Source code for schrodinger.protein.gpcr.update_gpcr_db

"""
This script downloads sequence and residue data from the GPCR DB and stores it
in a sqlite database.

Copyright Schrodinger, LLC. All rights reserved.
"""

import contextlib
import itertools
import json
import uuid

from schrodinger.utils import fileutils

from . import gpcrdb
from . import sql


[docs]def create_entry_database(): """ Create a sqlite database of sequences and residues from the GPCR DB. The sqlite database is created using a random filename and must be moved to the appropriate location for use. :return: Path to sqlite database :rtype: str """ db_filename = f"{uuid.uuid4()}.sqlite" conn = _init_database(db_filename) row_gen = gpcrdb.download_all_entry_data() cur = conn.cursor() with contextlib.closing(conn): all_residues = dict() next_residue_pk, next_entry_pk = 1, 1 for i, row in enumerate(row_gen): next_residue_pk, next_entry_pk = _insert_row( cur, row, all_residues=all_residues, next_residue_pk=next_residue_pk, next_entry_pk=next_entry_pk) if i % 100 == 0: conn.commit() return db_filename
def _insert_row(cur, row_data, all_residues, next_residue_pk=1, next_entry_pk=1): """ Insert data for a single GPCR DB entry. :param cur: Database cursor :type cur: sqlite3.Cursor :param row_data: Entry data :type row_data: tuple :param all_residues: Dict of residue primary keys, keyed by the row data. Used to reduce duplicate data in the database to save space. :type all_residues: dict :param next_residue_pk: Next primary key to use in the residue table :type next_residue_pk: int :param next_entry_pk: Next primary key to use in the residue table :type next_entry_pk: int """ entry_name, res_number_scheme, sequence, families, residues = row_data entry_residue_pks = [] # Insert rows into residue table for each unknown residue for res_dict in residues: data = tuple(res_dict[key] for key in sql.RESIDUES_KEYS) existing_pk = all_residues.get(data) if existing_pk is None: cur.execute(sql.INSERT_RESIDUE_SQL, [next_residue_pk, *data]) all_residues[data] = next_residue_pk entry_residue_pks.append(next_residue_pk) next_residue_pk += 1 else: entry_residue_pks.append(existing_pk) # Insert row for entry families = json.dumps(families) cur.execute( sql.INSERT_ENTRY_SQL, [next_entry_pk, entry_name, res_number_scheme, sequence, families]) # Insert rows into entry-residue table entry_residue_data = zip(itertools.repeat(next_entry_pk), entry_residue_pks) cur.executemany(sql.INSERT_ENTRY_RESIDUES_SQL, entry_residue_data) return next_residue_pk, next_entry_pk + 1 def _init_database(filename): """ Open the database and create the tables. :rtype: sqlite3.Connection """ conn = sql.open_database(filename) cur = conn.cursor() cur.executescript(sql.CREATE_SQL) conn.commit() return conn
[docs]def main(): db_file = create_entry_database()
# TODO move database file to the appropriate location (TBD) if __name__ == "__main__": main()