Source code for schrodinger.protein.gpcr.update_gpcr_db
"""
This script downloads sequence and residue data from the GPCR DB and stores it
in a sqlite database.
Copyright Schrodinger, LLC. All rights reserved.
"""
import contextlib
import itertools
import json
import uuid
from schrodinger.utils import fileutils
from . import gpcrdb
from . import sql
[docs]def create_entry_database():
"""
Create a sqlite database of sequences and residues from the GPCR DB.
The sqlite database is created using a random filename and must be moved
to the appropriate location for use.
:return: Path to sqlite database
:rtype: str
"""
db_filename = f"{uuid.uuid4()}.sqlite"
conn = _init_database(db_filename)
row_gen = gpcrdb.download_all_entry_data()
cur = conn.cursor()
with contextlib.closing(conn):
all_residues = dict()
next_residue_pk, next_entry_pk = 1, 1
for i, row in enumerate(row_gen):
next_residue_pk, next_entry_pk = _insert_row(
cur,
row,
all_residues=all_residues,
next_residue_pk=next_residue_pk,
next_entry_pk=next_entry_pk)
if i % 100 == 0:
conn.commit()
return db_filename
def _insert_row(cur,
row_data,
all_residues,
next_residue_pk=1,
next_entry_pk=1):
"""
Insert data for a single GPCR DB entry.
:param cur: Database cursor
:type cur: sqlite3.Cursor
:param row_data: Entry data
:type row_data: tuple
:param all_residues: Dict of residue primary keys, keyed by the row data.
Used to reduce duplicate data in the database to save space.
:type all_residues: dict
:param next_residue_pk: Next primary key to use in the residue table
:type next_residue_pk: int
:param next_entry_pk: Next primary key to use in the residue table
:type next_entry_pk: int
"""
entry_name, res_number_scheme, sequence, families, residues = row_data
entry_residue_pks = []
# Insert rows into residue table for each unknown residue
for res_dict in residues:
data = tuple(res_dict[key] for key in sql.RESIDUES_KEYS)
existing_pk = all_residues.get(data)
if existing_pk is None:
cur.execute(sql.INSERT_RESIDUE_SQL, [next_residue_pk, *data])
all_residues[data] = next_residue_pk
entry_residue_pks.append(next_residue_pk)
next_residue_pk += 1
else:
entry_residue_pks.append(existing_pk)
# Insert row for entry
families = json.dumps(families)
cur.execute(
sql.INSERT_ENTRY_SQL,
[next_entry_pk, entry_name, res_number_scheme, sequence, families])
# Insert rows into entry-residue table
entry_residue_data = zip(itertools.repeat(next_entry_pk), entry_residue_pks)
cur.executemany(sql.INSERT_ENTRY_RESIDUES_SQL, entry_residue_data)
return next_residue_pk, next_entry_pk + 1
def _init_database(filename):
"""
Open the database and create the tables.
:rtype: sqlite3.Connection
"""
conn = sql.open_database(filename)
cur = conn.cursor()
cur.executescript(sql.CREATE_SQL)
conn.commit()
return conn
[docs]def main():
db_file = create_entry_database()
# TODO move database file to the appropriate location (TBD)
if __name__ == "__main__":
main()