Source code for schrodinger.protein.gpcr.update_gpcr_db

"""
This script downloads sequence and residue data from the GPCR DB and stores it
in a sqlite database.

Copyright Schrodinger, LLC. All rights reserved.
"""

import contextlib
import itertools
import json
import uuid

from schrodinger.utils import fileutils

from . import gpcrdb
from . import sql


[docs]def create_entry_database():
    """
    Create a sqlite database of sequences and residues from the GPCR DB.

    The sqlite database is created using a random filename and must be moved
    to the appropriate location for use.

    :return: Path to sqlite database
    :rtype: str
    """
    db_filename = f"{uuid.uuid4()}.sqlite"
    conn = _init_database(db_filename)

    row_gen = gpcrdb.download_all_entry_data()
    cur = conn.cursor()
    with contextlib.closing(conn):
        all_residues = dict()
        next_residue_pk, next_entry_pk = 1, 1
        for i, row in enumerate(row_gen):
            next_residue_pk, next_entry_pk = _insert_row(
                cur,
                row,
                all_residues=all_residues,
                next_residue_pk=next_residue_pk,
                next_entry_pk=next_entry_pk)
            if i % 100 == 0:
                conn.commit()
    return db_filename


def _insert_row(cur,
                row_data,
                all_residues,
                next_residue_pk=1,
                next_entry_pk=1):
    """
    Insert data for a single GPCR DB entry.

    :param cur: Database cursor
    :type cur: sqlite3.Cursor

    :param row_data: Entry data
    :type row_data: tuple

    :param all_residues: Dict of residue primary keys, keyed by the row data.
        Used to reduce duplicate data in the database to save space.
    :type all_residues: dict

    :param next_residue_pk: Next primary key to use in the residue table
    :type next_residue_pk: int

    :param next_entry_pk: Next primary key to use in the residue table
    :type next_entry_pk: int
    """
    entry_name, res_number_scheme, sequence, families, residues = row_data
    entry_residue_pks = []
    # Insert rows into residue table for each unknown residue
    for res_dict in residues:
        data = tuple(res_dict[key] for key in sql.RESIDUES_KEYS)
        existing_pk = all_residues.get(data)
        if existing_pk is None:
            cur.execute(sql.INSERT_RESIDUE_SQL, [next_residue_pk, *data])
            all_residues[data] = next_residue_pk
            entry_residue_pks.append(next_residue_pk)
            next_residue_pk += 1
        else:
            entry_residue_pks.append(existing_pk)
    # Insert row for entry
    families = json.dumps(families)
    cur.execute(
        sql.INSERT_ENTRY_SQL,
        [next_entry_pk, entry_name, res_number_scheme, sequence, families])
    # Insert rows into entry-residue table
    entry_residue_data = zip(itertools.repeat(next_entry_pk), entry_residue_pks)
    cur.executemany(sql.INSERT_ENTRY_RESIDUES_SQL, entry_residue_data)
    return next_residue_pk, next_entry_pk + 1


def _init_database(filename):
    """
    Open the database and create the tables.

    :rtype: sqlite3.Connection
    """
    conn = sql.open_database(filename)
    cur = conn.cursor()
    cur.executescript(sql.CREATE_SQL)
    conn.commit()
    return conn


[docs]def main():
    db_file = create_entry_database()
    # TODO move database file to the appropriate location (TBD)


if __name__ == "__main__":
    main()