Source code for schrodinger.livedesign.molhash

import enum
import hashlib
import json
import logging
import re

from rdkit import Chem
from rdkit.Chem import EnumerateStereoisomers
from rdkit.Chem import rdDepictor
from rdkit.Chem import rdMolHash

ATOM_PROP_MAP_NUMBER = 'molAtomMapNumber'

logger = logging.getLogger(__name__)

rdDepictor.SetPreferCoordGen(True)

EXTRA_ISOTOPE_REMOVAL_REGEX = re.compile(r'\[[1-3]0*([1-9]?[0-9]*[A-Z][a-z]?)@')
ENHANCED_STEREO_GROUP_REGEX = re.compile(r'((?:a|[&o]\d+):\d+(?:,\d+)*)')

ENHANCED_STEREO_GROUP_WEIGHTS = {
    Chem.StereoGroupType.STEREO_AND: 1000,
    Chem.StereoGroupType.STEREO_OR: 2000,
    Chem.StereoGroupType.STEREO_ABSOLUTE: 3000,
}


[docs]class HashLayer(enum.Enum):
    CANONICAL_SMILES = enum.auto()
    ENHANCED_STEREO_LABEL = enum.auto()
    ESCAPE = enum.auto()
    FORMULA = enum.auto()
    NO_STEREO_SMILES = enum.auto()
    NO_STEREO_TAUTOMER_HASH = enum.auto()
    SGROUP_DATA = enum.auto()
    TAUTOMER_HASH = enum.auto()


[docs]@enum.unique
class HashScheme(enum.Enum):
    ALL_LAYERS = tuple(HashLayer)
    NO_STEREO_LAYERS = (HashLayer.ESCAPE, HashLayer.FORMULA,
                        HashLayer.NO_STEREO_SMILES,
                        HashLayer.NO_STEREO_TAUTOMER_HASH)
    TAUTOMERIC_LAYERS = (HashLayer.ESCAPE, HashLayer.FORMULA,
                         HashLayer.NO_STEREO_TAUTOMER_HASH,
                         HashLayer.TAUTOMER_HASH)


[docs]def get_molhash(all_layers, hash_scheme: HashScheme = HashScheme.ALL_LAYERS):
    """
    Generates a molecular hash using a specified set of layers.
    See https://docs.google.com/document/d/13MLGoCPAjD6VhytcJp-F9MHhiEdmNmpjG5blraAtI8s/edit?usp=sharing
    for explanation the layer idea.

    :param mol: the molecule to generate the hash for
    :param hash_scheme: a tuple containing the information layers to be considered for the hash
    """

    h = hashlib.sha1()
    for layer in hash_scheme.value:
        h.update(all_layers[layer].encode())
    return h.hexdigest()


[docs]def get_mol_layers(original_molecule, sgroup_data="", escape=""):
    """returns a NamedTuple with the layers for a molecule that might be useful in a hash

    *NOTE* the enhanced_stereo_label layer produced by this code uses the atom numbers from the canonical_smiles layer.
    It is not suitable for combining with the tautomer layer!


    TODOs:
      - The sgroup_data layer is just being treated as a string (i.e. not canonicalized) here

    """
    # Work on a copy with all non-stereogenic Hydrogens removed
    molecule = Chem.RemoveHs(original_molecule,
                             updateExplicitCount=True,
                             sanitize=False)
    molecule.UpdatePropertyCache(False)
    molecule = strip_atom_map_labels(molecule)

    formula = rdMolHash.MolHash(molecule, rdMolHash.HashFunction.MolFormula)
    cxsmiles, canonical_mol = canonicalize_stereo_groups(molecule)
    tautomer_hash = get_stereo_tautomer_hash(canonical_mol)

    canonical_smiles, enhanced_stereo_label = get_canonical_smiles(cxsmiles)

    sgroup_data = canonicalize_sgroups(canonical_mol)
    (no_stereo_tautomer_hash,
     no_stereo_smiles) = get_no_stereo_layers(canonical_mol, canonical_smiles,
                                              tautomer_hash)
    return {
        HashLayer.CANONICAL_SMILES: canonical_smiles,
        HashLayer.ENHANCED_STEREO_LABEL: enhanced_stereo_label,
        HashLayer.ESCAPE: escape,
        HashLayer.FORMULA: formula,
        HashLayer.NO_STEREO_SMILES: no_stereo_smiles,
        HashLayer.NO_STEREO_TAUTOMER_HASH: no_stereo_tautomer_hash,
        HashLayer.SGROUP_DATA: sgroup_data,
        HashLayer.TAUTOMER_HASH: tautomer_hash,
    }


[docs]def strip_atom_map_labels(molecule):
    for at in molecule.GetAtoms():
        try:
            at.ClearProp(ATOM_PROP_MAP_NUMBER)
        except KeyError:
            pass
    return molecule


[docs]def get_stereo_tautomer_hash(molecule):
    # SHARED-7909: workaround for https://github.com/rdkit/rdkit/issues/4222
    # This can be removed once we update to an RDKit version without this bug.
    no_h_mol = Chem.RemoveAllHs(molecule, sanitize=False)
    no_h_mol.UpdatePropertyCache(False)
    return rdMolHash.MolHash(no_h_mol, rdMolHash.HashFunction.HetAtomTautomer)


[docs]def get_canonical_smiles(cxsmiles):
    smiles_parts = (p.strip() for p in cxsmiles.split('|'))
    smiles_parts = [p for p in smiles_parts if p]
    if not smiles_parts:
        return '', ''
    elif len(smiles_parts) > 2:
        raise ValueError('Unexpected number of fragments in canonical CXSMILES')

    canonical_smiles = smiles_parts[0]
    stereo_groups = ''
    if len(smiles_parts) == 2:
        # note: as with many regex-based things, this is fragile
        groups = ENHANCED_STEREO_GROUP_REGEX.findall(smiles_parts[1])
        if groups:
            # We might have other CXSMILES extensions that aren't stereo groups
            stereo_groups = f"|{','.join(sorted(groups))}|"

    return canonical_smiles, stereo_groups


[docs]def get_no_stereo_layers(molecule, canonical_smiles, tautomer):
    # SHARED-7909: Strip all Hs, including stereogenic ones.
    no_stereo_mol = Chem.RemoveAllHs(molecule, sanitize=False)
    no_stereo_mol.UpdatePropertyCache(False)
    for atom in no_stereo_mol.GetAtoms():
        if atom.GetChiralTag() != Chem.ChiralType.CHI_UNSPECIFIED:
            atom.SetChiralTag(Chem.ChiralType.CHI_UNSPECIFIED)
    for bond in no_stereo_mol.GetBonds():
        if bond.GetStereo() != Chem.BondStereo.STEREONONE:
            bond.SetStereo(Chem.BondStereo.STEREONONE)
        if bond.GetBondDir() != Chem.BondDir.NONE:
            bond.SetBondDir(Chem.BondDir.NONE)
    no_stereo_tautomer_hash = rdMolHash.MolHash(
        no_stereo_mol, rdMolHash.HashFunction.HetAtomTautomer)
    no_stereo_smiles = rdMolHash.MolHash(no_stereo_mol,
                                         rdMolHash.HashFunction.CanonicalSmiles)
    return (no_stereo_tautomer_hash, no_stereo_smiles)


[docs]def get_canonical_atom_ranks_and_bonds(mol, useSmilesOrdering=True):
    """
    returns a 2-tuple with:

    1. the canonical ranks of a molecule's atoms
    2. the bonds expressed as (canonical_atom_rank_1,canonical_atom_rank_2) where
       canonical_atom_rank_1 < canonical_atom_rank_2

    If useSmilesOrdering is True then the atom indices here correspond to the order of
    the atoms in the canonical SMILES, otherwise just the canonical atom order is used.
    useSmilesOrdering=True is a bit slower, but it allows the output to be linked to the
    canonical SMILES, which can be useful.

    """
    if mol.GetNumAtoms() == 0:
        return [], []
    if not useSmilesOrdering:
        atRanks = list(Chem.CanonicalRankAtoms(mol))
    else:
        smi = Chem.MolToSmiles(mol)
        ordertxt = mol.GetProp("_smilesAtomOutputOrder")
        smiOrder = [int(x) for x in ordertxt[1:-1].split(",") if x]
        atRanks = [0] * len(smiOrder)
        for i, idx in enumerate(smiOrder):
            atRanks[idx] = i
    # get the bonds in canonical form
    bndOrder = []
    for bnd in mol.GetBonds():
        bo = atRanks[bnd.GetBeginAtomIdx()]
        eo = atRanks[bnd.GetEndAtomIdx()]
        if bo > eo:
            bo, eo = eo, bo
        bndOrder.append((bo, eo))
    return atRanks, bndOrder


[docs]def canonicalize_data_sgroup(sg,
                             atRanks,
                             bndOrder,
                             fieldNames=None,
                             sortAtomOrder=True):
    """
    NOTES: if sortAtomOrder is true then the atom list will be sorted.
    This assumes that the order of the atoms in that list is not important

    """
    if sg.GetProp("TYPE") != "DAT" or not sg.HasProp("FIELDNAME"):
        return None

    canonicalizableFieldNames = fieldNames or ["Atrop"]
    fieldName = sg.GetProp("FIELDNAME")
    if fieldName not in canonicalizableFieldNames:
        return None

    data = sg.GetStringVectProp("DATAFIELDS")
    if len(data) > 1:
        raise ValueError(
            "cannot canonicalize data groups with multiple data fields")
    data = data[0]
    ats = tuple(atRanks[x] for x in sg.GetAtoms())
    if sortAtomOrder:
        ats = tuple(sorted(ats))
    bnds = tuple(bndOrder[x] for x in sg.GetBonds())

    res = dict(fieldName=fieldName, atom=ats, bonds=bnds, value=data)
    return res


[docs]def getCanonicalBondRep(bond, atomRanks):
    aid1 = bond.GetBeginAtomIdx()
    aid2 = bond.GetEndAtomIdx()
    if atomRanks[aid1] > atomRanks[aid2] or (atomRanks[aid1] == atomRanks[aid2]
                                             and aid1 > aid2):
        aid1, aid2 = aid2, aid1
    return (aid1, aid2)


[docs]def canonicalize_sru_sgroup(mol, sg, atRanks, bndOrder, sortAtomAndBondOrder):
    """
    NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted.
    This assumes that the ordering of those lists is not important

    """
    if sg.GetProp("TYPE") != "SRU":
        return None
    ats = tuple(atRanks[x] for x in sg.GetAtoms())
    bnds = tuple(bndOrder[x] for x in sg.GetBonds())
    if sortAtomAndBondOrder:
        ats = tuple(sorted(ats))
        bnds = tuple(sorted(bnds))
    props = sg.GetPropsAsDict()
    res = dict(
        type="SRU",
        atoms=ats,
        bonds=bnds,
        index=props.get("index", 0),
        connect=props.get("CONNECT", ""),
        label=props.get("LABEL", ""),
    )
    if "PARENT" in props:
        res["PARENT"] = props["PARENT"]
    if "XBHEAD" in props:
        xbhbonds = tuple(
            getCanonicalBondRep(mol.GetBondWithIdx(x), atRanks)
            for x in props["XBHEAD"])
        if sortAtomAndBondOrder:
            xbhbonds = tuple(sorted(xbhbonds))
        res["XBHEAD"] = xbhbonds
    if "XBCORR" in props:
        xbcorrbonds = tuple(
            getCanonicalBondRep(mol.GetBondWithIdx(x), atRanks)
            for x in props["XBCORR"])
        if len(xbcorrbonds) % 2:
            raise ValueError("XBCORR should have 2N bonds")
        if sortAtomAndBondOrder:
            # these are pairs, so we need to sort them as such
            tmp = []
            for i in range(0, len(xbcorrbonds), 2):
                b1, b2 = xbcorrbonds[i], xbcorrbonds[i + 1]
                if b1 > b2:
                    b1, b2 = b2, b1
                tmp.append((b1, b2))
            xbcorrbonds = tuple(sorted(tmp))
        res["XBCORR"] = xbcorrbonds
    return res


[docs]def canonicalize_cop_sgroup(mol, sg, atRanks, bndOrder, sortAtomAndBondOrder):
    """
    NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted.
    This assumes that the ordering of those lists is not important
    """
    if sg.GetProp("TYPE") != "COP":
        return None
    ats = tuple(atRanks[x] for x in sg.GetAtoms())
    if sortAtomAndBondOrder:
        ats = tuple(sorted(ats))
    props = sg.GetPropsAsDict()
    # we are intentionally not doing the label here. Marvin adds one, biovia draw does not
    res = dict(type="COP", atoms=ats, index=props.get("index", 0))
    return res


[docs]def canonicalize_sgroups(mol, dataFieldNames=None, sortAtomAndBondOrder=True):
    """
    NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted.
    This assumes that the ordering of those lists is not important
    """
    dataFieldNames = dataFieldNames or ["Atrop"]
    atRanks, bndOrder = get_canonical_atom_ranks_and_bonds(mol)
    res = []
    for sg in Chem.GetMolSubstanceGroups(mol):
        lres = None
        if sg.GetProp("TYPE") == "DAT":
            lres = canonicalize_data_sgroup(sg, atRanks, bndOrder,
                                            dataFieldNames,
                                            sortAtomAndBondOrder)
        elif sg.GetProp("TYPE") == "SRU":
            lres = canonicalize_sru_sgroup(mol, sg, atRanks, bndOrder,
                                           sortAtomAndBondOrder)
        elif sg.GetProp("TYPE") == "COP":
            lres = canonicalize_cop_sgroup(mol, sg, atRanks, bndOrder,
                                           sortAtomAndBondOrder)

        if lres is not None:
            res.append(lres)
    if len(res) > 1:
        # we need to sort them, but dicts don't let you do that directly:
        tres = sorted(tuple(x.items()) for x in res)
        res = tuple(dict(x) for x in tres)
        # update "index" and "PARENT" props
        idxmap = {}
        for i, itm in enumerate(res):
            if "index" in itm:
                idxmap[itm["index"]] = i + 1
                itm["index"] = i + 1
        for i, itm in enumerate(res):
            if "PARENT" in itm:
                itm["PARENT"] = idxmap[itm["PARENT"]]

    return json.dumps(res)


[docs]class EnhancedStereoUpdateMode(enum.Enum):
    ADD_WEIGHTS = enum.auto()
    REMOVE_WEIGHTS = enum.auto()


[docs]def update_enhanced_stereo_group_weights(mol, mode):
    if mode == EnhancedStereoUpdateMode.ADD_WEIGHTS:
        factor = 1
    elif mode == EnhancedStereoUpdateMode.REMOVE_WEIGHTS:
        factor = -1
    else:
        raise ValueError('Invalid Enhanced Stereo weight update mode')

    isotopesModified = False
    stgs = mol.GetStereoGroups()
    for stg in stgs:
        stgt = stg.GetGroupType()
        weight = factor * ENHANCED_STEREO_GROUP_WEIGHTS[stgt]
        for at in stg.GetAtoms():

            # Make sure the isotope is reasonable and is not present in more than
            # one stereo group, and we are not messing it up
            isotope = at.GetIsotope()
            if mode == EnhancedStereoUpdateMode.ADD_WEIGHTS and isotope >= 100:
                raise ValueError(
                    f'Atom {at.GetIdx()} is an unusual isotope ({isotope})')

            at.SetIsotope(isotope + weight)
            isotopesModified = True

    return mol, isotopesModified


[docs]def canonicalize_stereo_groups(mol):
    """
    Returns canonical CXSmiles and the corresponding molecule with the
    stereo groups canonicalized.

    The RDKit canonicalization code does not currently take stereo groups into
    account. We work around that by using EnumerateStereoisomers() to generate
    all possible instances of the molecule's stereogroups and then lexically
    compare the CXSMILES of those.
    """

    if not len(mol.GetStereoGroups()):
        return Chem.MolToCXSmiles(mol), mol

    mol = Chem.Mol(mol)

    # To solve the problem we add isotope tags to the atoms involved in stereo
    # groups. These allow the canonicalization to tell the difference between
    # AND and OR (and have the happy side effect of allowing the
    # presence/absence of a stereo group to affect the canonicalization)
    mol, isotopesModified = update_enhanced_stereo_group_weights(
        mol, EnhancedStereoUpdateMode.ADD_WEIGHTS)

    # We're going to be generating canonical SMILES here anyway, so skip the
    # "unique" option for the sake of efficiency
    opts = EnumerateStereoisomers.StereoEnumerationOptions()
    opts.onlyStereoGroups = True
    opts.unique = False

    res = None
    res_csmi = ''
    for isomer in EnumerateStereoisomers.EnumerateStereoisomers(mol, opts):
        # We need to generate the canonical CXSMILES for the molecule with
        # the isotope tags.
        csmi = Chem.MolToCXSmiles(isomer)
        if res is None or csmi < res_csmi:
            res = isomer
            res_csmi = csmi

    res_csmi = EXTRA_ISOTOPE_REMOVAL_REGEX.sub(r'[\1@', res_csmi)

    if isotopesModified:
        res, _ = update_enhanced_stereo_group_weights(
            res, EnhancedStereoUpdateMode.REMOVE_WEIGHTS)

    return res_csmi, res