import enum
import hashlib
import json
import logging
import re
from rdkit import Chem
from rdkit.Chem import EnumerateStereoisomers
from rdkit.Chem import rdDepictor
from rdkit.Chem import rdMolHash
ATOM_PROP_MAP_NUMBER = 'molAtomMapNumber'
logger = logging.getLogger(__name__)
rdDepictor.SetPreferCoordGen(True)
EXTRA_ISOTOPE_REMOVAL_REGEX = re.compile(r'\[[1-3]0*([1-9]?[0-9]*[A-Z][a-z]?)@')
ENHANCED_STEREO_GROUP_REGEX = re.compile(r'((?:a|[&o]\d+):\d+(?:,\d+)*)')
ENHANCED_STEREO_GROUP_WEIGHTS = {
Chem.StereoGroupType.STEREO_AND: 1000,
Chem.StereoGroupType.STEREO_OR: 2000,
Chem.StereoGroupType.STEREO_ABSOLUTE: 3000,
}
[docs]class HashLayer(enum.Enum):
CANONICAL_SMILES = enum.auto()
ENHANCED_STEREO_LABEL = enum.auto()
ESCAPE = enum.auto()
FORMULA = enum.auto()
NO_STEREO_SMILES = enum.auto()
NO_STEREO_TAUTOMER_HASH = enum.auto()
SGROUP_DATA = enum.auto()
TAUTOMER_HASH = enum.auto()
[docs]@enum.unique
class HashScheme(enum.Enum):
ALL_LAYERS = tuple(HashLayer)
NO_STEREO_LAYERS = (HashLayer.ESCAPE, HashLayer.FORMULA,
HashLayer.NO_STEREO_SMILES,
HashLayer.NO_STEREO_TAUTOMER_HASH)
TAUTOMERIC_LAYERS = (HashLayer.ESCAPE, HashLayer.FORMULA,
HashLayer.NO_STEREO_TAUTOMER_HASH,
HashLayer.TAUTOMER_HASH)
[docs]def get_molhash(all_layers, hash_scheme: HashScheme = HashScheme.ALL_LAYERS):
"""
Generates a molecular hash using a specified set of layers.
See https://docs.google.com/document/d/13MLGoCPAjD6VhytcJp-F9MHhiEdmNmpjG5blraAtI8s/edit?usp=sharing
for explanation the layer idea.
:param mol: the molecule to generate the hash for
:param hash_scheme: a tuple containing the information layers to be considered for the hash
"""
h = hashlib.sha1()
for layer in hash_scheme.value:
h.update(all_layers[layer].encode())
return h.hexdigest()
[docs]def get_mol_layers(original_molecule, sgroup_data="", escape=""):
"""returns a NamedTuple with the layers for a molecule that might be useful in a hash
*NOTE* the enhanced_stereo_label layer produced by this code uses the atom numbers from the canonical_smiles layer.
It is not suitable for combining with the tautomer layer!
TODOs:
- The sgroup_data layer is just being treated as a string (i.e. not canonicalized) here
"""
# Work on a copy with all non-stereogenic Hydrogens removed
molecule = Chem.RemoveHs(original_molecule,
updateExplicitCount=True,
sanitize=False)
molecule.UpdatePropertyCache(False)
molecule = strip_atom_map_labels(molecule)
formula = rdMolHash.MolHash(molecule, rdMolHash.HashFunction.MolFormula)
cxsmiles, canonical_mol = canonicalize_stereo_groups(molecule)
tautomer_hash = get_stereo_tautomer_hash(canonical_mol)
canonical_smiles, enhanced_stereo_label = get_canonical_smiles(cxsmiles)
sgroup_data = canonicalize_sgroups(canonical_mol)
(no_stereo_tautomer_hash,
no_stereo_smiles) = get_no_stereo_layers(canonical_mol, canonical_smiles,
tautomer_hash)
return {
HashLayer.CANONICAL_SMILES: canonical_smiles,
HashLayer.ENHANCED_STEREO_LABEL: enhanced_stereo_label,
HashLayer.ESCAPE: escape,
HashLayer.FORMULA: formula,
HashLayer.NO_STEREO_SMILES: no_stereo_smiles,
HashLayer.NO_STEREO_TAUTOMER_HASH: no_stereo_tautomer_hash,
HashLayer.SGROUP_DATA: sgroup_data,
HashLayer.TAUTOMER_HASH: tautomer_hash,
}
[docs]def strip_atom_map_labels(molecule):
for at in molecule.GetAtoms():
try:
at.ClearProp(ATOM_PROP_MAP_NUMBER)
except KeyError:
pass
return molecule
[docs]def get_stereo_tautomer_hash(molecule):
# SHARED-7909: workaround for https://github.com/rdkit/rdkit/issues/4222
# This can be removed once we update to an RDKit version without this bug.
no_h_mol = Chem.RemoveAllHs(molecule, sanitize=False)
no_h_mol.UpdatePropertyCache(False)
return rdMolHash.MolHash(no_h_mol, rdMolHash.HashFunction.HetAtomTautomer)
[docs]def get_canonical_smiles(cxsmiles):
smiles_parts = (p.strip() for p in cxsmiles.split('|'))
smiles_parts = [p for p in smiles_parts if p]
if not smiles_parts:
return '', ''
elif len(smiles_parts) > 2:
raise ValueError('Unexpected number of fragments in canonical CXSMILES')
canonical_smiles = smiles_parts[0]
stereo_groups = ''
if len(smiles_parts) == 2:
# note: as with many regex-based things, this is fragile
groups = ENHANCED_STEREO_GROUP_REGEX.findall(smiles_parts[1])
if groups:
# We might have other CXSMILES extensions that aren't stereo groups
stereo_groups = f"|{','.join(sorted(groups))}|"
return canonical_smiles, stereo_groups
[docs]def get_no_stereo_layers(molecule, canonical_smiles, tautomer):
# SHARED-7909: Strip all Hs, including stereogenic ones.
no_stereo_mol = Chem.RemoveAllHs(molecule, sanitize=False)
no_stereo_mol.UpdatePropertyCache(False)
for atom in no_stereo_mol.GetAtoms():
if atom.GetChiralTag() != Chem.ChiralType.CHI_UNSPECIFIED:
atom.SetChiralTag(Chem.ChiralType.CHI_UNSPECIFIED)
for bond in no_stereo_mol.GetBonds():
if bond.GetStereo() != Chem.BondStereo.STEREONONE:
bond.SetStereo(Chem.BondStereo.STEREONONE)
if bond.GetBondDir() != Chem.BondDir.NONE:
bond.SetBondDir(Chem.BondDir.NONE)
no_stereo_tautomer_hash = rdMolHash.MolHash(
no_stereo_mol, rdMolHash.HashFunction.HetAtomTautomer)
no_stereo_smiles = rdMolHash.MolHash(no_stereo_mol,
rdMolHash.HashFunction.CanonicalSmiles)
return (no_stereo_tautomer_hash, no_stereo_smiles)
[docs]def get_canonical_atom_ranks_and_bonds(mol, useSmilesOrdering=True):
"""
returns a 2-tuple with:
1. the canonical ranks of a molecule's atoms
2. the bonds expressed as (canonical_atom_rank_1,canonical_atom_rank_2) where
canonical_atom_rank_1 < canonical_atom_rank_2
If useSmilesOrdering is True then the atom indices here correspond to the order of
the atoms in the canonical SMILES, otherwise just the canonical atom order is used.
useSmilesOrdering=True is a bit slower, but it allows the output to be linked to the
canonical SMILES, which can be useful.
"""
if mol.GetNumAtoms() == 0:
return [], []
if not useSmilesOrdering:
atRanks = list(Chem.CanonicalRankAtoms(mol))
else:
smi = Chem.MolToSmiles(mol)
ordertxt = mol.GetProp("_smilesAtomOutputOrder")
smiOrder = [int(x) for x in ordertxt[1:-1].split(",") if x]
atRanks = [0] * len(smiOrder)
for i, idx in enumerate(smiOrder):
atRanks[idx] = i
# get the bonds in canonical form
bndOrder = []
for bnd in mol.GetBonds():
bo = atRanks[bnd.GetBeginAtomIdx()]
eo = atRanks[bnd.GetEndAtomIdx()]
if bo > eo:
bo, eo = eo, bo
bndOrder.append((bo, eo))
return atRanks, bndOrder
[docs]def canonicalize_data_sgroup(sg,
atRanks,
bndOrder,
fieldNames=None,
sortAtomOrder=True):
"""
NOTES: if sortAtomOrder is true then the atom list will be sorted.
This assumes that the order of the atoms in that list is not important
"""
if sg.GetProp("TYPE") != "DAT" or not sg.HasProp("FIELDNAME"):
return None
canonicalizableFieldNames = fieldNames or ["Atrop"]
fieldName = sg.GetProp("FIELDNAME")
if fieldName not in canonicalizableFieldNames:
return None
data = sg.GetStringVectProp("DATAFIELDS")
if len(data) > 1:
raise ValueError(
"cannot canonicalize data groups with multiple data fields")
data = data[0]
ats = tuple(atRanks[x] for x in sg.GetAtoms())
if sortAtomOrder:
ats = tuple(sorted(ats))
bnds = tuple(bndOrder[x] for x in sg.GetBonds())
res = dict(fieldName=fieldName, atom=ats, bonds=bnds, value=data)
return res
[docs]def getCanonicalBondRep(bond, atomRanks):
aid1 = bond.GetBeginAtomIdx()
aid2 = bond.GetEndAtomIdx()
if atomRanks[aid1] > atomRanks[aid2] or (atomRanks[aid1] == atomRanks[aid2]
and aid1 > aid2):
aid1, aid2 = aid2, aid1
return (aid1, aid2)
[docs]def canonicalize_sru_sgroup(mol, sg, atRanks, bndOrder, sortAtomAndBondOrder):
"""
NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted.
This assumes that the ordering of those lists is not important
"""
if sg.GetProp("TYPE") != "SRU":
return None
ats = tuple(atRanks[x] for x in sg.GetAtoms())
bnds = tuple(bndOrder[x] for x in sg.GetBonds())
if sortAtomAndBondOrder:
ats = tuple(sorted(ats))
bnds = tuple(sorted(bnds))
props = sg.GetPropsAsDict()
res = dict(
type="SRU",
atoms=ats,
bonds=bnds,
index=props.get("index", 0),
connect=props.get("CONNECT", ""),
label=props.get("LABEL", ""),
)
if "PARENT" in props:
res["PARENT"] = props["PARENT"]
if "XBHEAD" in props:
xbhbonds = tuple(
getCanonicalBondRep(mol.GetBondWithIdx(x), atRanks)
for x in props["XBHEAD"])
if sortAtomAndBondOrder:
xbhbonds = tuple(sorted(xbhbonds))
res["XBHEAD"] = xbhbonds
if "XBCORR" in props:
xbcorrbonds = tuple(
getCanonicalBondRep(mol.GetBondWithIdx(x), atRanks)
for x in props["XBCORR"])
if len(xbcorrbonds) % 2:
raise ValueError("XBCORR should have 2N bonds")
if sortAtomAndBondOrder:
# these are pairs, so we need to sort them as such
tmp = []
for i in range(0, len(xbcorrbonds), 2):
b1, b2 = xbcorrbonds[i], xbcorrbonds[i + 1]
if b1 > b2:
b1, b2 = b2, b1
tmp.append((b1, b2))
xbcorrbonds = tuple(sorted(tmp))
res["XBCORR"] = xbcorrbonds
return res
[docs]def canonicalize_cop_sgroup(mol, sg, atRanks, bndOrder, sortAtomAndBondOrder):
"""
NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted.
This assumes that the ordering of those lists is not important
"""
if sg.GetProp("TYPE") != "COP":
return None
ats = tuple(atRanks[x] for x in sg.GetAtoms())
if sortAtomAndBondOrder:
ats = tuple(sorted(ats))
props = sg.GetPropsAsDict()
# we are intentionally not doing the label here. Marvin adds one, biovia draw does not
res = dict(type="COP", atoms=ats, index=props.get("index", 0))
return res
[docs]def canonicalize_sgroups(mol, dataFieldNames=None, sortAtomAndBondOrder=True):
"""
NOTES: if sortAtomAndBondOrder is true then the atom and bond lists will be sorted.
This assumes that the ordering of those lists is not important
"""
dataFieldNames = dataFieldNames or ["Atrop"]
atRanks, bndOrder = get_canonical_atom_ranks_and_bonds(mol)
res = []
for sg in Chem.GetMolSubstanceGroups(mol):
lres = None
if sg.GetProp("TYPE") == "DAT":
lres = canonicalize_data_sgroup(sg, atRanks, bndOrder,
dataFieldNames,
sortAtomAndBondOrder)
elif sg.GetProp("TYPE") == "SRU":
lres = canonicalize_sru_sgroup(mol, sg, atRanks, bndOrder,
sortAtomAndBondOrder)
elif sg.GetProp("TYPE") == "COP":
lres = canonicalize_cop_sgroup(mol, sg, atRanks, bndOrder,
sortAtomAndBondOrder)
if lres is not None:
res.append(lres)
if len(res) > 1:
# we need to sort them, but dicts don't let you do that directly:
tres = sorted(tuple(x.items()) for x in res)
res = tuple(dict(x) for x in tres)
# update "index" and "PARENT" props
idxmap = {}
for i, itm in enumerate(res):
if "index" in itm:
idxmap[itm["index"]] = i + 1
itm["index"] = i + 1
for i, itm in enumerate(res):
if "PARENT" in itm:
itm["PARENT"] = idxmap[itm["PARENT"]]
return json.dumps(res)
[docs]class EnhancedStereoUpdateMode(enum.Enum):
ADD_WEIGHTS = enum.auto()
REMOVE_WEIGHTS = enum.auto()
[docs]def update_enhanced_stereo_group_weights(mol, mode):
if mode == EnhancedStereoUpdateMode.ADD_WEIGHTS:
factor = 1
elif mode == EnhancedStereoUpdateMode.REMOVE_WEIGHTS:
factor = -1
else:
raise ValueError('Invalid Enhanced Stereo weight update mode')
isotopesModified = False
stgs = mol.GetStereoGroups()
for stg in stgs:
stgt = stg.GetGroupType()
weight = factor * ENHANCED_STEREO_GROUP_WEIGHTS[stgt]
for at in stg.GetAtoms():
# Make sure the isotope is reasonable and is not present in more than
# one stereo group, and we are not messing it up
isotope = at.GetIsotope()
if mode == EnhancedStereoUpdateMode.ADD_WEIGHTS and isotope >= 100:
raise ValueError(
f'Atom {at.GetIdx()} is an unusual isotope ({isotope})')
at.SetIsotope(isotope + weight)
isotopesModified = True
return mol, isotopesModified
[docs]def canonicalize_stereo_groups(mol):
"""
Returns canonical CXSmiles and the corresponding molecule with the
stereo groups canonicalized.
The RDKit canonicalization code does not currently take stereo groups into
account. We work around that by using EnumerateStereoisomers() to generate
all possible instances of the molecule's stereogroups and then lexically
compare the CXSMILES of those.
"""
if not len(mol.GetStereoGroups()):
return Chem.MolToCXSmiles(mol), mol
mol = Chem.Mol(mol)
# To solve the problem we add isotope tags to the atoms involved in stereo
# groups. These allow the canonicalization to tell the difference between
# AND and OR (and have the happy side effect of allowing the
# presence/absence of a stereo group to affect the canonicalization)
mol, isotopesModified = update_enhanced_stereo_group_weights(
mol, EnhancedStereoUpdateMode.ADD_WEIGHTS)
# We're going to be generating canonical SMILES here anyway, so skip the
# "unique" option for the sake of efficiency
opts = EnumerateStereoisomers.StereoEnumerationOptions()
opts.onlyStereoGroups = True
opts.unique = False
res = None
res_csmi = ''
for isomer in EnumerateStereoisomers.EnumerateStereoisomers(mol, opts):
# We need to generate the canonical CXSMILES for the molecule with
# the isotope tags.
csmi = Chem.MolToCXSmiles(isomer)
if res is None or csmi < res_csmi:
res = isomer
res_csmi = csmi
res_csmi = EXTRA_ISOTOPE_REMOVAL_REGEX.sub(r'[\1@', res_csmi)
if isotopesModified:
res, _ = update_enhanced_stereo_group_weights(
res, EnhancedStereoUpdateMode.REMOVE_WEIGHTS)
return res_csmi, res