Source code for schrodinger.livedesign.preprocessor

import argparse
import copy
import enum
import json
import math
import re
from collections import defaultdict
from io import BytesIO
from io import StringIO
from typing import NamedTuple
from typing import Optional
from typing import Tuple

import numpy as np
import rdkit
from rdkit import Chem
from rdkit import Geometry
from rdkit.Chem import SaltRemover
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdMolAlign
from rdkit.Chem import rdmolops
from rdkit.Chem.Descriptors import MolWt
# we can statically store the MolVS classes used in TAUTOMERIZE and NEUTRALIZE here
# so they are only created and initialized once:
from rdkit.Chem.MolStandardize import rdMolStandardize

from schrodinger.infra import structure as infrastructure
from schrodinger.job.driver_decorator import main_wrapper
from schrodinger.thirdparty import rdkit_adapter
from schrodinger.utils import log

try:
    from schrodinger.application.epik import epik_tautomer_canonicalizer
except ImportError as e:
    epik_tautomer_canonicalizer = e

logger = log.get_output_logger(__file__)

uncharger = rdMolStandardize.Uncharger(canonicalOrder=True)

# string constants for mol props
MOL_PROP_CHIRAL_FLAG = "_MolFileChiralFlag"
MOL_PROP_UNKNOWN_STEREO = "_UnknownStereo"
MOL_PROP_ON = 1
MOL_PROP_OFF = 0

STD_DEFAULT_SALTS = (
    "[Cl,Br,I]",
    "[Li,Na,K,Ca,Mg]",
    "[O,N]",
    "[N](=O)(O)O",
    "[P](=O)(O)(O)O",
    "[P](F)(F)(F)(F)(F)F",
    "[S](=O)(=O)(O)O",
    "[CH3][S](=O)(=O)(O)",
    # p-Toluene sulfonate
    "c1cc([CH3])ccc1[S](=O)(=O)(O)",
    # Acetic acid
    "[CH3]C(=O)O",
    # TFA
    "FC(F)(F)C(=O)O",
    # Fumarate/Maleate
    "OC(=O)C=CC(=O)O",
    # Oxalate
    "OC(=O)C(=O)O",
    # Tartrate
    "OC(=O)C(O)C(O)C(=O)O",
    # Dicylcohexylammonium
    "C1CCCCC1[NH]C1CCCCC1",
)

STD_TRANSFORMATIONS = (
    # Nitro
    "[#8:2]=[#7:1]=[#8:3]>>[#8-:2]-[#7+:1]=[#8:3]",
    # Nitroso
    "[#6:3]-[#7H2:1]=[#8:2]>>[#6:3]-[#7H2+:1]-[#8-:2]",
    # Phosphonium Ylide
    "[#6:3][P-:1]([#6:4])([#6:5])[#6+:2]>>[#6:3][P-0:1]([#6:4])([#6:5])=[#6+0:2]",
    # Sulfon
    "[#6:3][S;X3+0:1]([#6:4])=[#8-0:2]>>[#6:3][S+:1]([#6:4])-[#8-:2]",
    # Phosphonic
    "[#6:3][P+:1]([#8;X2:4])([#8;X2:5])[#8-:2]>>[#6:3][P+0:1]([#8:4])([#8:5])=[#8-0:2]",
    # Sulfoxide
    "[#6:3][S+:1]([#6:4])([#8-:2])=[O:5]>>[#6:3][S+0:1]([#6:4])(=[#8-0:2])=[O:5]",
    # Azide
    "[#7;A;X2-:1][N;X2+:2]#[N;X1:3]>>[#7-0:1]=[N+:2]=[#7-:3]",
    # Diazo
    "[#6;X3-:1][N;X2+:2]#[N;X1:3]>>[#6-0;A:1]=[N+:2]=[#7-:3]",
)


[docs]class ExplicitHydrogens(enum.Enum): REMOVE_ALL = enum.auto() KEEP_WEDGED = enum.auto() ADD_ALL = enum.auto() AS_IS = enum.auto()
[docs]class GenerateCoordinates(enum.Enum): NONE = enum.auto() FULL = enum.auto() FULL_ALIGNED = enum.auto()
[docs]class DoubleBondStereoStandard(enum.Enum): NONE = enum.auto() CROSSED = enum.auto() WIGGLY = enum.auto()
[docs]class RingRepresentation(enum.Enum): KEKULE = enum.auto() AROMATIC = enum.auto()
[docs]class RemoveSGroupData(enum.Enum): NONE = enum.auto() ALL = enum.auto() DAT_ONLY = enum.auto()
[docs]class ChiralFlagTreatment(enum.Enum): IGNORE = enum.auto() CLEAN = enum.auto() FORCE_ON = enum.auto() FORCE_OFF = enum.auto()
[docs]class PreprocessorOptions(NamedTuple): """ Options dictating preprocessor actions; all options default to no-ops unless otherwise specified. """ KEEP_ONLY_LARGEST_STRUCTURE: bool = True REMOVE_PROPERTIES: bool = False STRIP_SALTS: Tuple[str] = STD_DEFAULT_SALTS CLEAN_WEDGE_ORIENTATION: bool = True CHOOSE_CANONICAL_TAUTOMER: bool = False TRANSFORMATIONS: Tuple[str] = STD_TRANSFORMATIONS NEUTRALIZE: bool = True EXPLICIT_HYDROGENS: ExplicitHydrogens = ExplicitHydrogens.REMOVE_ALL GENERATE_COORDINATES: GenerateCoordinates = GenerateCoordinates.FULL_ALIGNED DOUBLE_BOND_STEREO_STANDARD: DoubleBondStereoStandard = DoubleBondStereoStandard.NONE CHIRAL_FLAG_TREATMENT: ChiralFlagTreatment = ChiralFlagTreatment.CLEAN HEAVY_HYDROGEN_DT: bool = False RING_REPRESENTATION: RingRepresentation = RingRepresentation.KEKULE GENERATE_V3K_SDF: bool = True REMOVE_SGROUP_DATA: RemoveSGroupData = RemoveSGroupData.NONE CLEAR_INVALID_WEDGE_BONDS: bool = True STRIP_STEREO_ABSOLUTE_GROUP: bool = True STRIP_AND_GROUPS_ON_SINGLE_ATOM: bool = True @staticmethod def _updateConfig(config: dict): """ :param dict: configuration to validate/update deprecated options """ # Validate/fix deprecated options key = "CORE_SUITE_NEUTRALIZE" if config.get(key): raise KeyError(f"{key} is not supported") config.pop(key, None) key = "FORCE_ABSOLUTE_STEREOCHEMISTRY" value = config.get(key) if value is not None: config["CHIRAL_FLAG_TREATMENT"] = "FORCE_ON" if value else "IGNORE" config.pop(key, None) # Update deprecated option formats key = "STRIP_SALTS" value = config.get(key) if isinstance(value, dict): config[key] = value["SALTS"].split() if value["STRIP"] else [] key = "TRANSFORMATIONS" value = config.get(key) if value and isinstance(next(iter(value), None), dict): config[key] = [transform["STRUCTURE"] for transform in value]
[docs] @staticmethod def fromConfig(config: dict): """ :param config: configuration from which to build options :raise KeyError: if an unknown key is present :raise ValueError: if an unknown value is present """ PreprocessorOptions._updateConfig(config) key_to_enum = { "EXPLICIT_HYDROGENS": ExplicitHydrogens, "GENERATE_COORDINATES": GenerateCoordinates, "DOUBLE_BOND_STEREO_STANDARD": DoubleBondStereoStandard, "RING_REPRESENTATION": RingRepresentation, "REMOVE_SGROUP_DATA": RemoveSGroupData, "CHIRAL_FLAG_TREATMENT": ChiralFlagTreatment, } config = copy.deepcopy(config) for key, value in config.items(): if key in key_to_enum: try: config[key] = key_to_enum[key][value] except KeyError: raise ValueError(f"{key} does not support {value}") try: return PreprocessorOptions(**config) except TypeError as e: raise KeyError(e)
[docs] def toConfig(self) -> dict: config = self._asdict() for key, value in config.items(): if isinstance(value, enum.Enum): config[key] = value.name return config
[docs]def atom_has_non_standard_query(atom) -> bool: """ Checks if the atom has a non-standard query feature like M which rdkit doesn't consider as a query :param atom: the atom to check :returns whether or not the atom has a non standard query feature """ return atom.HasProp("molFileAlias") and atom.GetProp("molFileAlias") == "M"
[docs]def s_group_has_non_standard_query(s_group) -> bool: """ Checks if the s-group has a non-standard query feature like M which rdkit doesn't consider as a query :param s_group: the s-group to check :returns whether or not the s-group has a non standard query feature """ return s_group.HasProp("LABEL") and s_group.GetProp("LABEL") == "M"
[docs]def is_queryatom_exception(atom): """ Normally we raise an exception if query atoms are in the molecule to be preprocessed. This function returns True for query atoms which are allowed. :param atom: the atom to check :returns whether or not the atom is allowed in the preprocessor """ if not atom.HasQuery(): return True # "*" atoms from mol blocks: if atom.HasQuery() and atom.DescribeQuery().strip() == "AtomNull": return True for bond in atom.GetBonds(): if bond.HasProp("_MolFileBondAttach"): # this indicates that the atom is actually an attachment point, # which are allowed even though we don't accept other # features return True return False
[docs]def coords_all_zero(conf): """ Returns whether or not all atom positions in a conformer are zero """ ps = conf.GetPositions() return np.alltrue(np.abs(ps) < 1e-4)
[docs]def setup_mol(mol): """ Setup on a molecule that is always done regardless of configuration. :param mol: An unsanitized RDKit Mol :returns: A partially sanitized RDKit mol, ready for the standardizer. """ Chem.SanitizeMol( mol, sanitizeOps=Chem.SANITIZE_SETAROMATICITY | Chem.SANITIZE_ADJUSTHS) mol, modified = handle_pentavalent_nitrogens(mol) mol.UpdatePropertyCache(strict=False) mol, modified = handle_aromatic_heteroatoms_with_attachment_points(mol) if mol.GetNumConformers() and coords_all_zero(mol.GetConformer()): Chem.AssignAtomChiralTagsFromMolParity(mol) elif mol.GetNumConformers() and mol.GetConformer().Is3D(): Chem.AssignStereochemistryFrom3D(mol) else: # make sure we set cis/trans on double bonds (was SS-30467) Chem.AssignStereochemistry(mol) # check for the presence of query features on atoms or bonds for atom in mol.GetAtoms(): if atom.HasQuery(): if is_queryatom_exception(atom): # special case for some query atoms continue raise ValueError( f"Query feature found on atom {atom.GetIdx()+1}. Molecules with query features cannot be preprocessed." ) if atom_has_non_standard_query(atom): raise ValueError( f"Non-standard Query feature found on atom {atom.GetIdx()+1}. Molecules with query features cannot be preprocessed." ) for bond in mol.GetBonds(): if bond.HasQuery(): raise ValueError( f"Query feature found on bond {bond.GetIdx()+1}. Molecules with query features cannot be preprocessed." ) for s_group in Chem.GetMolSubstanceGroups(mol): if s_group_has_non_standard_query(s_group): raise ValueError( "Non-standard Query feature found on s-group. Molecules with query features cannot be preprocessed." ) return mol
[docs]def do_final_corrections(molblock, generated_coordinates, v3000): # if we generated coordinates make sure FIELDDISP Sgroups are using relative coords if generated_coordinates and "FIELDDISP=" in molblock: # we can only correct V3000s here. That's ok since V2000 is only there for testing if v3000: # start by converting all FIELDISP lines to single lines # note that this regex is not robust with respect to all possible # pathological formulations. It handles what (at least) the RDKit, MarvinSketch, # and BioVia produce. In this case we know that the molblock we're working with # originated with the RDKit, so it'll be fine as long as other parts of the # preprocessor invoked after the call to convert_to_molblock() don't do violence # to the FIELDISP line. txt = re.sub( r'(FIELDDISP="[^"]*)-\nM V30 ([^"]*")', r"-\nM V30 \1\2 -\nM V30", molblock, ) # now do the substitution: we zero out the coordinates and mark them as relative - "DRU" instead of "DAU" # addendum post-testing: it looks like there are also occurencs of "DA " in the wild, so we also replace # "DA " with "DR ". omolblock = re.sub( r'M V30 FIELDDISP="[0-9\- .]*D[AR]([U ])', r'M V30 FIELDDISP=" 0.0 0.0 DR\1', txt, ) if omolblock != molblock: logger.debug("FINAL_CORRECTIONS") molblock = omolblock return molblock
[docs]def preprocess_molblock(molblock: str, config: Optional[dict] = None) -> str: """ Process molecule based on config objects NOTE: transforms are done before neutralization and tautomer canonicalization """ options = PreprocessorOptions.fromConfig(config) if config else None sio = BytesIO(molblock.encode()) suppl = Chem.ForwardSDMolSupplier( sio, sanitize=False, removeHs=False, strictParsing=False) mol = next(suppl) return preprocess(mol, options)
[docs]@rdkit_adapter.suppress_rdkit_log() def preprocess(mol: rdkit.Chem.Mol, options: Optional[PreprocessorOptions] = None) -> str: options = options or PreprocessorOptions() mol = setup_mol(mol) if options.STRIP_STEREO_ABSOLUTE_GROUP: mol = strip_stereo_abs(mol) if options.STRIP_AND_GROUPS_ON_SINGLE_ATOM: mol = strip_stereo_and(mol) # make a copy of where we started to use later orig_mol = Chem.Mol(mol) # clarification as to why we're removing Hs when ADD_ALL is set: # if we are going to be adding Hs later anyway, we might as # well remove them now in order to simplify the rest of the process # # In order to maintain the current Hs, EXPLICIT_HYDROGENS should be # set to "AS_IS" if options.EXPLICIT_HYDROGENS is ExplicitHydrogens.KEEP_WEDGED: mol = remove_explicit_hydrogens(mol, keep_wedged=True) elif options.EXPLICIT_HYDROGENS in (ExplicitHydrogens.REMOVE_ALL, ExplicitHydrogens.ADD_ALL): mol = remove_explicit_hydrogens(mol, keep_wedged=False) elif options.EXPLICIT_HYDROGENS is ExplicitHydrogens.AS_IS: # keep hydrogens as is, don't touch! pass if options.KEEP_ONLY_LARGEST_STRUCTURE: mol = remove_fragments(mol) coordgen_enabled = options.GENERATE_COORDINATES in ( GenerateCoordinates.FULL, GenerateCoordinates.FULL_ALIGNED) if coordgen_enabled: mol = clear_brackets_from_sgroups(mol) if options.REMOVE_PROPERTIES: mol = remove_properties(mol) if options.REMOVE_SGROUP_DATA is not RemoveSGroupData.NONE: mol = remove_sgroups(mol, which=options.REMOVE_SGROUP_DATA) if options.STRIP_SALTS: mol = strip_salts(mol, salt_list=options.STRIP_SALTS) for transformation in options.TRANSFORMATIONS: mol, _ = transform(mol, transformation) if options.NEUTRALIZE: mol = neutralize(mol, options.EXPLICIT_HYDROGENS is ExplicitHydrogens.AS_IS) if options.CHOOSE_CANONICAL_TAUTOMER: mol = generate_canonical_tautomer(mol) if options.EXPLICIT_HYDROGENS is ExplicitHydrogens.ADD_ALL: mol = add_explicit_hydrogens(mol) if options.CLEAN_WEDGE_ORIENTATION: # if we are going to be setting the wedge orientation, we need # to add chiral Hs now, before coordinates are generated. Then # we set the bond wedging after generating coordinates mol = add_chiral_hs(mol) else: mol = reapply_molblock_wedging(mol) if options.CLEAR_INVALID_WEDGE_BONDS: mol = clear_wedge_bonds_from_achiral_centers(mol) # Adjusting the stereochemistry flag should happen # after clear_wedge_bonds_from_achiral_centers(), since it does # recalculate stereo, and we might find/remove some chiral centers mol = chiral_flag_treatment(options.CHIRAL_FLAG_TREATMENT, orig_mol, mol) if coordgen_enabled: align = options.GENERATE_COORDINATES is GenerateCoordinates.FULL_ALIGNED mol = generate_coordinates(mol, align) if options.CLEAN_WEDGE_ORIENTATION: # now that coordinates are set, we need to assign bond wedging mol = wedge_clean(mol) if options.DOUBLE_BOND_STEREO_STANDARD is DoubleBondStereoStandard.CROSSED: remove_wiggly_bonds_around_double_bonds(mol) # FIXME: Below calls all operate on molblock text directly v3000 = options.GENERATE_V3K_SDF kekulize = options.RING_REPRESENTATION is RingRepresentation.KEKULE processed_molblock = convert_to_molblock(mol, v3000, kekulize) if options.HEAVY_HYDROGEN_DT: processed_molblock = convert_heavy_hydrogens(processed_molblock, v3000) # do this at the very end because we need the mol block and the molecule if not v3000 and options.DOUBLE_BOND_STEREO_STANDARD is not DoubleBondStereoStandard.CROSSED: raise ValueError( "V2000 output only supports DOUBLE_BOND_STEREO_STANDARD=CROSSED") processed_molblock = set_double_bond_stereo( mol, processed_molblock, options.DOUBLE_BOND_STEREO_STANDARD) processed_molblock = do_final_corrections(processed_molblock, coordgen_enabled, v3000) return processed_molblock
[docs]def handle_pentavalent_nitrogens(mol): """ Pentavalent nitrogens are usually the incorrect representation and will break most of the downstream functions. We will allow pentavalent nitrogens form for Nitro groups (-NO2), but other instances will be transformed to the charge separated form. """ modified = False if mol.HasSubstructMatch(Chem.MolFromSmarts("[#7;X3,X4]=O")): logger.debug( "Found a pentavalent nitrogen! Trying to TRANSFORM it away...") mol, tf_modified = transform(mol, "[n;H0+0:1]=[O+0:2]>>[n+1:1]-[O-1:2]") if tf_modified: modified = True mol, tf_modified = transform( mol, "[!O:3]-[N+0;H1,H0:1](=[O+0:2])-[!O:4]>>[*:3]-[N+1:1](-[O-1:2])-[*:4]", ) if tf_modified: modified = True return mol, modified
[docs]def handle_aromatic_heteroatoms_with_attachment_points(mol): """ Aromatic atoms with attachment points are impossible to kekulize, clear that up here by making it explicit that they have an implicit H attached (This was SS-31328) """ modified = False res = Chem.Mol(mol) for atom in res.GetAtoms(): if (atom.GetIsAromatic() and atom.GetAtomicNum() != 6 and atom.HasProp("molAttchpt")): modified = True atom.SetNumExplicitHs(1) atom.SetNoImplicit(True) return res, modified
[docs]def add_explicit_hydrogens(mol): orig_num_atoms = mol.GetNumAtoms() mol = rdmolops.AddHs(mol, explicitOnly=False, addCoords=True) if mol.GetNumAtoms() != orig_num_atoms: logger.debug("EXPLICIT_HYDROGENS_ADD_ALL") return mol
[docs]def remove_explicit_hydrogens(mol, keep_wedged=False): # we use this to flag Hs that should not be removed. # there's no reasonable scenario in which an H atom # in a molecule could have an isotope value of 10, so # this should be safe save_isotope = 10 ps = Chem.RemoveHsParameters() ps.removeWithWedgedBond = not keep_wedged # since we likely do not have information about wedging still on the bonds, the above check # may not save us. Let's explicitly protect Hs at the end of wedged bonds from # being removed: saved_hs = False if keep_wedged: for atom in mol.GetAtoms(): if atom.GetAtomicNum() == 1 and atom.GetIsotope() == 0: for bnd in atom.GetBonds(): # why this is as complex as it is: # Information about bond wedging is stored as a bond property after the CTAB # is parsed. Because the values have different meanings, the V2000 and V3000 # parsers use different property names. # The values here correspond to bonds which are wedged, hashed, or "squiggled" # above comment valid for at least the 2020.03 RDKit release if (bnd.HasProp("_MolFileBondStereo") and bnd.GetIntProp("_MolFileBondStereo") in (1, 6, 4) ) or (bnd.HasProp("_MolFileBondCfg") and bnd.GetUnsignedProp("_MolFileBondCfg") in (1, 2, 3)): atom.SetIsotope(save_isotope) saved_hs = True break ps.updateExplicitCount = True res = rdmolops.RemoveHs(mol, ps, sanitize=False) if res.GetNumAtoms() != res.GetNumAtoms(): logger.debug(f"EXPLICIT_HYDROGENS: keep_wedged={keep_wedged}") if saved_hs: for atom in res.GetAtoms(): if atom.GetAtomicNum() == 1 and atom.GetIsotope() == save_isotope: atom.SetIsotope(0) return res
[docs]def convert_to_molblock(mol, v3000, kekulize): if kekulize: for bond in mol.GetBonds(): if bond.GetIsAromatic(): logger.debug("RING_REPRESENTATION_KEKULE") break sio = StringIO() writer = Chem.SDWriter(sio) try: writer.SetKekulize(kekulize) writer.SetForceV3000(v3000) writer.write(mol) writer.flush() return sio.getvalue() finally: writer.close()
[docs]def convert_heavy_hydrogens(molblock, v3000): """ NOTE that this operates on a molblock, not a molecule The RDKit does not currently (v2020.03) support writing D or T to mol blocks, so we need to post-process the text. Fortunately it's an easy regex in v3000 mol blocks. This does not work with V2000 mol blocks, so we throw a ValueError there. This doesn't seem like a big deal since V2000 support is primarly being kept around for debugging purposes. If we need to eventually support V2000+HEAVY_HYDROGEN_DT, some not-completely-trivial code will need to be written. """ if not v3000: raise ValueError( "HEAVY_HYDROGEN_DT option only supported for V3000 output") d_expr = re.compile( r"^(M V30 [0-9]+) H (.+?) MASS=2([^\n]*)", flags=re.MULTILINE) out_molblock = d_expr.sub(r"\1 D \2\3", molblock) t_expr = re.compile( r"^(M V30 [0-9]+) H (.+?) MASS=3([^\n]*)", flags=re.MULTILINE) out_molblock = t_expr.sub(r"\1 T \2\3", out_molblock) if out_molblock != molblock: logger.debug("HEAVY_HYDROGEN_DT") return out_molblock
[docs]def neutralize(mol, checkForProblematicHs=False): res = uncharger.uncharge(mol) res.UpdatePropertyCache(strict=False) if res.GetNumAtoms() != mol.GetNumAtoms(): logger.debug("NEUTRALIZE") else: for orig_at, new_at in zip(mol.GetAtoms(), res.GetAtoms()): if orig_at.GetFormalCharge() != new_at.GetFormalCharge(): logger.debug("NEUTRALIZE") break if checkForProblematicHs: # NOTE the logic here really only applies to the organic subset. # it's not going to do well with other element types, but hopefully # no-one is doing neutralization on species with non-organic atoms # and EXPLICIT_HS set to "AS_IS" if res.GetNumAtoms() != mol.GetNumAtoms(): raise RuntimeError("uncharging changed the number of atoms") for i in range(res.GetNumAtoms()): oAt = mol.GetAtomWithIdx(i) oChg = oAt.GetFormalCharge() if oChg > 0: nAt = res.GetAtomWithIdx(i) nChg = nAt.GetFormalCharge() if nChg != oChg: # positive charges on atoms with Hs should have had an H removed, but # we can't currently do that if oAt.GetTotalNumHs( includeNeighbors=True) - oAt.GetTotalNumHs(): raise ValueError( "could not neutralize positive charged atom with explicit H neighbors" ) return res
[docs]def unicode_to_str(unicode_str): """ Takes a unicode object and converts it to a str (utf-8). If the arg is already a str, returns unicode_str (i.e. if run with python 3). Needed to support python 2/3 with unicode_literals. py2: type<unicode> -> type<str utf-8> py3: type<str utf-8> (no unicode type exists) :type unicode_str: unicode (py2) or str (py3) :param unicode_str: the unicode that potentially needs converting (i.e. if run with python 2) :return: str """ if isinstance(unicode_str, str): return unicode_str return unicode_str.encode("utf-8")
[docs]def transform(mol, transformation, maxTransformations=1000): """ apply the transformation to the molecule repeatedly until it no longer applies. the maxTransformations argument is just there to prevent us from ending up in an infinite loop due to a bogus transformation """ modified = False smarts = unicode_to_str(transformation) rxn = rdChemReactions.ReactionFromSmarts(smarts) for _ in range(maxTransformations): output = rxn.RunReactants((mol,)) if output: output[0][0].UpdatePropertyCache(strict=False) mol = output[0][0] modified = True else: # if we get here, the reaction failed because the substructure to be TRANSFORMed # was not present in our *mol*. We return the original *mol* object unchanged. return mol, modified if modified: logger.debug(f"TRANSFORM: {transformation}") return mol, modified
[docs]def in_xy_plane(mol): if mol.GetNumConformers() == 0: return False return all(xyz[2] == 0 for xyz in mol.GetConformer().GetPositions())
[docs]def generate_coordinates(mol, align=False): if mol.GetNumAtoms() == 1: # we don't really need to deal with coordgen if there's only a single atom mol.GetConformer().SetAtomPosition(0, Geometry.Point3D(0, 0, 0)) mol.GetConformer().Set3D(False) return mol # keep original mol to align with original_mol = Chem.Mol(mol) Chem.rdCoordGen.AddCoords(mol) # align input mol and mol with new coords if align and in_xy_plane(mol) and in_xy_plane(original_mol): rdMolAlign.AlignMol(mol, original_mol) logger.debug("GENERATE_COORDINATES") return mol
[docs]def generate_canonical_tautomer(mol): orig_bonds = [ (bond.GetBondType(), bond.GetIsAromatic()) for bond in mol.GetBonds() ] st = rdkit_adapter.from_rdkit( mol, include_properties=False, include_stereo=False) st = epik_tautomer_canonicalizer.TautomerCanonicalizer().apply(st)[0] mol = rdkit_adapter.copy_lewis_structure_and_hydrogens(st, mol) new_bonds = [ (bond.GetBondType(), bond.GetIsAromatic()) for bond in mol.GetBonds() ] if new_bonds != orig_bonds: logger.debug("CHOOSE_CANONICAL_TAUTOMER") return mol
[docs]def clear_wedge_bonds_from_achiral_centers(mol): modified = False new_mol = Chem.Mol(mol) Chem.AssignStereochemistry(new_mol, cleanIt=True, force=True) for orig_at, new_at in zip(mol.GetAtoms(), new_mol.GetAtoms()): if orig_at.GetChiralTag() != new_at.GetChiralTag(): modified = True break if not modified: for orig_bnd, new_bnd in zip(mol.GetBonds(), new_mol.GetBonds()): if (orig_bnd.GetStereo() != new_bnd.GetStereo() or orig_bnd.GetBondDir() != new_bnd.GetBondDir()): modified = True break if modified: logger.debug("CLEAR_INVALID_WEDGE_BONDS") return new_mol
def _get_chiral_flag(mol): try: return mol.GetUnsignedProp(MOL_PROP_CHIRAL_FLAG) except KeyError: return MOL_PROP_OFF
[docs]def chiral_flag_treatment(adj_mode, src_mol, mol): if adj_mode == ChiralFlagTreatment.IGNORE: chiral_flag = _get_chiral_flag(src_mol) elif adj_mode == ChiralFlagTreatment.FORCE_OFF: chiral_flag = MOL_PROP_OFF elif adj_mode == ChiralFlagTreatment.FORCE_ON: chiral_flag = MOL_PROP_ON else: has_chiral_atoms = any( atom.GetChiralTag() != Chem.ChiralType.CHI_UNSPECIFIED for atom in mol.GetAtoms()) has_enhanced_stereo = len(mol.GetStereoGroups()) != 0 if has_chiral_atoms and not has_enhanced_stereo: chiral_flag = MOL_PROP_ON else: chiral_flag = MOL_PROP_OFF mol.SetUnsignedProp(MOL_PROP_CHIRAL_FLAG, chiral_flag) logger.debug(f"CHIRAL_FLAG_TREATMENT[{adj_mode.name}]: {chiral_flag}") return mol
[docs]def strip_stereo_abs(input_mol): """ Removes any Stereo ABS group :param input_mol: The original molecule to consider :return: post-processed molecule, if the input molecule was modified """ mol = Chem.RWMol(input_mol) keep = [] for stereo_group in mol.GetStereoGroups(): if stereo_group.GetGroupType() != Chem.StereoGroupType.STEREO_ABSOLUTE: keep.append(stereo_group) else: logger.debug("STRIP_STEREO_ABSOLUTE_GROUP") mol.SetStereoGroups(keep) return Chem.Mol(mol)
[docs]def strip_stereo_and(input_mol): """ Removes any Stereo AND groups with only one center and flattens the bonds around it :param input_mol: The original molecule to consider :return: post-processed molecule, if the input molecule was modified """ mol = Chem.RWMol(input_mol) keep = [] for stereo_group in mol.GetStereoGroups(): if stereo_group.GetGroupType() != Chem.StereoGroupType.STEREO_AND: keep.append(stereo_group) elif len(stereo_group.GetAtoms()) > 1: keep.append(stereo_group) else: logger.debug("STRIP_AND_GROUPS_ON_SINGLE_ATOM") # flatten the atom, At this point the stereo_group only has one atom atom = stereo_group.GetAtoms()[0] atom.SetChiralTag(Chem.ChiralType.CHI_UNSPECIFIED) # make sure that all bond wedging information has been removed around that atom too: for bond in atom.GetBonds(): if bond.GetBeginAtomIdx() == atom.GetIdx(): bond.ClearProp("_MolFileBondStereo" ) # what we'd get from a V2000 mol file bond.ClearProp("_MolFileBondCfg" ) # what we'd get from a V3000 mol file mol.SetStereoGroups(keep) return Chem.Mol(mol)
[docs]def frag_is_smaller(atoms, largest_atoms, weight, largest_weight, smiles, largest_smiles): """ A fragment is considered larger if its atoms/weight are larger, the length of the smiles string is larger, or the smiles string is lexicographically *smaller* if they are equal length. ie, 'AAA' is larger than 'AAB'.. hence the final smiles > largest_smiles check here to reject """ if atoms < largest_atoms: return True if atoms > largest_atoms: return False if weight < largest_weight: return True if weight > largest_weight: return False if len(smiles) < len(largest_smiles): return True if len(smiles) > len(largest_smiles): return False return smiles > largest_smiles
[docs]def connect_variable_attachment_points(mol): """ forms zero-order bonds between one of the atoms of a bond with an ATTACH property to the "main" molecule in order to have the molecule+variable attachment point treated as a single fragment returns a 2-tuple with: 1. the modified molecule 2. whether or not the molecule was modified """ bonds_added = False res = None for bond in list(mol.GetBonds()): if bond.HasProp("_MolFileBondAttach") and bond.HasProp( "_MolFileBondEndPts"): # these should look like: ENDPTS=(3 4 6 5) endp_txt = bond.GetProp("_MolFileBondEndPts") # do a bit of validation that the format is correct: if endp_txt[0] != "(" or endp_txt[-1] != ")": logger.warning("invalid ENDPTS format ignored") continue endps = [int(x) for x in endp_txt[1:-1].split(" ")] # do a bit of validation that the format is correct: if len(endps) != endps[0] + 1 or endps[0] == 1: logger.warning("invalid ENDPTS format ignored") continue batom_idx = bond.GetBeginAtomIdx() oatom_idx = endps[1] - 1 if res is None: res = Chem.RWMol(mol) if res.GetBondBetweenAtoms(batom_idx, oatom_idx): logger.warning("variable attachment point already connected") continue res.AddBond(batom_idx, oatom_idx, Chem.BondType.ZERO) # set a property on the bond so that we can find it again: res.GetBondBetweenAtoms(batom_idx, oatom_idx).SetIntProp( "variable_attach_bond", 1) bonds_added = True if res is None: res = mol return res, bonds_added
[docs]def remove_fragments(mol): """ Fragments are not removed if the molecule contains any SGroups which are associated with polymers Use the following criteria to remove unwanted fragements from *mol*: 1. keep only the fragment which has the most number of atoms 2. break ties by keeping only fragments with the greatest molecular weight 3. break ties with the longest smiles string 4. break additional ties by keeping the fragment with the earliest alpha sorted SMILES string If two or more identical fragments remain after 1-4, we will throw a fatal error. """ sgs = Chem.GetMolSubstanceGroups(mol) for sg in sgs: props = sg.GetPropsAsDict() if "TYPE" in props and props["TYPE"] in ( "MUL", "SRU", "MON", "COP", "CRO", "GRA", "MIX", "FOR", "MOD", ): logger.debug( "Molecule has polymeric SGroups, skipping remove_fragments.") return mol nmol, bonds_added = connect_variable_attachment_points(mol) frags = Chem.GetMolFrags(nmol, asMols=True, sanitizeFrags=False) if len(frags) == 1: return mol largest_frag = None largest_atoms = math.inf largest_weight = math.inf largest_smiles = None duplicates = 0 for frag in frags: [atoms, weight, smiles] = [ frag.GetNumHeavyAtoms(), MolWt(frag), Chem.MolToSmiles(frag), ] if largest_frag is not None: if frag_is_smaller(atoms, largest_atoms, weight, largest_weight, smiles, largest_smiles): continue # If all properties are equivalent, we have a *potential* duplicate of the largest known fragment # but if they are not, we've discovered a new, bigger fragment, and may discard any known dupes if (atoms == largest_atoms and weight == largest_weight and smiles == largest_smiles): duplicates += 1 else: duplicates = 0 largest_frag = frag largest_atoms = atoms largest_weight = weight largest_smiles = smiles else: largest_frag = frag largest_atoms = atoms largest_weight = weight largest_smiles = smiles if duplicates: message = ( "At the end of REMOVE_FRAGMENTS, we had {duplicates} identical " "fragments left over, returning only one.") logger.debug(message.format(duplicates=duplicates)) if bonds_added: largest_frag = Chem.RWMol(largest_frag) bnds = list(largest_frag.GetBonds()) for bnd in bnds: if bnd.HasProp("variable_attach_bond"): largest_frag.RemoveBond(bnd.GetBeginAtomIdx(), bnd.GetEndAtomIdx()) logger.debug("KEEP_ONLY_LARGEST_STRUCTURE") return largest_frag
[docs]def clear_brackets_from_sgroups(mol): """ Removes brackets from any s-groups """ sgs = Chem.GetMolSubstanceGroups(mol) Chem.ClearMolSubstanceGroups(mol) for sg in sgs: sg.ClearBrackets() logger.debug("CLEAR_BRACKETS_FROM_SGROUPS") Chem.AddMolSubstanceGroup(mol, sg) return mol
[docs]def remove_properties(mol): for prop_name in mol.GetPropNames(): mol.ClearProp(prop_name) logger.debug("REMOVE_PROPERTIES") return mol
[docs]def remove_sgroups(mol, which): res = Chem.Mol(mol) if which is RemoveSGroupData.NONE: return res Chem.ClearMolSubstanceGroups(res) if which is RemoveSGroupData.ALL: if len(Chem.GetMolSubstanceGroups(mol)) != 0: logger.debug("REMOVE_SGROUP_DATA") return res if which is RemoveSGroupData.DAT_ONLY: for sg in Chem.GetMolSubstanceGroups(mol): if sg.GetProp("TYPE") != "DAT": Chem.AddMolSubstanceGroup(res, sg) if len(Chem.GetMolSubstanceGroups(mol)) != len( Chem.GetMolSubstanceGroups(res)): logger.debug("REMOVE_SGROUP_DATA") return res # we shouldn't get here assert False
[docs]def strip_salts(mol, salt_list): salt_list_str = "\n".join(salt_list) remover = SaltRemover.SaltRemover(defnData=salt_list_str) res = remover.StripMol(mol) if res.GetNumAtoms() != mol.GetNumAtoms(): logger.debug("STRIP_SALTS") return res
[docs]def add_chiral_hs(mol): # need to filter chiral_atoms to get only atoms with endocyclic stereobonds # since we don't want to add hydrogens to chiral atoms that already have # exocyclic stereobonds chiral_atoms = [center[0] for center in Chem.FindMolChiralCenters(mol)] chiral_atoms_needing_hydrogens = [] for atom_idx in chiral_atoms: atom = mol.GetAtomWithIdx(atom_idx) # only mark this for adding hydrogens if there are no exocyclic bonds add_atom = all(bond.IsInRing() for bond in atom.GetBonds()) if add_atom: chiral_atoms_needing_hydrogens.append(atom_idx) # return mol if there are no chiral atoms to be adjusted if not chiral_atoms_needing_hydrogens: return mol res = Chem.AddHs( mol, explicitOnly=False, addCoords=True, onlyOnAtoms=chiral_atoms_needing_hydrogens, ) if res.GetNumAtoms() != mol.GetNumAtoms(): logger.debug("ADD_CHIRAL_HS") return res
[docs]def wedge_clean(mol): # resetting the BondDir and UnknownStereo setting on each bond # so WedgeMolBonds can do its magic for bond in mol.GetBonds(): if bond.GetBondType() == Chem.BondType.SINGLE and bond.IsInRing(): if bond.GetBondDir() == Chem.BondDir.UNKNOWN: bond.SetIntProp(MOL_PROP_UNKNOWN_STEREO, MOL_PROP_ON) bond.SetBondDir(Chem.BondDir.NONE) # bonds which were explicitly marked as unknown: if (bond.HasProp("_MolFileBondStereo") and bond.GetProp("_MolFileBondStereo") == "4") or ( bond.HasProp("_MolFileBondCfg") and bond.GetProp("_MolFileBondCfg") == "2"): # ensure that the bond doesn't start at an atom with an specified double # bond disqualify = False for obond in bond.GetBeginAtom().GetBonds(): if (obond.GetBondType() == Chem.BondType.DOUBLE and obond.GetStereo() != Chem.BondStereo.STEREONONE): disqualify = True break if not disqualify: bond.SetBondDir(Chem.BondDir.UNKNOWN) # NOTE: the logic here of when to indicate that the molecule has been modified is not # 100% obvious orig_dirs = [x.GetBondDir() for x in mol.GetBonds()] Chem.WedgeMolBonds(mol, mol.GetConformer(0)) new_dirs = [x.GetBondDir() for x in mol.GetBonds()] if orig_dirs != new_dirs: logger.debug("CLEAN_WEDGE_ORIENTATION") return mol
[docs]def reapply_molblock_wedging(mol): for bnd in mol.GetBonds(): # only change the wedging if the bond was wedged in the input data (we # recognize that by looking for the properties the mol file parser # sets): if bnd.HasProp("_MolFileBondStereo"): # V2000 val = bnd.GetProp("_MolFileBondStereo") if val == "1": bnd.SetBondDir(Chem.BondDir.BEGINWEDGE) elif val == "6": bnd.SetBondDir(Chem.BondDir.BEGINDASH) elif val == "4": bnd.SetBondDir(Chem.BondDir.UNKNOWN) elif bnd.HasProp("_MolFileBondCfg"): # V3000 val = bnd.GetProp("_MolFileBondCfg") if val == "1": bnd.SetBondDir(Chem.BondDir.BEGINWEDGE) elif val == "3": bnd.SetBondDir(Chem.BondDir.BEGINDASH) elif val == "2": bnd.SetBondDir(Chem.BondDir.UNKNOWN) elif bnd.GetBondDir() in (Chem.BondDir.BEGINDASH, Chem.BondDir.BEGINWEDGE, Chem.BondDir.UNKNOWN): bnd.SetBondDir(Chem.BondDir.NONE) return mol
[docs]def remove_wiggly_bonds_around_double_bonds(mol): # We just need to make sure that there any wiggly bonds starting at atoms # involved in double bonds don't end up in the output for bond in mol.GetBonds(): if bond.GetBondType() == Chem.BondType.SINGLE and ( (bond.HasProp("_MolFileBondStereo") and bond.GetProp("_MolFileBondStereo") == "4") or (bond.HasProp("_MolFileBondCfg") and bond.GetProp("_MolFileBondCfg") == "2")): # it's a wiggly bond. Check to see if it starts at an atom with a double bond startsAtDouble = False for obond in bond.GetBeginAtom().GetBonds(): if (obond.GetBondType() == Chem.BondType.DOUBLE and obond.GetStereo() != Chem.BondStereo.STEREONONE): startsAtDouble = True break if startsAtDouble: if bond.HasProp("_MolFileBondStereo"): bond.ClearProp("_MolFileBondStereo") if bond.HasProp("_MolFileBondCfg"): bond.ClearProp("_MolFileBondCfg") bond.SetBondDir(Chem.BondDir.NONE) logger.debug("DOUBLE_BOND_STEREO_STANDARD_CROSSED")
[docs]def set_double_bond_stereo(mol, mol_block, bond_type): # as of the 2020.03 release cycle the RDKit will always write double bonds with # unknown stereochemistry as crossed bonds. This is done at the C++ level and # can't be modified from Python. We work around that here by doing regex changes # in the output mol_block. # some notes on this: # - This is a problem which would ideally be solved in the core RDKit, with a # real implementation connected directly to the generation of mol blocks. # Until that's there we're kind of stuck with this hacky (though hopefully correct) # solution. # - It is, unsurprisingly, fragile. It's quite easy to construct examples # where a double bond has more than one wiggly bond to it. This is ugly, # but not wrong. # Explanation of the constants being used, this is taken from the Biovia CTAB documentation: # http://help.accelrysonline.com/ulm/onelab/1.0/content/ulm_pdfs/direct/reference/ctfileformats2016.pdf # # Marking a double bond as CROSSED to indicate unknown stereochemistry is done # by setting CFG=2 on the double bond in V3000 # the RDKit parses this field into the property "_MolFileBondCfg" # in V2000 it's by setting 3 in the "bond stereo" field # the RDKit parses this field into the property "_MolFileBondStereo" # # Marking a single bond as wiggly to indicate unknown stereochemistry on a neighboring # double bond or chiral center is done by setting CFG=2 on the single bond in V3000 # the RDKit parses this field into the property "_MolFileBondCfg" # in V2000 it's by setting 4 in the "bond stereo" field # the RDKit parses this field into the property "_MolFileBondStereo" out_mol_block = mol_block if bond_type is DoubleBondStereoStandard.CROSSED: # this is what the RDKit does by default. pass elif bond_type is DoubleBondStereoStandard.WIGGLY: # we need to create one wiggly bond per double bond. Do the one that is shared # with the least number of other double bonds # start by identifying bonds next to double bonds that have stereo specified: next_to_specified_double_bond = [0] * mol.GetNumBonds() for bond in mol.GetBonds(): if bond.GetBondType() == Chem.BondType.DOUBLE and bond.GetStereo( ) not in ( Chem.rdchem.BondStereo.STEREOANY, Chem.rdchem.BondStereo.STEREONONE, ): # mark all of our neighbors: begin_atom = bond.GetBeginAtom() end_atom = bond.GetEndAtom() for contiguous_bond in begin_atom.GetBonds( ) + end_atom.GetBonds(): next_to_specified_double_bond[contiguous_bond.GetIdx()] = 1 single_bond_scores = [0] * mol.GetNumBonds() unknown_double_bond_nbrs = defaultdict(list) for bond in mol.GetBonds(): if (bond.GetBondType() == Chem.BondType.DOUBLE and bond.GetStereo() == Chem.rdchem.BondStereo.STEREOANY): # increment the counts of our eligible neighbors: begin_atom = bond.GetBeginAtom() end_atom = bond.GetEndAtom() for contiguous_bond in begin_atom.GetBonds( ) + end_atom.GetBonds(): if (not next_to_specified_double_bond[contiguous_bond. GetIdx()] and contiguous_bond.GetBondType() == Chem.BondType.SINGLE and contiguous_bond.GetBondDir() == Chem.BondDir.NONE): single_bond_scores[contiguous_bond.GetIdx()] += 1 unknown_double_bond_nbrs[bond.GetIdx()].append( contiguous_bond.GetIdx()) for dbl_bnd_idx in unknown_double_bond_nbrs: nbr_scores = sorted( [(single_bond_scores[x], x) for x in unknown_double_bond_nbrs[dbl_bnd_idx]]) if not nbr_scores: logger.warning( f"cannot find a single bond to mark as WIGGLY for double bond {bond.GetIdx()+1}" ) continue single_bond_to_mark = nbr_scores[0][-1] # now we know which double bond and which single bond we need to change. # go ahead and change them in the mol block # we replace the crossed double bond, which looks like this: # M V30 3 2 1 2 CFG=2 # with its uncrossed form: # M V30 3 2 1 2 # and we make the single bond wiggly, which is converting it from this: # M V30 3 1 1 3 # to this: # M V30 3 1 1 3 CFG=2 # first do the double bond: out_mol_block = re.sub( r"(M V30 %d 2 [0-9]+ [0-9]+) CFG=2" % (dbl_bnd_idx + 1), r"\1", out_mol_block, ) # now the single bond out_mol_block = re.sub( r"(M V30 %d 1 [0-9]+ [0-9]+)" % (single_bond_to_mark + 1), r"\1 CFG=2", out_mol_block, ) elif bond_type is DoubleBondStereoStandard.NONE: # go back to whatever we had before for bond in mol.GetBonds(): if (bond.GetBondType() == Chem.BondType.DOUBLE and bond.GetStereo() == Chem.rdchem.BondStereo.STEREOANY): # V3000 crossed bond cfg = 0 if bond.HasProp("_MolFileBondCfg"): cfg = bond.GetUnsignedProp("_MolFileBondCfg") elif bond.HasProp("_MolFileBondStereo"): cfg = bond.GetUnsignedProp("_MolFileBondStereo") # translate v2000->v3000 if cfg == 3: cfg = 2 else: cfg = 0 if cfg == 2: # it was originally crossed, it will still be crossed, no need to do anything pass else: reset_dbl_bond = False # and find neighboring wiggly single bonds begin_atom = bond.GetBeginAtom() end_atom = bond.GetEndAtom() for contiguous_bond in begin_atom.GetBonds( ) + end_atom.GetBonds(): if (contiguous_bond == bond or contiguous_bond.GetBondType() != Chem.BondType.SINGLE): continue cfg = 0 if contiguous_bond.HasProp("_MolFileBondCfg"): cfg = contiguous_bond.GetUnsignedProp( "_MolFileBondCfg") elif contiguous_bond.HasProp("_MolFileBondStereo"): cfg = contiguous_bond.GetUnsignedProp( "_MolFileBondStereo") # translate v2000->v3000 if cfg == 4: cfg = 2 else: cfg = 0 if cfg: bnd_idx = contiguous_bond.GetIdx() if (re.search( r"M V30 %d 1 [0-9]+ [0-9]+ CFG=" % (bnd_idx + 1), mol_block, ) is None): out_mol_block = re.sub( r"(M V30 %d 1 [0-9]+ [0-9]+)" % (bnd_idx + 1), r"\1 CFG=2", out_mol_block, ) reset_dbl_bond = True else: # if we somehow already ended up with a configuration (perhaps because of wedging elsewhere) # generate a warning, but don't mess with it: logger.warning( f"unable to set single bond {bnd_idx+1} to be a wiggly bond." ) if reset_dbl_bond: # uncross the double bond: dbl_bnd_idx = bond.GetIdx() out_mol_block = re.sub( r"(M V30 %d 2 [0-9]+ [0-9]+) CFG=2" % (dbl_bnd_idx + 1), r"\1", out_mol_block, ) else: logger.warning( f"unable to write double bond {bond.GetIdx()+1} without a crossed bond." ) else: raise ValueError( f"Unrecognized value for DOUBLE_BOND_STEREO_STANDARD {bond_type}") if out_mol_block != mol_block: logger.debug(f"DOUBLE_BOND_STEREO_STANDARD_{bond_type}") return out_mol_block
[docs]@main_wrapper("LiveDesign Structure Preprocessor") def main(argv=None): """ Function to run preprocessor directly from the command line. """ parser = argparse.ArgumentParser( description="Run the preprocessor on an SD file") parser.add_argument("input", help="input SDF") parser.add_argument("-output", default="out.sdf", help="output SDF") parser.add_argument("-config", help="JSON config file") parser.add_argument("-verbose", action="store_true", help="Verbose logging") args = parser.parse_args(argv) logger.setLevel(log.logging.DEBUG if args.verbose else log.logging.INFO) config_json = None if args.config: with open(args.config) as fh: config_json = json.load(fh) logger.debug(f"Preprocesor Configuration: {config_json}") with infrastructure.TextBlockReader(args.input) as reader, \ infrastructure.TextBlockWriter(args.output) as writer: for in_txt in reader: in_txt = in_txt.decode() try: out_txt = preprocess_molblock(in_txt, config_json) writer.append(out_txt.encode()) except Exception as exc: logger.info("Failed to process structure...") logger.info(f"{in_txt}") logger.info("ERROR:") logger.exception(f"{exc}") logger.info(log.SINGLE_LINE)
if __name__ == "__main__": main()