Source code for schrodinger.livedesign.preprocessor

import argparse
import copy
import enum
import json
import math
import re
from collections import defaultdict
from io import BytesIO
from io import StringIO
from typing import NamedTuple
from typing import Optional
from typing import Tuple

import numpy as np
import rdkit
from rdkit import Chem
from rdkit import Geometry
from rdkit.Chem import SaltRemover
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdMolAlign
from rdkit.Chem import rdmolops
from rdkit.Chem.Descriptors import MolWt
# we can statically store the MolVS classes used in TAUTOMERIZE and NEUTRALIZE here
# so they are only created and initialized once:
from rdkit.Chem.MolStandardize import rdMolStandardize

from schrodinger.infra import structure as infrastructure
from schrodinger.job.driver_decorator import main_wrapper
from schrodinger.thirdparty import rdkit_adapter
from schrodinger.utils import log

try:
    from schrodinger.application.epik import epik_tautomer_canonicalizer
except ImportError as e:
    epik_tautomer_canonicalizer = e

logger = log.get_output_logger(__file__)

uncharger = rdMolStandardize.Uncharger(canonicalOrder=True)

# string constants for mol props
MOL_PROP_CHIRAL_FLAG = "_MolFileChiralFlag"
MOL_PROP_UNKNOWN_STEREO = "_UnknownStereo"
MOL_PROP_ON = 1
MOL_PROP_OFF = 0

STD_DEFAULT_SALTS = (
    "[Cl,Br,I]",
    "[Li,Na,K,Ca,Mg]",
    "[O,N]",
    "[N](=O)(O)O",
    "[P](=O)(O)(O)O",
    "[P](F)(F)(F)(F)(F)F",
    "[S](=O)(=O)(O)O",
    "[CH3][S](=O)(=O)(O)",
    # p-Toluene sulfonate
    "c1cc([CH3])ccc1[S](=O)(=O)(O)",
    # Acetic acid
    "[CH3]C(=O)O",
    # TFA
    "FC(F)(F)C(=O)O",
    # Fumarate/Maleate
    "OC(=O)C=CC(=O)O",
    # Oxalate
    "OC(=O)C(=O)O",
    # Tartrate
    "OC(=O)C(O)C(O)C(=O)O",
    # Dicylcohexylammonium
    "C1CCCCC1[NH]C1CCCCC1",
)

STD_TRANSFORMATIONS = (
    # Nitro
    "[#8:2]=[#7:1]=[#8:3]>>[#8-:2]-[#7+:1]=[#8:3]",
    # Nitroso
    "[#6:3]-[#7H2:1]=[#8:2]>>[#6:3]-[#7H2+:1]-[#8-:2]",
    # Phosphonium Ylide
    "[#6:3][P-:1]([#6:4])([#6:5])[#6+:2]>>[#6:3][P-0:1]([#6:4])([#6:5])=[#6+0:2]",
    # Sulfon
    "[#6:3][S;X3+0:1]([#6:4])=[#8-0:2]>>[#6:3][S+:1]([#6:4])-[#8-:2]",
    # Phosphonic
    "[#6:3][P+:1]([#8;X2:4])([#8;X2:5])[#8-:2]>>[#6:3][P+0:1]([#8:4])([#8:5])=[#8-0:2]",
    # Sulfoxide
    "[#6:3][S+:1]([#6:4])([#8-:2])=[O:5]>>[#6:3][S+0:1]([#6:4])(=[#8-0:2])=[O:5]",
    # Azide
    "[#7;A;X2-:1][N;X2+:2]#[N;X1:3]>>[#7-0:1]=[N+:2]=[#7-:3]",
    # Diazo
    "[#6;X3-:1][N;X2+:2]#[N;X1:3]>>[#6-0;A:1]=[N+:2]=[#7-:3]",
)


[docs]class ExplicitHydrogens(enum.Enum):
    REMOVE_ALL = enum.auto()
    KEEP_WEDGED = enum.auto()
    ADD_ALL = enum.auto()
    AS_IS = enum.auto()


[docs]class GenerateCoordinates(enum.Enum):
    NONE = enum.auto()
    FULL = enum.auto()
    FULL_ALIGNED = enum.auto()


[docs]class DoubleBondStereoStandard(enum.Enum):
    NONE = enum.auto()
    CROSSED = enum.auto()
    WIGGLY = enum.auto()


[docs]class RingRepresentation(enum.Enum):
    KEKULE = enum.auto()
    AROMATIC = enum.auto()


[docs]class RemoveSGroupData(enum.Enum):
    NONE = enum.auto()
    ALL = enum.auto()
    DAT_ONLY = enum.auto()


[docs]class ChiralFlagTreatment(enum.Enum):
    IGNORE = enum.auto()
    CLEAN = enum.auto()
    FORCE_ON = enum.auto()
    FORCE_OFF = enum.auto()


[docs]class PreprocessorOptions(NamedTuple):
    """
    Options dictating preprocessor actions; all options default to no-ops unless
    otherwise specified.
    """
    KEEP_ONLY_LARGEST_STRUCTURE: bool = True
    REMOVE_PROPERTIES: bool = False
    STRIP_SALTS: Tuple[str] = STD_DEFAULT_SALTS
    CLEAN_WEDGE_ORIENTATION: bool = True
    CHOOSE_CANONICAL_TAUTOMER: bool = False
    TRANSFORMATIONS: Tuple[str] = STD_TRANSFORMATIONS
    NEUTRALIZE: bool = True
    EXPLICIT_HYDROGENS: ExplicitHydrogens = ExplicitHydrogens.REMOVE_ALL
    GENERATE_COORDINATES: GenerateCoordinates = GenerateCoordinates.FULL_ALIGNED
    DOUBLE_BOND_STEREO_STANDARD: DoubleBondStereoStandard = DoubleBondStereoStandard.NONE
    CHIRAL_FLAG_TREATMENT: ChiralFlagTreatment = ChiralFlagTreatment.CLEAN
    HEAVY_HYDROGEN_DT: bool = False
    RING_REPRESENTATION: RingRepresentation = RingRepresentation.KEKULE
    GENERATE_V3K_SDF: bool = True
    REMOVE_SGROUP_DATA: RemoveSGroupData = RemoveSGroupData.NONE
    CLEAR_INVALID_WEDGE_BONDS: bool = True
    STRIP_STEREO_ABSOLUTE_GROUP: bool = True
    STRIP_AND_GROUPS_ON_SINGLE_ATOM: bool = True

    @staticmethod
    def _updateConfig(config: dict):
        """
        :param dict: configuration to validate/update deprecated options
        """

        # Validate/fix deprecated options
        key = "CORE_SUITE_NEUTRALIZE"
        if config.get(key):
            raise KeyError(f"{key} is not supported")
        config.pop(key, None)

        key = "FORCE_ABSOLUTE_STEREOCHEMISTRY"
        value = config.get(key)
        if value is not None:
            config["CHIRAL_FLAG_TREATMENT"] = "FORCE_ON" if value else "IGNORE"
        config.pop(key, None)

        # Update deprecated option formats
        key = "STRIP_SALTS"
        value = config.get(key)
        if isinstance(value, dict):
            config[key] = value["SALTS"].split() if value["STRIP"] else []

        key = "TRANSFORMATIONS"
        value = config.get(key)
        if value and isinstance(next(iter(value), None), dict):
            config[key] = [transform["STRUCTURE"] for transform in value]

[docs]    @staticmethod
    def fromConfig(config: dict):
        """
        :param config: configuration from which to build options
        :raise KeyError: if an unknown key is present
        :raise ValueError: if an unknown value is present
        """
        PreprocessorOptions._updateConfig(config)

        key_to_enum = {
            "EXPLICIT_HYDROGENS": ExplicitHydrogens,
            "GENERATE_COORDINATES": GenerateCoordinates,
            "DOUBLE_BOND_STEREO_STANDARD": DoubleBondStereoStandard,
            "RING_REPRESENTATION": RingRepresentation,
            "REMOVE_SGROUP_DATA": RemoveSGroupData,
            "CHIRAL_FLAG_TREATMENT": ChiralFlagTreatment,
        }
        config = copy.deepcopy(config)
        for key, value in config.items():
            if key in key_to_enum:
                try:
                    config[key] = key_to_enum[key][value]
                except KeyError:
                    raise ValueError(f"{key} does not support {value}")
        try:
            return PreprocessorOptions(**config)
        except TypeError as e:
            raise KeyError(e)

[docs]    def toConfig(self) -> dict:
        config = self._asdict()
        for key, value in config.items():
            if isinstance(value, enum.Enum):
                config[key] = value.name
        return config


[docs]def atom_has_non_standard_query(atom) -> bool:
    """
    Checks if the atom has a non-standard query feature like M which rdkit doesn't consider as a query

    :param atom: the atom to check
    :returns whether or not the atom has a non standard query feature
    """
    return atom.HasProp("molFileAlias") and atom.GetProp("molFileAlias") == "M"


[docs]def s_group_has_non_standard_query(s_group) -> bool:
    """
    Checks if the s-group has a non-standard query feature like M which rdkit doesn't consider as a query

    :param s_group: the s-group to check
    :returns whether or not the s-group has a non standard query feature
    """
    return s_group.HasProp("LABEL") and s_group.GetProp("LABEL") == "M"


[docs]def is_queryatom_exception(atom):
    """
    Normally we raise an exception if query atoms are in the molecule to be preprocessed.
    This function returns True for query atoms which are allowed.

    :param atom: the atom to check
    :returns whether or not the atom is allowed in the preprocessor
    """
    if not atom.HasQuery():
        return True

    # "*" atoms from mol blocks:
    if atom.HasQuery() and atom.DescribeQuery().strip() == "AtomNull":
        return True

    for bond in atom.GetBonds():
        if bond.HasProp("_MolFileBondAttach"):
            # this indicates that the atom is actually an attachment point,
            # which are allowed even though we don't accept other
            # features
            return True
    return False


[docs]def coords_all_zero(conf):
    """

    Returns whether or not all atom positions in a conformer are zero
    """
    ps = conf.GetPositions()
    return np.alltrue(np.abs(ps) < 1e-4)


[docs]def setup_mol(mol):
    """
    Setup on a molecule that is always done regardless of configuration.

    :param mol: An unsanitized RDKit Mol
    :returns: A partially sanitized RDKit mol, ready for the standardizer.

    """
    Chem.SanitizeMol(
        mol, sanitizeOps=Chem.SANITIZE_SETAROMATICITY | Chem.SANITIZE_ADJUSTHS)
    mol, modified = handle_pentavalent_nitrogens(mol)
    mol.UpdatePropertyCache(strict=False)
    mol, modified = handle_aromatic_heteroatoms_with_attachment_points(mol)

    if mol.GetNumConformers() and coords_all_zero(mol.GetConformer()):
        Chem.AssignAtomChiralTagsFromMolParity(mol)
    elif mol.GetNumConformers() and mol.GetConformer().Is3D():
        Chem.AssignStereochemistryFrom3D(mol)
    else:
        # make sure we set cis/trans on double bonds (was SS-30467)
        Chem.AssignStereochemistry(mol)

    # check for the presence of query features on atoms or bonds
    for atom in mol.GetAtoms():
        if atom.HasQuery():
            if is_queryatom_exception(atom):
                # special case for some query atoms
                continue
            raise ValueError(
                f"Query feature found on atom {atom.GetIdx()+1}. Molecules with query features cannot be preprocessed."
            )
        if atom_has_non_standard_query(atom):
            raise ValueError(
                f"Non-standard Query feature found on atom {atom.GetIdx()+1}. Molecules with query features cannot be preprocessed."
            )
    for bond in mol.GetBonds():
        if bond.HasQuery():
            raise ValueError(
                f"Query feature found on bond {bond.GetIdx()+1}. Molecules with query features cannot be preprocessed."
            )
    for s_group in Chem.GetMolSubstanceGroups(mol):
        if s_group_has_non_standard_query(s_group):
            raise ValueError(
                "Non-standard Query feature found on s-group. Molecules with query features cannot be preprocessed."
            )

    return mol


[docs]def do_final_corrections(molblock, generated_coordinates, v3000):
    # if we generated coordinates make sure FIELDDISP Sgroups are using relative coords
    if generated_coordinates and "FIELDDISP=" in molblock:
        # we can only correct V3000s here. That's ok since V2000 is only there for testing
        if v3000:
            # start by converting all FIELDISP lines to single lines
            # note that this regex is not robust with respect to all possible
            # pathological formulations. It handles what (at least) the RDKit, MarvinSketch,
            # and BioVia produce. In this case we know that the molblock we're working with
            # originated with the RDKit, so it'll be fine as long as other parts of the
            # preprocessor invoked after the call to convert_to_molblock() don't do violence
            # to the FIELDISP line.
            txt = re.sub(
                r'(FIELDDISP="[^"]*)-\nM  V30 ([^"]*")',
                r"-\nM  V30 \1\2 -\nM  V30",
                molblock,
            )
            # now do the substitution: we zero out the coordinates and mark them as relative - "DRU" instead of "DAU"
            #   addendum post-testing: it looks like there are also occurencs of "DA " in the wild, so we also replace
            #   "DA " with "DR ".
            omolblock = re.sub(
                r'M  V30 FIELDDISP="[0-9\- .]*D[AR]([U ])',
                r'M  V30 FIELDDISP="    0.0       0.0       DR\1',
                txt,
            )
            if omolblock != molblock:
                logger.debug("FINAL_CORRECTIONS")
                molblock = omolblock
    return molblock


[docs]def preprocess_molblock(molblock: str, config: Optional[dict] = None) -> str:
    """
    Process molecule based on config objects

    NOTE: transforms are done before neutralization and tautomer canonicalization
    """

    options = PreprocessorOptions.fromConfig(config) if config else None

    sio = BytesIO(molblock.encode())
    suppl = Chem.ForwardSDMolSupplier(
        sio, sanitize=False, removeHs=False, strictParsing=False)
    mol = next(suppl)

    return preprocess(mol, options)


[docs]@rdkit_adapter.suppress_rdkit_log()
def preprocess(mol: rdkit.Chem.Mol,
               options: Optional[PreprocessorOptions] = None) -> str:

    options = options or PreprocessorOptions()

    mol = setup_mol(mol)

    if options.STRIP_STEREO_ABSOLUTE_GROUP:
        mol = strip_stereo_abs(mol)

    if options.STRIP_AND_GROUPS_ON_SINGLE_ATOM:
        mol = strip_stereo_and(mol)

    # make a copy of where we started to use later
    orig_mol = Chem.Mol(mol)

    # clarification as to why we're removing Hs when ADD_ALL is set:
    # if we are going to be adding Hs later anyway, we might as
    # well remove them now in order to simplify the rest of the process
    #
    # In order to maintain the current Hs, EXPLICIT_HYDROGENS should be
    #   set to "AS_IS"
    if options.EXPLICIT_HYDROGENS is ExplicitHydrogens.KEEP_WEDGED:
        mol = remove_explicit_hydrogens(mol, keep_wedged=True)

    elif options.EXPLICIT_HYDROGENS in (ExplicitHydrogens.REMOVE_ALL,
                                        ExplicitHydrogens.ADD_ALL):
        mol = remove_explicit_hydrogens(mol, keep_wedged=False)

    elif options.EXPLICIT_HYDROGENS is ExplicitHydrogens.AS_IS:
        # keep hydrogens as is, don't touch!
        pass

    if options.KEEP_ONLY_LARGEST_STRUCTURE:
        mol = remove_fragments(mol)

    coordgen_enabled = options.GENERATE_COORDINATES in (
        GenerateCoordinates.FULL, GenerateCoordinates.FULL_ALIGNED)
    if coordgen_enabled:
        mol = clear_brackets_from_sgroups(mol)

    if options.REMOVE_PROPERTIES:
        mol = remove_properties(mol)

    if options.REMOVE_SGROUP_DATA is not RemoveSGroupData.NONE:
        mol = remove_sgroups(mol, which=options.REMOVE_SGROUP_DATA)

    if options.STRIP_SALTS:
        mol = strip_salts(mol, salt_list=options.STRIP_SALTS)

    for transformation in options.TRANSFORMATIONS:
        mol, _ = transform(mol, transformation)

    if options.NEUTRALIZE:
        mol = neutralize(mol,
                         options.EXPLICIT_HYDROGENS is ExplicitHydrogens.AS_IS)

    if options.CHOOSE_CANONICAL_TAUTOMER:
        mol = generate_canonical_tautomer(mol)

    if options.EXPLICIT_HYDROGENS is ExplicitHydrogens.ADD_ALL:
        mol = add_explicit_hydrogens(mol)

    if options.CLEAN_WEDGE_ORIENTATION:
        # if we are going to be setting the wedge orientation, we need
        # to add chiral Hs now, before coordinates are generated. Then
        # we set the bond wedging after generating coordinates
        mol = add_chiral_hs(mol)

    else:
        mol = reapply_molblock_wedging(mol)

    if options.CLEAR_INVALID_WEDGE_BONDS:
        mol = clear_wedge_bonds_from_achiral_centers(mol)

    # Adjusting the stereochemistry flag should happen
    # after clear_wedge_bonds_from_achiral_centers(), since it does
    # recalculate stereo, and we might find/remove some chiral centers
    mol = chiral_flag_treatment(options.CHIRAL_FLAG_TREATMENT, orig_mol, mol)

    if coordgen_enabled:
        align = options.GENERATE_COORDINATES is GenerateCoordinates.FULL_ALIGNED
        mol = generate_coordinates(mol, align)

    if options.CLEAN_WEDGE_ORIENTATION:
        # now that coordinates are set, we need to assign bond wedging
        mol = wedge_clean(mol)

    if options.DOUBLE_BOND_STEREO_STANDARD is DoubleBondStereoStandard.CROSSED:
        remove_wiggly_bonds_around_double_bonds(mol)

    # FIXME: Below calls all operate on molblock text directly
    v3000 = options.GENERATE_V3K_SDF

    kekulize = options.RING_REPRESENTATION is RingRepresentation.KEKULE
    processed_molblock = convert_to_molblock(mol, v3000, kekulize)

    if options.HEAVY_HYDROGEN_DT:
        processed_molblock = convert_heavy_hydrogens(processed_molblock, v3000)

    # do this at the very end because we need the mol block and the molecule
    if not v3000 and options.DOUBLE_BOND_STEREO_STANDARD is not DoubleBondStereoStandard.CROSSED:
        raise ValueError(
            "V2000 output only supports DOUBLE_BOND_STEREO_STANDARD=CROSSED")

    processed_molblock = set_double_bond_stereo(
        mol, processed_molblock, options.DOUBLE_BOND_STEREO_STANDARD)

    processed_molblock = do_final_corrections(processed_molblock,
                                              coordgen_enabled, v3000)

    return processed_molblock


[docs]def handle_pentavalent_nitrogens(mol):
    """
    Pentavalent nitrogens are usually the incorrect representation and will
    break most of the downstream functions. We will allow pentavalent nitrogens
    form for Nitro groups (-NO2), but other instances will be transformed to the
    charge separated form.
    """
    modified = False
    if mol.HasSubstructMatch(Chem.MolFromSmarts("[#7;X3,X4]=O")):
        logger.debug(
            "Found a pentavalent nitrogen! Trying to TRANSFORM it away...")
        mol, tf_modified = transform(mol, "[n;H0+0:1]=[O+0:2]>>[n+1:1]-[O-1:2]")
        if tf_modified:
            modified = True
        mol, tf_modified = transform(
            mol,
            "[!O:3]-[N+0;H1,H0:1](=[O+0:2])-[!O:4]>>[*:3]-[N+1:1](-[O-1:2])-[*:4]",
        )
        if tf_modified:
            modified = True
    return mol, modified


[docs]def handle_aromatic_heteroatoms_with_attachment_points(mol):
    """
    Aromatic atoms with attachment points are impossible to kekulize,
    clear that up here by making it explicit that they have an implicit H attached
    (This was SS-31328)
    """
    modified = False
    res = Chem.Mol(mol)
    for atom in res.GetAtoms():
        if (atom.GetIsAromatic() and atom.GetAtomicNum() != 6 and
                atom.HasProp("molAttchpt")):
            modified = True
            atom.SetNumExplicitHs(1)
            atom.SetNoImplicit(True)
    return res, modified


[docs]def add_explicit_hydrogens(mol):
    orig_num_atoms = mol.GetNumAtoms()
    mol = rdmolops.AddHs(mol, explicitOnly=False, addCoords=True)
    if mol.GetNumAtoms() != orig_num_atoms:
        logger.debug("EXPLICIT_HYDROGENS_ADD_ALL")
    return mol


[docs]def remove_explicit_hydrogens(mol, keep_wedged=False):
    # we use this to flag Hs that should not be removed.
    # there's no reasonable scenario in which an H atom
    # in a molecule could have an isotope value of 10, so
    # this should be safe
    save_isotope = 10
    ps = Chem.RemoveHsParameters()
    ps.removeWithWedgedBond = not keep_wedged
    # since we likely do not have information about wedging still on the bonds, the above check
    # may not save us. Let's explicitly protect Hs at the end of wedged bonds from
    # being removed:
    saved_hs = False
    if keep_wedged:
        for atom in mol.GetAtoms():
            if atom.GetAtomicNum() == 1 and atom.GetIsotope() == 0:
                for bnd in atom.GetBonds():
                    # why this is as complex as it is:
                    # Information about bond wedging is stored as a bond property after the CTAB
                    # is parsed. Because the values have different meanings, the V2000 and V3000
                    # parsers use different property names.
                    # The values here correspond to bonds which are wedged, hashed, or "squiggled"
                    #  above comment valid for at least the 2020.03 RDKit release
                    if (bnd.HasProp("_MolFileBondStereo") and
                            bnd.GetIntProp("_MolFileBondStereo") in (1, 6, 4)
                       ) or (bnd.HasProp("_MolFileBondCfg") and
                             bnd.GetUnsignedProp("_MolFileBondCfg") in (1, 2,
                                                                        3)):
                        atom.SetIsotope(save_isotope)
                        saved_hs = True
                        break

    ps.updateExplicitCount = True
    res = rdmolops.RemoveHs(mol, ps, sanitize=False)
    if res.GetNumAtoms() != res.GetNumAtoms():
        logger.debug(f"EXPLICIT_HYDROGENS: keep_wedged={keep_wedged}")

    if saved_hs:
        for atom in res.GetAtoms():
            if atom.GetAtomicNum() == 1 and atom.GetIsotope() == save_isotope:
                atom.SetIsotope(0)

    return res


[docs]def convert_to_molblock(mol, v3000, kekulize):
    if kekulize:
        for bond in mol.GetBonds():
            if bond.GetIsAromatic():
                logger.debug("RING_REPRESENTATION_KEKULE")
                break
    sio = StringIO()
    writer = Chem.SDWriter(sio)
    try:
        writer.SetKekulize(kekulize)
        writer.SetForceV3000(v3000)
        writer.write(mol)
        writer.flush()
        return sio.getvalue()
    finally:
        writer.close()


[docs]def convert_heavy_hydrogens(molblock, v3000):
    """
    NOTE that this operates on a molblock, not a molecule

    The RDKit does not currently (v2020.03) support writing D or T to
    mol blocks, so we need to post-process the text. Fortunately it's
    an easy regex in v3000 mol blocks. This does not work with V2000
    mol blocks, so we throw a ValueError there. This doesn't seem like
    a big deal since V2000 support is primarly being kept around for
    debugging purposes.
    If we need to eventually support V2000+HEAVY_HYDROGEN_DT, some
    not-completely-trivial code will need to be written.
    """
    if not v3000:
        raise ValueError(
            "HEAVY_HYDROGEN_DT option only supported for V3000 output")
    d_expr = re.compile(
        r"^(M  V30 [0-9]+) H (.+?) MASS=2([^\n]*)", flags=re.MULTILINE)
    out_molblock = d_expr.sub(r"\1 D \2\3", molblock)
    t_expr = re.compile(
        r"^(M  V30 [0-9]+) H (.+?) MASS=3([^\n]*)", flags=re.MULTILINE)
    out_molblock = t_expr.sub(r"\1 T \2\3", out_molblock)

    if out_molblock != molblock:
        logger.debug("HEAVY_HYDROGEN_DT")

    return out_molblock


[docs]def neutralize(mol, checkForProblematicHs=False):
    res = uncharger.uncharge(mol)
    res.UpdatePropertyCache(strict=False)

    if res.GetNumAtoms() != mol.GetNumAtoms():
        logger.debug("NEUTRALIZE")
    else:
        for orig_at, new_at in zip(mol.GetAtoms(), res.GetAtoms()):
            if orig_at.GetFormalCharge() != new_at.GetFormalCharge():
                logger.debug("NEUTRALIZE")
                break

    if checkForProblematicHs:
        # NOTE the logic here really only applies to the organic subset.
        # it's not going to do well with other element types, but hopefully
        # no-one is doing neutralization on species with non-organic atoms
        # and EXPLICIT_HS set to "AS_IS"
        if res.GetNumAtoms() != mol.GetNumAtoms():
            raise RuntimeError("uncharging changed the number of atoms")
        for i in range(res.GetNumAtoms()):
            oAt = mol.GetAtomWithIdx(i)
            oChg = oAt.GetFormalCharge()
            if oChg > 0:
                nAt = res.GetAtomWithIdx(i)
                nChg = nAt.GetFormalCharge()
                if nChg != oChg:
                    # positive charges on atoms with Hs should have had an H removed, but
                    # we can't currently do that
                    if oAt.GetTotalNumHs(
                            includeNeighbors=True) - oAt.GetTotalNumHs():
                        raise ValueError(
                            "could not neutralize positive charged atom with explicit H neighbors"
                        )
    return res


[docs]def unicode_to_str(unicode_str):
    """
    Takes a unicode object and converts it to a str (utf-8).
    If the arg is already a str, returns unicode_str (i.e. if run with python 3).
    Needed to support python 2/3 with unicode_literals.

    py2: type<unicode> -> type<str utf-8>
    py3: type<str utf-8> (no unicode type exists)

    :type unicode_str: unicode (py2) or str (py3)
    :param unicode_str: the unicode that potentially needs converting (i.e. if run with python 2)

    :return: str
    """
    if isinstance(unicode_str, str):
        return unicode_str

    return unicode_str.encode("utf-8")


[docs]def transform(mol, transformation, maxTransformations=1000):
    """
    apply the transformation to the molecule repeatedly until it no longer applies.

    the maxTransformations argument is just there to prevent us from ending up
    in an infinite loop due to a bogus transformation

    """
    modified = False
    smarts = unicode_to_str(transformation)
    rxn = rdChemReactions.ReactionFromSmarts(smarts)
    for _ in range(maxTransformations):
        output = rxn.RunReactants((mol,))
        if output:
            output[0][0].UpdatePropertyCache(strict=False)
            mol = output[0][0]
            modified = True
        else:
            # if we get here, the reaction failed because the substructure to be TRANSFORMed
            # was not present in our *mol*. We return the original *mol* object unchanged.
            return mol, modified
    if modified:
        logger.debug(f"TRANSFORM: {transformation}")
    return mol, modified


[docs]def in_xy_plane(mol):
    if mol.GetNumConformers() == 0:
        return False
    return all(xyz[2] == 0 for xyz in mol.GetConformer().GetPositions())


[docs]def generate_coordinates(mol, align=False):
    if mol.GetNumAtoms() == 1:
        # we don't really need to deal with coordgen if there's only a single atom
        mol.GetConformer().SetAtomPosition(0, Geometry.Point3D(0, 0, 0))
        mol.GetConformer().Set3D(False)
        return mol

    # keep original mol to align with
    original_mol = Chem.Mol(mol)
    Chem.rdCoordGen.AddCoords(mol)

    # align input mol and mol with new coords
    if align and in_xy_plane(mol) and in_xy_plane(original_mol):
        rdMolAlign.AlignMol(mol, original_mol)

    logger.debug("GENERATE_COORDINATES")
    return mol


[docs]def generate_canonical_tautomer(mol):

    orig_bonds = [
        (bond.GetBondType(), bond.GetIsAromatic()) for bond in mol.GetBonds()
    ]
    st = rdkit_adapter.from_rdkit(
        mol, include_properties=False, include_stereo=False)
    st = epik_tautomer_canonicalizer.TautomerCanonicalizer().apply(st)[0]

    mol = rdkit_adapter.copy_lewis_structure_and_hydrogens(st, mol)
    new_bonds = [
        (bond.GetBondType(), bond.GetIsAromatic()) for bond in mol.GetBonds()
    ]
    if new_bonds != orig_bonds:
        logger.debug("CHOOSE_CANONICAL_TAUTOMER")

    return mol


[docs]def clear_wedge_bonds_from_achiral_centers(mol):
    modified = False
    new_mol = Chem.Mol(mol)
    Chem.AssignStereochemistry(new_mol, cleanIt=True, force=True)

    for orig_at, new_at in zip(mol.GetAtoms(), new_mol.GetAtoms()):
        if orig_at.GetChiralTag() != new_at.GetChiralTag():
            modified = True
            break
    if not modified:
        for orig_bnd, new_bnd in zip(mol.GetBonds(), new_mol.GetBonds()):
            if (orig_bnd.GetStereo() != new_bnd.GetStereo() or
                    orig_bnd.GetBondDir() != new_bnd.GetBondDir()):
                modified = True
                break
    if modified:
        logger.debug("CLEAR_INVALID_WEDGE_BONDS")
    return new_mol


def _get_chiral_flag(mol):
    try:
        return mol.GetUnsignedProp(MOL_PROP_CHIRAL_FLAG)
    except KeyError:
        return MOL_PROP_OFF


[docs]def chiral_flag_treatment(adj_mode, src_mol, mol):
    if adj_mode == ChiralFlagTreatment.IGNORE:
        chiral_flag = _get_chiral_flag(src_mol)
    elif adj_mode == ChiralFlagTreatment.FORCE_OFF:
        chiral_flag = MOL_PROP_OFF
    elif adj_mode == ChiralFlagTreatment.FORCE_ON:
        chiral_flag = MOL_PROP_ON
    else:
        has_chiral_atoms = any(
            atom.GetChiralTag() != Chem.ChiralType.CHI_UNSPECIFIED
            for atom in mol.GetAtoms())
        has_enhanced_stereo = len(mol.GetStereoGroups()) != 0

        if has_chiral_atoms and not has_enhanced_stereo:
            chiral_flag = MOL_PROP_ON
        else:
            chiral_flag = MOL_PROP_OFF

    mol.SetUnsignedProp(MOL_PROP_CHIRAL_FLAG, chiral_flag)
    logger.debug(f"CHIRAL_FLAG_TREATMENT[{adj_mode.name}]: {chiral_flag}")

    return mol


[docs]def strip_stereo_abs(input_mol):
    """
    Removes any Stereo ABS group

    :param input_mol: The original molecule to consider
    :return: post-processed molecule, if the input molecule was modified
    """
    mol = Chem.RWMol(input_mol)
    keep = []

    for stereo_group in mol.GetStereoGroups():
        if stereo_group.GetGroupType() != Chem.StereoGroupType.STEREO_ABSOLUTE:
            keep.append(stereo_group)
        else:
            logger.debug("STRIP_STEREO_ABSOLUTE_GROUP")

    mol.SetStereoGroups(keep)
    return Chem.Mol(mol)


[docs]def strip_stereo_and(input_mol):
    """
    Removes any Stereo AND groups with only one center and flattens the bonds around it

    :param input_mol: The original molecule to consider
    :return: post-processed molecule, if the input molecule was modified
    """
    mol = Chem.RWMol(input_mol)
    keep = []

    for stereo_group in mol.GetStereoGroups():
        if stereo_group.GetGroupType() != Chem.StereoGroupType.STEREO_AND:
            keep.append(stereo_group)
        elif len(stereo_group.GetAtoms()) > 1:
            keep.append(stereo_group)
        else:
            logger.debug("STRIP_AND_GROUPS_ON_SINGLE_ATOM")
            # flatten the atom, At this point the stereo_group only has one atom
            atom = stereo_group.GetAtoms()[0]
            atom.SetChiralTag(Chem.ChiralType.CHI_UNSPECIFIED)
            # make sure that all bond wedging information has been removed around that atom too:
            for bond in atom.GetBonds():
                if bond.GetBeginAtomIdx() == atom.GetIdx():
                    bond.ClearProp("_MolFileBondStereo"
                                  )  # what we'd get from a V2000 mol file
                    bond.ClearProp("_MolFileBondCfg"
                                  )  # what we'd get from a V3000 mol file

    mol.SetStereoGroups(keep)
    return Chem.Mol(mol)


[docs]def frag_is_smaller(atoms, largest_atoms, weight, largest_weight, smiles,
                    largest_smiles):
    """
    A fragment is considered larger if its atoms/weight are larger, the length of the smiles string
    is larger, or the smiles string is lexicographically *smaller* if they are equal length.
    ie, 'AAA' is larger than 'AAB'.. hence the final smiles > largest_smiles check here to reject
    """
    if atoms < largest_atoms:
        return True
    if atoms > largest_atoms:
        return False
    if weight < largest_weight:
        return True
    if weight > largest_weight:
        return False
    if len(smiles) < len(largest_smiles):
        return True
    if len(smiles) > len(largest_smiles):
        return False
    return smiles > largest_smiles


[docs]def connect_variable_attachment_points(mol):
    """
    forms zero-order bonds between one of the atoms of a bond with an ATTACH
    property to the "main" molecule in order to have the molecule+variable
    attachment point treated as a single fragment

    returns a 2-tuple with:
      1. the modified molecule
      2. whether or not the molecule was modified
    """
    bonds_added = False

    res = None
    for bond in list(mol.GetBonds()):
        if bond.HasProp("_MolFileBondAttach") and bond.HasProp(
                "_MolFileBondEndPts"):
            # these should look like: ENDPTS=(3 4 6 5)
            endp_txt = bond.GetProp("_MolFileBondEndPts")
            # do a bit of validation that the format is correct:
            if endp_txt[0] != "(" or endp_txt[-1] != ")":
                logger.warning("invalid ENDPTS format ignored")
                continue
            endps = [int(x) for x in endp_txt[1:-1].split(" ")]
            # do a bit of validation that the format is correct:
            if len(endps) != endps[0] + 1 or endps[0] == 1:
                logger.warning("invalid ENDPTS format ignored")
                continue
            batom_idx = bond.GetBeginAtomIdx()
            oatom_idx = endps[1] - 1
            if res is None:
                res = Chem.RWMol(mol)
            if res.GetBondBetweenAtoms(batom_idx, oatom_idx):
                logger.warning("variable attachment point already connected")
                continue
            res.AddBond(batom_idx, oatom_idx, Chem.BondType.ZERO)
            # set a property on the bond so that we can find it again:
            res.GetBondBetweenAtoms(batom_idx, oatom_idx).SetIntProp(
                "variable_attach_bond", 1)
            bonds_added = True
    if res is None:
        res = mol
    return res, bonds_added


[docs]def remove_fragments(mol):
    """
    Fragments are not removed if the molecule contains any SGroups which are associated with
    polymers

    Use the following criteria to remove unwanted fragements from *mol*:
     1. keep only the fragment which has the most number of atoms
     2. break ties by keeping only fragments with the greatest molecular weight
     3. break ties with the longest smiles string
     4. break additional ties by keeping the fragment with the earliest alpha sorted SMILES string

    If two or more identical fragments remain after 1-4, we will throw a fatal error.
    """
    sgs = Chem.GetMolSubstanceGroups(mol)
    for sg in sgs:
        props = sg.GetPropsAsDict()
        if "TYPE" in props and props["TYPE"] in (
                "MUL",
                "SRU",
                "MON",
                "COP",
                "CRO",
                "GRA",
                "MIX",
                "FOR",
                "MOD",
        ):
            logger.debug(
                "Molecule has polymeric SGroups, skipping remove_fragments.")
            return mol

    nmol, bonds_added = connect_variable_attachment_points(mol)
    frags = Chem.GetMolFrags(nmol, asMols=True, sanitizeFrags=False)
    if len(frags) == 1:
        return mol

    largest_frag = None
    largest_atoms = math.inf
    largest_weight = math.inf
    largest_smiles = None
    duplicates = 0

    for frag in frags:
        [atoms, weight, smiles] = [
            frag.GetNumHeavyAtoms(),
            MolWt(frag),
            Chem.MolToSmiles(frag),
        ]
        if largest_frag is not None:
            if frag_is_smaller(atoms, largest_atoms, weight, largest_weight,
                               smiles, largest_smiles):
                continue

            # If all properties are equivalent, we have a *potential* duplicate of the largest known fragment
            # but if they are not, we've discovered a new, bigger fragment, and may discard any known dupes
            if (atoms == largest_atoms and weight == largest_weight and
                    smiles == largest_smiles):
                duplicates += 1
            else:
                duplicates = 0
            largest_frag = frag
            largest_atoms = atoms
            largest_weight = weight
            largest_smiles = smiles
        else:
            largest_frag = frag
            largest_atoms = atoms
            largest_weight = weight
            largest_smiles = smiles

    if duplicates:
        message = (
            "At the end of REMOVE_FRAGMENTS, we had {duplicates} identical "
            "fragments left over, returning only one.")
        logger.debug(message.format(duplicates=duplicates))

    if bonds_added:
        largest_frag = Chem.RWMol(largest_frag)
        bnds = list(largest_frag.GetBonds())
        for bnd in bnds:
            if bnd.HasProp("variable_attach_bond"):
                largest_frag.RemoveBond(bnd.GetBeginAtomIdx(),
                                        bnd.GetEndAtomIdx())

    logger.debug("KEEP_ONLY_LARGEST_STRUCTURE")
    return largest_frag


[docs]def clear_brackets_from_sgroups(mol):
    """
    Removes brackets from any s-groups
    """
    sgs = Chem.GetMolSubstanceGroups(mol)
    Chem.ClearMolSubstanceGroups(mol)
    for sg in sgs:
        sg.ClearBrackets()
        logger.debug("CLEAR_BRACKETS_FROM_SGROUPS")
        Chem.AddMolSubstanceGroup(mol, sg)

    return mol


[docs]def remove_properties(mol):
    for prop_name in mol.GetPropNames():
        mol.ClearProp(prop_name)
        logger.debug("REMOVE_PROPERTIES")
    return mol


[docs]def remove_sgroups(mol, which):
    res = Chem.Mol(mol)
    if which is RemoveSGroupData.NONE:
        return res
    Chem.ClearMolSubstanceGroups(res)
    if which is RemoveSGroupData.ALL:
        if len(Chem.GetMolSubstanceGroups(mol)) != 0:
            logger.debug("REMOVE_SGROUP_DATA")
        return res
    if which is RemoveSGroupData.DAT_ONLY:
        for sg in Chem.GetMolSubstanceGroups(mol):
            if sg.GetProp("TYPE") != "DAT":
                Chem.AddMolSubstanceGroup(res, sg)
        if len(Chem.GetMolSubstanceGroups(mol)) != len(
                Chem.GetMolSubstanceGroups(res)):
            logger.debug("REMOVE_SGROUP_DATA")
        return res
    # we shouldn't get here
    assert False


[docs]def strip_salts(mol, salt_list):
    salt_list_str = "\n".join(salt_list)
    remover = SaltRemover.SaltRemover(defnData=salt_list_str)
    res = remover.StripMol(mol)
    if res.GetNumAtoms() != mol.GetNumAtoms():
        logger.debug("STRIP_SALTS")
    return res


[docs]def add_chiral_hs(mol):
    # need to filter chiral_atoms to get only atoms with endocyclic stereobonds
    # since we don't want to add hydrogens to chiral atoms that already have
    # exocyclic stereobonds
    chiral_atoms = [center[0] for center in Chem.FindMolChiralCenters(mol)]
    chiral_atoms_needing_hydrogens = []
    for atom_idx in chiral_atoms:
        atom = mol.GetAtomWithIdx(atom_idx)
        # only mark this for adding hydrogens if there are no exocyclic bonds
        add_atom = all(bond.IsInRing() for bond in atom.GetBonds())
        if add_atom:
            chiral_atoms_needing_hydrogens.append(atom_idx)

    # return mol if there are no chiral atoms to be adjusted
    if not chiral_atoms_needing_hydrogens:
        return mol

    res = Chem.AddHs(
        mol,
        explicitOnly=False,
        addCoords=True,
        onlyOnAtoms=chiral_atoms_needing_hydrogens,
    )
    if res.GetNumAtoms() != mol.GetNumAtoms():
        logger.debug("ADD_CHIRAL_HS")
    return res


[docs]def wedge_clean(mol):
    # resetting the BondDir and UnknownStereo setting on each bond
    # so WedgeMolBonds can do its magic
    for bond in mol.GetBonds():
        if bond.GetBondType() == Chem.BondType.SINGLE and bond.IsInRing():
            if bond.GetBondDir() == Chem.BondDir.UNKNOWN:
                bond.SetIntProp(MOL_PROP_UNKNOWN_STEREO, MOL_PROP_ON)
            bond.SetBondDir(Chem.BondDir.NONE)
        # bonds which were explicitly marked as unknown:
        if (bond.HasProp("_MolFileBondStereo") and
                bond.GetProp("_MolFileBondStereo") == "4") or (
                    bond.HasProp("_MolFileBondCfg") and
                    bond.GetProp("_MolFileBondCfg") == "2"):
            # ensure that the bond doesn't start at an atom with an specified double
            # bond
            disqualify = False
            for obond in bond.GetBeginAtom().GetBonds():
                if (obond.GetBondType() == Chem.BondType.DOUBLE and
                        obond.GetStereo() != Chem.BondStereo.STEREONONE):
                    disqualify = True
                    break
            if not disqualify:
                bond.SetBondDir(Chem.BondDir.UNKNOWN)
    # NOTE: the logic here of when to indicate that the molecule has been modified is not
    # 100% obvious
    orig_dirs = [x.GetBondDir() for x in mol.GetBonds()]

    Chem.WedgeMolBonds(mol, mol.GetConformer(0))

    new_dirs = [x.GetBondDir() for x in mol.GetBonds()]
    if orig_dirs != new_dirs:
        logger.debug("CLEAN_WEDGE_ORIENTATION")
    return mol


[docs]def reapply_molblock_wedging(mol):
    for bnd in mol.GetBonds():
        # only change the wedging if the bond was wedged in the input data (we
        # recognize that by looking for the properties the mol file parser
        # sets):
        if bnd.HasProp("_MolFileBondStereo"):
            # V2000
            val = bnd.GetProp("_MolFileBondStereo")
            if val == "1":
                bnd.SetBondDir(Chem.BondDir.BEGINWEDGE)
            elif val == "6":
                bnd.SetBondDir(Chem.BondDir.BEGINDASH)
            elif val == "4":
                bnd.SetBondDir(Chem.BondDir.UNKNOWN)
        elif bnd.HasProp("_MolFileBondCfg"):
            # V3000
            val = bnd.GetProp("_MolFileBondCfg")
            if val == "1":
                bnd.SetBondDir(Chem.BondDir.BEGINWEDGE)
            elif val == "3":
                bnd.SetBondDir(Chem.BondDir.BEGINDASH)
            elif val == "2":
                bnd.SetBondDir(Chem.BondDir.UNKNOWN)
        elif bnd.GetBondDir() in (Chem.BondDir.BEGINDASH,
                                  Chem.BondDir.BEGINWEDGE,
                                  Chem.BondDir.UNKNOWN):
            bnd.SetBondDir(Chem.BondDir.NONE)
    return mol


[docs]def remove_wiggly_bonds_around_double_bonds(mol):
    # We just need to make sure that there any wiggly bonds starting at atoms
    # involved in double bonds don't end up in the output
    for bond in mol.GetBonds():
        if bond.GetBondType() == Chem.BondType.SINGLE and (
            (bond.HasProp("_MolFileBondStereo") and
             bond.GetProp("_MolFileBondStereo") == "4") or
            (bond.HasProp("_MolFileBondCfg") and
             bond.GetProp("_MolFileBondCfg") == "2")):
            # it's a wiggly bond. Check to see if it starts at an atom with a double bond
            startsAtDouble = False
            for obond in bond.GetBeginAtom().GetBonds():
                if (obond.GetBondType() == Chem.BondType.DOUBLE and
                        obond.GetStereo() != Chem.BondStereo.STEREONONE):
                    startsAtDouble = True
                    break
            if startsAtDouble:
                if bond.HasProp("_MolFileBondStereo"):
                    bond.ClearProp("_MolFileBondStereo")
                if bond.HasProp("_MolFileBondCfg"):
                    bond.ClearProp("_MolFileBondCfg")
                bond.SetBondDir(Chem.BondDir.NONE)
                logger.debug("DOUBLE_BOND_STEREO_STANDARD_CROSSED")


[docs]def set_double_bond_stereo(mol, mol_block, bond_type):
    # as of the 2020.03 release cycle the RDKit will always write double bonds with
    # unknown stereochemistry as crossed bonds. This is done at the C++ level and
    # can't be modified from Python. We work around that here by doing regex changes
    # in the output mol_block.

    # some notes on this:
    #  - This is a problem which would ideally be solved in the core RDKit, with a
    #    real implementation connected directly to the generation of mol blocks.
    #    Until that's there we're kind of stuck with this hacky (though hopefully correct)
    #    solution.
    #  - It is, unsurprisingly, fragile. It's quite easy to construct examples
    #    where a double bond has more than one wiggly bond to it. This is ugly,
    #    but not wrong.

    # Explanation of the constants being used, this is taken from the Biovia CTAB documentation:
    #  http://help.accelrysonline.com/ulm/onelab/1.0/content/ulm_pdfs/direct/reference/ctfileformats2016.pdf
    #
    #  Marking a double bond as CROSSED to indicate unknown stereochemistry is done
    #    by setting CFG=2 on the double bond in V3000
    #         the RDKit parses this field into the property "_MolFileBondCfg"
    #    in V2000 it's by setting 3 in the "bond stereo" field
    #         the RDKit parses this field into the property "_MolFileBondStereo"
    #
    #  Marking a single bond as wiggly to indicate unknown stereochemistry on a neighboring
    #  double bond or chiral center is done by setting CFG=2 on the single bond in V3000
    #         the RDKit parses this field into the property "_MolFileBondCfg"
    #    in V2000 it's by setting 4 in the "bond stereo" field
    #         the RDKit parses this field into the property "_MolFileBondStereo"
    out_mol_block = mol_block
    if bond_type is DoubleBondStereoStandard.CROSSED:
        # this is what the RDKit does by default.
        pass
    elif bond_type is DoubleBondStereoStandard.WIGGLY:
        # we need to create one wiggly bond per double bond. Do the one that is shared
        # with the least number of other double bonds

        # start by identifying bonds next to double bonds that have stereo specified:
        next_to_specified_double_bond = [0] * mol.GetNumBonds()
        for bond in mol.GetBonds():
            if bond.GetBondType() == Chem.BondType.DOUBLE and bond.GetStereo(
            ) not in (
                    Chem.rdchem.BondStereo.STEREOANY,
                    Chem.rdchem.BondStereo.STEREONONE,
            ):
                # mark all of our neighbors:
                begin_atom = bond.GetBeginAtom()
                end_atom = bond.GetEndAtom()
                for contiguous_bond in begin_atom.GetBonds(
                ) + end_atom.GetBonds():
                    next_to_specified_double_bond[contiguous_bond.GetIdx()] = 1

        single_bond_scores = [0] * mol.GetNumBonds()
        unknown_double_bond_nbrs = defaultdict(list)
        for bond in mol.GetBonds():
            if (bond.GetBondType() == Chem.BondType.DOUBLE and
                    bond.GetStereo() == Chem.rdchem.BondStereo.STEREOANY):
                # increment the counts of our eligible neighbors:
                begin_atom = bond.GetBeginAtom()
                end_atom = bond.GetEndAtom()
                for contiguous_bond in begin_atom.GetBonds(
                ) + end_atom.GetBonds():
                    if (not next_to_specified_double_bond[contiguous_bond.
                                                          GetIdx()] and
                            contiguous_bond.GetBondType() ==
                            Chem.BondType.SINGLE and
                            contiguous_bond.GetBondDir() == Chem.BondDir.NONE):
                        single_bond_scores[contiguous_bond.GetIdx()] += 1
                        unknown_double_bond_nbrs[bond.GetIdx()].append(
                            contiguous_bond.GetIdx())

        for dbl_bnd_idx in unknown_double_bond_nbrs:
            nbr_scores = sorted(
                [(single_bond_scores[x], x)
                 for x in unknown_double_bond_nbrs[dbl_bnd_idx]])
            if not nbr_scores:
                logger.warning(
                    f"cannot find a single bond to mark as WIGGLY for double bond {bond.GetIdx()+1}"
                )
                continue
            single_bond_to_mark = nbr_scores[0][-1]

            # now we know which double bond and which single bond we need to change.
            # go ahead and change them in the mol block
            # we replace the crossed double bond, which looks like this:
            # M  V30 3 2 1 2 CFG=2
            # with its uncrossed form:
            # M  V30 3 2 1 2
            # and we make the single bond wiggly, which is converting it from this:
            # M  V30 3 1 1 3
            # to this:
            # M  V30 3 1 1 3 CFG=2

            # first do the double bond:
            out_mol_block = re.sub(
                r"(M  V30 %d 2 [0-9]+ [0-9]+) CFG=2" % (dbl_bnd_idx + 1),
                r"\1",
                out_mol_block,
            )

            # now the single bond
            out_mol_block = re.sub(
                r"(M  V30 %d 1 [0-9]+ [0-9]+)" % (single_bond_to_mark + 1),
                r"\1 CFG=2",
                out_mol_block,
            )
    elif bond_type is DoubleBondStereoStandard.NONE:
        # go back to whatever we had before
        for bond in mol.GetBonds():
            if (bond.GetBondType() == Chem.BondType.DOUBLE and
                    bond.GetStereo() == Chem.rdchem.BondStereo.STEREOANY):
                # V3000 crossed bond
                cfg = 0
                if bond.HasProp("_MolFileBondCfg"):
                    cfg = bond.GetUnsignedProp("_MolFileBondCfg")
                elif bond.HasProp("_MolFileBondStereo"):
                    cfg = bond.GetUnsignedProp("_MolFileBondStereo")
                    # translate v2000->v3000
                    if cfg == 3:
                        cfg = 2
                    else:
                        cfg = 0
                if cfg == 2:
                    # it was originally crossed, it will still be crossed, no need to do anything
                    pass
                else:
                    reset_dbl_bond = False
                    # and find neighboring wiggly single bonds
                    begin_atom = bond.GetBeginAtom()
                    end_atom = bond.GetEndAtom()
                    for contiguous_bond in begin_atom.GetBonds(
                    ) + end_atom.GetBonds():
                        if (contiguous_bond == bond or
                                contiguous_bond.GetBondType() !=
                                Chem.BondType.SINGLE):
                            continue
                        cfg = 0
                        if contiguous_bond.HasProp("_MolFileBondCfg"):
                            cfg = contiguous_bond.GetUnsignedProp(
                                "_MolFileBondCfg")
                        elif contiguous_bond.HasProp("_MolFileBondStereo"):
                            cfg = contiguous_bond.GetUnsignedProp(
                                "_MolFileBondStereo")
                            # translate v2000->v3000
                            if cfg == 4:
                                cfg = 2
                            else:
                                cfg = 0
                        if cfg:
                            bnd_idx = contiguous_bond.GetIdx()
                            if (re.search(
                                    r"M  V30 %d 1 [0-9]+ [0-9]+ CFG=" %
                                (bnd_idx + 1),
                                    mol_block,
                            ) is None):
                                out_mol_block = re.sub(
                                    r"(M  V30 %d 1 [0-9]+ [0-9]+)" %
                                    (bnd_idx + 1),
                                    r"\1 CFG=2",
                                    out_mol_block,
                                )
                                reset_dbl_bond = True
                            else:
                                # if we somehow already ended up with a configuration (perhaps because of wedging elsewhere)
                                # generate a warning, but don't mess with it:
                                logger.warning(
                                    f"unable to set single bond {bnd_idx+1} to be a wiggly bond."
                                )

                    if reset_dbl_bond:
                        # uncross the double bond:
                        dbl_bnd_idx = bond.GetIdx()
                        out_mol_block = re.sub(
                            r"(M  V30 %d 2 [0-9]+ [0-9]+) CFG=2" %
                            (dbl_bnd_idx + 1),
                            r"\1",
                            out_mol_block,
                        )
                    else:
                        logger.warning(
                            f"unable to write double bond {bond.GetIdx()+1} without a crossed bond."
                        )

    else:
        raise ValueError(
            f"Unrecognized value for DOUBLE_BOND_STEREO_STANDARD {bond_type}")

    if out_mol_block != mol_block:
        logger.debug(f"DOUBLE_BOND_STEREO_STANDARD_{bond_type}")

    return out_mol_block


[docs]@main_wrapper("LiveDesign Structure Preprocessor")
def main(argv=None):
    """
    Function to run preprocessor directly from the command line.
    """

    parser = argparse.ArgumentParser(
        description="Run the preprocessor on an SD file")
    parser.add_argument("input", help="input SDF")
    parser.add_argument("-output", default="out.sdf", help="output SDF")
    parser.add_argument("-config", help="JSON config file")
    parser.add_argument("-verbose", action="store_true", help="Verbose logging")
    args = parser.parse_args(argv)

    logger.setLevel(log.logging.DEBUG if args.verbose else log.logging.INFO)

    config_json = None
    if args.config:
        with open(args.config) as fh:
            config_json = json.load(fh)
            logger.debug(f"Preprocesor Configuration: {config_json}")

    with infrastructure.TextBlockReader(args.input) as reader, \
            infrastructure.TextBlockWriter(args.output) as writer:
        for in_txt in reader:
            in_txt = in_txt.decode()
            try:
                out_txt = preprocess_molblock(in_txt, config_json)
                writer.append(out_txt.encode())
            except Exception as exc:
                logger.info("Failed to process structure...")
                logger.info(f"{in_txt}")
                logger.info("ERROR:")
                logger.exception(f"{exc}")
                logger.info(log.SINGLE_LINE)


if __name__ == "__main__":
    main()