Source code for schrodinger.application.steps.scorers
import csv
import os
from typing import Dict
from rdkit import Chem
from schrodinger.models import parameters
from schrodinger.tasks import stepper
from . import utils
from .basesteps import MolMapStep
from .basesteps import MolReduceStep
from .dataclasses import ScoredMol
from .dataclasses import ScorerMixin
try:
from ligand_ml.smasher import Smasher
except:
Smasher = None
INF = float('inf')
[docs]class LigandMLScorer(ScorerMixin, MolReduceStep):
"""
Yield scored molecules where the score is the value predicted by the model.
The only setting is the required `ml_file`: the path to the ML qzip file.
"""
[docs] class Settings(parameters.CompoundParam):
ml_file: stepper.StepperFile = None
[docs] def validateSettings(self, what='ml_file'):
issues = []
if self.settings.ml_file is None:
issues.append(stepper.SettingsError(self, f'{what} not defined'))
elif not os.path.isfile(self.settings.ml_file):
issues.append(stepper.SettingsError(self, f'{what} not found'))
else:
issue = utils.validate_smasher_file(self.settings.ml_file)
if issue:
issues.append(stepper.SettingsError(self, issue))
return issues
[docs] def reduceFunction(self, inputs):
# For performance reasons, this step combines all inputs into a list
# for the score prediction, but yields every scored molecule one by one.
mols = list(inputs)
smasher = Smasher.load(self.settings.ml_file)
results = smasher.predict_on_mols(mols)
for result, mol in zip(results, mols):
yield ScoredMol(mol=mol, score=result[0])
[docs]class PatternScore:
"""
A SMARTS pattern and the associated score
"""
[docs] def __init__(self, pattern, score):
"""
:param pattern: the SMARTS pattern
:type pattern: str
:param score: the score for the SMARTS presence in molecules
:type score: float castable object
"""
self._pattern = Chem.MolFromSmarts(pattern)
self._score = float(score)
[docs] def mol_score(self, mol):
"""
:param mol: the molecule to score
:type mol: Chem.Mol
:return: the structure score for the mol
:rtype: float
"""
return len(mol.GetSubstructMatches(self._pattern)) * self._score
[docs]class PatternScorer(ScorerMixin, MolMapStep):
"""
Only allows molecules through whose score is less than `max_value`.
The score itself is the sum of the scores for every occurrence of the SMARTS
patterns in the molecule. Note that if the molecule to score does not have
explicit H's and the SMARTS pattern does, no match will be found and the
score would be 0 even if all heavy atoms match.
The csv file should have headers and at least the columns for
the SMARTS pattern (`pattern_hdr`) and score value (`score_hdr`).
If the settings has `score_file` defined, it will be considered to be the
path from which to create the pattern scores to use.
"""
[docs] class Settings(parameters.CompoundParam):
max_value: float = INF
pattern_scores: Dict[str, PatternScore]
score_file: stepper.StepperFile
pattern_hdr: str = 'Pattern'
score_hdr: str = 'Score'
def _readCSV(self):
self._pattern_scores = dict()
with open(self.settings.score_file, 'r') as fh:
for row in csv.DictReader(fh):
pattern = row[self.settings.pattern_hdr]
self._pattern_scores[pattern] = PatternScore(
pattern, row[self.settings.score_hdr])
def _setPatternScores(self):
self._pattern_scores = self.settings.pattern_scores
file = self.settings.score_file
if file and os.path.isfile(file):
self._readCSV()
[docs] def validateSettings(self):
issues = utils.validate_file(self, 'score_file')
self._setPatternScores()
if not self._pattern_scores:
issues.append(
stepper.SettingsError(self, 'no pattern scores defined'))
return issues
[docs] def setUp(self):
super().setUp()
self._setPatternScores()
self.logger.info(
f'{self.getStepId()} {len(self._pattern_scores)} pattern scores')
def _score(self, mol):
"""
:param mol: the molecule to score
:type mol: Chem.Mol
:return: the first computed score above the settings' max_value
:rtype: float
"""
score = 0.0
for pattern_score in self._pattern_scores.values():
score += pattern_score.mol_score(mol)
if score >= self.settings.max_value:
return score
return score
[docs] def mapFunction(self, mol):
score = self._score(mol)
if score < self.settings.max_value:
yield ScoredMol(mol=mol, score=score)