"""
This module contains the CombinatorialScreener class, which employs a
heuristic approach to identify subsets of combinatorial reactants that
are most likely to yield enumerated products with the highest dendritic
fingerprint similarities to a query.
Basic Algorithm
---------------
For the example reaction A + B + C --> ABC, the reactants in each of the 3
groups are ranked by decreasing Tversky similarity to the query, where the
Tversky weight for the reactant R is 1 and the Tversky weight for the query
Q is 0. In other words,
rank_score(R, Q) = ON(R & Q) / ON(R)
Where:
ON(R & Q) = Number of 'on' bits shared by reactant and query
ON(R) = Number of 'on' bits in reactant
This quantity is maximized when R is a substructure of Q.
Once the reactants have been ranked, limits NA, NB, NC on the ranked lists
are assigned to yield the subset S(NA, NB, NC) = A[0:NA] x B[0:NB] x C[0:NC],
where [0:N] is a Python slice.
If the minimum number of enumerated products desired is min_products, the
limits must be chosen such that NA * NB * NC >= min_products.
NA, NB, NC are arrived by setting them to 1 and then performing a systematic
exploration of larger values with the goal of identifying combinations of
reactants whose logical OR fingerprints yield the highest similarities to
the query. In the case of dendritic fingerprints, the logical OR similarities
correlate strongly with similarities computed from the enumerated products,
so this is a good approximation that avoids enumeration of S(NA, NB, NC) as
the limits are varied.
A rough outline of the procedure is as follows::
1. Set NA = NB = NC = 1
2. if NA * NB * NC >= min_products, we are done
3. sim_best = 0, R_best = None
4. for R in (A, B, C):
NR += 1 # Temporarily add new reactant R_new
for each (a, b, c) in S(NA, NB, NC), where R_new is in (a, b, c)
FP_abc = FP(a) | FP(b) | FP(c) # Logical OR fingerprint
sim = Tanimoto(FP_abc, FP_query)
if sim > sim_best:
sim_best = sim
R_best = R
NR -= 1 # Remove new reactant
5. NR_best += 1 # Expand limit to include best new reactant
6. Go to step 2
This approach is superior to assigning equal limits, such as (10, 10, 10)
if 1000 products are desired. In many cases, the algorithm finds limits that
are quite ragged, such as (2, 50, 10), and the enumerated compound with the
highest similiarty to the query is found at some non-obvious position, such
as (1, 45, 7).
Copyright Schrodinger LLC, All Rights Reserved.
"""
import itertools
import numpy
from schrodinger.application.combinatorial_screen import \
fingerprint_utils as fp_utils
# MAX_COMBOS is used to assign a default upper bound on the size of each of
# the N reactant subsets:
#
# max_reactants = MAX_COMBOS ** (1/N)
#
# For example, if there are 3 reactants, the upper bound would be 100, which
# means the reactant subset limits would top out at (100, 100, 100). This is
# done to force the exploration of all dimensions of the reactant subspace in
# cases where there's a natural tendency to keep one or more dimensions very
# small. So instead of getting limits like (1, 320, 313) when 100,000 products
# are requested, we would get limits like (10, 100, 100). The advantages of
# of this approach are that it prevents step 4 in the above algorithm from
# becoming painfully slow, and it tends to increase the number of successfully
# enumerated products for the same total number of reactant combinations.
MAX_COMBOS = 1000000
[docs]class CombinatorialScreener:
"""
Identifies subsets of combinatorial reactants that are most likely to
yield products with the highest dendritic Tanimoto similarities to a
query.
"""
[docs] def __init__(self, reactant_fp_files, query_smiles, max_reactants=None):
"""
Constructor taking the names of dendritic fingerprint files for one
or more sets of reactants, the SMILES string for a query, and an
optional cap on the reactant subset size.
Each fingerprint file must contain a single extra data column that
holds the SMILES of the reactants. If the same fingerprint file is
supplied more than once, each instance is treated as a separate
reactant set, but the file is read only once, and all screens will
yield an identical reactant subset size for all instances of that
reactant.
:param reactant_fp_files: List of reactant fingerprint files.
:type reactant_fp_files: list(str)
:param query_smiles: SMILES string for query.
:type query_smiles: str
:param max_reactants: Maximum allowed size of each reactant subset
when a query is screened. The default is MAX_COMBOS ** 1/N,
where N is the number of reactant groups.
:type max_reactants: int
"""
nr = len(reactant_fp_files)
if nr == 0:
raise RuntimeError("No reactant fingerprint files supplied")
self.reactant_count = nr
self._query_fp = fp_utils.smiles_to_fingerprint(query_smiles)
if max_reactants is not None:
if max_reactants < 1:
raise ValueError("Maximum number of reactants must be >= 1")
else:
max_reactants = MAX_COMBOS**(1 / self.reactant_count)
self.max_reactants = int(round(max_reactants))
self.reactant_fps = nr * [None]
self.reactant_titles = nr * [None]
self.reactant_smiles = nr * [None]
self.reactant_indices = nr * [None] # 0-based reactant numbers
self.max_products = 1
self.max_reactants_reached = False
# Keep track of duplicate reactant files while reading and scoring.
prev_reactants = {}
self._reactant_equiv = [[] for i in range(nr)]
for i in range(nr):
if reactant_fp_files[i] in prev_reactants:
iprev = prev_reactants[reactant_fp_files[i]]
self._reactant_equiv[i].append(iprev)
self._reactant_equiv[iprev].append(i)
self.reactant_titles[i] = self.reactant_titles[iprev]
self.reactant_smiles[i] = self.reactant_smiles[iprev]
self.reactant_indices[i] = self.reactant_indices[iprev]
self.reactant_fps[i] = self.reactant_fps[iprev]
else:
self.reactant_titles[i], self.reactant_smiles[
i], self.reactant_indices[i], self.reactant_fps[
i] = fp_utils.rank_reactants(reactant_fp_files[i],
self._query_fp,
self.max_reactants)
prev_reactants[reactant_fp_files[i]] = i
self.max_products *= len(self.reactant_fps[i])
[docs] def screen(self, min_products):
"""
Performs a similarity screen against the query to determine the
number of reactants in each sorted group that are required to make
the minimum number of enumerated products. These reactant counts
are stored in self.reactant_limits.
:param min_products: Minimum number of theoretical products
:type min_products: int
"""
if min_products > self.max_products:
raise ValueError("Too many products requested")
self.max_reactants_reached = False
self.reactant_limits = self.reactant_count * [1]
while True:
if numpy.prod(self.reactant_limits) >= min_products:
break
if not self._add_best_reactant():
self.max_reactants_reached = True
break
def _add_best_reactant(self):
"""
Increments the reactant limit which yields the combination of
reactants with the highest logical OR similarity to the query.
Returns True if a reactant was added and False if a limit was
reached that prevented a reactant from being added.
:return: True if a reactant was added; False if not
:rtype: bool
"""
sim_best = 0.0
r_best = None
nr = self.reactant_count
for r in range(nr):
if self.reactant_limits[r] == len(self.reactant_fps[r]):
# No more reactants left in this group.
continue
if self.reactant_limits[r] >= self.max_reactants:
# Reached the self-imposed upper bound on this group.
continue
pools = []
for i in range(nr):
ilimit = self.reactant_limits[i]
if i == r:
pools.append([ilimit]) # New reactant
else:
pools.append(list(range(0, ilimit))) # Other reactants
# Consider all resulting combinations of ranked reactants.
for combo in itertools.product(*pools):
sim = fp_utils.get_reactant_combo_sim(self._query_fp,
self.reactant_fps, combo)
if sim > sim_best:
sim_best = sim
r_best = r
if r_best is not None:
self.reactant_limits[r_best] += 1
# Increment any duplicates of this reactant to preserve symmetry.
for r_dup in self._reactant_equiv[r_best]:
self.reactant_limits[r_dup] += 1
return r_best is not None