Source code for schrodinger.application.matsci.packmol

"""
Utilities for working with packmol.

Copyright Schrodinger, LLC. All rights reserved.
"""

import os
import string
import sys
import time
from collections import namedtuple

from schrodinger import structure
from schrodinger.application.desmond import cms
from schrodinger.application.matsci import cgforcefield as cgff
from schrodinger.application.matsci import coarsegrain
from schrodinger.application.matsci import desmondutils
from schrodinger.application.matsci import jobutils
from schrodinger.application.matsci.nano import xtal
from schrodinger.infra import mm
from schrodinger.job import jobcontrol
from schrodinger.job import queue
from schrodinger.structutils import color
from schrodinger.test import ioredirect
from schrodinger.utils import fileutils
from schrodinger.utils import imputils
from schrodinger.utils import subprocess

from .msutils import add_or_update_bond_order

path = os.path.join(fileutils.get_mmshare_scripts_dir(),
                    'prepare_for_md_gui_dir', 'prepare_for_md_driver.py')
prep_md = imputils.import_module_from_file(path)

PDB_EXT = '.pdb'

PDBdata = namedtuple('PDBdata',
                     ['pdbname', 'pdbres', 'resnum', 'chain_name', 'order'])

# since pdbres names are taken to be in a field of 4 chars use the following
# short and rare character to uniqueify them
UNIQUE_PDB_RES_TAG = '#'


[docs]def get_pdb_data(pdbname, pdbres, resnum=None, chain_name=None, order=None):
    """
    Return a PDBdata.

    :type pdbname: str
    :param pdbname: the PDB atom name

    :type pdbres: str
    :param pdbres: the PDB residue name

    :type resnum: int or None
    :param resnum: the PDB residue number if needed

    :type chain_name: str or None
    :param chain_name: the PDB chain name if needed

    :type order: int or None
    :param order: a bond order to be used when bonding to the atom
        corresponding to this object

    :rtype: PDBdata
    :return: the PDB data
    """

    if chain_name:
        chain_name = chain_name.strip()
    return PDBdata(
        pdbname=pdbname.strip(),
        pdbres=pdbres.strip(),
        resnum=resnum,
        chain_name=chain_name,
        order=order)


[docs]class Job(queue.SubprocessJob):

[docs]    def __init__(self, *args, **kwargs):
        """
        See parent class for documentation.
        """

        self.output_file = kwargs.pop('output_file', None)
        super().__init__(*args, **kwargs)
        self.name = fileutils.get_basename(self.output_file) + '_packmol'

[docs]    def doCommand(self, *args, **kwargs):
        """
        See parent class for documentation.
        """

        # typically ran as 'packmol < input.inp',
        # to avoid shell command pass script and
        # input separately

        self._start_time = time.time()
        script, input_file = self._command
        self._stdin = open(input_file, 'r')
        self._subprocess = subprocess.Popen(
            [script],
            stdin=self._stdin,
            stdout=self._stdout,
            stderr=self._stderr,
            universal_newlines=True)

[docs]    def postCommand(self):
        """
        See parent class for documentation.
        """

        super().postCommand()
        self._stdin.close()

[docs]    def update(self):
        """
        See parent class for documentation.
        """

        # packmol failures do not return
        # an error code so manually handle it here

        super().update()
        if self.state == queue.JobState.DONE and self.output_file and \
            not os.path.exists(self.output_file):
            self.state = queue.JobState.FAILED


[docs]def run(input_files, allow_failures=True):
    """
    Run.

    :type input_files: list
    :param input_files: packmol input files

    :type allow_failures: bool
    :param allow_failures: if True then all but a single subjob are
        allowed to fail without causing the main job to fail

    :raise RuntimeError: if all subjobs fail, if allow_failures is False
        then if a single subjob fails then the main job fails

    :rtype: dict
    :return: keys are input files, values are structure output files
    """

    # structure files referenced in the given packmol input files
    # are assumed to exist in the directory from which this function
    # is called

    if allow_failures:
        max_failures = len(input_files) - 1
    else:
        max_failures = 0
    job_dj = queue.JobDJ(
        verbosity='normal', max_failures=max_failures, max_retries=0)

    # get binary name
    if sys.platform == 'win32':
        packmol_bin = 'packmol.exe'
    else:
        packmol_bin = 'packmol'

    job_be = jobcontrol.get_backend()

    log_fhs = []
    output_files = {}
    for input_file in input_files:
        with open(input_file, 'r') as afile:
            for line in afile:
                tokens = line.strip().split()
                if tokens and tokens[0].lower() == 'output':
                    output_file = tokens[1]
                    break
            else:
                # an input file that does not specify an
                # output file will fail anyway and is handled
                # using max_failures
                output_file = fileutils.get_basename(input_file) + '.out'
        base_name = fileutils.get_basename(output_file)
        log_file = base_name + '.log'
        if job_be:
            job_be.addLogFile(log_file)
        log_fh = open(log_file, 'w')
        log_fhs.append(log_fh)
        output_files[input_file] = output_file
        cmd = [packmol_bin, input_file]
        job = Job(cmd, stdout=log_fh, stderr=log_fh, output_file=output_file)
        job_dj.addJob(job)

    try:
        job_dj.run()
    finally:
        for log_fh in log_fhs:
            log_fh.close()

    for input_file, output_file in output_files.items():
        if not os.path.exists(output_file):
            output_files[input_file] = None
        elif job_be:
            job_be.addOutputFile(output_file)

    return output_files


[docs]class PDBWriter(structure.PDBWriter):

[docs]    def write(self, ct):
        """
        See parent class for documentation.
        """

        # First check CT is able to be written. TexualStructure objects
        # are not:
        if isinstance(ct, structure.TextualStructure):
            raise Exception("TextualStructure objects can not be written to "
                            "an PDB format file")

        mm.mmpdb_initialize(mm.error_handler)
        fh = mm.mmpdb_new()

        # by default structure.PDBWriter reorders atoms, it is critical
        # that this is avoided due to atom index references in the packmol
        # input file
        mm.mmpdb_set(fh, mm.MMPDB_NO_REORDER_ATOMS)

        mm.mmpdb_write(fh, ct, self.filename)
        mm.mmpdb_delete(fh)


def _set_atom_properties(st, key, available_values, justify=False):
    """
    Set atom properties.

    :type st: schrodinger.structure.Structure
    :param st: the structure

    :type key: str
    :param key: the property key

    :type available_values: list
    :param available_values: unique available values for the property

    :type justify: bool
    :param justify: whether to left justify string values in a field
        of length 4
    """

    values = []
    for atom in st.atom:
        value = atom.property.get(key, None)
        if (isinstance(value, str) and value.strip() in ['', 'UNK']) or \
            (isinstance(value, int) and not value):
            value = None
        values.append(value)

    if None not in values:
        return

    new_value = available_values.pop(0)
    if justify:
        new_value = new_value.ljust(4)
    for idx, value in enumerate(values, 1):
        if value is None:
            st.atom[idx].property[key] = new_value


[docs]def set_unique_pdb_atom_names(st):
    """
    Set unique PDB atom names on the given structure.

    :type st: schrodinger.structure.Structure
    :param st: the structure
    """

    adict = {}
    for atom in st.atom:
        idxs = adict.setdefault(atom.pdbres, {}).setdefault(atom.element, [])
        idxs.append(atom.index)
        idx = str(len(idxs))
        atom.pdbname = f'{atom.element}{idx}'.ljust(4)


def _fix_bonding(st, sts, bond_dict=None):
    """
    Fix the bonding output from packmol.

    :type st: schrodinger.structure.Structure
    :param st: the structure to fix

    :type sts: dict
    :param sts: keys are file names (referenced in the given packmol input files),
        values are schrodinger.structure.Structure

    :type bond_dict: dict or None
    :param bond_dict: keys are pair tuples containing PDB atom names and residue
        names, values are triple tuples containing PDB atom names, residue names,
        and bond orders, or None if it is to be determined

    :rtype: dict
    :return: the bond_dict
    """

    # the structure to fix comes from using the PDBReader class
    # on the packmol output pdb file, even though the input pdb
    # files of comoponents have connections the packmol output pdb
    # does not, the PDBReader class by default assigns bonds, yet
    # the assignment can be missing bonds, for both atomistic and
    # CG structures, or the assignment can be wrong, for example
    # due to packmol exiting with a FORCED status which results
    # in non-bonding atoms that are too close to each other,
    # the PDBReader class lacks a clean API to prevent bonding
    # so remove bonds here and rebond

    pairs = [(bond.atom1, bond.atom2) for bond in st.bond]
    for pair in pairs:
        st.deleteBond(*pair)

    if not bond_dict:
        bond_dict = {}
        for file_name, mae_st in sts.items():
            # read the PDB to get the PDB atom names that were
            # used during writing
            pdb_st = structure.Structure.read(file_name)
            # ordering is enforced
            for mae_atom, pdb_atom in zip(mae_st.atom, pdb_st.atom):
                # this tuple will be unique over the entire system
                atuple = get_pdb_data(pdb_atom.pdbname, pdb_atom.pdbres)
                btuples = []
                for mae_neighbor in mae_atom.bonded_atoms:
                    order = mae_st.getBond(mae_atom, mae_neighbor).order
                    pdb_neighbor = pdb_st.atom[mae_neighbor.index]
                    btuple = get_pdb_data(
                        pdb_neighbor.pdbname, pdb_neighbor.pdbres, order=order)
                    btuples.append(btuple)
                bond_dict[atuple] = btuples

    idx_dict = {}
    for atom in st.atom:
        atuple = get_pdb_data(
            atom.pdbname,
            atom.pdbres,
            resnum=atom.resnum,
            chain_name=atom.chain_name)
        idx_dict[atuple] = atom.index

    # see MATSCI-7720 - since only a single instance of each residue type
    # is supported per component there will never be bonds between atoms with
    # the same residue name but different residue numbers, atoms in a
    # component all have the same chain name
    for atuple, idx in idx_dict.items():
        a_key = get_pdb_data(atuple.pdbname, atuple.pdbres)
        for btuple in bond_dict[a_key]:
            b_key = get_pdb_data(
                btuple.pdbname,
                btuple.pdbres,
                resnum=atuple.resnum,
                chain_name=atuple.chain_name)
            jdx = idx_dict[b_key]
            add_or_update_bond_order(st, idx, jdx, btuple.order)

    return bond_dict


def _fix_coarse_grained_st(st, sts, cg_info_dict=None):
    """
    Fix the coarse grained structure output from packmol.

    :type st: schrodinger.structure.Structure
    :param st: the structure to fix

    :type sts: dict
    :param sts: keys are file names (referenced in the given packmol input files),
        values are schrodinger.structure.Structure

    :type cg_info_dict: dict or None
    :param cg_info_dict: keys are pair tuples containing PDB atom names and
        residue names, values are coarsegrain.ParticleInfo, or None if it is
        to be determined

    :raise RuntimeError: if there is a problem with the input

    :rtype: dict
    :return: the cg_info_dict
    """

    try:
        is_coarse_grain = coarsegrain.is_coarse_grain_input(
            input_sts=sts.values())
    except ValueError as err:
        raise RuntimeError(str(err))

    if not is_coarse_grain:
        return

    # the packmol input and output pdb files lack coarse-grained particle
    # attributes, since the s_m_pdb_atom_name is limited store that
    # information here so that it can be applied later
    if not cg_info_dict:
        cg_info_dict = {}
        for file_name, mae_st in sts.items():
            # read the PDB to get the PDB atom names that were
            # used during writing
            pdb_st = structure.Structure.read(file_name)
            # ordering is enforced
            for mae_atom, pdb_atom in zip(mae_st.atom, pdb_st.atom):
                key = mae_atom.name
                name = key
                rgb = mae_atom.color.rgb
                radius = mm.mmct_atom_get_display_radius(mae_st, mae_atom.index)
                mass = mm.mmct_atom_get_coarse_grain_mass(
                    mae_st, mae_atom.index)
                info = coarsegrain.ParticleInfo(
                    key=key, name=name, rgb=rgb, radius=radius, mass=mass)
                # this tuple will be unique over the entire system
                atuple = get_pdb_data(pdb_atom.pdbname, pdb_atom.pdbres)
                cg_info_dict[atuple] = info

    coarsegrain.set_as_coarse_grain(st)
    for atom in st.atom:
        atuple = get_pdb_data(atom.pdbname, atom.pdbres)
        info = cg_info_dict[atuple]
        coarsegrain.set_atom_coarse_grain_properties(
            st,
            atom,
            info.name,
            rgb=info.rgb,
            radius=info.radius,
            mass=info.mass)

    st.applyStyle(
        atoms=structure.ATOM_BALLNSTICK, bonds=structure.BOND_BALLNSTICK)

    return cg_info_dict


[docs]def get_cells(input_files, sts, allow_failures=True):
    """
    Return cells.

    :type input_files: dict
    :param input_files: keys are packmol input files, values are tuples of the
        3 box lengths (Angstrom) defining the PBC

    :type sts: dict
    :param sts: keys are file names (referenced in the given packmol input files),
        values are schrodinger.structure.Structure

    :type allow_failures: bool
    :param allow_failures: if True then all but a single subjob are
        allowed to fail without causing the main job to fail

    :raise RuntimeError: if there is a problem with the input

    :rtype: dict
    :return: keys are input files, values are schrodinger.structure.Structure
    """

    # 26 identifiers should be enough (potentially 1 for each componentent that
    # is missing data)
    residue_numbers = list(range(1, 27))
    chain_names = list(string.ascii_uppercase)
    pdb_residue_names = [f'R{i}' for i in residue_numbers]
    pairs = [('i_m_residue_number', residue_numbers),
             ('s_m_chain_name', chain_names), ('s_m_pdb_residue_name',
                                               pdb_residue_names)]

    # uniqueify
    for st in sts.values():
        for atom in st.atom:
            for key, collection in pairs:
                value = atom.property.get(key)
                if isinstance(value, str):
                    value = value.strip()
                try:
                    collection.remove(value)
                except ValueError:
                    pass

    for file_name, st in sts.items():
        # use the the run function for non-pdb structures, packmol
        # supports pdb, tinker, xyz, and moldy, with pdb the default
        if fileutils.splitext(file_name)[1] != PDB_EXT:
            raise RuntimeError('Only PDB structure files are supported.')

        _set_atom_properties(st, 'i_m_residue_number', residue_numbers)
        _set_atom_properties(st, 's_m_chain_name', chain_names)
        _set_atom_properties(
            st, 's_m_pdb_residue_name', pdb_residue_names, justify=True)

        # warnings regarding nonunique PDB atom names can be thrown,
        # yet in general uniqueifying them within the allowed 4 character
        # max is not possible, for example the packmol source example
        # protein.pdb has multiple atoms with PDB atom name of 'HH11',
        # if the structure lacks s_m_pdb_atom_name properties then the
        # PDBWriter creates them by numbering by element per residue per chain,
        # for example O1, H1, H2, (residue R1, chain A), O1, O2,
        # H1, (residue R2, chain A), O1, H1, H2 (residue R1, chain B), etc.,
        # since unique PDB atom names are required for post-processing use
        # the PDBWriter convention if they aren't already unique

        if len(set(atom.pdbname for atom in st.atom)) < st.atom_total:
            # the PDBWriter PDB atom name assignment convention requires that atoms
            # in a residue have contiguous indexing so instead do it by hand here
            set_unique_pdb_atom_names(st)

        with ioredirect.IOSilence():
            with PDBWriter(file_name) as writer:
                writer.write(st)

    output_files = run(input_files.keys(), allow_failures=allow_failures)

    bond_dict = None
    cg_info_dict = None
    out_sts = {}
    for input_file, output_file in output_files.items():
        if output_file is None:
            continue
        st = structure.Structure.read(output_file)
        box_lengths = input_files[input_file]
        chorus = (box_lengths[0], 0., 0., 0., box_lengths[1], 0., 0., 0.,
                  box_lengths[2])
        xtal.set_pbc_properties(st, chorus)
        if not st.title:
            st.title = fileutils.get_basename(input_file)
        bond_dict = _fix_bonding(st, sts, bond_dict=bond_dict)
        cg_info_dict = _fix_coarse_grained_st(
            st, sts, cg_info_dict=cg_info_dict)
        if not coarsegrain.is_coarse_grain_input(input_sts=sts.values()):
            color.apply_color_scheme(st, 'element')
        # see MATSCI-9010 - allow polymer builder output which features multiple
        # residues by the same name
        unset_unique_pdb_res_names(st)
        out_sts[input_file] = st

    return out_sts


[docs]def write_desmond_cells(input_files,
                        sts,
                        allow_failures=True,
                        force_field=mm.OPLS_NAME_F16,
                        water_force_field=desmondutils.SPC,
                        cg_ff_loc_type=cgff.LOCAL_FF_LOCATION_TYPE):
    """
    Write Desmond cells.

    :type input_files: dict
    :param input_files: keys are packmol input files, values are tuples of the
        3 box lengths (Angstrom) defining the PBC

    :type sts: dict
    :param sts: keys are file names (referenced in the given packmol input files),
        values are schrodinger.structure.Structure

    :type allow_failures: bool
    :param allow_failures: if True then all but a single subjob are
        allowed to fail without causing the main job to fail

    :type force_field: str
    :param force_field: name of FF to apply

    :type water_force_field: str
    :param water_force_field: name of the water force field to apply,
        options are available in desmondutils

    :type cg_ff_loc_type: str
    :param cg_ff_loc_type: specifies the location to which
        to look for coarse-grained force field files,
        one of cgff.INSTALLED_FF_LOCATION_TYPE or cgff.LOCAL_FF_LOCATION_TYPE

    :raise RuntimeError: if there is a problem with the input

    :rtype: dict
    :return: keys are input files, values are names of written Desmond *cms files
    """

    cells = get_cells(input_files, sts, allow_failures=allow_failures)

    if allow_failures:
        max_failures = len(input_files) - 1
    else:
        max_failures = 0
    job_dj = queue.JobDJ(
        verbosity='normal', max_failures=max_failures, max_retries=0)

    cleaner = jobutils.StringCleaner()

    output_files = {}
    for input_file, st in cells.items():
        base_name = fileutils.get_basename(input_file)
        input_file = base_name + '_out.mae'

        st.write(input_file)
        title = cleaner.cleanAndUniquify(st.title)
        output_files[input_file] = title + '_system-out.cms'

        cmd = ['prepare_for_md_gui_dir/prepare_for_md_driver.py']
        cmd.extend([prep_md.FLAG_FORCEFIELD, force_field])
        cmd.extend([prep_md.CGFFLD_LOCATION_TYPE_FLAG, cg_ff_loc_type])
        cmd.append(jobutils.SPLIT_COMPONENTS_FLAG)
        cmd.extend([jobutils.WATER_FF_TYPE_FLAG, water_force_field])
        cmd.extend(['-JOBNAME', base_name + '_prep_md'])
        cmd.append(input_file)

        job = queue.JobControlJob(cmd)
        job_dj.addJob(job)

    job_dj.run()

    job_be = jobcontrol.get_backend()
    for input_file, output_file in output_files.items():
        if not os.path.exists(output_file):
            msg = (f'Prepare for MD output file {output_file} '
                   'can not be found.')
            raise RuntimeError(msg)
        model = cms.Cms(file=output_file)
        try:
            desmondutils.assign_lipid_ff(model)
        except ValueError:
            # this means no known lipid template is present, so
            # nothing to do
            pass
        else:
            model.write(output_file)
        if job_be:
            job_be.addOutputFile(output_file)

    return output_files


[docs]def set_unique_pdb_res_names(st):
    """
    Set unique PDB residue names on the given structure.

    :raise RuntimeError: if there is a problem with the input

    :type st: schrodinger.structure.Structure
    :param st: the structure
    """

    adict = {}
    for residue in st.residue:
        adict.setdefault(residue.pdbres.strip(), []).append(residue)
    for name, residues in adict.items():
        if len(residues) == 1:
            continue
        for tag_idx, residue in enumerate(residues, 1):
            pdbres = f'{name}{UNIQUE_PDB_RES_TAG}{tag_idx}'
            if len(pdbres) > 4:
                raise RuntimeError(
                    'The uniqueified PDB residue name '
                    f'of {pdbres} for residue {name} has more than 4 '
                    'characters which is not supported.')
            residue.pdbres = pdbres.ljust(4)


[docs]def unset_unique_pdb_res_names(st):
    """
    Unset unique PDB residue names on the given structure.

    :type st: schrodinger.structure.Structure
    :param st: the structure
    """

    for residue in st.residue:
        try:
            name, tag_idx = residue.pdbres.strip().split(UNIQUE_PDB_RES_TAG)
        except ValueError:
            continue
        residue.pdbres = name.ljust(4)