Source code for prepmd.ligand

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 27 16:49:54 2026

@author: rob
"""

# setup
import MDAnalysis as mda
from rdkit import Chem
import copy

# run
from openff.toolkit import Molecule, Topology, ForceField
from openff.interchange import Interchange
from openff.interchange.components._packmol import UNIT_CUBE, pack_box
from openff.units import unit
from openmm.app import pdbxfile
from MDAnalysis.converters.OpenMMParser import OpenMMTopologyParser
from MDAnalysis.coordinates.memory import MemoryReader
from openmm.app.modeller import Modeller
from Bio.PDB import PDBParser, MMCIFIO
import numpy as np
import os

"""
Okay, here's my full laundry list of complaints about rdkit
- the initial model generated by mda has no hydrogens. if you try and count
the hydrogens and free radicals in the structure it outputs zero for all of
them. however, if you run AddHs it will not add any hydrogens. the only way
to make it add hydrogens is to run RemoveHs first. despite the fact that the
structure contains no hydrogens.

- second, the structure it generates is fine with the exception of a single
atom which is horribly misplaced for seemingly no reason.
the only way to fix this is to run EmbedMolecule. however, doing that will
change the position and rotation of the whole molecule.

- the only way to fix that is to then run AlignMol to align the rotated
molecule to the original molecule. however if you try to run this with the
default settings it will claim there's 'no substructure match' despite the
two molecules being the same molecule. to fix this you feed it a list of
atom indices
"""



[docs]
def load_universe(pdb_path):
    """
    Create an mdanalysis universe from a structure file.
    This uses a quick and dirty hack - going via openmm - to get an mmcif
    file into mdanalysis, as mdanalysis does not natively support mmCif files,
    despite the fact that the pdb format has been deprecated for two years (!)
    at time of writing.
    
    Args:
        pdb_path: path to a pdb or cif file
    Returns:
        an mdanalysis Universe object
    """
    if ".cif" in pdb_path or ".mmcif" in pdb_path:
        pdbx = pdbxfile.PDBxFile(pdb_path)
        openmmtopol = OpenMMTopologyParser(pdbx.topology)
        positions = pdbx.getPositions(asNumpy=True) * 10  # angstrom conversion
        mdatopol = openmmtopol._mda_topology_from_omm_topology(pdbx.topology)
        u = mda.Universe(mdatopol, positions, format=MemoryReader)
    else:
        u = mda.Universe(pdb_path)
    return u



[docs]
def pdb_back_to_mmcif(inpdb, outcif):
        p = PDBParser()
        struc = p.get_structure("", inpdb)
        io = MMCIFIO()
        io.set_structure(struc)
        io.save(outcif)


# setup ligand system


[docs]
def split_pdb_ligand(pdb_path, new_pdb_output=None, neutralise_radicals=True):
    """
    Extract ligands from a structure file (pdb or mmcif) and write out a new,
    ligand/hetatm-free structure file and sdf files containing the ligands.
    
    Args:
        pdb_path - path to input pdb or mmcif file, a string
        new_pdb_output - filename for the new pdb file, a string. Otherwise the
        existing pdb file will be overwritten.
        neutralise_radicals - whether to remove free radicals. A bool.
    Returns:
        a list of strings, where each string is the filename of an sdf file
        that has been written, which are named according to their residue.
    """

    u = load_universe(pdb_path)
    protein = u.select_atoms('protein')
    ligand = u.select_atoms('not protein and not water')
    if len(ligand) == None:
        print("No hetatms/ligands found.")
        return None
    else:
        print("Extracting hetatms/ligands...")

    ligands = ligand.split("residue")

    if not new_pdb_output:
        output_path = pdb_path
    else:
        output_path = new_pdb_output

    if ".cif" in pdb_path or ".mmcif" in pdb_path:
        print("Warning: mdanalysis inexplicably  does not support writing "
              "mmcif files, so your structure will be written to a temporary "
              "pdb file and then converted to cif. This process may not work "
              "with all structures, make sure to so double-check the output!")
        protein.write(output_path+".pdb")
        p = PDBParser()
        struc = p.get_structure("", output_path+".pdb")
        io = MMCIFIO()
        io.set_structure(struc)
        io.save(output_path) # todo: replace w call to pdb_back_to_mmcif

    else:
        protein.write(output_path)

    ligand_filenames = []
    ligands_written = {}
    for ligand in ligands:
        resname = "".join(list(set(ligand.resnames)))
        if resname in ligands_written.keys():
            ligands_written[resname] += 1
            resname += "_"+str(ligands_written[resname])
        else:
            ligands_written[resname] = 1
        mol = ligand.convert_to('RDKIT', inferrer=None)

        # need to create a coordmap or rdkit rotates and translates the ligand
        # for absolutely no reason. luckily getting the positions of atoms in
        # rdkit is very easy. just make a conformer object and then iterate
        # over it to get an object that has x y and z as member variables
        # someone should maybe tell the rdkit developers about arrays
        coordmap = {}
        atommap = []
        conf = mol.GetConformer()
        for atom in mol.GetAtoms():
            idx = atom.GetIdx()
            pos = conf.GetAtomPosition(idx)
            coordmap[idx] = pos
            atommap.append([idx, idx])

        # do not touch this or it WILL break
        # the molecule does not contain any hydrogens but in order to add
        # hydrogens you have to first run the removehs function, if you don't
        # do this it won't add any hydrogens but it definitely won't tell you
        # that anything is wrong
        try:
            mol = Chem.RemoveHs(mol)
            mol = Chem.AddHs(mol, addCoords=True)
        except Exception as e:
            print("WARNING: Couldn't extract "+resname+" ("+str(e)+").")
            continue

        original_mol = copy.copy(mol)

        # remove radicals
        radicals_neutralised = 0
        if neutralise_radicals:
            for a in mol.GetAtoms():
                rad = a.GetNumRadicalElectrons()
                if rad:
                    a.SetNumExplicitHs(rad)
                    a.SetNumRadicalElectrons(0)
                    radicals_neutralised += rad

        mol = Chem.AddHs(mol, addCoords=True)

        # if you don't call embedmolecule here then the molecule will have
        # exactly one atom in completely the wrong place that will cause the
        # simulation to immediately detonate. however, if you call
        # embedmolecule without a coordmap it will rotate and translate the
        # ligand. it does this out of spite and hatred for molecular modellers
        Chem.AllChem.EmbedMolecule(mol, coordMap=coordmap)

        # i have to pass this stupid atommap rdMolAlign otherwise it won't
        # align the molecules because there's 'no substructure match' despite
        # them being literally the SAME MOLECULE
        rmsd = Chem.rdMolAlign.AlignMol(mol, original_mol, atomMap=atommap)
        #print("rmsd: "+str(rmsd))

        Chem.AllChem.MMFFOptimizeMolecule(mol)
        Chem.AllChem.UFFOptimizeMolecule(mol)

        writer = Chem.SDWriter(resname+".sdf")
        writer.write(mol)
        writer.close()
        ligand_filenames.append(resname+".sdf")
    return ligand_filenames



waters = {
    "tip3p": "tip3p.offxml",
    "tip4pew": "tip4p_ew.offxml"
}



[docs]
def solvate_openff(interchange, protein, ligands, n_water=None, gutter_nm=1.5,
                   positive_ion_smile="[Na+]", negative_ion_smile="[Cl-]",
                   desired_charge_elementary=0, water_ff="tip3p",
                   solvent_smiles="O", ff="openff-2.2.0.offxml",
                   solvent_name="HOH"):
    """
    Solvate an openff system.
    Args:
        interchange: an openff interchange containing the protein AND ligand.
        protein: protein loaded as an openff topology
        ligands: a list of openff molecule objects
        n_water: number of water molecules, an int. Will calculate if this is
        not set.
        gutter_nm: size of the gap between the protein and box edge in nm, a
        float.
        positive_ion_smile: smiles string for the positive ions
        positive_ion_smile: smiles string for the negative ions
        desired_charge_elementary: desired charge in units of the elementary
        charge
        water_ff: force field to use for the water. can be tip3p or tip4pew.
        solvent_smiles: smiles string for the solvent molecule.
        ff: force field to use, A string. Can be any openff force field.
        solvent name: name of solvent molecule (for topology!) - a string
    Returns:
        openff topology and an interchange for just the waters.
    """

    all_molecules = [protein]+ligands

    total_charge = round(
        sum(interchange["Electrostatics"].charges.values()), 3)

    ligand_charge = 0
    for ligand in ligands:
        ligand_charge += ligand.total_charge
    assert total_charge == protein.total_charge + ligand_charge, (
        f"Total charge of the system is {total_charge}, not {protein.total_charge + ligand.total_charge}"
    )

    water = Molecule.from_smiles(solvent_smiles)
    water.name = solvent_name
    water.generate_conformers(n_conformers=1)

    if total_charge > desired_charge_elementary * unit.elementary_charge:
        ion = Molecule.from_smiles(negative_ion_smile)
    elif total_charge <= desired_charge_elementary * unit.elementary_charge:
        ion = Molecule.from_smiles(positive_ion_smile)

    num_ions = abs(int((total_charge/unit.elementary_charge) -
                   (desired_charge_elementary)))

    ion.generate_conformers(n_conformers=1)

    xyz = protein.conformers[0]
    centroid = xyz.sum(axis=0) / xyz.shape[0]
    protein_radius = np.sqrt(((xyz - centroid) ** 2).sum(axis=-1).max())
    box_vectors = UNIT_CUBE * (protein_radius * 2 + gutter_nm * unit.nanometer)

    if not n_water:
        box_vol_angstroms = np.array(box_vectors.to("angstrom").m.diagonal())
        # one water per 30A^3, old JAWS default
        n_water = int(box_vol_angstroms.prod() / 70)
        print("Adding "+str(n_water)+" water molecules.")

    docked_topology = Topology.from_molecules(all_molecules)

    packed_topology = pack_box(
        molecules=[water, ion],
        number_of_copies=[n_water, num_ions],
        solute=docked_topology,
        box_vectors=box_vectors,
        center_solute=True,
    )

    water_interchange = Interchange.from_smirnoff(
        force_field=ForceField(ff, waters[water_ff]),
        topology=([water] * n_water) + ([ion] * num_ions),
    )

    return packed_topology, water_interchange




[docs]
def create_ligand_system(
        protein_structure,  # pdb finame
        ligand_structures,  # list of ligand filenames
        n_water=None,  # if None, calculate water automatically
        water_ff="tip3p",  # tip3p or tip4pew only supported
        solvent_smiles="O",
        gutter_nm=1,
        positive_ion_smile="[Na+]",
        negative_ion_smile="[Cl-]",
        desired_charge_elementary=0,
        ff="ff14sb_off_impropers_0.0.4.offxml",
        ligand_ff="openff-2.2.0.offxml",
        output_topology=None,
        solvent_name="HOH",
        ligands_names=None):
    """
    Set up an openmm system containing ligands using openff.
    Args:
        protein_structure: structure file path, pdb or mmcif, a string
        ligand_structure: a list of strings where each string is a path to
        an sdf file
        n_water = None, # if None, calculate water automatically
        positive_ion_smile: smiles string for the positive ions
        positive_ion_smile: smiles string for the negative ions
        desired_charge_elementary: desired charge in units of the elementary
        charge
        water_ff: force field to use for the water. can be tip3p or tip4pew.
        solvent_smiles: smiles string for the solvent molecule.
        ff: force field. For now, this is ff14sb_off_impropers_0.0.4.offxml as
        it's the only one openff properly supports.
        ff: force field to use for the ligand, A string. Can be any small
        molecule force field supported by openff.
        solvent_name - solvent name to appear in the topology, a string
        ligands_names: names of ligands to be written to the topology. a list
        of strings
    Returns:
        an openmm system, topology and positions object.
    """

    print("Setting up ligand simulation...")
    if water_ff not in waters.keys():
        raise ValueError("Water must be one of: "+",".join(waters.keys()))

    protein_with_crystal_water = Topology.from_pdb(protein_structure)
    protein = protein_with_crystal_water.molecule(0)

    ff14sb = ForceField(ff, waters[water_ff])

    protein_intrcg = Interchange.from_smirnoff(
        force_field=ff14sb,
        topology=protein.to_topology(),
    )
    
    if not ligands_names:
        ligands_names = [None]*len(ligand_structures)
        for ligand_fname in range(len(ligand_structures)):
            try:
                basename = os.path.basename(ligand_structures[ligand_fname])
                resname = basename.split(".")[0]
                if "_" in resname:
                    resname = resname.split("_")[0]
                ligands_names[ligand_fname] = resname
            except:
                pass
        if None in ligands_names:
            print("Warning: missing ligand name(s)! Will appear in topology "
                  "as 'UNK'.")

    ligands = []
    for ligand, name in zip(ligand_structures, ligands_names):
        current_ligand = Molecule.from_file(ligand)
        if name:
            current_ligand.name = name # ADD THIS
            for atom in current_ligand.atoms:
                atom.metadata["residue_name"] = name
        ligands.append(current_ligand)

    ligand_topology = Topology.from_molecules(ligands)

    ligand_force_field = ForceField(
        ligand_ff, waters[water_ff])  # includes tip3p

    ligand_intrcg = Interchange.from_smirnoff(
        ligand_force_field, ligand_topology)

    interchange = protein_intrcg.combine(ligand_intrcg)

    print("Solvating...")
    packed_topology, water_interchange = solvate_openff(interchange, protein,
                                                        ligands, n_water=n_water,
                                                        gutter_nm=gutter_nm,
                                                        positive_ion_smile=positive_ion_smile, negative_ion_smile=negative_ion_smile,
                                                        desired_charge_elementary=desired_charge_elementary, water_ff=water_ff,
                                                        solvent_smiles=solvent_smiles,
                                                        ff=ligand_ff,
                                                        solvent_name=solvent_name)

    solvated_interchange = interchange.combine(water_interchange)
    solvated_interchange.positions = packed_topology.get_positions()
    solvated_interchange.box = packed_topology.box_vectors

    if output_topology and ".pdb" in output_topology:
        solvated_interchange.to_pdb(output_topology)
    if output_topology:
        if ".mmcif" in output_topology or ".mmCif" in output_topology:
            print("WARNING: OpenMM can't write mmCif topologies. Writing a pdb"
                  " and converting it back to mmCif. Sorry!")
            solvated_interchange.to_pdb(output_topology+".pdb")
            pdb_back_to_mmcif(output_topology+".pdb", output_topology)

    openmm_system = solvated_interchange.to_openmm()
    openmm_topology = solvated_interchange.to_openmm_topology()
    openmm_positions = solvated_interchange.positions.to_openmm()

    # note: recommended settings:
    # intergrator - variable langevin, tolerance: 0.0005 (or timestep < 0.001ps)
    # minimisation - tolerance = 2.5 kilojoule/(nanometer*mole)
    return openmm_system, openmm_topology, openmm_positions