#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 27 16:49:54 2026
@author: rob
"""
# setup
import MDAnalysis as mda
from rdkit import Chem
import copy
# run
from openff.toolkit import Molecule, Topology, ForceField
from openff.interchange import Interchange
from openff.interchange.components._packmol import UNIT_CUBE, pack_box
from openff.units import unit
from openmm.app import pdbxfile
from MDAnalysis.converters.OpenMMParser import OpenMMTopologyParser
from MDAnalysis.coordinates.memory import MemoryReader
from openmm.app.modeller import Modeller
from Bio.PDB import PDBParser, MMCIFIO
import numpy as np
import os
"""
Okay, here's my full laundry list of complaints about rdkit
- the initial model generated by mda has no hydrogens. if you try and count
the hydrogens and free radicals in the structure it outputs zero for all of
them. however, if you run AddHs it will not add any hydrogens. the only way
to make it add hydrogens is to run RemoveHs first. despite the fact that the
structure contains no hydrogens.
- second, the structure it generates is fine with the exception of a single
atom which is horribly misplaced for seemingly no reason.
the only way to fix this is to run EmbedMolecule. however, doing that will
change the position and rotation of the whole molecule.
- the only way to fix that is to then run AlignMol to align the rotated
molecule to the original molecule. however if you try to run this with the
default settings it will claim there's 'no substructure match' despite the
two molecules being the same molecule. to fix this you feed it a list of
atom indices
"""
[docs]
def load_universe(pdb_path):
"""
Create an mdanalysis universe from a structure file.
This uses a quick and dirty hack - going via openmm - to get an mmcif
file into mdanalysis, as mdanalysis does not natively support mmCif files,
despite the fact that the pdb format has been deprecated for two years (!)
at time of writing.
Args:
pdb_path: path to a pdb or cif file
Returns:
an mdanalysis Universe object
"""
if ".cif" in pdb_path or ".mmcif" in pdb_path:
pdbx = pdbxfile.PDBxFile(pdb_path)
openmmtopol = OpenMMTopologyParser(pdbx.topology)
positions = pdbx.getPositions(asNumpy=True) * 10 # angstrom conversion
mdatopol = openmmtopol._mda_topology_from_omm_topology(pdbx.topology)
u = mda.Universe(mdatopol, positions, format=MemoryReader)
else:
u = mda.Universe(pdb_path)
return u
[docs]
def pdb_back_to_mmcif(inpdb, outcif):
p = PDBParser()
struc = p.get_structure("", inpdb)
io = MMCIFIO()
io.set_structure(struc)
io.save(outcif)
# setup ligand system
[docs]
def split_pdb_ligand(pdb_path, new_pdb_output=None, neutralise_radicals=True):
"""
Extract ligands from a structure file (pdb or mmcif) and write out a new,
ligand/hetatm-free structure file and sdf files containing the ligands.
Args:
pdb_path - path to input pdb or mmcif file, a string
new_pdb_output - filename for the new pdb file, a string. Otherwise the
existing pdb file will be overwritten.
neutralise_radicals - whether to remove free radicals. A bool.
Returns:
a list of strings, where each string is the filename of an sdf file
that has been written, which are named according to their residue.
"""
u = load_universe(pdb_path)
protein = u.select_atoms('protein')
ligand = u.select_atoms('not protein and not water')
if len(ligand) == None:
print("No hetatms/ligands found.")
return None
else:
print("Extracting hetatms/ligands...")
ligands = ligand.split("residue")
if not new_pdb_output:
output_path = pdb_path
else:
output_path = new_pdb_output
if ".cif" in pdb_path or ".mmcif" in pdb_path:
print("Warning: mdanalysis inexplicably does not support writing "
"mmcif files, so your structure will be written to a temporary "
"pdb file and then converted to cif. This process may not work "
"with all structures, make sure to so double-check the output!")
protein.write(output_path+".pdb")
p = PDBParser()
struc = p.get_structure("", output_path+".pdb")
io = MMCIFIO()
io.set_structure(struc)
io.save(output_path) # todo: replace w call to pdb_back_to_mmcif
else:
protein.write(output_path)
ligand_filenames = []
ligands_written = {}
for ligand in ligands:
resname = "".join(list(set(ligand.resnames)))
if resname in ligands_written.keys():
ligands_written[resname] += 1
resname += "_"+str(ligands_written[resname])
else:
ligands_written[resname] = 1
mol = ligand.convert_to('RDKIT', inferrer=None)
# need to create a coordmap or rdkit rotates and translates the ligand
# for absolutely no reason. luckily getting the positions of atoms in
# rdkit is very easy. just make a conformer object and then iterate
# over it to get an object that has x y and z as member variables
# someone should maybe tell the rdkit developers about arrays
coordmap = {}
atommap = []
conf = mol.GetConformer()
for atom in mol.GetAtoms():
idx = atom.GetIdx()
pos = conf.GetAtomPosition(idx)
coordmap[idx] = pos
atommap.append([idx, idx])
# do not touch this or it WILL break
# the molecule does not contain any hydrogens but in order to add
# hydrogens you have to first run the removehs function, if you don't
# do this it won't add any hydrogens but it definitely won't tell you
# that anything is wrong
try:
mol = Chem.RemoveHs(mol)
mol = Chem.AddHs(mol, addCoords=True)
except Exception as e:
print("WARNING: Couldn't extract "+resname+" ("+str(e)+").")
continue
original_mol = copy.copy(mol)
# remove radicals
radicals_neutralised = 0
if neutralise_radicals:
for a in mol.GetAtoms():
rad = a.GetNumRadicalElectrons()
if rad:
a.SetNumExplicitHs(rad)
a.SetNumRadicalElectrons(0)
radicals_neutralised += rad
mol = Chem.AddHs(mol, addCoords=True)
# if you don't call embedmolecule here then the molecule will have
# exactly one atom in completely the wrong place that will cause the
# simulation to immediately detonate. however, if you call
# embedmolecule without a coordmap it will rotate and translate the
# ligand. it does this out of spite and hatred for molecular modellers
Chem.AllChem.EmbedMolecule(mol, coordMap=coordmap)
# i have to pass this stupid atommap rdMolAlign otherwise it won't
# align the molecules because there's 'no substructure match' despite
# them being literally the SAME MOLECULE
rmsd = Chem.rdMolAlign.AlignMol(mol, original_mol, atomMap=atommap)
#print("rmsd: "+str(rmsd))
Chem.AllChem.MMFFOptimizeMolecule(mol)
Chem.AllChem.UFFOptimizeMolecule(mol)
writer = Chem.SDWriter(resname+".sdf")
writer.write(mol)
writer.close()
ligand_filenames.append(resname+".sdf")
return ligand_filenames
waters = {
"tip3p": "tip3p.offxml",
"tip4pew": "tip4p_ew.offxml"
}
[docs]
def solvate_openff(interchange, protein, ligands, n_water=None, gutter_nm=1.5,
positive_ion_smile="[Na+]", negative_ion_smile="[Cl-]",
desired_charge_elementary=0, water_ff="tip3p",
solvent_smiles="O", ff="openff-2.2.0.offxml",
solvent_name="HOH"):
"""
Solvate an openff system.
Args:
interchange: an openff interchange containing the protein AND ligand.
protein: protein loaded as an openff topology
ligands: a list of openff molecule objects
n_water: number of water molecules, an int. Will calculate if this is
not set.
gutter_nm: size of the gap between the protein and box edge in nm, a
float.
positive_ion_smile: smiles string for the positive ions
positive_ion_smile: smiles string for the negative ions
desired_charge_elementary: desired charge in units of the elementary
charge
water_ff: force field to use for the water. can be tip3p or tip4pew.
solvent_smiles: smiles string for the solvent molecule.
ff: force field to use, A string. Can be any openff force field.
solvent name: name of solvent molecule (for topology!) - a string
Returns:
openff topology and an interchange for just the waters.
"""
all_molecules = [protein]+ligands
total_charge = round(
sum(interchange["Electrostatics"].charges.values()), 3)
ligand_charge = 0
for ligand in ligands:
ligand_charge += ligand.total_charge
assert total_charge == protein.total_charge + ligand_charge, (
f"Total charge of the system is {total_charge}, not {protein.total_charge + ligand.total_charge}"
)
water = Molecule.from_smiles(solvent_smiles)
water.name = solvent_name
water.generate_conformers(n_conformers=1)
if total_charge > desired_charge_elementary * unit.elementary_charge:
ion = Molecule.from_smiles(negative_ion_smile)
elif total_charge <= desired_charge_elementary * unit.elementary_charge:
ion = Molecule.from_smiles(positive_ion_smile)
num_ions = abs(int((total_charge/unit.elementary_charge) -
(desired_charge_elementary)))
ion.generate_conformers(n_conformers=1)
xyz = protein.conformers[0]
centroid = xyz.sum(axis=0) / xyz.shape[0]
protein_radius = np.sqrt(((xyz - centroid) ** 2).sum(axis=-1).max())
box_vectors = UNIT_CUBE * (protein_radius * 2 + gutter_nm * unit.nanometer)
if not n_water:
box_vol_angstroms = np.array(box_vectors.to("angstrom").m.diagonal())
# one water per 30A^3, old JAWS default
n_water = int(box_vol_angstroms.prod() / 70)
print("Adding "+str(n_water)+" water molecules.")
docked_topology = Topology.from_molecules(all_molecules)
packed_topology = pack_box(
molecules=[water, ion],
number_of_copies=[n_water, num_ions],
solute=docked_topology,
box_vectors=box_vectors,
center_solute=True,
)
water_interchange = Interchange.from_smirnoff(
force_field=ForceField(ff, waters[water_ff]),
topology=([water] * n_water) + ([ion] * num_ions),
)
return packed_topology, water_interchange
[docs]
def create_ligand_system(
protein_structure, # pdb finame
ligand_structures, # list of ligand filenames
n_water=None, # if None, calculate water automatically
water_ff="tip3p", # tip3p or tip4pew only supported
solvent_smiles="O",
gutter_nm=1,
positive_ion_smile="[Na+]",
negative_ion_smile="[Cl-]",
desired_charge_elementary=0,
ff="ff14sb_off_impropers_0.0.4.offxml",
ligand_ff="openff-2.2.0.offxml",
output_topology=None,
solvent_name="HOH",
ligands_names=None):
"""
Set up an openmm system containing ligands using openff.
Args:
protein_structure: structure file path, pdb or mmcif, a string
ligand_structure: a list of strings where each string is a path to
an sdf file
n_water = None, # if None, calculate water automatically
positive_ion_smile: smiles string for the positive ions
positive_ion_smile: smiles string for the negative ions
desired_charge_elementary: desired charge in units of the elementary
charge
water_ff: force field to use for the water. can be tip3p or tip4pew.
solvent_smiles: smiles string for the solvent molecule.
ff: force field. For now, this is ff14sb_off_impropers_0.0.4.offxml as
it's the only one openff properly supports.
ff: force field to use for the ligand, A string. Can be any small
molecule force field supported by openff.
solvent_name - solvent name to appear in the topology, a string
ligands_names: names of ligands to be written to the topology. a list
of strings
Returns:
an openmm system, topology and positions object.
"""
print("Setting up ligand simulation...")
if water_ff not in waters.keys():
raise ValueError("Water must be one of: "+",".join(waters.keys()))
protein_with_crystal_water = Topology.from_pdb(protein_structure)
protein = protein_with_crystal_water.molecule(0)
ff14sb = ForceField(ff, waters[water_ff])
protein_intrcg = Interchange.from_smirnoff(
force_field=ff14sb,
topology=protein.to_topology(),
)
if not ligands_names:
ligands_names = [None]*len(ligand_structures)
for ligand_fname in range(len(ligand_structures)):
try:
basename = os.path.basename(ligand_structures[ligand_fname])
resname = basename.split(".")[0]
if "_" in resname:
resname = resname.split("_")[0]
ligands_names[ligand_fname] = resname
except:
pass
if None in ligands_names:
print("Warning: missing ligand name(s)! Will appear in topology "
"as 'UNK'.")
ligands = []
for ligand, name in zip(ligand_structures, ligands_names):
current_ligand = Molecule.from_file(ligand)
if name:
current_ligand.name = name # ADD THIS
for atom in current_ligand.atoms:
atom.metadata["residue_name"] = name
ligands.append(current_ligand)
ligand_topology = Topology.from_molecules(ligands)
ligand_force_field = ForceField(
ligand_ff, waters[water_ff]) # includes tip3p
ligand_intrcg = Interchange.from_smirnoff(
ligand_force_field, ligand_topology)
interchange = protein_intrcg.combine(ligand_intrcg)
print("Solvating...")
packed_topology, water_interchange = solvate_openff(interchange, protein,
ligands, n_water=n_water,
gutter_nm=gutter_nm,
positive_ion_smile=positive_ion_smile, negative_ion_smile=negative_ion_smile,
desired_charge_elementary=desired_charge_elementary, water_ff=water_ff,
solvent_smiles=solvent_smiles,
ff=ligand_ff,
solvent_name=solvent_name)
solvated_interchange = interchange.combine(water_interchange)
solvated_interchange.positions = packed_topology.get_positions()
solvated_interchange.box = packed_topology.box_vectors
if output_topology and ".pdb" in output_topology:
solvated_interchange.to_pdb(output_topology)
if output_topology:
if ".mmcif" in output_topology or ".mmCif" in output_topology:
print("WARNING: OpenMM can't write mmCif topologies. Writing a pdb"
" and converting it back to mmCif. Sorry!")
solvated_interchange.to_pdb(output_topology+".pdb")
pdb_back_to_mmcif(output_topology+".pdb", output_topology)
openmm_system = solvated_interchange.to_openmm()
openmm_topology = solvated_interchange.to_openmm_topology()
openmm_positions = solvated_interchange.positions.to_openmm()
# note: recommended settings:
# intergrator - variable langevin, tolerance: 0.0005 (or timestep < 0.001ps)
# minimisation - tolerance = 2.5 kilojoule/(nanometer*mole)
return openmm_system, openmm_topology, openmm_positions