#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Calls to the MODELLER API
"""
NO_MODELLER = False
try:
import modeller
from modeller.automodel import *
except:
NO_MODELLER = True
from Bio import Align
import os
import pathlib
import shutil
from prepmd import get_residues
from prepmd import point_cloud
import sys
placeholder = "sequence:::::::::"
[docs]
def retitle_alignment(alignmentfile, title1, title2,
metadata1=placeholder, metadata2=placeholder):
"""
Alignment files generated by BioPython aren't formatted in a way that
modeller can easily read, so this function reformats them with correct
titles and header info.
Args:
alignmentfile: path to alignment file, a string
title1: a string, title of the first sequence
title2: a string, title of the second sequence
metadata1: a string, metadata for the first sequence
metadata2: a string, metadata for the second sequence.
The metadata strings must contain at least the substring ':::::::::'.
Returns:
none, but overwrites alignmentfile
"""
titles = [title1, title2]
metadata = [metadata1, metadata2]
with open(alignmentfile, "r") as file:
alignment = file.readlines()
new_alignment = []
for line_no in range(len(alignment)):
modified_line = False
if ">" in alignment[line_no]:
new_alignment.append(alignment[line_no].replace(
">", ">"+"P1;"+titles.pop(0)))
modified_line = True
curr_metadata = metadata.pop(0)
if curr_metadata is not None:
new_alignment.append(curr_metadata+os.linesep)
line = alignment[line_no]
if ">" not in line and ":" not in line and ";" not in line:
if not line.strip().endswith("*"):
new_alignment.append(line.strip()+"*"+os.linesep)
modified_line = True
if modified_line is False:
new_alignment.append(alignment[line_no])
with open(alignmentfile, "w") as file:
file.writelines(new_alignment)
[docs]
def fasta(fasta_name, include_metadata=False):
"""
Read data in fasta sequence.
Args:
fasta_name: filename of fasta file, a string
include_metadata: bool, reads metadata if true
Returns:
a list of lists, the first list is the sequence in the file, the
second are the different blocks in the file: first the name/id, then
the sequence, and the metadata (if requested)
"""
out = []
fastadata = []
metadata = None
with open(fasta_name) as file:
for line in file:
if line[0] == ">":
header = line.strip()
if fastadata != [] and fastadata != ['']:
out.append([header, fastadata])
fastadata = []
if ":" in line and ">" not in line:
if include_metadata:
metadata = line.strip()
else:
continue
if ":" not in line and ">" not in line:
fastadata.append(line.strip())
if fastadata != [] and fastadata != ['']:
out.append([header, "".join(fastadata), metadata])
if out == []:
raise IOError("Couldn't read FASTA data in "+fasta_name)
return out
[docs]
def get_alignment_info(alignmentout):
"""
For a FASTA-formatted alignment file, get the number of residues filled,
number of gaps, and largest gap.
Args:
alignmentout: path to an alignment file, a string
Returns:
total_resididues_filled - total number of residues to be added
total_gaps_filled - number of gaps to be filled
filled_residues - missing residues
filled_gaps - gaps
max_gap - largest gap
"""
def get_info(aln):
total_missing = 0
total_gaps = 0
lengths = []
aln = "".join(aln.split("\n")[2:])
prev_char = "?"
curr_length = 0
for char in aln:
if char == "-":
total_missing += 1
if prev_char != "-":
total_gaps += 1
lengths.append(curr_length)
curr_length = 0
else:
curr_length += 1
prev_char = char
return total_missing, total_gaps, lengths
with open(alignmentout) as file:
alignment = "".join(file.readlines())
aln_missing, aln_filled = alignment.split(">")[1:]
# fix for hetatms
if ".h." in aln_missing:
aln_missing = aln_missing.replace(".h.", "")
if "---" in aln_filled[-5:]:
aln_filled = "".join(aln_filled.rsplit("---", 1))
missing_residues, missing_gaps, missing_lengths = get_info(aln_missing)
filled_residues, filled_gaps, filled_lengths = get_info(aln_filled)
try:
max_gap = max(missing_lengths)
except ValueError:
max_gap = 0 # no gaps
total_residues_filled = missing_residues - filled_residues
total_gaps_filled = missing_gaps - filled_gaps
return total_residues_filled, total_gaps_filled, filled_residues, filled_gaps, max_gap
[docs]
def get_objective_functions(pdbs):
"""
For a list of PDB or mmCif files generated by modeller, get the objective
function (which measures the quality of the model) and similarity (of the
model sequence and the sequence used to fill the missing loops) for each
pdb.
Args:
pdbs: a list of strings, paths to pdb files
Returns:
scores, similarities, two dictionaries keyed by file path containing
the scores and similarities.
"""
scores = {}
similarities = {}
for pdb in pdbs:
with open(pdb, "r") as file:
for line in file:
# pdb
if "OBJECTIVE FUNCTION" in line:
score = line.split(" ")[-1]
scores[pdb] = float(score)
# mmcif
if "_modeller.objective_function" in line:
score = line.split()[-1]
scores[pdb] = float(score)
# pdb
if "BEST TEMPLATE" in line:
similarities[pdb] = float(line.split()[-1])
# mmcif
if "_modeller.best_template_pct_seq_id" in line:
similarities[pdb] = float(line.split()[-1])
return scores, similarities
[docs]
def get_best_pdb(directory, exts=["pdb", "cif", "mmcif", "mmCif"],
em_map=None, em_contour_level=None):
"""
For a directory, find all of the pdbs generated by modeller and select
the one with the highest objective function (arbitrary metric used by
modeller).
Args:
directory: a string, the directory to scan
ext: file extensions to check for (a list of strings)
em_map: path to an EM density map file (a string). If this is set,
the best PDB will be picked based on similarity to the map.
em_contour_level: contour level for the EM map, a float.
Returns:
path to the file with the highest objective function, a string
"""
pdbs = []
for ext in exts:
pdbs += list(pathlib.Path(directory).glob('*.'+ext))
scores, similarities = get_objective_functions(pdbs)
if em_map:
em_scores = {}
for pdb in pdbs:
em_scores[pdb] = point_cloud.score_pdb_map(pdb, em_map,
em_contour_level)
max_sim = max(similarities.values())
if max_sim >= 97:
print("Similiarity: "+str(max_sim)+"%.")
if max_sim < 97 and max_sim >= 90:
print("WARNING: best sequence similarity is "+str(max_sim)+"%. "
"Modelled loops could be wrong. Please check them!")
if max_sim < 90 and max_sim >= 80:
print("SERIOUS WARNING: best sequence similarity is only "+
str(max_sim)+"%. Modelled loops may have the wrong sequence." )
if max_sim < 80:
raise ValueError("Sequences do not match. Cannot reconstruct missing "
"loops. Please double-check your sequence data (by "
"default these are the SEQRES records from the input "
"structure.")
if em_map:
em_err = min(em_scores.values())
print("EM map alignemnt error: "+str(round(em_err, 2))+"A")
if em_err > 15:
raise ValueError("EM map and PDB are not the same structure.")
return str(min(em_scores, key=em_scores.get))
else:
return str(max(scores, key=scores.get))
# note: the pdb file isn't a parameter, it must be called code.pdb
[docs]
def fix_missing_residues(code, fastafile, alignmentout, inmodel, outmodel,
wdir, num_models=1, em_map=None, em_contour=None,
keep_hetatms = False):
"""
For a given structure, fill in missing loops using modeller.
Args:
code: the pdb id of the structure, a string
fastafile: path to the fasta sequence, a string
alignmentout: filename of the temporary alignment output file, a string
inmodel: input structure. note: due to the limitations of modeller,
this must be named according to the pdb id!
outmodel: output structure file, a string
wdir: working directory, a string, will be created if it doesn't exist
num_models: how many models to generate, an int.
Returns:
nothing, but writes out outmodel and wdir.
"""
if NO_MODELLER:
raise ImportError("Can't fix without MODELLER and a valid license key")
print("Fixing missing residues...")
if not os.path.isdir(wdir):
pathlib.Path(wdir).mkdir(parents=True, exist_ok=True)
os.chdir(wdir)
try:
shutil.copy(inmodel, wdir)
except shutil.SameFileError:
pass
pdb_dirs = ['.', '../atom_files']
modeller.log.none()
if fastafile:
env = modeller.Environ()
model = modeller.Model(env, file=inmodel)
aln = modeller.Alignment(env)
aln.append_model(model, align_codes=code)
aln.write(file=code+'.seq')
print("Using provided fasta sequence to generate alignment.")
pdb_sequence = fasta(code+'.seq')
original_sequence = fasta(fastafile)
aln_metadata = fasta(code+'.seq', include_metadata=True)[0][2]
aligner = Align.PairwiseAligner()
try:
print("Aligning sequences...")
alignments = aligner.align(
pdb_sequence[0][1], original_sequence[0][1])
print("Succesfully aligned.")
Align.write(alignments[-1], alignmentout, "fasta")
# the file doesn't have names/labels for whatever reason
retitle_alignment(alignmentout, code, code +
"_fill", metadata1=aln_metadata)
except OverflowError:
print("Could not align sequences (normally this is because the "
"sequences in the PDB and the original protein are too "
"different)")
fastafile = None
print("Will try to use the sequence data in the input file...")
if not fastafile:
print("Getting missing residues from input file...")
pdb_res = get_residues.get_residues_pdb(inmodel, code,
get_hetatms = keep_hetatms)
pdb_fullseq = get_residues.get_fullseq_pdb(inmodel, code,
get_hetatms = keep_hetatms)
with open(code+".seq", "w") as file:
file.write("".join(pdb_res))
with open(code+"_fill.seq", "w") as file:
file.write(pdb_fullseq)
env = modeller.environ()
env.io.hetatm = True
print("Aligning sequences...")
aln = modeller.Alignment(env)
aln.append(file=code+".seq", align_codes=code)
aln.append(file=code+"_fill.seq", align_codes=code+"_fill")
aln.salign()
print("Succesfully aligned.", end="")
aln.write(file=alignmentout)
print(" Wrote alignment to "+alignmentout)
env.io.atom_files_directory = pdb_dirs
print("Modelling missing loops...")
if num_models > 4:
print("(Creating "+str(num_models)+" models - might be slow!)")
old_stdout = sys.stdout
f = open(os.devnull, 'w')
sys.stdout = f
a = automodel(env, alnfile=alignmentout,
knowns=code, sequence=code+"_fill")
a.auto_align()
if ".mmCif" in inmodel or ".cif" in inmodel or "Cif" in inmodel:
a.set_output_model_format("MMCIF")
if num_models > 1:
a.starting_model= 1
a.ending_model = num_models
a.make()
sys.stdout = old_stdout
if em_map:
print("Ranking PDBs by similarity to EM map...")
best_pdb = get_best_pdb(wdir, em_map=em_map, em_contour_level=em_contour)
print("Finished modelling missing loops.")
residues, gaps, remain_res, remain_gaps, max_gap = get_alignment_info(alignmentout)
print("Filled "+str(residues)+" residues in "+str(gaps)+" gaps.")
if remain_res > 0:
print("(still missing: "+str(remain_res) +
" residues in "+str(remain_gaps)+" gaps)")
if max_gap > 20:
print("Warning: big missing loop ("+str(max_gap)+" residues)")
shutil.copy2(best_pdb, os.path.basename(outmodel))