Source code for prepmd.model

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Calls to the MODELLER API
"""
NO_MODELLER = False
try:
    import modeller
    from modeller.automodel import *
except:
    NO_MODELLER = True

from Bio import Align
import os
import pathlib
import shutil
from prepmd import get_residues
from prepmd import point_cloud
import sys

placeholder = "sequence:::::::::"


[docs] def retitle_alignment(alignmentfile, title1, title2, metadata1=placeholder, metadata2=placeholder): """ Alignment files generated by BioPython aren't formatted in a way that modeller can easily read, so this function reformats them with correct titles and header info. Args: alignmentfile: path to alignment file, a string title1: a string, title of the first sequence title2: a string, title of the second sequence metadata1: a string, metadata for the first sequence metadata2: a string, metadata for the second sequence. The metadata strings must contain at least the substring ':::::::::'. Returns: none, but overwrites alignmentfile """ titles = [title1, title2] metadata = [metadata1, metadata2] with open(alignmentfile, "r") as file: alignment = file.readlines() new_alignment = [] for line_no in range(len(alignment)): modified_line = False if ">" in alignment[line_no]: new_alignment.append(alignment[line_no].replace( ">", ">"+"P1;"+titles.pop(0))) modified_line = True curr_metadata = metadata.pop(0) if curr_metadata is not None: new_alignment.append(curr_metadata+os.linesep) line = alignment[line_no] if ">" not in line and ":" not in line and ";" not in line: if not line.strip().endswith("*"): new_alignment.append(line.strip()+"*"+os.linesep) modified_line = True if modified_line is False: new_alignment.append(alignment[line_no]) with open(alignmentfile, "w") as file: file.writelines(new_alignment)
[docs] def fasta(fasta_name, include_metadata=False): """ Read data in fasta sequence. Args: fasta_name: filename of fasta file, a string include_metadata: bool, reads metadata if true Returns: a list of lists, the first list is the sequence in the file, the second are the different blocks in the file: first the name/id, then the sequence, and the metadata (if requested) """ out = [] fastadata = [] metadata = None with open(fasta_name) as file: for line in file: if line[0] == ">": header = line.strip() if fastadata != [] and fastadata != ['']: out.append([header, fastadata]) fastadata = [] if ":" in line and ">" not in line: if include_metadata: metadata = line.strip() else: continue if ":" not in line and ">" not in line: fastadata.append(line.strip()) if fastadata != [] and fastadata != ['']: out.append([header, "".join(fastadata), metadata]) if out == []: raise IOError("Couldn't read FASTA data in "+fasta_name) return out
[docs] def get_alignment_info(alignmentout): """ For a FASTA-formatted alignment file, get the number of residues filled, number of gaps, and largest gap. Args: alignmentout: path to an alignment file, a string Returns: total_resididues_filled - total number of residues to be added total_gaps_filled - number of gaps to be filled filled_residues - missing residues filled_gaps - gaps max_gap - largest gap """ def get_info(aln): total_missing = 0 total_gaps = 0 lengths = [] aln = "".join(aln.split("\n")[2:]) prev_char = "?" curr_length = 0 for char in aln: if char == "-": total_missing += 1 if prev_char != "-": total_gaps += 1 lengths.append(curr_length) curr_length = 0 else: curr_length += 1 prev_char = char return total_missing, total_gaps, lengths with open(alignmentout) as file: alignment = "".join(file.readlines()) aln_missing, aln_filled = alignment.split(">")[1:] # fix for hetatms if ".h." in aln_missing: aln_missing = aln_missing.replace(".h.", "") if "---" in aln_filled[-5:]: aln_filled = "".join(aln_filled.rsplit("---", 1)) missing_residues, missing_gaps, missing_lengths = get_info(aln_missing) filled_residues, filled_gaps, filled_lengths = get_info(aln_filled) try: max_gap = max(missing_lengths) except ValueError: max_gap = 0 # no gaps total_residues_filled = missing_residues - filled_residues total_gaps_filled = missing_gaps - filled_gaps return total_residues_filled, total_gaps_filled, filled_residues, filled_gaps, max_gap
[docs] def get_objective_functions(pdbs): """ For a list of PDB or mmCif files generated by modeller, get the objective function (which measures the quality of the model) and similarity (of the model sequence and the sequence used to fill the missing loops) for each pdb. Args: pdbs: a list of strings, paths to pdb files Returns: scores, similarities, two dictionaries keyed by file path containing the scores and similarities. """ scores = {} similarities = {} for pdb in pdbs: with open(pdb, "r") as file: for line in file: # pdb if "OBJECTIVE FUNCTION" in line: score = line.split(" ")[-1] scores[pdb] = float(score) # mmcif if "_modeller.objective_function" in line: score = line.split()[-1] scores[pdb] = float(score) # pdb if "BEST TEMPLATE" in line: similarities[pdb] = float(line.split()[-1]) # mmcif if "_modeller.best_template_pct_seq_id" in line: similarities[pdb] = float(line.split()[-1]) return scores, similarities
[docs] def get_best_pdb(directory, exts=["pdb", "cif", "mmcif", "mmCif"], em_map=None, em_contour_level=None): """ For a directory, find all of the pdbs generated by modeller and select the one with the highest objective function (arbitrary metric used by modeller). Args: directory: a string, the directory to scan ext: file extensions to check for (a list of strings) em_map: path to an EM density map file (a string). If this is set, the best PDB will be picked based on similarity to the map. em_contour_level: contour level for the EM map, a float. Returns: path to the file with the highest objective function, a string """ pdbs = [] for ext in exts: pdbs += list(pathlib.Path(directory).glob('*.'+ext)) scores, similarities = get_objective_functions(pdbs) if em_map: em_scores = {} for pdb in pdbs: em_scores[pdb] = point_cloud.score_pdb_map(pdb, em_map, em_contour_level) max_sim = max(similarities.values()) if max_sim >= 97: print("Similiarity: "+str(max_sim)+"%.") if max_sim < 97 and max_sim >= 90: print("WARNING: best sequence similarity is "+str(max_sim)+"%. " "Modelled loops could be wrong. Please check them!") if max_sim < 90 and max_sim >= 80: print("SERIOUS WARNING: best sequence similarity is only "+ str(max_sim)+"%. Modelled loops may have the wrong sequence." ) if max_sim < 80: raise ValueError("Sequences do not match. Cannot reconstruct missing " "loops. Please double-check your sequence data (by " "default these are the SEQRES records from the input " "structure.") if em_map: em_err = min(em_scores.values()) print("EM map alignemnt error: "+str(round(em_err, 2))+"A") if em_err > 15: raise ValueError("EM map and PDB are not the same structure.") return str(min(em_scores, key=em_scores.get)) else: return str(max(scores, key=scores.get))
# note: the pdb file isn't a parameter, it must be called code.pdb
[docs] def fix_missing_residues(code, fastafile, alignmentout, inmodel, outmodel, wdir, num_models=1, em_map=None, em_contour=None, keep_hetatms = False): """ For a given structure, fill in missing loops using modeller. Args: code: the pdb id of the structure, a string fastafile: path to the fasta sequence, a string alignmentout: filename of the temporary alignment output file, a string inmodel: input structure. note: due to the limitations of modeller, this must be named according to the pdb id! outmodel: output structure file, a string wdir: working directory, a string, will be created if it doesn't exist num_models: how many models to generate, an int. Returns: nothing, but writes out outmodel and wdir. """ if NO_MODELLER: raise ImportError("Can't fix without MODELLER and a valid license key") print("Fixing missing residues...") if not os.path.isdir(wdir): pathlib.Path(wdir).mkdir(parents=True, exist_ok=True) os.chdir(wdir) try: shutil.copy(inmodel, wdir) except shutil.SameFileError: pass pdb_dirs = ['.', '../atom_files'] modeller.log.none() if fastafile: env = modeller.Environ() model = modeller.Model(env, file=inmodel) aln = modeller.Alignment(env) aln.append_model(model, align_codes=code) aln.write(file=code+'.seq') print("Using provided fasta sequence to generate alignment.") pdb_sequence = fasta(code+'.seq') original_sequence = fasta(fastafile) aln_metadata = fasta(code+'.seq', include_metadata=True)[0][2] aligner = Align.PairwiseAligner() try: print("Aligning sequences...") alignments = aligner.align( pdb_sequence[0][1], original_sequence[0][1]) print("Succesfully aligned.") Align.write(alignments[-1], alignmentout, "fasta") # the file doesn't have names/labels for whatever reason retitle_alignment(alignmentout, code, code + "_fill", metadata1=aln_metadata) except OverflowError: print("Could not align sequences (normally this is because the " "sequences in the PDB and the original protein are too " "different)") fastafile = None print("Will try to use the sequence data in the input file...") if not fastafile: print("Getting missing residues from input file...") pdb_res = get_residues.get_residues_pdb(inmodel, code, get_hetatms = keep_hetatms) pdb_fullseq = get_residues.get_fullseq_pdb(inmodel, code, get_hetatms = keep_hetatms) with open(code+".seq", "w") as file: file.write("".join(pdb_res)) with open(code+"_fill.seq", "w") as file: file.write(pdb_fullseq) env = modeller.environ() env.io.hetatm = True print("Aligning sequences...") aln = modeller.Alignment(env) aln.append(file=code+".seq", align_codes=code) aln.append(file=code+"_fill.seq", align_codes=code+"_fill") aln.salign() print("Succesfully aligned.", end="") aln.write(file=alignmentout) print(" Wrote alignment to "+alignmentout) env.io.atom_files_directory = pdb_dirs print("Modelling missing loops...") if num_models > 4: print("(Creating "+str(num_models)+" models - might be slow!)") old_stdout = sys.stdout f = open(os.devnull, 'w') sys.stdout = f a = automodel(env, alnfile=alignmentout, knowns=code, sequence=code+"_fill") a.auto_align() if ".mmCif" in inmodel or ".cif" in inmodel or "Cif" in inmodel: a.set_output_model_format("MMCIF") if num_models > 1: a.starting_model= 1 a.ending_model = num_models a.make() sys.stdout = old_stdout if em_map: print("Ranking PDBs by similarity to EM map...") best_pdb = get_best_pdb(wdir, em_map=em_map, em_contour_level=em_contour) print("Finished modelling missing loops.") residues, gaps, remain_res, remain_gaps, max_gap = get_alignment_info(alignmentout) print("Filled "+str(residues)+" residues in "+str(gaps)+" gaps.") if remain_res > 0: print("(still missing: "+str(remain_res) + " residues in "+str(remain_gaps)+" gaps)") if max_gap > 20: print("Warning: big missing loop ("+str(max_gap)+" residues)") shutil.copy2(best_pdb, os.path.basename(outmodel))