Source code for prepmd.model

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Calls to the MODELLER API
"""
NO_MODELLER = False
try:
    import modeller
    from modeller.automodel import *
except:
    NO_MODELLER = True

from Bio import Align
import os
import pathlib
import shutil
from prepmd import get_residues
from prepmd import point_cloud
import sys

placeholder = "sequence:::::::::"



[docs]
def retitle_alignment(alignmentfile, title1, title2,
                      metadata1=placeholder, metadata2=placeholder):
    """
    Alignment files generated by BioPython aren't formatted in a way that
    modeller can easily read, so this function reformats them with correct
    titles and header info.
    Args:
        alignmentfile: path to alignment file, a string
        title1: a string, title of the first sequence
        title2: a string, title of the second sequence
        metadata1: a string, metadata for the first sequence
        metadata2: a string, metadata for the second sequence.
    The metadata strings must contain at least the substring ':::::::::'.
    Returns:
        none, but overwrites alignmentfile
    """
    titles = [title1, title2]
    metadata = [metadata1, metadata2]
    with open(alignmentfile, "r") as file:
        alignment = file.readlines()
    new_alignment = []
    for line_no in range(len(alignment)):
        modified_line = False
        if ">" in alignment[line_no]:
            new_alignment.append(alignment[line_no].replace(
                ">", ">"+"P1;"+titles.pop(0)))
            modified_line = True
            curr_metadata = metadata.pop(0)
            if curr_metadata is not None:
                new_alignment.append(curr_metadata+os.linesep)
        line = alignment[line_no]
        if ">" not in line and ":" not in line and ";" not in line:
            if not line.strip().endswith("*"):
                new_alignment.append(line.strip()+"*"+os.linesep)
                modified_line = True
        if modified_line is False:
            new_alignment.append(alignment[line_no])
    with open(alignmentfile, "w") as file:
        file.writelines(new_alignment)




[docs]
def fasta(fasta_name, include_metadata=False):
    """
    Read data in fasta sequence.
    Args:
        fasta_name: filename of fasta file, a string
        include_metadata: bool, reads metadata if true
    Returns:
        a list of lists, the first list is the sequence in the file, the
        second are the different blocks in the file: first the name/id, then
        the sequence, and the metadata (if requested)
    """
    out = []
    fastadata = []
    metadata = None
    with open(fasta_name) as file:
        for line in file:
            if line[0] == ">":
                header = line.strip()
                if fastadata != [] and fastadata != ['']:
                    out.append([header, fastadata])
                    fastadata = []
            if ":" in line and ">" not in line:
                if include_metadata:
                    metadata = line.strip()
                else:
                    continue
            if ":" not in line and ">" not in line:
                fastadata.append(line.strip())
        if fastadata != [] and fastadata != ['']:
            out.append([header, "".join(fastadata), metadata])
    if out == []:
        raise IOError("Couldn't read FASTA data in "+fasta_name)
    return out



[docs]
def get_alignment_info(alignmentout):
    """
    For a FASTA-formatted alignment file, get the number of residues filled,
    number of gaps, and largest gap.
    
    Args:
        alignmentout: path to an alignment file, a string
    
    Returns:
        total_resididues_filled - total number of residues to be added
        total_gaps_filled - number of gaps to be filled
        filled_residues - missing residues
        filled_gaps - gaps
        max_gap - largest gap
    """
    def get_info(aln):
        total_missing = 0
        total_gaps = 0
        lengths = []
        aln = "".join(aln.split("\n")[2:])
        prev_char = "?"
        curr_length = 0
        for char in aln:
            if char == "-":
                total_missing += 1
                if prev_char != "-":
                    total_gaps += 1
                    lengths.append(curr_length)
                    curr_length = 0
                else:
                    curr_length += 1
            prev_char = char
        return total_missing, total_gaps, lengths

    with open(alignmentout) as file:
        alignment = "".join(file.readlines())
    aln_missing, aln_filled = alignment.split(">")[1:]
    
    # fix for hetatms
    if ".h." in aln_missing:
        aln_missing = aln_missing.replace(".h.", "")
        if "---" in aln_filled[-5:]:
            aln_filled = "".join(aln_filled.rsplit("---", 1))
    
    missing_residues, missing_gaps, missing_lengths = get_info(aln_missing)
    filled_residues, filled_gaps, filled_lengths = get_info(aln_filled)
    try:
        max_gap = max(missing_lengths)
    except ValueError:
        max_gap = 0 # no gaps

    total_residues_filled = missing_residues - filled_residues
    total_gaps_filled = missing_gaps - filled_gaps
    return total_residues_filled, total_gaps_filled, filled_residues, filled_gaps, max_gap




[docs]
def get_objective_functions(pdbs):
    """
    For a list of PDB or mmCif files generated by modeller, get the objective
    function (which measures the quality of the model) and similarity (of the
    model sequence and the sequence used to fill the missing loops) for each
    pdb.
    
    Args:
        pdbs: a list of strings, paths to pdb files
    Returns:
        scores, similarities, two dictionaries keyed by file path containing
        the scores and similarities.
    """
    scores = {}
    similarities = {}
    for pdb in pdbs:
        with open(pdb, "r") as file:
            for line in file:
                # pdb
                if "OBJECTIVE FUNCTION" in line:
                    score = line.split(" ")[-1]
                    scores[pdb] = float(score)
                # mmcif
                if "_modeller.objective_function" in line:
                    score = line.split()[-1]
                    scores[pdb] = float(score)
                # pdb
                if "BEST TEMPLATE" in line:
                    similarities[pdb] = float(line.split()[-1])
                # mmcif
                if "_modeller.best_template_pct_seq_id" in line:
                    similarities[pdb] = float(line.split()[-1])
    return scores, similarities




[docs]
def get_best_pdb(directory, exts=["pdb", "cif", "mmcif", "mmCif"],
                 em_map=None, em_contour_level=None):
    """
    For a directory, find all of the pdbs generated by modeller and select
    the one with the highest objective function (arbitrary metric used by
    modeller).
    Args:
        directory: a string, the directory to scan
        ext: file extensions to check for (a list of strings)
        em_map: path to an EM density map file (a string). If this is set,
        the best PDB will be picked based on similarity to the map.
        em_contour_level: contour level for the EM map, a float.
    Returns:
        path to the file with the highest objective function, a string
    """
    
    pdbs = []
    for ext in exts:
        pdbs += list(pathlib.Path(directory).glob('*.'+ext))
    scores, similarities = get_objective_functions(pdbs)
    if em_map:
        em_scores = {}
        for pdb in pdbs:
            em_scores[pdb] = point_cloud.score_pdb_map(pdb, em_map, 
                                                       em_contour_level)
    max_sim = max(similarities.values())
    if max_sim >= 97:
        print("Similiarity: "+str(max_sim)+"%.")
    if max_sim < 97 and max_sim >= 90:
        print("WARNING: best sequence similarity is "+str(max_sim)+"%. "
              "Modelled loops could be wrong. Please check them!")
    if max_sim < 90 and max_sim >= 80:
        print("SERIOUS WARNING: best sequence similarity is only "+
              str(max_sim)+"%. Modelled loops may have the wrong sequence." )
    if max_sim < 80:
        raise ValueError("Sequences do not match. Cannot reconstruct missing "
                         "loops. Please double-check your sequence data (by "
                         "default these are the SEQRES records from the input "
                         "structure.")
    if em_map:
        em_err = min(em_scores.values())
        print("EM map alignemnt error: "+str(round(em_err, 2))+"A")
        if em_err > 15:
            raise ValueError("EM map and PDB are not the same structure.")
        return str(min(em_scores, key=em_scores.get))
    else:
        return str(max(scores, key=scores.get))



# note: the pdb file isn't a parameter, it must be called code.pdb

[docs]
def fix_missing_residues(code, fastafile, alignmentout, inmodel, outmodel,
                         wdir, num_models=1, em_map=None, em_contour=None,
                         keep_hetatms = False):
    """
    For a given structure, fill in missing loops using modeller.
    Args:
        code: the pdb id of the structure, a string
        fastafile: path to the fasta sequence, a string
        alignmentout: filename of the temporary alignment output file, a string
        inmodel: input structure. note: due to the limitations of modeller,
        this must be named according to the pdb id!
        outmodel: output structure file, a string
        wdir: working directory, a string, will be created if it doesn't exist
        num_models: how many models to generate, an int.
    Returns:
        nothing, but writes out outmodel and wdir.
    """
    if NO_MODELLER:
        raise ImportError("Can't fix without MODELLER and a valid license key")

    print("Fixing missing residues...")

    if not os.path.isdir(wdir):
        pathlib.Path(wdir).mkdir(parents=True, exist_ok=True)
    os.chdir(wdir)
    try:
        shutil.copy(inmodel, wdir)
    except shutil.SameFileError:
        pass

    pdb_dirs = ['.', '../atom_files']

    modeller.log.none()

    if fastafile:
        env = modeller.Environ()
        model = modeller.Model(env, file=inmodel)
        aln = modeller.Alignment(env)
        aln.append_model(model, align_codes=code)
        aln.write(file=code+'.seq')
        print("Using provided fasta sequence to generate alignment.")
        pdb_sequence = fasta(code+'.seq')
        original_sequence = fasta(fastafile)
        aln_metadata = fasta(code+'.seq', include_metadata=True)[0][2]

        aligner = Align.PairwiseAligner()
        try:
            print("Aligning sequences...")
            alignments = aligner.align(
                pdb_sequence[0][1], original_sequence[0][1])
            print("Succesfully aligned.")
            Align.write(alignments[-1], alignmentout, "fasta")
            # the file doesn't have names/labels for whatever reason
            retitle_alignment(alignmentout, code, code +
                              "_fill", metadata1=aln_metadata)
        except OverflowError:
            print("Could not align sequences (normally this is because the "
                  "sequences in the PDB and the original protein are too "
                  "different)")
            fastafile = None
            print("Will try to use the sequence data in the input file...")

    if not fastafile:
        print("Getting missing residues from input file...")
        pdb_res = get_residues.get_residues_pdb(inmodel, code,
                                                get_hetatms = keep_hetatms)
        pdb_fullseq = get_residues.get_fullseq_pdb(inmodel, code,
                                                   get_hetatms = keep_hetatms)
        with open(code+".seq", "w") as file:
            file.write("".join(pdb_res))
        with open(code+"_fill.seq", "w") as file:
            file.write(pdb_fullseq)
        env = modeller.environ()
        env.io.hetatm = True
        print("Aligning sequences...")
        aln = modeller.Alignment(env)
        aln.append(file=code+".seq", align_codes=code)
        aln.append(file=code+"_fill.seq", align_codes=code+"_fill")
        aln.salign()
        print("Succesfully aligned.", end="")
        aln.write(file=alignmentout)
        print(" Wrote alignment to "+alignmentout)

    env.io.atom_files_directory = pdb_dirs
    print("Modelling missing loops...")
    if num_models > 4:
        print("(Creating "+str(num_models)+" models - might be slow!)")
    old_stdout = sys.stdout
    f = open(os.devnull, 'w')
    sys.stdout = f
    a = automodel(env, alnfile=alignmentout,
                  knowns=code, sequence=code+"_fill")
    a.auto_align()

    if ".mmCif" in inmodel or ".cif" in inmodel or "Cif" in inmodel:
        a.set_output_model_format("MMCIF")

    if num_models > 1:
        a.starting_model= 1
        a.ending_model  = num_models
        
    a.make()

    sys.stdout = old_stdout
    
    if em_map:
        print("Ranking PDBs by similarity to EM map...")
    best_pdb = get_best_pdb(wdir, em_map=em_map, em_contour_level=em_contour)
    
    print("Finished modelling missing loops.")
    residues, gaps, remain_res, remain_gaps, max_gap = get_alignment_info(alignmentout)
    print("Filled "+str(residues)+" residues in "+str(gaps)+" gaps.")
    if remain_res > 0:
        print("(still missing: "+str(remain_res) +
              " residues in "+str(remain_gaps)+" gaps)")
    if max_gap > 20:
        print("Warning: big missing loop ("+str(max_gap)+" residues)")

    shutil.copy2(best_pdb, os.path.basename(outmodel))