Source code for prepmd.download

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Download data from the PDB and UNIPROT
"""

import urllib.request
from os.path import sep
import requests


[docs] def get_em_map(emdb_id, directory): """ Download a structure from the EMDB. Args: emdb_id: id of the em map to download, a string directory: directory to download the file into, a string returns: path to the downloaded file. """ emdb_id = str(emdb_id).replace("EMD-", "").replace("emd-", "") url = "https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-"+str(emdb_id)+"/map/emd_"+str(emdb_id)+".map.gz" destination = directory+sep+str(emdb_id)+".map.gz" try: urllib.request.urlretrieve(url, destination) except urllib.error.HTTPError as e: if e.code == 404: msg = "EMDB entry with ID "+emdb_id+" not found." raise IOError(msg) return directory+sep+str(emdb_id)+".map.gz"
[docs] def get_structure(pdb_id, directory, file_format="mmCif", redo=False): """ Download a structure from the PDB. Args: pdb_id: id of the pdb to download, a string directory: directory to download the file into, a string file_format: mmCif or pdb, a string returns: path to the downloaded file. """ if file_format == "mmCif" or file_format == "cif": format_str = "cif" if file_format == "pdb": format_str = "pdb" try: if redo: url = "https://pdb-redo.eu/db/"+pdb_id+"/"+pdb_id+"_final"+"."+format_str print(url) else: url = "https://files.rcsb.org/download/"+pdb_id+"."+format_str destination = directory+sep+pdb_id+"."+format_str urllib.request.urlretrieve(url, destination) except urllib.error.HTTPError as e: if e.code == 404 and redo: msg = "PDB with ID "+pdb_id+" not found in PDB-REDO." raise IOError(msg) r = requests.get(url.replace(".pdb", ".cif")) if r.status_code == 200: msg = "No PDB for "+pdb_id + \ " exists (but an mmcif structure does). " "Run with --fmt cif to use it." raise IOError(msg) else: raise e return directory+sep+pdb_id+"."+format_str
[docs] def get_uniprot_sequence(pdb_id, merge_sequence=True, write_to_file=None, verbose=False): """ For a given pdb id, find the fasta sequence for all chains from UNIPROT. Args: pdb_id: the id of the pdb, a string merge_sequence: whether to merge the sequences together into a single fasta file, a bool write_to_file: path of file to write to, a string verbose: a bool, whether to write debug info out Returns: the fasta sequences as a dictionry keyed by pdb id, or a string with the fasta sequence. """ query = '''{ entries(entry_ids:["'''+pdb_id+'''"]){ polymer_entities { rcsb_id rcsb_polymer_entity_container_identifiers { reference_sequence_identifiers { database_accession database_name } } } } }''' data = requests.get('https://data.rcsb.org/graphql?query='+query) accessions = {} for item in data.json()['data']['entries']: entities = item['polymer_entities'] for entity in entities: if entity['rcsb_polymer_entity_container_identifiers']['reference_sequence_identifiers']: for identifier in entity['rcsb_polymer_entity_container_identifiers']['reference_sequence_identifiers']: accessions[int(entity["rcsb_id"].split("_")[1]) ] = identifier['database_accession'] else: accessions[int(entity["rcsb_id"].split("_")[1])] = None # rna? if verbose: print("Accessions: "+str(accessions)) sequences = {} for key, value in accessions.items(): if value: url = "https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=fasta&query=(accession:" + \ value+")" seq = requests.get(url) sequences[key] = seq.text else: sequences[key] = None if not merge_sequence: return sequences names = [] combined_sequences = [] for key, sequence in sequences.items(): if sequence is None: continue names.append(sequence.split("\n")[0].replace( "sp|", "").replace(">", "")) seq_noheader = sequence.split("\n")[1:] combined_sequences.append("".join(seq_noheader).replace("\n", "")) fasta = ">P1;"+pdb_id+"\n" + \ ";".join(names)+"\n"+"/".join(combined_sequences) if write_to_file: with open(write_to_file, "w") as file: file.write(fasta) return fasta