Source code for prepmd.download
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Download data from the PDB and UNIPROT
"""
import urllib.request
from os.path import sep
import requests
[docs]
def get_em_map(emdb_id, directory):
"""
Download a structure from the EMDB.
Args:
emdb_id: id of the em map to download, a string
directory: directory to download the file into, a string
returns:
path to the downloaded file.
"""
emdb_id = str(emdb_id).replace("EMD-", "").replace("emd-", "")
url = "https://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-"+str(emdb_id)+"/map/emd_"+str(emdb_id)+".map.gz"
destination = directory+sep+str(emdb_id)+".map.gz"
try:
urllib.request.urlretrieve(url, destination)
except urllib.error.HTTPError as e:
if e.code == 404:
msg = "EMDB entry with ID "+emdb_id+" not found."
raise IOError(msg)
return directory+sep+str(emdb_id)+".map.gz"
[docs]
def get_structure(pdb_id, directory, file_format="mmCif", redo=False):
"""
Download a structure from the PDB.
Args:
pdb_id: id of the pdb to download, a string
directory: directory to download the file into, a string
file_format: mmCif or pdb, a string
returns:
path to the downloaded file.
"""
if file_format == "mmCif" or file_format == "cif":
format_str = "cif"
if file_format == "pdb":
format_str = "pdb"
try:
if redo:
url = "https://pdb-redo.eu/db/"+pdb_id+"/"+pdb_id+"_final"+"."+format_str
print(url)
else:
url = "https://files.rcsb.org/download/"+pdb_id+"."+format_str
destination = directory+sep+pdb_id+"."+format_str
urllib.request.urlretrieve(url, destination)
except urllib.error.HTTPError as e:
if e.code == 404 and redo:
msg = "PDB with ID "+pdb_id+" not found in PDB-REDO."
raise IOError(msg)
r = requests.get(url.replace(".pdb", ".cif"))
if r.status_code == 200:
msg = "No PDB for "+pdb_id + \
" exists (but an mmcif structure does). "
"Run with --fmt cif to use it."
raise IOError(msg)
else:
raise e
return directory+sep+pdb_id+"."+format_str
[docs]
def get_uniprot_sequence(pdb_id, merge_sequence=True, write_to_file=None,
verbose=False):
"""
For a given pdb id, find the fasta sequence for all chains from UNIPROT.
Args:
pdb_id: the id of the pdb, a string
merge_sequence: whether to merge the sequences together into a single
fasta file, a bool
write_to_file: path of file to write to, a string
verbose: a bool, whether to write debug info out
Returns:
the fasta sequences as a dictionry keyed by pdb id, or a string
with the fasta sequence.
"""
query = '''{
entries(entry_ids:["'''+pdb_id+'''"]){
polymer_entities {
rcsb_id
rcsb_polymer_entity_container_identifiers {
reference_sequence_identifiers {
database_accession
database_name
}
}
}
}
}'''
data = requests.get('https://data.rcsb.org/graphql?query='+query)
accessions = {}
for item in data.json()['data']['entries']:
entities = item['polymer_entities']
for entity in entities:
if entity['rcsb_polymer_entity_container_identifiers']['reference_sequence_identifiers']:
for identifier in entity['rcsb_polymer_entity_container_identifiers']['reference_sequence_identifiers']:
accessions[int(entity["rcsb_id"].split("_")[1])
] = identifier['database_accession']
else:
accessions[int(entity["rcsb_id"].split("_")[1])] = None # rna?
if verbose:
print("Accessions: "+str(accessions))
sequences = {}
for key, value in accessions.items():
if value:
url = "https://rest.uniprot.org/uniprotkb/stream?compressed=false&format=fasta&query=(accession:" + \
value+")"
seq = requests.get(url)
sequences[key] = seq.text
else:
sequences[key] = None
if not merge_sequence:
return sequences
names = []
combined_sequences = []
for key, sequence in sequences.items():
if sequence is None:
continue
names.append(sequence.split("\n")[0].replace(
"sp|", "").replace(">", ""))
seq_noheader = sequence.split("\n")[1:]
combined_sequences.append("".join(seq_noheader).replace("\n", ""))
fasta = ">P1;"+pdb_id+"\n" + \
";".join(names)+"\n"+"/".join(combined_sequences)
if write_to_file:
with open(write_to_file, "w") as file:
file.write(fasta)
return fasta