File size: 7,386 Bytes
1430181 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | """
Downloads and prepares benchmark complexes.
Datasets:
1. Marsh 2013 homomers/heteromers (Cell 2013) β ground-truth assembly orders
2. NPC nucleoporins (human) β target system
Usage:
python fetch_complexes.py --dataset marsh2013
python fetch_complexes.py --dataset npc
python fetch_complexes.py --dataset all
"""
import os
import sys
import json
import time
import argparse
import requests
from pathlib import Path
DATA_DIR = Path(__file__).parent.parent.parent / "data" / "complexes"
DATA_DIR.mkdir(parents=True, exist_ok=True)
RCSB_PDB_URL = "https://files.rcsb.org/download/{pdb_id}.pdb"
UNIPROT_FASTA = "https://rest.uniprot.org/uniprotkb/{accession}.fasta"
STRING_API = "https://string-db.org/api/json/network"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Marsh 2013 benchmark complexes
# Source: Table S1 from Marsh et al. Cell 2013 β ordered assembly pathways
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
MARSH_2013_HOMOMERS = {
# PDB_ID: (n_subunits, assembly_order_description)
"1AON": {"n": 14, "type": "homomer", "name": "GroEL (homotetradecamer)"},
"1GRU": {"n": 7, "type": "homomer", "name": "GroES (homoheptamer)"},
"2HHB": {"n": 4, "type": "heteromer", "name": "Hemoglobin (Ξ±2Ξ²2)"},
"1AY7": {"n": 2, "type": "heteromer", "name": "RNase SA / Barstar"},
"1BRS": {"n": 2, "type": "heteromer", "name": "Barnase / Barstar"},
"1SBB": {"n": 2, "type": "heteromer", "name": "Subtilisin / Eglin C"},
"1TGS": {"n": 2, "type": "heteromer", "name": "Trypsin / PSTI"},
"2PTC": {"n": 2, "type": "heteromer", "name": "Trypsin / BPTI"},
"3GBN": {"n": 2, "type": "heteromer", "name": "Antigen / Antibody"},
"1A2K": {"n": 2, "type": "heteromer", "name": "Ran-GTP / importin"},
}
# NPC nucleoporin PDB structures (major subcomplexes with known structures)
NPC_COMPLEXES = {
"5A9Q": {"name": "Nup107-Nup133 (Y-complex fragment)", "n": 2},
"3PBP": {"name": "Nup98 APD crystal structure", "n": 1},
"4I9B": {"name": "Nup358 RanBD1", "n": 1},
"5C3L": {"name": "Nup214 / Nup88 (cytoplasmic ring)", "n": 2},
"2QX5": {"name": "Nup153 LacZ fusion", "n": 1},
"3F3F": {"name": "Nup62/58/54 (central channel)", "n": 3},
"5HAX": {"name": "Y-complex (human, Nup107 subcomplex)","n": 7},
}
def _fetch_pdb(pdb_id: str, outdir: Path) -> Path:
out = outdir / f"{pdb_id}.pdb"
if out.exists():
print(f" {pdb_id}.pdb already present")
return out
url = RCSB_PDB_URL.format(pdb_id=pdb_id)
r = requests.get(url, timeout=30)
r.raise_for_status()
out.write_text(r.text)
print(f" downloaded {pdb_id}.pdb ({len(r.text)//1024} KB)")
time.sleep(0.5)
return out
def _fetch_uniprot_fasta(accession: str) -> str:
"""Returns FASTA sequence string."""
r = requests.get(UNIPROT_FASTA.format(accession=accession), timeout=30)
r.raise_for_status()
lines = r.text.strip().splitlines()
return "".join(l for l in lines if not l.startswith(">"))
def _extract_sequences_from_pdb(pdb_path: Path) -> dict[str, str]:
"""Extract per-chain sequences from SEQRES records or ATOM records."""
chains: dict[str, list[str]] = {}
aa3to1 = {
"ALA":"A","ARG":"R","ASN":"N","ASP":"D","CYS":"C","GLN":"Q","GLU":"E",
"GLY":"G","HIS":"H","ILE":"I","LEU":"L","LYS":"K","MET":"M","PHE":"F",
"PRO":"P","SER":"S","THR":"T","TRP":"W","TYR":"Y","VAL":"V",
}
seen_residues: dict[str, set] = {}
with open(pdb_path) as f:
for line in f:
if line[:4] == "ATOM" and line[13:15].strip() == "CA":
chain = line[21]
resnum = line[22:26].strip()
resname = line[17:20].strip()
key = (resnum, resname)
if chain not in seen_residues:
seen_residues[chain] = set()
chains[chain] = []
if key not in seen_residues[chain]:
seen_residues[chain].add(key)
chains[chain].append(aa3to1.get(resname, "X"))
return {c: "".join(seq) for c, seq in chains.items()}
def fetch_string_scores(
proteins: list[str], # UniProt accessions or gene names
species: int = 9606, # Human
min_score: int = 400,
) -> dict[tuple[str, str], float]:
"""
Fetch STRING PPI scores for a set of proteins.
Returns dict: (protein_a, protein_b) β combined_score [0,1]
"""
identifiers = "%0d".join(proteins)
params = {
"identifiers": identifiers,
"species": species,
"required_score": min_score,
"caller_identity": "protein_assembly_sim",
}
r = requests.get(STRING_API, params=params, timeout=60)
if r.status_code != 200:
print(f" STRING API error {r.status_code}; using zero scores")
return {}
scores = {}
for item in r.json():
a = item.get("preferredName_A", "")
b = item.get("preferredName_B", "")
s = item.get("score", 0) / 1000.0
scores[(a, b)] = s
scores[(b, a)] = s
return scores
def download_dataset(dataset: str = "marsh2013"):
if dataset in ("marsh2013", "all"):
outdir = DATA_DIR / "marsh2013"
outdir.mkdir(exist_ok=True)
print("Downloading Marsh 2013 benchmark complexes...")
manifest = {}
for pdb_id, meta in {**MARSH_2013_HOMOMERS}.items():
try:
pdb_path = _fetch_pdb(pdb_id, outdir)
seqs = _extract_sequences_from_pdb(pdb_path)
meta["sequences"] = seqs
meta["pdb_path"] = str(pdb_path)
manifest[pdb_id] = meta
except Exception as e:
print(f" WARN: {pdb_id} failed: {e}")
manifest_path = outdir / "manifest.json"
with open(manifest_path, "w") as f:
json.dump(manifest, f, indent=2)
print(f" manifest saved β {manifest_path}")
if dataset in ("npc", "all"):
outdir = DATA_DIR / "npc"
outdir.mkdir(exist_ok=True)
print("Downloading NPC nucleoporin structures...")
manifest = {}
for pdb_id, meta in NPC_COMPLEXES.items():
try:
pdb_path = _fetch_pdb(pdb_id, outdir)
seqs = _extract_sequences_from_pdb(pdb_path)
meta["sequences"] = seqs
meta["pdb_path"] = str(pdb_path)
manifest[pdb_id] = meta
except Exception as e:
print(f" WARN: {pdb_id} failed: {e}")
manifest_path = outdir / "manifest.json"
with open(manifest_path, "w") as f:
json.dump(manifest, f, indent=2)
print(f" manifest saved β {manifest_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", default="marsh2013",
choices=["marsh2013", "npc", "all"])
args = parser.parse_args()
download_dataset(args.dataset)
|