PRISM / src /data_prep /fetch_complexes.py
Siddhant Bhat
Initial commit: PRISM protein assembly order prediction GNN
1430181
"""
Downloads and prepares benchmark complexes.
Datasets:
1. Marsh 2013 homomers/heteromers (Cell 2013) — ground-truth assembly orders
2. NPC nucleoporins (human) — target system
Usage:
python fetch_complexes.py --dataset marsh2013
python fetch_complexes.py --dataset npc
python fetch_complexes.py --dataset all
"""
import os
import sys
import json
import time
import argparse
import requests
from pathlib import Path
DATA_DIR = Path(__file__).parent.parent.parent / "data" / "complexes"
DATA_DIR.mkdir(parents=True, exist_ok=True)
RCSB_PDB_URL = "https://files.rcsb.org/download/{pdb_id}.pdb"
UNIPROT_FASTA = "https://rest.uniprot.org/uniprotkb/{accession}.fasta"
STRING_API = "https://string-db.org/api/json/network"
# ──────────────────────────────────────────────────────────────────────────────
# Marsh 2013 benchmark complexes
# Source: Table S1 from Marsh et al. Cell 2013 — ordered assembly pathways
# ──────────────────────────────────────────────────────────────────────────────
MARSH_2013_HOMOMERS = {
# PDB_ID: (n_subunits, assembly_order_description)
"1AON": {"n": 14, "type": "homomer", "name": "GroEL (homotetradecamer)"},
"1GRU": {"n": 7, "type": "homomer", "name": "GroES (homoheptamer)"},
"2HHB": {"n": 4, "type": "heteromer", "name": "Hemoglobin (α2β2)"},
"1AY7": {"n": 2, "type": "heteromer", "name": "RNase SA / Barstar"},
"1BRS": {"n": 2, "type": "heteromer", "name": "Barnase / Barstar"},
"1SBB": {"n": 2, "type": "heteromer", "name": "Subtilisin / Eglin C"},
"1TGS": {"n": 2, "type": "heteromer", "name": "Trypsin / PSTI"},
"2PTC": {"n": 2, "type": "heteromer", "name": "Trypsin / BPTI"},
"3GBN": {"n": 2, "type": "heteromer", "name": "Antigen / Antibody"},
"1A2K": {"n": 2, "type": "heteromer", "name": "Ran-GTP / importin"},
}
# NPC nucleoporin PDB structures (major subcomplexes with known structures)
NPC_COMPLEXES = {
"5A9Q": {"name": "Nup107-Nup133 (Y-complex fragment)", "n": 2},
"3PBP": {"name": "Nup98 APD crystal structure", "n": 1},
"4I9B": {"name": "Nup358 RanBD1", "n": 1},
"5C3L": {"name": "Nup214 / Nup88 (cytoplasmic ring)", "n": 2},
"2QX5": {"name": "Nup153 LacZ fusion", "n": 1},
"3F3F": {"name": "Nup62/58/54 (central channel)", "n": 3},
"5HAX": {"name": "Y-complex (human, Nup107 subcomplex)","n": 7},
}
def _fetch_pdb(pdb_id: str, outdir: Path) -> Path:
out = outdir / f"{pdb_id}.pdb"
if out.exists():
print(f" {pdb_id}.pdb already present")
return out
url = RCSB_PDB_URL.format(pdb_id=pdb_id)
r = requests.get(url, timeout=30)
r.raise_for_status()
out.write_text(r.text)
print(f" downloaded {pdb_id}.pdb ({len(r.text)//1024} KB)")
time.sleep(0.5)
return out
def _fetch_uniprot_fasta(accession: str) -> str:
"""Returns FASTA sequence string."""
r = requests.get(UNIPROT_FASTA.format(accession=accession), timeout=30)
r.raise_for_status()
lines = r.text.strip().splitlines()
return "".join(l for l in lines if not l.startswith(">"))
def _extract_sequences_from_pdb(pdb_path: Path) -> dict[str, str]:
"""Extract per-chain sequences from SEQRES records or ATOM records."""
chains: dict[str, list[str]] = {}
aa3to1 = {
"ALA":"A","ARG":"R","ASN":"N","ASP":"D","CYS":"C","GLN":"Q","GLU":"E",
"GLY":"G","HIS":"H","ILE":"I","LEU":"L","LYS":"K","MET":"M","PHE":"F",
"PRO":"P","SER":"S","THR":"T","TRP":"W","TYR":"Y","VAL":"V",
}
seen_residues: dict[str, set] = {}
with open(pdb_path) as f:
for line in f:
if line[:4] == "ATOM" and line[13:15].strip() == "CA":
chain = line[21]
resnum = line[22:26].strip()
resname = line[17:20].strip()
key = (resnum, resname)
if chain not in seen_residues:
seen_residues[chain] = set()
chains[chain] = []
if key not in seen_residues[chain]:
seen_residues[chain].add(key)
chains[chain].append(aa3to1.get(resname, "X"))
return {c: "".join(seq) for c, seq in chains.items()}
def fetch_string_scores(
proteins: list[str], # UniProt accessions or gene names
species: int = 9606, # Human
min_score: int = 400,
) -> dict[tuple[str, str], float]:
"""
Fetch STRING PPI scores for a set of proteins.
Returns dict: (protein_a, protein_b) → combined_score [0,1]
"""
identifiers = "%0d".join(proteins)
params = {
"identifiers": identifiers,
"species": species,
"required_score": min_score,
"caller_identity": "protein_assembly_sim",
}
r = requests.get(STRING_API, params=params, timeout=60)
if r.status_code != 200:
print(f" STRING API error {r.status_code}; using zero scores")
return {}
scores = {}
for item in r.json():
a = item.get("preferredName_A", "")
b = item.get("preferredName_B", "")
s = item.get("score", 0) / 1000.0
scores[(a, b)] = s
scores[(b, a)] = s
return scores
def download_dataset(dataset: str = "marsh2013"):
if dataset in ("marsh2013", "all"):
outdir = DATA_DIR / "marsh2013"
outdir.mkdir(exist_ok=True)
print("Downloading Marsh 2013 benchmark complexes...")
manifest = {}
for pdb_id, meta in {**MARSH_2013_HOMOMERS}.items():
try:
pdb_path = _fetch_pdb(pdb_id, outdir)
seqs = _extract_sequences_from_pdb(pdb_path)
meta["sequences"] = seqs
meta["pdb_path"] = str(pdb_path)
manifest[pdb_id] = meta
except Exception as e:
print(f" WARN: {pdb_id} failed: {e}")
manifest_path = outdir / "manifest.json"
with open(manifest_path, "w") as f:
json.dump(manifest, f, indent=2)
print(f" manifest saved → {manifest_path}")
if dataset in ("npc", "all"):
outdir = DATA_DIR / "npc"
outdir.mkdir(exist_ok=True)
print("Downloading NPC nucleoporin structures...")
manifest = {}
for pdb_id, meta in NPC_COMPLEXES.items():
try:
pdb_path = _fetch_pdb(pdb_id, outdir)
seqs = _extract_sequences_from_pdb(pdb_path)
meta["sequences"] = seqs
meta["pdb_path"] = str(pdb_path)
manifest[pdb_id] = meta
except Exception as e:
print(f" WARN: {pdb_id} failed: {e}")
manifest_path = outdir / "manifest.json"
with open(manifest_path, "w") as f:
json.dump(manifest, f, indent=2)
print(f" manifest saved → {manifest_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", default="marsh2013",
choices=["marsh2013", "npc", "all"])
args = parser.parse_args()
download_dataset(args.dataset)