| """ |
| Downloads and prepares benchmark complexes. |
| |
| Datasets: |
| 1. Marsh 2013 homomers/heteromers (Cell 2013) — ground-truth assembly orders |
| 2. NPC nucleoporins (human) — target system |
| |
| Usage: |
| python fetch_complexes.py --dataset marsh2013 |
| python fetch_complexes.py --dataset npc |
| python fetch_complexes.py --dataset all |
| """ |
|
|
| import os |
| import sys |
| import json |
| import time |
| import argparse |
| import requests |
| from pathlib import Path |
|
|
| DATA_DIR = Path(__file__).parent.parent.parent / "data" / "complexes" |
| DATA_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| RCSB_PDB_URL = "https://files.rcsb.org/download/{pdb_id}.pdb" |
| UNIPROT_FASTA = "https://rest.uniprot.org/uniprotkb/{accession}.fasta" |
| STRING_API = "https://string-db.org/api/json/network" |
|
|
|
|
| |
| |
| |
| |
|
|
| MARSH_2013_HOMOMERS = { |
| |
| "1AON": {"n": 14, "type": "homomer", "name": "GroEL (homotetradecamer)"}, |
| "1GRU": {"n": 7, "type": "homomer", "name": "GroES (homoheptamer)"}, |
| "2HHB": {"n": 4, "type": "heteromer", "name": "Hemoglobin (α2β2)"}, |
| "1AY7": {"n": 2, "type": "heteromer", "name": "RNase SA / Barstar"}, |
| "1BRS": {"n": 2, "type": "heteromer", "name": "Barnase / Barstar"}, |
| "1SBB": {"n": 2, "type": "heteromer", "name": "Subtilisin / Eglin C"}, |
| "1TGS": {"n": 2, "type": "heteromer", "name": "Trypsin / PSTI"}, |
| "2PTC": {"n": 2, "type": "heteromer", "name": "Trypsin / BPTI"}, |
| "3GBN": {"n": 2, "type": "heteromer", "name": "Antigen / Antibody"}, |
| "1A2K": {"n": 2, "type": "heteromer", "name": "Ran-GTP / importin"}, |
| } |
|
|
| |
| NPC_COMPLEXES = { |
| "5A9Q": {"name": "Nup107-Nup133 (Y-complex fragment)", "n": 2}, |
| "3PBP": {"name": "Nup98 APD crystal structure", "n": 1}, |
| "4I9B": {"name": "Nup358 RanBD1", "n": 1}, |
| "5C3L": {"name": "Nup214 / Nup88 (cytoplasmic ring)", "n": 2}, |
| "2QX5": {"name": "Nup153 LacZ fusion", "n": 1}, |
| "3F3F": {"name": "Nup62/58/54 (central channel)", "n": 3}, |
| "5HAX": {"name": "Y-complex (human, Nup107 subcomplex)","n": 7}, |
| } |
|
|
|
|
| def _fetch_pdb(pdb_id: str, outdir: Path) -> Path: |
| out = outdir / f"{pdb_id}.pdb" |
| if out.exists(): |
| print(f" {pdb_id}.pdb already present") |
| return out |
| url = RCSB_PDB_URL.format(pdb_id=pdb_id) |
| r = requests.get(url, timeout=30) |
| r.raise_for_status() |
| out.write_text(r.text) |
| print(f" downloaded {pdb_id}.pdb ({len(r.text)//1024} KB)") |
| time.sleep(0.5) |
| return out |
|
|
|
|
| def _fetch_uniprot_fasta(accession: str) -> str: |
| """Returns FASTA sequence string.""" |
| r = requests.get(UNIPROT_FASTA.format(accession=accession), timeout=30) |
| r.raise_for_status() |
| lines = r.text.strip().splitlines() |
| return "".join(l for l in lines if not l.startswith(">")) |
|
|
|
|
| def _extract_sequences_from_pdb(pdb_path: Path) -> dict[str, str]: |
| """Extract per-chain sequences from SEQRES records or ATOM records.""" |
| chains: dict[str, list[str]] = {} |
| aa3to1 = { |
| "ALA":"A","ARG":"R","ASN":"N","ASP":"D","CYS":"C","GLN":"Q","GLU":"E", |
| "GLY":"G","HIS":"H","ILE":"I","LEU":"L","LYS":"K","MET":"M","PHE":"F", |
| "PRO":"P","SER":"S","THR":"T","TRP":"W","TYR":"Y","VAL":"V", |
| } |
|
|
| seen_residues: dict[str, set] = {} |
| with open(pdb_path) as f: |
| for line in f: |
| if line[:4] == "ATOM" and line[13:15].strip() == "CA": |
| chain = line[21] |
| resnum = line[22:26].strip() |
| resname = line[17:20].strip() |
| key = (resnum, resname) |
| if chain not in seen_residues: |
| seen_residues[chain] = set() |
| chains[chain] = [] |
| if key not in seen_residues[chain]: |
| seen_residues[chain].add(key) |
| chains[chain].append(aa3to1.get(resname, "X")) |
|
|
| return {c: "".join(seq) for c, seq in chains.items()} |
|
|
|
|
| def fetch_string_scores( |
| proteins: list[str], |
| species: int = 9606, |
| min_score: int = 400, |
| ) -> dict[tuple[str, str], float]: |
| """ |
| Fetch STRING PPI scores for a set of proteins. |
| Returns dict: (protein_a, protein_b) → combined_score [0,1] |
| """ |
| identifiers = "%0d".join(proteins) |
| params = { |
| "identifiers": identifiers, |
| "species": species, |
| "required_score": min_score, |
| "caller_identity": "protein_assembly_sim", |
| } |
| r = requests.get(STRING_API, params=params, timeout=60) |
| if r.status_code != 200: |
| print(f" STRING API error {r.status_code}; using zero scores") |
| return {} |
|
|
| scores = {} |
| for item in r.json(): |
| a = item.get("preferredName_A", "") |
| b = item.get("preferredName_B", "") |
| s = item.get("score", 0) / 1000.0 |
| scores[(a, b)] = s |
| scores[(b, a)] = s |
| return scores |
|
|
|
|
| def download_dataset(dataset: str = "marsh2013"): |
| if dataset in ("marsh2013", "all"): |
| outdir = DATA_DIR / "marsh2013" |
| outdir.mkdir(exist_ok=True) |
| print("Downloading Marsh 2013 benchmark complexes...") |
| manifest = {} |
| for pdb_id, meta in {**MARSH_2013_HOMOMERS}.items(): |
| try: |
| pdb_path = _fetch_pdb(pdb_id, outdir) |
| seqs = _extract_sequences_from_pdb(pdb_path) |
| meta["sequences"] = seqs |
| meta["pdb_path"] = str(pdb_path) |
| manifest[pdb_id] = meta |
| except Exception as e: |
| print(f" WARN: {pdb_id} failed: {e}") |
| manifest_path = outdir / "manifest.json" |
| with open(manifest_path, "w") as f: |
| json.dump(manifest, f, indent=2) |
| print(f" manifest saved → {manifest_path}") |
|
|
| if dataset in ("npc", "all"): |
| outdir = DATA_DIR / "npc" |
| outdir.mkdir(exist_ok=True) |
| print("Downloading NPC nucleoporin structures...") |
| manifest = {} |
| for pdb_id, meta in NPC_COMPLEXES.items(): |
| try: |
| pdb_path = _fetch_pdb(pdb_id, outdir) |
| seqs = _extract_sequences_from_pdb(pdb_path) |
| meta["sequences"] = seqs |
| meta["pdb_path"] = str(pdb_path) |
| manifest[pdb_id] = meta |
| except Exception as e: |
| print(f" WARN: {pdb_id} failed: {e}") |
| manifest_path = outdir / "manifest.json" |
| with open(manifest_path, "w") as f: |
| json.dump(manifest, f, indent=2) |
| print(f" manifest saved → {manifest_path}") |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--dataset", default="marsh2013", |
| choices=["marsh2013", "npc", "all"]) |
| args = parser.parse_args() |
| download_dataset(args.dataset) |
|
|