Spaces:

Sbhat2026
/

PRISM

Configuration error

File size: 7,386 Bytes
"""
Downloads and prepares benchmark complexes.

Datasets:
1. Marsh 2013 homomers/heteromers (Cell 2013) — ground-truth assembly orders
2. NPC nucleoporins (human) — target system

Usage:
    python fetch_complexes.py --dataset marsh2013
    python fetch_complexes.py --dataset npc
    python fetch_complexes.py --dataset all
"""

import os
import sys
import json
import time
import argparse
import requests
from pathlib import Path

DATA_DIR = Path(__file__).parent.parent.parent / "data" / "complexes"
DATA_DIR.mkdir(parents=True, exist_ok=True)

RCSB_PDB_URL = "https://files.rcsb.org/download/{pdb_id}.pdb"
UNIPROT_FASTA = "https://rest.uniprot.org/uniprotkb/{accession}.fasta"
STRING_API = "https://string-db.org/api/json/network"


# ──────────────────────────────────────────────────────────────────────────────
# Marsh 2013 benchmark complexes
# Source: Table S1 from Marsh et al. Cell 2013 — ordered assembly pathways
# ──────────────────────────────────────────────────────────────────────────────

MARSH_2013_HOMOMERS = {
    # PDB_ID: (n_subunits, assembly_order_description)
    "1AON": {"n": 14, "type": "homomer", "name": "GroEL (homotetradecamer)"},
    "1GRU": {"n": 7,  "type": "homomer", "name": "GroES (homoheptamer)"},
    "2HHB": {"n": 4,  "type": "heteromer", "name": "Hemoglobin (α2β2)"},
    "1AY7": {"n": 2,  "type": "heteromer", "name": "RNase SA / Barstar"},
    "1BRS": {"n": 2,  "type": "heteromer", "name": "Barnase / Barstar"},
    "1SBB": {"n": 2,  "type": "heteromer", "name": "Subtilisin / Eglin C"},
    "1TGS": {"n": 2,  "type": "heteromer", "name": "Trypsin / PSTI"},
    "2PTC": {"n": 2,  "type": "heteromer", "name": "Trypsin / BPTI"},
    "3GBN": {"n": 2,  "type": "heteromer", "name": "Antigen / Antibody"},
    "1A2K": {"n": 2,  "type": "heteromer", "name": "Ran-GTP / importin"},
}

# NPC nucleoporin PDB structures (major subcomplexes with known structures)
NPC_COMPLEXES = {
    "5A9Q": {"name": "Nup107-Nup133 (Y-complex fragment)", "n": 2},
    "3PBP": {"name": "Nup98 APD crystal structure",        "n": 1},
    "4I9B": {"name": "Nup358 RanBD1",                     "n": 1},
    "5C3L": {"name": "Nup214 / Nup88 (cytoplasmic ring)", "n": 2},
    "2QX5": {"name": "Nup153 LacZ fusion",                "n": 1},
    "3F3F": {"name": "Nup62/58/54 (central channel)",     "n": 3},
    "5HAX": {"name": "Y-complex (human, Nup107 subcomplex)","n": 7},
}


def _fetch_pdb(pdb_id: str, outdir: Path) -> Path:
    out = outdir / f"{pdb_id}.pdb"
    if out.exists():
        print(f"  {pdb_id}.pdb already present")
        return out
    url = RCSB_PDB_URL.format(pdb_id=pdb_id)
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    out.write_text(r.text)
    print(f"  downloaded {pdb_id}.pdb ({len(r.text)//1024} KB)")
    time.sleep(0.5)
    return out


def _fetch_uniprot_fasta(accession: str) -> str:
    """Returns FASTA sequence string."""
    r = requests.get(UNIPROT_FASTA.format(accession=accession), timeout=30)
    r.raise_for_status()
    lines = r.text.strip().splitlines()
    return "".join(l for l in lines if not l.startswith(">"))


def _extract_sequences_from_pdb(pdb_path: Path) -> dict[str, str]:
    """Extract per-chain sequences from SEQRES records or ATOM records."""
    chains: dict[str, list[str]] = {}
    aa3to1 = {
        "ALA":"A","ARG":"R","ASN":"N","ASP":"D","CYS":"C","GLN":"Q","GLU":"E",
        "GLY":"G","HIS":"H","ILE":"I","LEU":"L","LYS":"K","MET":"M","PHE":"F",
        "PRO":"P","SER":"S","THR":"T","TRP":"W","TYR":"Y","VAL":"V",
    }

    seen_residues: dict[str, set] = {}
    with open(pdb_path) as f:
        for line in f:
            if line[:4] == "ATOM" and line[13:15].strip() == "CA":
                chain = line[21]
                resnum = line[22:26].strip()
                resname = line[17:20].strip()
                key = (resnum, resname)
                if chain not in seen_residues:
                    seen_residues[chain] = set()
                    chains[chain] = []
                if key not in seen_residues[chain]:
                    seen_residues[chain].add(key)
                    chains[chain].append(aa3to1.get(resname, "X"))

    return {c: "".join(seq) for c, seq in chains.items()}


def fetch_string_scores(
    proteins: list[str],  # UniProt accessions or gene names
    species: int = 9606,  # Human
    min_score: int = 400,
) -> dict[tuple[str, str], float]:
    """
    Fetch STRING PPI scores for a set of proteins.
    Returns dict: (protein_a, protein_b) → combined_score [0,1]
    """
    identifiers = "%0d".join(proteins)
    params = {
        "identifiers": identifiers,
        "species": species,
        "required_score": min_score,
        "caller_identity": "protein_assembly_sim",
    }
    r = requests.get(STRING_API, params=params, timeout=60)
    if r.status_code != 200:
        print(f"  STRING API error {r.status_code}; using zero scores")
        return {}

    scores = {}
    for item in r.json():
        a = item.get("preferredName_A", "")
        b = item.get("preferredName_B", "")
        s = item.get("score", 0) / 1000.0
        scores[(a, b)] = s
        scores[(b, a)] = s
    return scores


def download_dataset(dataset: str = "marsh2013"):
    if dataset in ("marsh2013", "all"):
        outdir = DATA_DIR / "marsh2013"
        outdir.mkdir(exist_ok=True)
        print("Downloading Marsh 2013 benchmark complexes...")
        manifest = {}
        for pdb_id, meta in {**MARSH_2013_HOMOMERS}.items():
            try:
                pdb_path = _fetch_pdb(pdb_id, outdir)
                seqs = _extract_sequences_from_pdb(pdb_path)
                meta["sequences"] = seqs
                meta["pdb_path"] = str(pdb_path)
                manifest[pdb_id] = meta
            except Exception as e:
                print(f"  WARN: {pdb_id} failed: {e}")
        manifest_path = outdir / "manifest.json"
        with open(manifest_path, "w") as f:
            json.dump(manifest, f, indent=2)
        print(f"  manifest saved → {manifest_path}")

    if dataset in ("npc", "all"):
        outdir = DATA_DIR / "npc"
        outdir.mkdir(exist_ok=True)
        print("Downloading NPC nucleoporin structures...")
        manifest = {}
        for pdb_id, meta in NPC_COMPLEXES.items():
            try:
                pdb_path = _fetch_pdb(pdb_id, outdir)
                seqs = _extract_sequences_from_pdb(pdb_path)
                meta["sequences"] = seqs
                meta["pdb_path"] = str(pdb_path)
                manifest[pdb_id] = meta
            except Exception as e:
                print(f"  WARN: {pdb_id} failed: {e}")
        manifest_path = outdir / "manifest.json"
        with open(manifest_path, "w") as f:
            json.dump(manifest, f, indent=2)
        print(f"  manifest saved → {manifest_path}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset", default="marsh2013",
                        choices=["marsh2013", "npc", "all"])
    args = parser.parse_args()
    download_dataset(args.dataset)