Spaces:

Sbhat2026
/

PRISM

Configuration error

PRISM / src /data_prep /fetch_complexes.py

Siddhant Bhat

Initial commit: PRISM protein assembly order prediction GNN

1430181 about 1 month ago

7.39 kB

	"""
	Downloads and prepares benchmark complexes.

	Datasets:
	1. Marsh 2013 homomers/heteromers (Cell 2013) — ground-truth assembly orders
	2. NPC nucleoporins (human) — target system

	Usage:
	python fetch_complexes.py --dataset marsh2013
	python fetch_complexes.py --dataset npc
	python fetch_complexes.py --dataset all
	"""

	import os
	import sys
	import json
	import time
	import argparse
	import requests
	from pathlib import Path

	DATA_DIR = Path(__file__).parent.parent.parent / "data" / "complexes"
	DATA_DIR.mkdir(parents=True, exist_ok=True)

	RCSB_PDB_URL = "https://files.rcsb.org/download/{pdb_id}.pdb"
	UNIPROT_FASTA = "https://rest.uniprot.org/uniprotkb/{accession}.fasta"
	STRING_API = "https://string-db.org/api/json/network"


	# ──────────────────────────────────────────────────────────────────────────────
	# Marsh 2013 benchmark complexes
	# Source: Table S1 from Marsh et al. Cell 2013 — ordered assembly pathways
	# ──────────────────────────────────────────────────────────────────────────────

	MARSH_2013_HOMOMERS = {
	# PDB_ID: (n_subunits, assembly_order_description)
	"1AON": {"n": 14, "type": "homomer", "name": "GroEL (homotetradecamer)"},
	"1GRU": {"n": 7, "type": "homomer", "name": "GroES (homoheptamer)"},
	"2HHB": {"n": 4, "type": "heteromer", "name": "Hemoglobin (α2β2)"},
	"1AY7": {"n": 2, "type": "heteromer", "name": "RNase SA / Barstar"},
	"1BRS": {"n": 2, "type": "heteromer", "name": "Barnase / Barstar"},
	"1SBB": {"n": 2, "type": "heteromer", "name": "Subtilisin / Eglin C"},
	"1TGS": {"n": 2, "type": "heteromer", "name": "Trypsin / PSTI"},
	"2PTC": {"n": 2, "type": "heteromer", "name": "Trypsin / BPTI"},
	"3GBN": {"n": 2, "type": "heteromer", "name": "Antigen / Antibody"},
	"1A2K": {"n": 2, "type": "heteromer", "name": "Ran-GTP / importin"},
	}

	# NPC nucleoporin PDB structures (major subcomplexes with known structures)
	NPC_COMPLEXES = {
	"5A9Q": {"name": "Nup107-Nup133 (Y-complex fragment)", "n": 2},
	"3PBP": {"name": "Nup98 APD crystal structure", "n": 1},
	"4I9B": {"name": "Nup358 RanBD1", "n": 1},
	"5C3L": {"name": "Nup214 / Nup88 (cytoplasmic ring)", "n": 2},
	"2QX5": {"name": "Nup153 LacZ fusion", "n": 1},
	"3F3F": {"name": "Nup62/58/54 (central channel)", "n": 3},
	"5HAX": {"name": "Y-complex (human, Nup107 subcomplex)","n": 7},
	}


	def _fetch_pdb(pdb_id: str, outdir: Path) -> Path:
	out = outdir / f"{pdb_id}.pdb"
	if out.exists():
	print(f" {pdb_id}.pdb already present")
	return out
	url = RCSB_PDB_URL.format(pdb_id=pdb_id)
	r = requests.get(url, timeout=30)
	r.raise_for_status()
	out.write_text(r.text)
	print(f" downloaded {pdb_id}.pdb ({len(r.text)//1024} KB)")
	time.sleep(0.5)
	return out


	def _fetch_uniprot_fasta(accession: str) -> str:
	"""Returns FASTA sequence string."""
	r = requests.get(UNIPROT_FASTA.format(accession=accession), timeout=30)
	r.raise_for_status()
	lines = r.text.strip().splitlines()
	return "".join(l for l in lines if not l.startswith(">"))


	def _extract_sequences_from_pdb(pdb_path: Path) -> dict[str, str]:
	"""Extract per-chain sequences from SEQRES records or ATOM records."""
	chains: dict[str, list[str]] = {}
	aa3to1 = {
	"ALA":"A","ARG":"R","ASN":"N","ASP":"D","CYS":"C","GLN":"Q","GLU":"E",
	"GLY":"G","HIS":"H","ILE":"I","LEU":"L","LYS":"K","MET":"M","PHE":"F",
	"PRO":"P","SER":"S","THR":"T","TRP":"W","TYR":"Y","VAL":"V",
	}

	seen_residues: dict[str, set] = {}
	with open(pdb_path) as f:
	for line in f:
	if line[:4] == "ATOM" and line[13:15].strip() == "CA":
	chain = line[21]
	resnum = line[22:26].strip()
	resname = line[17:20].strip()
	key = (resnum, resname)
	if chain not in seen_residues:
	seen_residues[chain] = set()
	chains[chain] = []
	if key not in seen_residues[chain]:
	seen_residues[chain].add(key)
	chains[chain].append(aa3to1.get(resname, "X"))

	return {c: "".join(seq) for c, seq in chains.items()}


	def fetch_string_scores(
	proteins: list[str], # UniProt accessions or gene names
	species: int = 9606, # Human
	min_score: int = 400,
	) -> dict[tuple[str, str], float]:
	"""
	Fetch STRING PPI scores for a set of proteins.
	Returns dict: (protein_a, protein_b) → combined_score [0,1]
	"""
	identifiers = "%0d".join(proteins)
	params = {
	"identifiers": identifiers,
	"species": species,
	"required_score": min_score,
	"caller_identity": "protein_assembly_sim",
	}
	r = requests.get(STRING_API, params=params, timeout=60)
	if r.status_code != 200:
	print(f" STRING API error {r.status_code}; using zero scores")
	return {}

	scores = {}
	for item in r.json():
	a = item.get("preferredName_A", "")
	b = item.get("preferredName_B", "")
	s = item.get("score", 0) / 1000.0
	scores[(a, b)] = s
	scores[(b, a)] = s
	return scores


	def download_dataset(dataset: str = "marsh2013"):
	if dataset in ("marsh2013", "all"):
	outdir = DATA_DIR / "marsh2013"
	outdir.mkdir(exist_ok=True)
	print("Downloading Marsh 2013 benchmark complexes...")
	manifest = {}
	for pdb_id, meta in {**MARSH_2013_HOMOMERS}.items():
	try:
	pdb_path = _fetch_pdb(pdb_id, outdir)
	seqs = _extract_sequences_from_pdb(pdb_path)
	meta["sequences"] = seqs
	meta["pdb_path"] = str(pdb_path)
	manifest[pdb_id] = meta
	except Exception as e:
	print(f" WARN: {pdb_id} failed: {e}")
	manifest_path = outdir / "manifest.json"
	with open(manifest_path, "w") as f:
	json.dump(manifest, f, indent=2)
	print(f" manifest saved → {manifest_path}")

	if dataset in ("npc", "all"):
	outdir = DATA_DIR / "npc"
	outdir.mkdir(exist_ok=True)
	print("Downloading NPC nucleoporin structures...")
	manifest = {}
	for pdb_id, meta in NPC_COMPLEXES.items():
	try:
	pdb_path = _fetch_pdb(pdb_id, outdir)
	seqs = _extract_sequences_from_pdb(pdb_path)
	meta["sequences"] = seqs
	meta["pdb_path"] = str(pdb_path)
	manifest[pdb_id] = meta
	except Exception as e:
	print(f" WARN: {pdb_id} failed: {e}")
	manifest_path = outdir / "manifest.json"
	with open(manifest_path, "w") as f:
	json.dump(manifest, f, indent=2)
	print(f" manifest saved → {manifest_path}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--dataset", default="marsh2013",
	choices=["marsh2013", "npc", "all"])
	args = parser.parse_args()
	download_dataset(args.dataset)