| """Compute conformers and symmetries for all the CCD molecules.""" |
|
|
| import argparse |
| import multiprocessing |
| import pickle |
| import sys |
| from functools import partial |
| from pathlib import Path |
|
|
| import pandas as pd |
| import rdkit |
| from p_tqdm import p_uimap |
| from pdbeccdutils.core import ccd_reader |
| from pdbeccdutils.core.component import ConformerType |
| from rdkit import rdBase |
| from rdkit.Chem import AllChem |
| from rdkit.Chem.rdchem import Conformer, Mol |
| from tqdm import tqdm |
|
|
|
|
| def load_molecules(components: str) -> list[Mol]: |
| """Load the CCD components file. |
| |
| Parameters |
| ---------- |
| components : str |
| Path to the CCD components file. |
| |
| Returns |
| ------- |
| list[Mol] |
| |
| """ |
| components: dict[str, ccd_reader.CCDReaderResult] |
| components = ccd_reader.read_pdb_components_file(components) |
|
|
| mols = [] |
| for name, component in components.items(): |
| mol = component.component.mol |
| mol.SetProp("PDB_NAME", name) |
| mols.append(mol) |
|
|
| return mols |
|
|
|
|
| def compute_3d(mol: Mol, version: str = "v3") -> bool: |
| """Generate 3D coordinates using EKTDG method. |
| |
| Taken from `pdbeccdutils.core.component.Component`. |
| |
| Parameters |
| ---------- |
| mol: Mol |
| The RDKit molecule to process |
| version: str, optional |
| The ETKDG version, defaults ot v3 |
| |
| Returns |
| ------- |
| bool |
| Whether computation was successful. |
| |
| """ |
| if version == "v3": |
| options = rdkit.Chem.AllChem.ETKDGv3() |
| elif version == "v2": |
| options = rdkit.Chem.AllChem.ETKDGv2() |
| else: |
| options = rdkit.Chem.AllChem.ETKDGv2() |
|
|
| options.clearConfs = False |
| conf_id = -1 |
|
|
| try: |
| conf_id = rdkit.Chem.AllChem.EmbedMolecule(mol, options) |
| rdkit.Chem.AllChem.UFFOptimizeMolecule(mol, confId=conf_id, maxIters=1000) |
|
|
| except RuntimeError: |
| pass |
| except ValueError: |
| pass |
|
|
| if conf_id != -1: |
| conformer = mol.GetConformer(conf_id) |
| conformer.SetProp("name", ConformerType.Computed.name) |
| conformer.SetProp("coord_generation", f"ETKDG{version}") |
|
|
| return True |
|
|
| return False |
|
|
|
|
| def get_conformer(mol: Mol, c_type: ConformerType) -> Conformer: |
| """Retrieve an rdkit object for a deemed conformer. |
| |
| Taken from `pdbeccdutils.core.component.Component`. |
| |
| Parameters |
| ---------- |
| mol: Mol |
| The molecule to process. |
| c_type: ConformerType |
| The conformer type to extract. |
| |
| Returns |
| ------- |
| Conformer |
| The desired conformer, if any. |
| |
| Raises |
| ------ |
| ValueError |
| If there are no conformers of the given tyoe. |
| |
| """ |
| for c in mol.GetConformers(): |
| try: |
| if c.GetProp("name") == c_type.name: |
| return c |
| except KeyError: |
| pass |
|
|
| msg = f"Conformer {c_type.name} does not exist." |
| raise ValueError(msg) |
|
|
|
|
| def compute_symmetries(mol: Mol) -> list[list[int]]: |
| """Compute the symmetries of a molecule. |
| |
| Parameters |
| ---------- |
| mol : Mol |
| The molecule to process |
| |
| Returns |
| ------- |
| list[list[int]] |
| The symmetries as a list of index permutations |
| |
| """ |
| mol = AllChem.RemoveHs(mol) |
| idx_map = {} |
| atom_idx = 0 |
| for i, atom in enumerate(mol.GetAtoms()): |
| |
| if int(atom.GetProp("leaving_atom")): |
| continue |
| idx_map[i] = atom_idx |
| atom_idx += 1 |
|
|
| |
| permutations = [] |
| raw_permutations = mol.GetSubstructMatches(mol, uniquify=False) |
| for raw_permutation in raw_permutations: |
| |
| try: |
| if {raw_permutation[idx] for idx in idx_map} == set(idx_map.keys()): |
| permutation = [ |
| idx_map[idx] for idx in raw_permutation if idx in idx_map |
| ] |
| permutations.append(permutation) |
| except Exception: |
| pass |
| serialized_permutations = pickle.dumps(permutations) |
| mol.SetProp("symmetries", serialized_permutations.hex()) |
| return permutations |
|
|
|
|
| def process(mol: Mol, output: str) -> tuple[str, str]: |
| """Process a CCD component. |
| |
| Parameters |
| ---------- |
| mol : Mol |
| The molecule to process |
| output : str |
| The directory to save the molecules |
| |
| Returns |
| ------- |
| str |
| The name of the component |
| str |
| The result of the conformer generation |
| |
| """ |
| |
| name = mol.GetProp("PDB_NAME") |
|
|
| |
| if mol.GetNumAtoms() == 1: |
| result = "single" |
| else: |
| |
| try: |
| |
| success = compute_3d(mol, version="v3") |
| if success: |
| _ = get_conformer(mol, ConformerType.Computed) |
| result = "computed" |
|
|
| |
| else: |
| _ = get_conformer(mol, ConformerType.Ideal) |
| result = "ideal" |
| except ValueError: |
| result = "failed" |
|
|
| |
| path = Path(output) / f"{name}.pkl" |
| with path.open("wb") as f: |
| pickle.dump(mol, f) |
|
|
| |
| return name, result |
|
|
|
|
| def main(args: argparse.Namespace) -> None: |
| """Process conformers.""" |
| |
| rdkit.Chem.SetDefaultPickleProperties(rdkit.Chem.PropertyPickleOptions.AllProps) |
|
|
| |
| print("Loading components") |
| molecules = load_molecules(args.components) |
|
|
| |
| sys.stdout = sys.__stdout__ |
| sys.stderr = sys.__stderr__ |
|
|
| |
| blocker = rdBase.BlockLogs() |
|
|
| |
| outdir = Path(args.outdir) |
| outdir.mkdir(parents=True, exist_ok=True) |
| mol_output = outdir / "mols" |
| mol_output.mkdir(parents=True, exist_ok=True) |
| process_fn = partial(process, output=str(mol_output)) |
|
|
| |
| print("Processing components") |
| metadata = [] |
|
|
| |
| max_processes = multiprocessing.cpu_count() |
| num_processes = max(1, min(args.num_processes, max_processes, len(molecules))) |
| parallel = num_processes > 1 |
|
|
| if parallel: |
| for name, result in p_uimap( |
| process_fn, |
| molecules, |
| num_cpus=num_processes, |
| ): |
| metadata.append({"name": name, "result": result}) |
| else: |
| for mol in tqdm(molecules): |
| name, result = process_fn(mol) |
| metadata.append({"name": name, "result": result}) |
|
|
| |
| molecules = {} |
| for item in metadata: |
| if item["result"] == "failed": |
| continue |
|
|
| |
| path = mol_output / f"{item['name']}.pkl" |
| with path.open("rb") as f: |
| mol = pickle.load(f) |
| molecules[item["name"]] = mol |
|
|
| |
| path = outdir / "results.csv" |
| metadata = pd.DataFrame(metadata) |
| metadata.to_csv(path) |
|
|
| |
| path = outdir / "ccd.pkl" |
| with path.open("wb") as f: |
| pickle.dump(molecules, f) |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--components", type=str) |
| parser.add_argument("--outdir", type=str) |
| parser.add_argument( |
| "--num_processes", |
| type=int, |
| default=multiprocessing.cpu_count(), |
| ) |
| args = parser.parse_args() |
| main(args) |
|
|