Source code for rush.prepare._protein

"""
Protein preparation module for the Rush Python client.

This module supports system preparation workflows such as converting PDB inputs
to TRC, protonating and optimizing hydrogen positions, and augmenting
structures with connectivity and formal charge information before downstream
calculations.

Usage::

    from rush import prepare

    result = prepare.protein("protein.pdb").fetch()
    print(result.topology.symbols)
"""

import json
import sys
from collections.abc import Iterator
from dataclasses import dataclass
from pathlib import Path
from string import Template
from typing import Any, Literal, Self

from gql.transport.exceptions import TransportQueryError

from .._rex import optional_str
from ..convert import _single_trc, from_json, from_pdb
from ..mol import TRC, Chains, Residues, Topology
from ..objects import (
    RushObject,
    TRCPaths,
    TRCRef,
    _to_chains_vobj,
    _to_residues_vobj,
    _to_topology_vobj,
)
from ..runs import Run, RunOpts, RunSpec
from ..session import _submit_rex

# ---------------------------------------------------------------------------
# Result types
# ---------------------------------------------------------------------------


[docs] @dataclass(frozen=True) class ResultRef: """Lightweight reference to prepare-protein output in the Rush object store. May contain multiple TRC triplets if the input PDB has multiple models. """ models: list[TRCRef] def __getitem__(self, index: int) -> TRCRef: return self.models[index] def __len__(self) -> int: return len(self.models) def __iter__(self) -> Iterator[TRCRef]: return iter(self.models)
[docs] @classmethod def from_raw_output(cls, res: list[Any]) -> Self: """Parse raw ``collect_run`` output into a ``ResultRef``. The raw output is a list of groups, where each group is a list of 3 dicts (topology, residues, chains objects). Multi-model PDBs produce multiple groups. """ if not isinstance(res, list) or len(res) == 0: raise ValueError( f"prepare_protein should return a non-empty list, " f"got {type(res).__name__}" f"{f' with {len(res)} items' if hasattr(res, '__len__') else ''}." ) models: list[TRCRef] = [] for i, group in enumerate(res): if not isinstance(group, list) or len(group) != 3: raise ValueError( f"prepare_protein output group {i} expected a list of 3 elements, " f"got {type(group).__name__}" f"{f' with {len(group)} items' if isinstance(group, list) else ''}." ) topo, resid, chain = group[0], group[1], group[2] if ( not isinstance(topo, dict) or not isinstance(resid, dict) or not isinstance(chain, dict) ): raise ValueError( f"prepare_protein output group {i} elements must be dicts." ) models.append( TRCRef( topology=RushObject.from_dict(topo), residues=RushObject.from_dict(resid), chains=RushObject.from_dict(chain), ) ) return cls(models=models)
[docs] def fetch(self) -> list[TRC]: """Download prepare-protein output and parse into TRCs. Returns one TRC per model in the input PDB. Most PDBs contain a single model, so ``result[0]`` is the common pattern. """ return [model.fetch() for model in self.models]
[docs] def save(self) -> list[TRCPaths]: """Download prepare-protein output and save to the workspace. Returns one TRCPaths per model in the input PDB. """ return [model.save() for model in self.models]
# --------------------------------------------------------------------------- # Submission # ---------------------------------------------------------------------------
[docs] def protein( mol: TRC | TRCRef | tuple[ Path | str | RushObject | Topology, Path | str | RushObject | Residues, Path | str | RushObject | Chains, ] | Path | str, ph: float | None = None, naming_scheme: Literal["AMBER", "CHARMM"] | None = None, capping_style: Literal["never", "truncated", "always"] | None = None, truncation_threshold: int | None = None, opt: bool | None = None, debump: bool | None = None, run_spec: RunSpec = RunSpec(gpus=1), run_opts: RunOpts = RunOpts(), ) -> Run[ResultRef]: """ Submit a prepare-protein job for a PDB or TRC file. Returns a :class:`~rush.runs.Run` handle. Call ``.fetch()`` to get the parsed TRC, or ``.save()`` to write the output files to disk. """ # Upload inputs match mol: case TRC(): trc_ref = TRCRef.upload(mol) case TRCRef(): trc_ref = mol case (t, r, c): trc_ref = TRCRef( RushObject.from_dict(_to_topology_vobj(t)), RushObject.from_dict(_to_residues_vobj(r)), RushObject.from_dict(_to_chains_vobj(c)), ) case Path() | str(): input_path = mol if isinstance(input_path, str): input_path = Path(input_path) with open(input_path) as f: if input_path.suffix == ".pdb": trc = from_pdb(f.read()) else: trc = from_json(json.load(f)) trc = _single_trc(trc, input_path) trc_ref = TRCRef.upload(trc) # Run rex rex = Template("""let obj_j = λ j → VirtualObject { path = j, format = ObjectFormat::json, size = 0 }, prepare_protein = λ topology residues chains → try_prepare_protein_rex ($run_spec) (prepare_protein_rex::PrepareProteinOptions { ph = $ph, naming_scheme = $naming_scheme, capping_style = $capping_style, truncation_threshold = $truncation_threshold, opt = $opt, debump = $debump, }) [( (obj_j topology), (obj_j residues), (obj_j chains) )] in prepare_protein "$topology_vobj_path" "$residues_vobj_path" "$chains_vobj_path" """).substitute( run_spec=run_spec._to_rex(), ph=optional_str(ph), naming_scheme=optional_str( naming_scheme.title() if naming_scheme is not None else None, prefix="prepare_protein_rex::NamingScheme::", ), capping_style=optional_str( capping_style.title() if capping_style is not None else None, prefix="prepare_protein_rex::CappingStyle::", ), truncation_threshold=optional_str(truncation_threshold), opt=optional_str(opt), debump=optional_str(debump), topology_vobj_path=trc_ref.topology.path, residues_vobj_path=trc_ref.residues.path, chains_vobj_path=trc_ref.chains.path, ) try: return Run(_submit_rex(rex, run_opts), ResultRef) except TransportQueryError as e: if e.errors: for error in e.errors: print(f"Error: {error['message']}", file=sys.stderr) raise