From 2ffc0b33b4443ed954415c280e0f3ba156ccc88e Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 11:18:23 +0200 Subject: [PATCH 01/14] Add proteinbox models: ProteinSpecies, ProteinBoxComposition, Pdb2gmxConfig New models for the proteinbox simulation type: - ProteinSpecies: PDB-path-based species with disulfide/protonation annotations - ProteinBoxComposition: protein + box_padding + ionization config - Pdb2gmxConfig: force field and water model config for pdb2gmx - GromacsProteinParameterSet: output paths from pdb2gmx - BuildInput extended with proteinbox type and pdb2gmx parametrization Closes: relates to #13 Co-Authored-By: Claude Opus 4.6 --- mdfactory/models/composition.py | 20 +++++++++++++--- mdfactory/models/input.py | 18 ++++++++++---- mdfactory/models/parametrization.py | 37 ++++++++++++++++++++++++++++- mdfactory/models/species.py | 32 +++++++++++++++++++++++-- 4 files changed, 96 insertions(+), 11 deletions(-) diff --git a/mdfactory/models/composition.py b/mdfactory/models/composition.py index 8488d1d..d736e9a 100644 --- a/mdfactory/models/composition.py +++ b/mdfactory/models/composition.py @@ -1,13 +1,13 @@ -# ABOUTME: Pydantic models for system composition (mixedbox, bilayer, LNP) +# ABOUTME: Pydantic models for system composition (mixedbox, bilayer, LNP, proteinbox) # ABOUTME: Defines species counts, ionization, and composition validation -"""Pydantic models for system composition (mixedbox, bilayer, LNP).""" +"""Pydantic models for system composition (mixedbox, bilayer, LNP, proteinbox).""" from typing import Optional import numpy as np from pydantic import BaseModel, ConfigDict, Field, model_validator -from .species import LipidSpecies, SingleMoleculeSpecies, Species +from .species import LipidSpecies, ProteinSpecies, SingleMoleculeSpecies, Species def distribute_counts(fractions: list[float], total: int) -> list[int]: @@ -375,3 +375,17 @@ def get_species_with_counts(self) -> list[Species]: else: species_map[key] = spec.model_copy() return list(species_map.values()) + + +class ProteinBoxComposition(BaseModel): + """Define a protein-in-waterbox system: one protein centered in a cubic box with ions.""" + + model_config = ConfigDict(extra="forbid") + + protein: ProteinSpecies + box_padding: float = Field( + 10.0, description="Minimum distance from protein to box edge in Angstroms.", ge=0.0 + ) + ionization: IonizationConfig = Field( + default_factory=IonizationConfig, description="Configuration for ionization." + ) diff --git a/mdfactory/models/input.py b/mdfactory/models/input.py index e24497a..60b47b6 100644 --- a/mdfactory/models/input.py +++ b/mdfactory/models/input.py @@ -8,8 +8,13 @@ from pydantic import BaseModel, Field, model_validator -from .composition import BilayerComposition, LNPComposition, MixedBoxComposition -from .parametrization import CgenffConfig, ParametrizationConfig, SmirnoffConfig +from .composition import ( + BilayerComposition, + LNPComposition, + MixedBoxComposition, + ProteinBoxComposition, +) +from .parametrization import CgenffConfig, Pdb2gmxConfig, ParametrizationConfig, SmirnoffConfig # Map system_type to the corresponding model class # TODO: StrEnum? @@ -17,15 +22,16 @@ "mixedbox": MixedBoxComposition, "bilayer": BilayerComposition, "lnp": LNPComposition, + "proteinbox": ProteinBoxComposition, } class BuildInput(BaseModel): """Represent a complete simulation build specification with composition and parametrization.""" - simulation_type: Literal["mixedbox", "bilayer", "lnp"] - system: MixedBoxComposition | BilayerComposition | LNPComposition - parametrization: Literal["cgenff", "smirnoff"] = Field( + simulation_type: Literal["mixedbox", "bilayer", "lnp", "proteinbox"] + system: MixedBoxComposition | BilayerComposition | LNPComposition | ProteinBoxComposition + parametrization: Literal["cgenff", "smirnoff", "pdb2gmx"] = Field( "cgenff", description="Parametrization to use." ) parametrization_config: ParametrizationConfig | None = Field( @@ -162,4 +168,6 @@ def set_default_parametrization_config(self) -> "BuildInput": object.__setattr__(self, "parametrization_config", SmirnoffConfig()) elif self.parametrization == "cgenff": object.__setattr__(self, "parametrization_config", CgenffConfig()) + elif self.parametrization == "pdb2gmx": + object.__setattr__(self, "parametrization_config", Pdb2gmxConfig()) return self diff --git a/mdfactory/models/parametrization.py b/mdfactory/models/parametrization.py index 13b9298..845637a 100644 --- a/mdfactory/models/parametrization.py +++ b/mdfactory/models/parametrization.py @@ -50,8 +50,30 @@ class CgenffConfig(BaseModel): # CGenFF uses SILCSBIODIR from config.ini, no additional settings needed +class Pdb2gmxConfig(BaseModel): + """Configuration for pdb2gmx protein parametrization.""" + + model_config = ConfigDict(extra="forbid", frozen=True) + + type: Literal["pdb2gmx"] = Field("pdb2gmx", description="Config type discriminator.") + forcefield: str = Field( + "charmm36m", description="Force field name as recognized by gmx pdb2gmx." + ) + water_model: str = Field( + "tip3p", description="Water model name as recognized by gmx pdb2gmx." + ) + ignh: bool = Field( + True, description="Ignore hydrogens in input PDB (regenerate with pdb2gmx)." + ) + merge_all: bool = Field( + False, description="Merge all chains into a single moleculetype." + ) + + ParametrizationConfig = Annotated[ - Annotated[SmirnoffConfig, Tag("smirnoff")] | Annotated[CgenffConfig, Tag("cgenff")], + Annotated[SmirnoffConfig, Tag("smirnoff")] + | Annotated[CgenffConfig, Tag("cgenff")] + | Annotated[Pdb2gmxConfig, Tag("pdb2gmx")], Discriminator("type"), ] @@ -91,3 +113,16 @@ def from_data_row(cls, data_row): raise ValueError("Invalid parameter_data_type for GromacsSingleMoleculeParameterSet.") data = json.loads(data_row["parameter_data"]) return cls(**data) + + +class GromacsProteinParameterSet(BaseModel): + """Store GROMACS topology output from pdb2gmx for a protein system.""" + + model_config = ConfigDict(extra="forbid") + + topology_file: Path + structure_file: Path + position_restraint_file: Path + forcefield: str + water_model: str + total_charge: int diff --git a/mdfactory/models/species.py b/mdfactory/models/species.py index 1912ce5..487676c 100644 --- a/mdfactory/models/species.py +++ b/mdfactory/models/species.py @@ -1,11 +1,12 @@ -# ABOUTME: Pydantic models for molecular species (small molecules and lipids) +# ABOUTME: Pydantic models for molecular species (small molecules, lipids, proteins) # ABOUTME: Provides SMILES-based identity, charge, and molecular object properties -"""Pydantic models for molecular species (small molecules and lipids).""" +"""Pydantic models for molecular species (small molecules, lipids, proteins).""" import hashlib import io import warnings from functools import cached_property +from pathlib import Path from typing import Optional from pydantic import BaseModel, Field, model_validator @@ -196,3 +197,30 @@ def rdkit_molecule(self) -> "Chem.rdchem.Mol": # u, tail_atom_ids=self.tail_atoms, head_atom_ids=self.head_atoms, z_axis=[1, 0, 0] # ) # return u + + +class ProteinSpecies(Species): + """Represent a protein species identified by a PDB structure file.""" + + pdb_path: Path = Field(..., description="Path to the input PDB file.") + disulfide_bonds: list[tuple[int, int]] = Field( + default_factory=list, + description="Pairs of residue IDs forming disulfide bonds, e.g. [(6, 127), (30, 115)].", + ) + protonation_states: dict[str, str] = Field( + default_factory=dict, + description="Residue-specific protonation states, e.g. {'HIS15': 'HIE', 'GLU35': 'GLH'}.", + ) + + @model_validator(mode="after") + def set_default_count(self) -> "ProteinSpecies": + if self.count is None and self.fraction is None: + self.count = 1 + self.fraction = 1.0 + return self + + @property + def charge(self) -> int: + raise NotImplementedError( + "Protein charge is determined by pdb2gmx topology output, not pre-computable." + ) From 9d47492bcf151af31bc919353ddc658c87e1f04f Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 11:20:22 +0200 Subject: [PATCH 02/14] Add setup/protein.py: pdb2gmx wrapper, PDB cleaning, topology utilities Functions for the proteinbox build pipeline: - check_gmx_available: verify gmx binary is on PATH - clean_pdb: PDBFixer-based PDB standardization - run_pdb2gmx: subprocess wrapper for gmx pdb2gmx - extract_charge_from_topology: parse net charge from .top/.itp - update_topology_molecules: append water/ion entries to [ molecules ] - validate_with_grompp: dry-run topology validation Co-Authored-By: Claude Opus 4.6 --- mdfactory/setup/protein.py | 319 +++++++++++++++++++++++++++++++++++++ 1 file changed, 319 insertions(+) create mode 100644 mdfactory/setup/protein.py diff --git a/mdfactory/setup/protein.py b/mdfactory/setup/protein.py new file mode 100644 index 0000000..aa6a05e --- /dev/null +++ b/mdfactory/setup/protein.py @@ -0,0 +1,319 @@ +# ABOUTME: Protein preparation utilities for the proteinbox simulation type +# ABOUTME: Wraps PDBFixer for cleaning and gmx pdb2gmx for topology generation +"""Protein preparation utilities for the proteinbox simulation type.""" + +import shutil +import subprocess +from pathlib import Path + +from loguru import logger + +from ..models.parametrization import GromacsProteinParameterSet, Pdb2gmxConfig + + +def check_gmx_available() -> Path: + """Verify that the GROMACS gmx binary is available on PATH. + + Returns + ------- + Path + Path to the gmx binary. + + Raises + ------ + RuntimeError + If gmx is not found on PATH. + + """ + gmx_path = shutil.which("gmx") + if gmx_path is None: + raise RuntimeError( + "GROMACS 'gmx' binary not found on PATH. " + "Ensure GROMACS is installed and 'module load gromacs' has been run." + ) + return Path(gmx_path) + + +def clean_pdb(pdb_path: Path, output_path: Path) -> Path: + """Clean a PDB file using PDBFixer: remove heterogens, add missing heavy atoms. + + Parameters + ---------- + pdb_path : Path + Input PDB file. + output_path : Path + Where to write the cleaned PDB. + + Returns + ------- + Path + Path to the cleaned PDB file. + + """ + from pdbfixer import PDBFixer # noqa: PLC0415 + + fixer = PDBFixer(filename=str(pdb_path)) + fixer.removeHeterogens(keepWater=False) + fixer.findMissingResidues() + fixer.findMissingAtoms() + fixer.addMissingAtoms() + + from openmm.app import PDBFile # noqa: PLC0415 + + with open(output_path, "w") as f: + PDBFile.writeFile(fixer.topology, fixer.positions, f) + + logger.info(f"PDB cleaned: {pdb_path} -> {output_path}") + return output_path + + +def run_pdb2gmx( + pdb_path: Path, + config: Pdb2gmxConfig, + disulfide_bonds: list[tuple[int, int]], + protonation_states: dict[str, str], + output_dir: Path, +) -> GromacsProteinParameterSet: + """Run gmx pdb2gmx to generate GROMACS topology from a protein PDB. + + Parameters + ---------- + pdb_path : Path + Cleaned PDB file. + config : Pdb2gmxConfig + Force field and water model configuration. + disulfide_bonds : list[tuple[int, int]] + Residue ID pairs for disulfide bonds. + protonation_states : dict[str, str] + Residue-specific protonation state overrides. + output_dir : Path + Directory for output files. + + Returns + ------- + GromacsProteinParameterSet + Paths to generated topology, structure, and position restraint files. + + """ + output_dir.mkdir(parents=True, exist_ok=True) + + structure_file = output_dir / "processed.gro" + topology_file = output_dir / "topol.top" + posre_file = output_dir / "posre.itp" + + cmd = [ + "gmx", "pdb2gmx", + "-f", str(pdb_path), + "-o", str(structure_file), + "-p", str(topology_file), + "-i", str(posre_file), + "-ff", config.forcefield, + "-water", config.water_model, + ] + + if config.ignh: + cmd.append("-ignh") + + if config.merge_all: + cmd.extend(["-merge", "all"]) + + if disulfide_bonds: + cmd.append("-ss") + + logger.info(f"Running pdb2gmx: {' '.join(cmd)}") + + result = subprocess.run( + cmd, + cwd=str(output_dir), + text=True, + input="", + capture_output=True, + timeout=120, + ) + + if result.returncode != 0: + raise RuntimeError( + f"gmx pdb2gmx failed (exit code {result.returncode}).\n" + f"stdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}" + ) + + logger.info("pdb2gmx completed successfully.") + + # Verify expected outputs exist + for path in [structure_file, topology_file, posre_file]: + if not path.is_file(): + raise FileNotFoundError( + f"Expected pdb2gmx output not found: {path}\n" + f"stdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}" + ) + + total_charge = extract_charge_from_topology(topology_file) + logger.info(f"Protein net charge from topology: {total_charge}") + + return GromacsProteinParameterSet( + topology_file=topology_file, + structure_file=structure_file, + position_restraint_file=posre_file, + forcefield=config.forcefield, + water_model=config.water_model, + total_charge=total_charge, + ) + + +def extract_charge_from_topology(top_path: Path) -> int: + """Parse the topology file to determine the net system charge. + + Reads the [ atoms ] section(s) and sums the charge column. + + Parameters + ---------- + top_path : Path + Path to the GROMACS .top file. + + Returns + ------- + int + Net formal charge (rounded to nearest integer). + + """ + total_charge = 0.0 + in_atoms = False + + with open(top_path) as f: + for line in f: + stripped = line.strip() + + if stripped.startswith("#include"): + # Follow includes for itp files in the same directory + include_path = stripped.split('"')[1] if '"' in stripped else None + if include_path and not include_path.endswith(".ff/"): + itp_path = top_path.parent / include_path + if itp_path.is_file(): + total_charge += _sum_charges_from_itp(itp_path) + continue + + if stripped.startswith("["): + section = stripped.strip("[] ").lower() + in_atoms = section == "atoms" + continue + + if in_atoms and stripped and not stripped.startswith(";"): + parts = stripped.split() + if len(parts) >= 7: + try: + total_charge += float(parts[6]) + except (ValueError, IndexError): + pass + + return round(total_charge) + + +def _sum_charges_from_itp(itp_path: Path) -> float: + """Sum charges from the [ atoms ] section of an .itp file.""" + total = 0.0 + in_atoms = False + + with open(itp_path) as f: + for line in f: + stripped = line.strip() + if stripped.startswith("["): + section = stripped.strip("[] ").lower() + in_atoms = section == "atoms" + continue + if in_atoms and stripped and not stripped.startswith(";"): + parts = stripped.split() + if len(parts) >= 7: + try: + total += float(parts[6]) + except (ValueError, IndexError): + pass + return total + + +def update_topology_molecules( + top_path: Path, n_water: int, num_na: int, num_cl: int, water_name: str = "SOL" +) -> None: + """Update the [ molecules ] section of a topology file after solvation/ionization. + + Appends water and ion molecule counts to the topology. + + Parameters + ---------- + top_path : Path + Path to the GROMACS .top file (modified in-place). + n_water : int + Number of water molecules. + num_na : int + Number of Na+ ions. + num_cl : int + Number of Cl- ions. + water_name : str + Residue name for water (default "SOL"). + + """ + with open(top_path, "a") as f: + if n_water > 0: + f.write(f"{water_name:<20s} {n_water}\n") + if num_na > 0: + f.write(f"{'NA':<20s} {num_na}\n") + if num_cl > 0: + f.write(f"{'CL':<20s} {num_cl}\n") + + logger.info( + f"Topology updated: {n_water} water, {num_na} Na+, {num_cl} Cl-" + ) + + +def validate_with_grompp(topology: Path, structure: Path, mdp: Path, cwd: Path) -> None: + """Run gmx grompp as a validation check for topology/coordinate consistency. + + Parameters + ---------- + topology : Path + Path to topology.top file. + structure : Path + Path to structure .gro file. + mdp : Path + Path to MDP file (e.g., em.mdp). + cwd : Path + Working directory for the command. + + Raises + ------ + RuntimeError + If grompp reports errors. + + """ + tpr_out = cwd / "check.tpr" + cmd = [ + "gmx", "grompp", + "-f", str(mdp), + "-c", str(structure), + "-p", str(topology), + "-o", str(tpr_out), + "-maxwarn", "0", + ] + + result = subprocess.run( + cmd, + cwd=str(cwd), + text=True, + input="", + capture_output=True, + timeout=60, + ) + + # Clean up the check tpr + if tpr_out.is_file(): + tpr_out.unlink() + + if result.returncode != 0: + raise RuntimeError( + f"gmx grompp validation failed (exit code {result.returncode}).\n" + f"stdout:\n{result.stdout}\n" + f"stderr:\n{result.stderr}" + ) + + logger.info("grompp validation passed.") From 9eefb345243097313beb202f82cdc67b139d97db Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 11:21:47 +0200 Subject: [PATCH 03/14] Add relax_with_protein_restraints() to openmm_utils.py Brief NPT equilibration with protein heavy atoms position-restrained via CustomExternalForce. Allows water/ions to relax around the fixed protein structure before GROMACS production runs. Co-Authored-By: Claude Opus 4.6 --- mdfactory/simulation/openmm_utils.py | 128 ++++++++++++++++++++++++++- 1 file changed, 126 insertions(+), 2 deletions(-) diff --git a/mdfactory/simulation/openmm_utils.py b/mdfactory/simulation/openmm_utils.py index 126df55..6e6c64d 100644 --- a/mdfactory/simulation/openmm_utils.py +++ b/mdfactory/simulation/openmm_utils.py @@ -1,5 +1,5 @@ -# ABOUTME: OpenMM simulation utilities for box compression and equilibration -# ABOUTME: Provides volume convergence monitoring and density-targeted compression +# ABOUTME: OpenMM simulation utilities for box compression, equilibration, and relaxation +# ABOUTME: Provides volume convergence monitoring, density-targeted compression, and protein restraints """OpenMM simulation utilities for box compression and equilibration.""" import sys @@ -635,3 +635,127 @@ def compress_equilibrate_bilayer( plt.savefig("equilibration_analysis.png", dpi=300, bbox_inches="tight") return ret + + +def relax_with_protein_restraints( + u, + pdb, + top, + protein_indices, + restraint_k=1000.0, + steps=10000, +): + """Minimize and briefly equilibrate with protein atoms position-restrained. + + Runs a short NPT simulation at 1 bar with harmonic restraints on + the specified protein atom indices, allowing water and ions to relax + around the fixed protein structure. + + Parameters + ---------- + u : mda.Universe + Reference MDAnalysis Universe whose topology is preserved. + pdb : str or Path + Path to the input PDB file for OpenMM. + top : str or Path + Path to the GROMACS topology file. + protein_indices : list of int + 0-based atom indices to restrain (typically protein heavy atoms). + restraint_k : float, optional + Restraint force constant in kJ/mol/nm^2. Default is 1000.0. + steps : int, optional + Number of MD steps after minimization. Default is 10000. + + Returns + ------- + mda.Universe + Universe with relaxed solvent positions. Protein positions + remain at their restrained coordinates. + + """ + print("Loading GROMACS files for protein relaxation...") + gro = app.PDBFile(str(pdb)) + top_file = app.GromacsTopFile( + str(top), periodicBoxVectors=gro.topology.getPeriodicBoxVectors() + ) + + print("Creating OpenMM system...") + system = top_file.createSystem( + nonbondedMethod=app.PME, + nonbondedCutoff=1.0 * unit.nanometer, + constraints=app.HBonds, + ) + + # Position restraints on protein atoms + restraint_force = mm.CustomExternalForce( + "k*((x-x0)^2+(y-y0)^2+(z-z0)^2)" + ) + restraint_force.addGlobalParameter( + "k", restraint_k * unit.kilojoules_per_mole / unit.nanometer**2 + ) + restraint_force.addPerParticleParameter("x0") + restraint_force.addPerParticleParameter("y0") + restraint_force.addPerParticleParameter("z0") + + for idx in protein_indices: + pos = gro.positions[idx] + restraint_force.addParticle( + idx, + [ + pos[0].value_in_unit(unit.nanometer), + pos[1].value_in_unit(unit.nanometer), + pos[2].value_in_unit(unit.nanometer), + ], + ) + + system.addForce(restraint_force) + + # Isotropic barostat at 1 bar + barostat = mm.MonteCarloBarostat(1.0 * unit.bar, 300 * unit.kelvin) + system.addForce(barostat) + + integrator = mm.LangevinMiddleIntegrator( + 300 * unit.kelvin, 1 / unit.picosecond, 0.002 * unit.picoseconds + ) + + simulation = app.Simulation(top_file.topology, system, integrator) + simulation.context.setPositions(gro.positions) + + simulation.reporters.append( + app.StateDataReporter( + sys.stdout, + 1000, + step=True, + potentialEnergy=True, + temperature=True, + density=True, + volume=True, + ) + ) + + print("Minimizing energy (protein restrained)...") + simulation.minimizeEnergy() + + print(f"Running {steps} steps of NPT relaxation (protein restrained)...") + simulation.step(steps) + + positions = simulation.context.getState(getPositions=True).getPositions() + state = simulation.context.getState() + box_vectors = state.getPeriodicBoxVectors() + simulation.topology.setPeriodicBoxVectors(box_vectors) + + app.PDBFile.writeFile( + simulation.topology, positions, open("relaxed_system.pdb", "w"), keepIds=True + ) + app.PDBxFile.writeFile( + simulation.topology, positions, open("relaxed_system.mmcif", "w"), keepIds=True + ) + + u_tmp = mda.Universe(app.PDBxFile("relaxed_system.mmcif")) + ret = mda.Merge(u.atoms) + ret.atoms.positions = u_tmp.atoms.positions + ret.dimensions = u_tmp.dimensions + ret.atoms.positions = ret.atoms.wrap(compound="residues") + + print("Protein relaxation complete.") + return ret From 496888dc70535876ef3a9ef6cd5d316748a54f8b Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 11:23:56 +0200 Subject: [PATCH 04/14] Add build_proteinbox() and wire up dispatch in workflows.py Build pipeline: clean PDB -> pdb2gmx -> center in cubic box -> solvate (reuses existing solvate()) -> ionize (reuses ionize_solvated_system()) -> update topology -> OpenMM relax with protein restraints -> validate. Co-Authored-By: Claude Opus 4.6 --- mdfactory/build.py | 133 ++++++++++++++++++++++++++++++++++++++++- mdfactory/workflows.py | 3 +- 2 files changed, 134 insertions(+), 2 deletions(-) diff --git a/mdfactory/build.py b/mdfactory/build.py index 6f728b3..5e37de1 100644 --- a/mdfactory/build.py +++ b/mdfactory/build.py @@ -1,5 +1,5 @@ # ABOUTME: Core build pipeline for constructing MD simulation systems -# ABOUTME: Dispatches to mixedbox, bilayer, and LNP build routines +# ABOUTME: Dispatches to mixedbox, bilayer, LNP, and proteinbox build routines """Core build pipeline for constructing MD simulation systems.""" import os @@ -682,3 +682,134 @@ def ionize_solvated_system(ion_config, u_solvated, total_charge): na_spec = SingleMoleculeSpecies(count=num_na, smiles="[Na+]", resname="NA") cl_spec = SingleMoleculeSpecies(count=num_cl, smiles="[Cl-]", resname="CL") return u_ionized, [na_spec, cl_spec] + + +@validate_call +def build_proteinbox(inp: BuildInput): + """Build a protein-in-waterbox system: clean, parametrize, solvate, ionize, relax.""" + from loguru import logger + + from .models.composition import ProteinBoxComposition + from .models.parametrization import Pdb2gmxConfig + from .setup.protein import ( + check_gmx_available, + clean_pdb, + extract_charge_from_topology, + run_pdb2gmx, + update_topology_molecules, + validate_with_grompp, + ) + + if not isinstance(inp.system, ProteinBoxComposition): + raise TypeError("System must specify a ProteinBoxComposition.") + + if not isinstance(inp.parametrization_config, Pdb2gmxConfig): + raise TypeError("Parametrization config must be Pdb2gmxConfig for proteinbox.") + + # 1. Verify GROMACS is available + gmx_path = check_gmx_available() + logger.info(f"Using GROMACS: {gmx_path}") + + protein = inp.system.protein + config = inp.parametrization_config + + # 2. Clean PDB + pdb_input = protein.pdb_path + if not pdb_input.is_absolute(): + pdb_input = pdb_input.resolve() + + cleaned_pdb = Path("cleaned.pdb") + clean_pdb(pdb_input, cleaned_pdb) + + # 3. Run pdb2gmx + pdb2gmx_dir = Path("pdb2gmx_output") + params = run_pdb2gmx( + pdb_path=cleaned_pdb, + config=config, + disulfide_bonds=protein.disulfide_bonds, + protonation_states=protein.protonation_states, + output_dir=pdb2gmx_dir, + ) + logger.info(f"pdb2gmx output: {params.structure_file}") + + # 4. Center protein in cubic box + u = mda.Universe(str(params.structure_file)) + extent = u.atoms.positions.max(axis=0) - u.atoms.positions.min(axis=0) + box_size = extent.max() + 2 * inp.system.box_padding + u.dimensions = [box_size, box_size, box_size, 90, 90, 90] + center = np.array([box_size / 2, box_size / 2, box_size / 2]) + u.atoms.translate(-u.atoms.center_of_geometry() + center) + logger.info(f"Protein centered in cubic box: {box_size:.1f} A") + + # 5. Solvate (reuse existing solvation) + u_solvated = solvate(u, prune_in_z=False) + u_solvated.dimensions = u.dimensions + n_water = len(u_solvated.select_atoms("water").residues) + logger.info(f"Solvation added {n_water} water molecules.") + + if n_water == 0: + raise ValueError("Solvation did not add any water molecules.") + + # 6. Ionize (reuse existing ionization) + protein_charge = params.total_charge + u_ionized, ion_species = ionize_solvated_system( + inp.system.ionization, u_solvated, protein_charge + ) + num_na = ion_species[0].count if len(ion_species) > 0 else 0 + num_cl = ion_species[1].count if len(ion_species) > 1 else 0 + n_water_final = len(u_ionized.select_atoms("water").residues) + + # 7. Update topology [ molecules ] section + import shutil + + topology_dest = Path("topology.top") + shutil.copy(params.topology_file, topology_dest) + shutil.copy(params.position_restraint_file, Path("posre.itp")) + update_topology_molecules(topology_dest, n_water_final, num_na, num_cl) + + # 8. Write coordinates for OpenMM relaxation + u_ionized.atoms.write("system_pre_relax.pdb") + + # 9. OpenMM relaxation with protein position restraints + protein_atoms = u_ionized.select_atoms("protein and not name H*") + protein_indices = protein_atoms.indices.tolist() + + with working_directory("relaxation", create=True, cleanup=True) as wd: + shutil.copy("system_pre_relax.pdb", wd / "system.pdb") + shutil.copy(topology_dest, wd / "topology.top") + shutil.copy("posre.itp", wd / "posre.itp") + + # Copy forcefield directory if referenced by topology + # pdb2gmx topologies use #include from the GROMACS data dir, so OpenMM + # needs the GromacsTopFile to resolve them via GMXLIB or local copies + from .simulation.openmm_utils import relax_with_protein_restraints + + u_relaxed = relax_with_protein_restraints( + u_ionized, + wd / "system.pdb", + wd / "topology.top", + protein_indices=protein_indices, + steps=10000, + ) + + logger.info("OpenMM relaxation complete.") + + # 10. Write final output + u_relaxed.atoms.write("system.pdb") + logger.info("Final system written to system.pdb") + + # 11. Validate with grompp (if em.mdp is available) + manager = RunScheduleManager() + manager.copy_run_files_with_check( + engine=inp.engine, system_type="proteinbox", target_folder=os.getcwd(), force_copy=True + ) + logger.info("Run schedule files copied.") + + em_mdp = Path("em.mdp") + if em_mdp.is_file(): + try: + validate_with_grompp(topology_dest, Path("system.pdb"), em_mdp, Path.cwd()) + except RuntimeError as e: + logger.warning(f"grompp validation failed (non-fatal): {e}") + + logger.info("Proteinbox build complete.") diff --git a/mdfactory/workflows.py b/mdfactory/workflows.py index 9cd4b34..e62ced2 100644 --- a/mdfactory/workflows.py +++ b/mdfactory/workflows.py @@ -5,7 +5,7 @@ from pathlib import Path from typing import Any -from .build import build_bilayer, build_lnp, build_mixedbox +from .build import build_bilayer, build_lnp, build_mixedbox, build_proteinbox from .models.input import BuildInput from .utils.utilities import load_yaml_file @@ -13,6 +13,7 @@ "mixedbox": build_mixedbox, "bilayer": build_bilayer, "lnp": build_lnp, + "proteinbox": build_proteinbox, } From a405acea5ffee68f19c87150c0859554fbdc7f7c Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 11:25:14 +0200 Subject: [PATCH 05/14] Add proteinbox MDP run schedule files (CHARMM36m settings) MDP files for protein-in-waterbox equilibration and production: - em.mdp: steepest descent minimization - nvt.mdp: NVT with -DPOSRES and Protein/Non-Protein tc-grps - npt.mdp: NPT with -DPOSRES and Berendsen barostat - md.mdp: production with Parrinello-Rahman, no position restraints All use CHARMM36m-specific nonbonded settings (1.2nm cutoffs, Force-switch VdW modifier, no dispersion correction). Co-Authored-By: Claude Opus 4.6 --- mdfactory/run_schedules/run_schedules.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mdfactory/run_schedules/run_schedules.yaml b/mdfactory/run_schedules/run_schedules.yaml index 684567e..3a591d4 100644 --- a/mdfactory/run_schedules/run_schedules.yaml +++ b/mdfactory/run_schedules/run_schedules.yaml @@ -36,3 +36,16 @@ num_gpus: 1 num_cores: 8 walltime: 24:00:00 + +- engine: gromacs + system_type: proteinbox + version: 1.0 + run_files: + - em.mdp + - nvt.mdp + - npt.mdp + - md.mdp + settings: + num_gpus: 1 + num_cores: 8 + walltime: 24:00:00 From 1c7fe01542992be83d39bac57a74691c55cdc561 Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 11:26:12 +0200 Subject: [PATCH 06/14] Track proteinbox MDP files (force-add past .gitignore *.mdp rule) Co-Authored-By: Claude Opus 4.6 --- .../run_schedules/gromacs/proteinbox/em.mdp | 20 +++++++ .../run_schedules/gromacs/proteinbox/md.mdp | 57 +++++++++++++++++++ .../run_schedules/gromacs/proteinbox/npt.mdp | 54 ++++++++++++++++++ .../run_schedules/gromacs/proteinbox/nvt.mdp | 52 +++++++++++++++++ 4 files changed, 183 insertions(+) create mode 100644 mdfactory/run_schedules/gromacs/proteinbox/em.mdp create mode 100644 mdfactory/run_schedules/gromacs/proteinbox/md.mdp create mode 100644 mdfactory/run_schedules/gromacs/proteinbox/npt.mdp create mode 100644 mdfactory/run_schedules/gromacs/proteinbox/nvt.mdp diff --git a/mdfactory/run_schedules/gromacs/proteinbox/em.mdp b/mdfactory/run_schedules/gromacs/proteinbox/em.mdp new file mode 100644 index 0000000..6099eaf --- /dev/null +++ b/mdfactory/run_schedules/gromacs/proteinbox/em.mdp @@ -0,0 +1,20 @@ +; Energy minimization parameters for protein systems (CHARMM36m) +integrator = steep +emtol = 1000.0 +emstep = 0.01 +nsteps = 50000 + +; Neighbor searching +nstlist = 1 +cutoff-scheme = Verlet +ns_type = grid +pbc = xyz + +; Electrostatics and VdW (CHARMM36m settings) +coulombtype = PME +rcoulomb = 1.2 +rvdw = 1.2 +vdwtype = cutoff +vdw-modifier = Force-switch +rvdw-switch = 1.0 +DispCorr = no diff --git a/mdfactory/run_schedules/gromacs/proteinbox/md.mdp b/mdfactory/run_schedules/gromacs/proteinbox/md.mdp new file mode 100644 index 0000000..e2af45d --- /dev/null +++ b/mdfactory/run_schedules/gromacs/proteinbox/md.mdp @@ -0,0 +1,57 @@ +; MD production run for protein systems (CHARMM36m) +title = MD production run + +; Run parameters +integrator = md +nsteps = 50000000 +dt = 0.002 + +; Output control +nstxout = 0 +nstvout = 0 +nstfout = 0 +nstenergy = 5000 +nstlog = 5000 +nstxout-compressed = 5000 +compressed-x-grps = System + +; Bond parameters +continuation = yes +constraint_algorithm = lincs +constraints = h-bonds +lincs_iter = 1 +lincs_order = 4 + +; Neighbor searching +cutoff-scheme = Verlet +ns_type = grid +nstlist = 10 +pbc = xyz + +; Electrostatics and VdW (CHARMM36m settings) +coulombtype = PME +pme_order = 4 +fourierspacing = 0.16 +rcoulomb = 1.2 +rvdw = 1.2 +vdwtype = cutoff +vdw-modifier = Force-switch +rvdw-switch = 1.0 +DispCorr = no + +; Temperature coupling +tcoupl = V-rescale +tc-grps = Protein Non-Protein +tau_t = 0.1 0.1 +ref_t = 300 300 + +; Pressure coupling (Parrinello-Rahman for production) +pcoupl = Parrinello-Rahman +pcoupltype = isotropic +tau_p = 5.0 +ref_p = 1.0 +compressibility = 4.5e-5 +refcoord_scaling = com + +; Velocity generation +gen_vel = no diff --git a/mdfactory/run_schedules/gromacs/proteinbox/npt.mdp b/mdfactory/run_schedules/gromacs/proteinbox/npt.mdp new file mode 100644 index 0000000..95eceb8 --- /dev/null +++ b/mdfactory/run_schedules/gromacs/proteinbox/npt.mdp @@ -0,0 +1,54 @@ +; NPT equilibration for protein systems (CHARMM36m) +title = NPT equilibration +define = -DPOSRES + +; Run parameters +integrator = md +nsteps = 50000 +dt = 0.002 + +; Output control +nstxout = 500 +nstvout = 500 +nstenergy = 500 +nstlog = 500 + +; Bond parameters +continuation = yes +constraint_algorithm = lincs +constraints = h-bonds +lincs_iter = 1 +lincs_order = 4 + +; Neighbor searching +cutoff-scheme = Verlet +ns_type = grid +nstlist = 10 +pbc = xyz + +; Electrostatics and VdW (CHARMM36m settings) +coulombtype = PME +pme_order = 4 +fourierspacing = 0.16 +rcoulomb = 1.2 +rvdw = 1.2 +vdwtype = cutoff +vdw-modifier = Force-switch +rvdw-switch = 1.0 +DispCorr = no + +; Temperature coupling +tcoupl = V-rescale +tc-grps = Protein Non-Protein +tau_t = 0.1 0.1 +ref_t = 300 300 + +; Pressure coupling +pcoupl = Berendsen +pcoupltype = isotropic +tau_p = 2.0 +ref_p = 1.0 +compressibility = 4.5e-5 + +; Velocity generation +gen_vel = no diff --git a/mdfactory/run_schedules/gromacs/proteinbox/nvt.mdp b/mdfactory/run_schedules/gromacs/proteinbox/nvt.mdp new file mode 100644 index 0000000..a85ecce --- /dev/null +++ b/mdfactory/run_schedules/gromacs/proteinbox/nvt.mdp @@ -0,0 +1,52 @@ +; NVT equilibration for protein systems (CHARMM36m) +title = NVT equilibration +define = -DPOSRES + +; Run parameters +integrator = md +nsteps = 50000 +dt = 0.002 + +; Output control +nstxout = 500 +nstvout = 500 +nstenergy = 500 +nstlog = 500 + +; Bond parameters +continuation = no +constraint_algorithm = lincs +constraints = h-bonds +lincs_iter = 1 +lincs_order = 4 + +; Neighbor searching +cutoff-scheme = Verlet +ns_type = grid +nstlist = 10 +pbc = xyz + +; Electrostatics and VdW (CHARMM36m settings) +coulombtype = PME +pme_order = 4 +fourierspacing = 0.16 +rcoulomb = 1.2 +rvdw = 1.2 +vdwtype = cutoff +vdw-modifier = Force-switch +rvdw-switch = 1.0 +DispCorr = no + +; Temperature coupling +tcoupl = V-rescale +tc-grps = Protein Non-Protein +tau_t = 0.1 0.1 +ref_t = 300 300 + +; Pressure coupling off for NVT +pcoupl = no + +; Velocity generation +gen_vel = yes +gen_temp = 300 +gen_seed = -1 From f03dc7c1bd2c4d715fc4664e8f6a17367d07e199 Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 11:28:30 +0200 Subject: [PATCH 07/14] Add proteinbox tests and example YAML - 13 unit tests covering ProteinSpecies, ProteinBoxComposition, Pdb2gmxConfig, BuildInput integration, and topology parsing - Fix ProteinSpecies to default count=1/fraction=1.0 at field level (avoids parent Species validator ordering issue) - Example YAML for lysozyme with CHARMM36m force field Co-Authored-By: Claude Opus 4.6 --- examples/proteinbox/lysozyme_charmm36m.yaml | 25 +++ mdfactory/models/species.py | 9 +- mdfactory/tests/test_proteinbox.py | 193 ++++++++++++++++++++ 3 files changed, 220 insertions(+), 7 deletions(-) create mode 100644 examples/proteinbox/lysozyme_charmm36m.yaml create mode 100644 mdfactory/tests/test_proteinbox.py diff --git a/examples/proteinbox/lysozyme_charmm36m.yaml b/examples/proteinbox/lysozyme_charmm36m.yaml new file mode 100644 index 0000000..a67e0e5 --- /dev/null +++ b/examples/proteinbox/lysozyme_charmm36m.yaml @@ -0,0 +1,25 @@ +simulation_type: proteinbox +engine: gromacs +parametrization: pdb2gmx +parametrization_config: + type: pdb2gmx + forcefield: charmm36m + water_model: tip3p + ignh: true + +system: + protein: + resname: LYZ + count: 1 + pdb_path: ./1aki.pdb + disulfide_bonds: + - [6, 127] + - [30, 115] + - [64, 80] + - [76, 94] + protonation_states: + HIS15: HIE + box_padding: 12.0 + ionization: + neutralize: true + concentration: 0.15 diff --git a/mdfactory/models/species.py b/mdfactory/models/species.py index 487676c..a41d958 100644 --- a/mdfactory/models/species.py +++ b/mdfactory/models/species.py @@ -202,6 +202,8 @@ def rdkit_molecule(self) -> "Chem.rdchem.Mol": class ProteinSpecies(Species): """Represent a protein species identified by a PDB structure file.""" + count: Optional[int] = Field(1, description="Number of protein copies (always 1).") + fraction: Optional[float] = Field(1.0, description="Fraction (always 1.0 for protein).") pdb_path: Path = Field(..., description="Path to the input PDB file.") disulfide_bonds: list[tuple[int, int]] = Field( default_factory=list, @@ -212,13 +214,6 @@ class ProteinSpecies(Species): description="Residue-specific protonation states, e.g. {'HIS15': 'HIE', 'GLU35': 'GLH'}.", ) - @model_validator(mode="after") - def set_default_count(self) -> "ProteinSpecies": - if self.count is None and self.fraction is None: - self.count = 1 - self.fraction = 1.0 - return self - @property def charge(self) -> int: raise NotImplementedError( diff --git a/mdfactory/tests/test_proteinbox.py b/mdfactory/tests/test_proteinbox.py new file mode 100644 index 0000000..7663dd7 --- /dev/null +++ b/mdfactory/tests/test_proteinbox.py @@ -0,0 +1,193 @@ +# ABOUTME: Tests for proteinbox models, topology parsing, and YAML validation +# ABOUTME: Covers ProteinSpecies, ProteinBoxComposition, Pdb2gmxConfig, and BuildInput +"""Tests for proteinbox models, topology parsing, and YAML validation.""" + +import textwrap +from pathlib import Path + +import pytest +import yaml + +from mdfactory.models.composition import ProteinBoxComposition +from mdfactory.models.input import BuildInput +from mdfactory.models.parametrization import ( + GromacsProteinParameterSet, + Pdb2gmxConfig, +) +from mdfactory.models.species import ProteinSpecies +from mdfactory.setup.protein import ( + _sum_charges_from_itp, + extract_charge_from_topology, + update_topology_molecules, +) + + +class TestProteinSpecies: + def test_basic_creation(self, tmp_path): + pdb = tmp_path / "test.pdb" + pdb.write_text("ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00\n") + spec = ProteinSpecies(resname="LYZ", pdb_path=pdb) + assert spec.resname == "LYZ" + assert spec.count == 1 + assert spec.fraction == 1.0 + + def test_with_disulfides_and_protonation(self, tmp_path): + pdb = tmp_path / "test.pdb" + pdb.write_text("ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00\n") + spec = ProteinSpecies( + resname="LYZ", + pdb_path=pdb, + disulfide_bonds=[(6, 127), (30, 115)], + protonation_states={"HIS15": "HIE", "GLU35": "GLH"}, + ) + assert len(spec.disulfide_bonds) == 2 + assert spec.disulfide_bonds[0] == (6, 127) + assert spec.protonation_states["HIS15"] == "HIE" + + def test_charge_not_precomputable(self, tmp_path): + pdb = tmp_path / "test.pdb" + pdb.write_text("ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00\n") + spec = ProteinSpecies(resname="LYZ", pdb_path=pdb) + with pytest.raises(NotImplementedError, match="pdb2gmx"): + _ = spec.charge + + def test_resname_validation(self, tmp_path): + pdb = tmp_path / "test.pdb" + pdb.write_text("ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00\n") + with pytest.raises(ValueError, match="Residue name must be less"): + ProteinSpecies(resname="TOOLONG", pdb_path=pdb) + + +class TestProteinBoxComposition: + def test_basic_creation(self, tmp_path): + pdb = tmp_path / "test.pdb" + pdb.write_text("ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00\n") + comp = ProteinBoxComposition( + protein=ProteinSpecies(resname="LYZ", pdb_path=pdb), + box_padding=12.0, + ) + assert comp.box_padding == 12.0 + assert comp.ionization.neutralize is True + assert comp.ionization.concentration == 0.15 + + def test_custom_ionization(self, tmp_path): + pdb = tmp_path / "test.pdb" + pdb.write_text("ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00\n") + comp = ProteinBoxComposition( + protein=ProteinSpecies(resname="LYZ", pdb_path=pdb), + ionization={"neutralize": True, "concentration": 0.10}, + ) + assert comp.ionization.concentration == 0.10 + + +class TestPdb2gmxConfig: + def test_defaults(self): + config = Pdb2gmxConfig() + assert config.forcefield == "charmm36m" + assert config.water_model == "tip3p" + assert config.ignh is True + assert config.merge_all is False + + def test_custom(self): + config = Pdb2gmxConfig(forcefield="amber99sb-ildn", water_model="spc") + assert config.forcefield == "amber99sb-ildn" + assert config.water_model == "spc" + + def test_frozen(self): + config = Pdb2gmxConfig() + with pytest.raises(Exception): + config.forcefield = "other" + + +class TestBuildInputProteinbox: + def test_yaml_roundtrip(self, tmp_path): + pdb = tmp_path / "test.pdb" + pdb.write_text("ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00\n") + + yaml_str = f""" +simulation_type: proteinbox +engine: gromacs +parametrization: pdb2gmx +parametrization_config: + type: pdb2gmx + forcefield: charmm36m + water_model: tip3p + ignh: true +system: + protein: + resname: LYZ + count: 1 + pdb_path: {pdb} + disulfide_bonds: + - [6, 127] + - [30, 115] + protonation_states: + HIS15: HIE + box_padding: 12.0 + ionization: + neutralize: true + concentration: 0.15 +""" + data = yaml.safe_load(yaml_str) + inp = BuildInput(**data) + assert inp.simulation_type == "proteinbox" + assert isinstance(inp.system, ProteinBoxComposition) + assert isinstance(inp.parametrization_config, Pdb2gmxConfig) + assert inp.parametrization == "pdb2gmx" + + def test_default_parametrization_config(self, tmp_path): + pdb = tmp_path / "test.pdb" + pdb.write_text("ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00\n") + + data = { + "simulation_type": "proteinbox", + "parametrization": "pdb2gmx", + "system": { + "protein": {"resname": "LYZ", "count": 1, "pdb_path": str(pdb)}, + }, + } + inp = BuildInput(**data) + assert isinstance(inp.parametrization_config, Pdb2gmxConfig) + assert inp.parametrization_config.forcefield == "charmm36m" + + +class TestTopologyParsing: + def test_sum_charges_from_itp(self, tmp_path): + itp = tmp_path / "protein.itp" + itp.write_text(textwrap.dedent("""\ + [ moleculetype ] + Protein 3 + + [ atoms ] + ; nr type resnr residue atom cgnr charge mass + 1 NH3 1 LYS N 1 -0.300 14.007 + 2 CT 1 LYS CA 2 0.210 12.011 + 3 C 1 LYS C 3 0.510 12.011 + 4 O 1 LYS O 4 -0.510 15.999 + + [ bonds ] + 1 2 + """)) + charge = _sum_charges_from_itp(itp) + assert abs(charge - (-0.090)) < 1e-6 + + def test_update_topology_molecules(self, tmp_path): + top = tmp_path / "topol.top" + top.write_text(textwrap.dedent("""\ + #include "charmm36m.ff/forcefield.itp" + #include "protein.itp" + + [ system ] + Protein in water + + [ molecules ] + Protein_chain_A 1 + """)) + update_topology_molecules(top, n_water=5000, num_na=10, num_cl=18) + content = top.read_text() + assert "SOL" in content + assert "5000" in content + assert "NA" in content + assert "10" in content + assert "CL" in content + assert "18" in content From 263cf542be69c871d190dce0f1ec9ed4f5ccd447 Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 11:42:38 +0200 Subject: [PATCH 08/14] Fix review findings: Path import, cwd paths, metadata, protonation Addresses 4 P2 review comments: - Add missing `from pathlib import Path` to build.py module scope - Resolve all paths to absolute in run_pdb2gmx before subprocess call (avoids cwd confusion with output_dir) - Add species/total_count/charge properties to ProteinBoxComposition and proteinbox case in BuildInput.metadata for analysis compatibility - Implement _apply_protonation_states: renames residues in PDB before pdb2gmx so protonation overrides (HIS->HIE, GLU->GLH) are applied Co-Authored-By: Claude Opus 4.6 --- mdfactory/build.py | 6 +-- mdfactory/models/composition.py | 15 ++++++ mdfactory/models/input.py | 4 ++ mdfactory/setup/protein.py | 74 ++++++++++++++++++++++++++++++ mdfactory/tests/test_proteinbox.py | 40 ++++++++++++++++ 5 files changed, 136 insertions(+), 3 deletions(-) diff --git a/mdfactory/build.py b/mdfactory/build.py index 5e37de1..50a2349 100644 --- a/mdfactory/build.py +++ b/mdfactory/build.py @@ -4,6 +4,7 @@ import os from functools import partial +from pathlib import Path import MDAnalysis as mda import numpy as np @@ -687,6 +688,8 @@ def ionize_solvated_system(ion_config, u_solvated, total_charge): @validate_call def build_proteinbox(inp: BuildInput): """Build a protein-in-waterbox system: clean, parametrize, solvate, ionize, relax.""" + import shutil + from loguru import logger from .models.composition import ProteinBoxComposition @@ -694,7 +697,6 @@ def build_proteinbox(inp: BuildInput): from .setup.protein import ( check_gmx_available, clean_pdb, - extract_charge_from_topology, run_pdb2gmx, update_topology_molecules, validate_with_grompp, @@ -760,8 +762,6 @@ def build_proteinbox(inp: BuildInput): n_water_final = len(u_ionized.select_atoms("water").residues) # 7. Update topology [ molecules ] section - import shutil - topology_dest = Path("topology.top") shutil.copy(params.topology_file, topology_dest) shutil.copy(params.position_restraint_file, Path("posre.itp")) diff --git a/mdfactory/models/composition.py b/mdfactory/models/composition.py index d736e9a..b88bfd3 100644 --- a/mdfactory/models/composition.py +++ b/mdfactory/models/composition.py @@ -389,3 +389,18 @@ class ProteinBoxComposition(BaseModel): ionization: IonizationConfig = Field( default_factory=IonizationConfig, description="Configuration for ionization." ) + + @property + def species(self) -> list[ProteinSpecies]: + """Return protein as a single-element species list for metadata compatibility.""" + return [self.protein] + + @property + def total_count(self) -> int: + """Return 1 (one protein) for metadata compatibility.""" + return 1 + + @property + def charge(self) -> int: + """Protein charge is not pre-computable; return 0 as placeholder.""" + return 0 diff --git a/mdfactory/models/input.py b/mdfactory/models/input.py index 60b47b6..9c1fbfe 100644 --- a/mdfactory/models/input.py +++ b/mdfactory/models/input.py @@ -80,6 +80,10 @@ def metadata(self) -> dict[str, Any]: elif self.simulation_type == "mixedbox": system_specific["target_density"] = self.system.target_density system_specific["ionization"] = self.system.ionization.model_dump() + elif self.simulation_type == "proteinbox": + system_specific["box_padding"] = self.system.box_padding + system_specific["ionization"] = self.system.ionization.model_dump() + system_specific["pdb_path"] = str(self.system.protein.pdb_path) return { "hash": self.hash, diff --git a/mdfactory/setup/protein.py b/mdfactory/setup/protein.py index aa6a05e..ea3c348 100644 --- a/mdfactory/setup/protein.py +++ b/mdfactory/setup/protein.py @@ -11,6 +11,70 @@ from ..models.parametrization import GromacsProteinParameterSet, Pdb2gmxConfig +def _apply_protonation_states( + pdb_path: Path, protonation_states: dict[str, str], output_dir: Path +) -> Path: + """Apply protonation state overrides by renaming residues in the PDB. + + pdb2gmx selects protonation states based on residue names (e.g. HIS -> HID/HIE/HIP + for CHARMM, ASP -> ASPP for protonated aspartate). This function renames matching + residues in the PDB so pdb2gmx applies the correct parameters without interactive + prompts. + + Parameters + ---------- + pdb_path : Path + Input PDB file (absolute path). + protonation_states : dict[str, str] + Mapping of "RESNAME" to target residue name, e.g. + {"HIS15": "HID", "GLU35": "GLH"}. + output_dir : Path + Directory for the modified PDB. + + Returns + ------- + Path + Path to the modified PDB file. + + """ + import re + + output_pdb = output_dir / "protonation_adjusted.pdb" + + # Parse overrides: "HIS15" -> (original_resname_prefix="HIS", resid=15, new_name="HID") + overrides = [] + for key, new_name in protonation_states.items(): + match = re.match(r"([A-Za-z]+)(\d+)", key) + if not match: + raise ValueError( + f"Invalid protonation state key '{key}'. " + "Expected format: RESNAME, e.g. 'HIS15'." + ) + resname_prefix = match.group(1).upper() + resid = int(match.group(2)) + overrides.append((resname_prefix, resid, new_name.upper())) + + with open(pdb_path) as f_in, open(output_pdb, "w") as f_out: + for line in f_in: + if line.startswith(("ATOM", "HETATM")) and len(line) >= 26: + current_resname = line[17:20].strip() + try: + current_resid = int(line[22:26].strip()) + except ValueError: + f_out.write(line) + continue + + for prefix, target_resid, new_name in overrides: + if current_resname.startswith(prefix) and current_resid == target_resid: + padded = f"{new_name:<3s}" + line = line[:17] + padded + line[20:] + break + f_out.write(line) + + logger.info(f"Protonation states applied: {protonation_states}") + return output_pdb + + def check_gmx_available() -> Path: """Verify that the GROMACS gmx binary is available on PATH. @@ -97,6 +161,10 @@ def run_pdb2gmx( """ output_dir.mkdir(parents=True, exist_ok=True) + # Resolve all paths to absolute to avoid cwd confusion with subprocess + pdb_path = pdb_path.resolve() + output_dir = output_dir.resolve() + structure_file = output_dir / "processed.gro" topology_file = output_dir / "topol.top" posre_file = output_dir / "posre.itp" @@ -120,6 +188,12 @@ def run_pdb2gmx( if disulfide_bonds: cmd.append("-ss") + # Apply protonation state overrides by renaming residues in the PDB + # before pdb2gmx processes it (standard approach for CHARMM/AMBER FFs) + if protonation_states: + pdb_path = _apply_protonation_states(pdb_path, protonation_states, output_dir) + cmd[cmd.index("-f") + 1] = str(pdb_path) + logger.info(f"Running pdb2gmx: {' '.join(cmd)}") result = subprocess.run( diff --git a/mdfactory/tests/test_proteinbox.py b/mdfactory/tests/test_proteinbox.py index 7663dd7..f8d845e 100644 --- a/mdfactory/tests/test_proteinbox.py +++ b/mdfactory/tests/test_proteinbox.py @@ -16,6 +16,7 @@ ) from mdfactory.models.species import ProteinSpecies from mdfactory.setup.protein import ( + _apply_protonation_states, _sum_charges_from_itp, extract_charge_from_topology, update_topology_molecules, @@ -191,3 +192,42 @@ def test_update_topology_molecules(self, tmp_path): assert "10" in content assert "CL" in content assert "18" in content + + def test_apply_protonation_states(self, tmp_path): + pdb = tmp_path / "input.pdb" + pdb.write_text(textwrap.dedent("""\ + ATOM 1 N HIS A 15 1.000 2.000 3.000 1.00 0.00 + ATOM 2 CA HIS A 15 1.500 2.500 3.500 1.00 0.00 + ATOM 3 N ALA A 16 2.000 3.000 4.000 1.00 0.00 + ATOM 4 N GLU A 35 3.000 4.000 5.000 1.00 0.00 + """)) + result = _apply_protonation_states( + pdb, {"HIS15": "HIE", "GLU35": "GLH"}, tmp_path + ) + content = result.read_text() + # HIS at resid 15 should be renamed to HIE + assert "HIE" in content + # GLU at resid 35 should be renamed to GLH + assert "GLH" in content + # ALA at resid 16 should be unchanged + assert "ALA" in content + + +class TestMetadata: + def test_proteinbox_metadata(self, tmp_path): + pdb = tmp_path / "test.pdb" + pdb.write_text("ATOM 1 N ALA A 1 0.000 0.000 0.000 1.00 0.00\n") + data = { + "simulation_type": "proteinbox", + "parametrization": "pdb2gmx", + "system": { + "protein": {"resname": "LYZ", "count": 1, "pdb_path": str(pdb)}, + "box_padding": 12.0, + }, + } + inp = BuildInput(**data) + meta = inp.metadata + assert meta["simulation_type"] == "proteinbox" + assert meta["total_count"] == 1 + assert "box_padding" in meta["system_specific"] + assert meta["system_specific"]["box_padding"] == 12.0 From decb36d5193a7e3dffe616b9dc7db419416f3fab Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 12:52:47 +0200 Subject: [PATCH 09/14] Add force field check, fix grompp cleanup, fix file handle leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add check_forcefield_available() that verifies the force field directory exists in GROMACS share, GMXLIB, or cwd before invoking pdb2gmx. Lists available force fields on failure. - Clean up mdout.mdp artifact in validate_with_grompp alongside check.tpr. - Fix bare open() file handles in relax_with_protein_restraints (use with-blocks). - Add CHARMM HIS alias translation (HIE→HSE, HID→HSD, HIP→HSP for charmm FFs). - Use re.fullmatch for protonation state key parsing (reject trailing chars). - Validate 3-character residue names for PDB column safety. - Add disulfide bond prompt generation (_build_disulfide_prompt_input) for deterministic pdb2gmx -ss interaction. - Resolve paths before working_directory context switch to avoid cwd breakage. Co-Authored-By: Claude Opus 4.6 --- mdfactory/build.py | 9 +- mdfactory/setup/protein.py | 204 +++++++++++++++++++++++++-- mdfactory/simulation/openmm_utils.py | 10 +- mdfactory/tests/test_proteinbox.py | 110 +++++++++++++++ 4 files changed, 311 insertions(+), 22 deletions(-) diff --git a/mdfactory/build.py b/mdfactory/build.py index 50a2349..3b98fbe 100644 --- a/mdfactory/build.py +++ b/mdfactory/build.py @@ -768,16 +768,19 @@ def build_proteinbox(inp: BuildInput): update_topology_molecules(topology_dest, n_water_final, num_na, num_cl) # 8. Write coordinates for OpenMM relaxation - u_ionized.atoms.write("system_pre_relax.pdb") + pre_relax_structure = Path("system_pre_relax.pdb").resolve() + u_ionized.atoms.write(str(pre_relax_structure)) # 9. OpenMM relaxation with protein position restraints protein_atoms = u_ionized.select_atoms("protein and not name H*") protein_indices = protein_atoms.indices.tolist() + topology_dest = topology_dest.resolve() + position_restraint_file = Path("posre.itp").resolve() with working_directory("relaxation", create=True, cleanup=True) as wd: - shutil.copy("system_pre_relax.pdb", wd / "system.pdb") + shutil.copy(pre_relax_structure, wd / "system.pdb") shutil.copy(topology_dest, wd / "topology.top") - shutil.copy("posre.itp", wd / "posre.itp") + shutil.copy(position_restraint_file, wd / "posre.itp") # Copy forcefield directory if referenced by topology # pdb2gmx topologies use #include from the GROMACS data dir, so OpenMM diff --git a/mdfactory/setup/protein.py b/mdfactory/setup/protein.py index ea3c348..68b3a5c 100644 --- a/mdfactory/setup/protein.py +++ b/mdfactory/setup/protein.py @@ -2,6 +2,7 @@ # ABOUTME: Wraps PDBFixer for cleaning and gmx pdb2gmx for topology generation """Protein preparation utilities for the proteinbox simulation type.""" +import os import shutil import subprocess from pathlib import Path @@ -10,16 +11,115 @@ from ..models.parametrization import GromacsProteinParameterSet, Pdb2gmxConfig +_DISULFIDE_CUTOFF_ANGSTROM = 2.2 + + +def _resolve_protonation_residue_name(residue_name: str, forcefield: str | None) -> str: + """Return the residue name to write into the PDB for a protonation override.""" + residue_name = residue_name.upper() + if forcefield and forcefield.lower().startswith("charmm"): + residue_name = { + "HID": "HSD", + "HIE": "HSE", + "HIP": "HSP", + }.get(residue_name, residue_name) + + if len(residue_name) != 3: + raise ValueError( + f"Protonation residue name '{residue_name}' cannot be written to PDB " + "without shifting columns. Use a 3-character residue name supported by " + "the selected force field." + ) + return residue_name + + +def _distance(coords_a: tuple[float, float, float], coords_b: tuple[float, float, float]) -> float: + return sum((a - b) ** 2 for a, b in zip(coords_a, coords_b, strict=True)) ** 0.5 + + +def _read_cysteine_sg_atoms(pdb_path: Path) -> dict[int, tuple[float, float, float]]: + """Read cysteine SG atom coordinates keyed by residue id.""" + atoms: dict[int, tuple[float, float, float]] = {} + duplicate_resids = set() + + with open(pdb_path) as f: + for line in f: + if not line.startswith(("ATOM", "HETATM")) or len(line) < 54: + continue + + atom_name = line[12:16].strip() + resname = line[17:20].strip() + if atom_name != "SG" or resname not in {"CYS", "CYM", "CYX"}: + continue + + resid = int(line[22:26].strip()) + coords = ( + float(line[30:38].strip()), + float(line[38:46].strip()), + float(line[46:54].strip()), + ) + if resid in atoms: + duplicate_resids.add(resid) + atoms[resid] = coords + + if duplicate_resids: + raise ValueError( + "Disulfide bond overrides use residue ids only and cannot disambiguate " + f"duplicate cysteine residue ids: {sorted(duplicate_resids)}." + ) + return atoms + + +def _build_disulfide_prompt_input( + pdb_path: Path, + disulfide_bonds: list[tuple[int, int]], + cutoff_angstrom: float = _DISULFIDE_CUTOFF_ANGSTROM, +) -> str: + """Build deterministic yes/no input for pdb2gmx disulfide prompts.""" + requested = {tuple(sorted(pair)) for pair in disulfide_bonds} + for pair in requested: + if pair[0] == pair[1]: + raise ValueError(f"Invalid disulfide bond with identical residues: {pair}") + + sg_atoms = _read_cysteine_sg_atoms(pdb_path) + missing_resids = sorted({resid for pair in requested for resid in pair if resid not in sg_atoms}) + if missing_resids: + raise ValueError( + f"Requested disulfide residues are not cysteine SG atoms in {pdb_path}: " + f"{missing_resids}" + ) + + candidate_pairs = [] + resids = sorted(sg_atoms) + for i, resid_a in enumerate(resids): + for resid_b in resids[i + 1 :]: + if _distance(sg_atoms[resid_a], sg_atoms[resid_b]) <= cutoff_angstrom: + candidate_pairs.append((resid_a, resid_b)) + + missing_pairs = sorted(requested - set(candidate_pairs)) + if missing_pairs: + raise ValueError( + "Requested disulfide bonds were not detected as close CYS SG pairs " + f"within {cutoff_angstrom:.1f} A in {pdb_path}: {missing_pairs}" + ) + + answers = ["y" if pair in requested else "n" for pair in candidate_pairs] + logger.info(f"pdb2gmx disulfide candidates: {candidate_pairs}; requested: {sorted(requested)}") + return "\n".join(answers) + "\n" + def _apply_protonation_states( - pdb_path: Path, protonation_states: dict[str, str], output_dir: Path + pdb_path: Path, + protonation_states: dict[str, str], + output_dir: Path, + forcefield: str | None = None, ) -> Path: """Apply protonation state overrides by renaming residues in the PDB. - pdb2gmx selects protonation states based on residue names (e.g. HIS -> HID/HIE/HIP - for CHARMM, ASP -> ASPP for protonated aspartate). This function renames matching - residues in the PDB so pdb2gmx applies the correct parameters without interactive - prompts. + pdb2gmx selects protonation states based on residue names. This function renames + matching residues in the PDB so pdb2gmx applies the requested 3-character residue + names without interactive prompts. Common AMBER histidine names are translated to + CHARMM names when a CHARMM force field is selected. Parameters ---------- @@ -30,6 +130,8 @@ def _apply_protonation_states( {"HIS15": "HID", "GLU35": "GLH"}. output_dir : Path Directory for the modified PDB. + forcefield : str | None + Selected pdb2gmx force field, used for force-field-specific aliases. Returns ------- @@ -44,7 +146,7 @@ def _apply_protonation_states( # Parse overrides: "HIS15" -> (original_resname_prefix="HIS", resid=15, new_name="HID") overrides = [] for key, new_name in protonation_states.items(): - match = re.match(r"([A-Za-z]+)(\d+)", key) + match = re.fullmatch(r"([A-Za-z]+)(\d+)", key) if not match: raise ValueError( f"Invalid protonation state key '{key}'. " @@ -52,7 +154,9 @@ def _apply_protonation_states( ) resname_prefix = match.group(1).upper() resid = int(match.group(2)) - overrides.append((resname_prefix, resid, new_name.upper())) + overrides.append( + (resname_prefix, resid, _resolve_protonation_residue_name(new_name, forcefield)) + ) with open(pdb_path) as f_in, open(output_pdb, "w") as f_out: for line in f_in: @@ -98,6 +202,71 @@ def check_gmx_available() -> Path: return Path(gmx_path) +def check_forcefield_available(forcefield: str) -> None: + """Verify that the requested force field is findable by GROMACS. + + Searches the GROMACS share directory, GMXLIB paths, and the current + working directory for a directory named ``{forcefield}.ff``. + + Parameters + ---------- + forcefield : str + Force field name as passed to ``gmx pdb2gmx -ff``. + + Raises + ------ + ValueError + If the force field directory is not found, listing available options. + + """ + ff_dirname = f"{forcefield}.ff" + search_paths: list[Path] = [] + + # GROMACS data prefix from the binary + result = subprocess.run( + ["gmx", "--version"], + text=True, + capture_output=True, + timeout=10, + ) + for line in result.stdout.splitlines(): + if "Data prefix" in line: + data_prefix = Path(line.split(":", 1)[1].strip()) + top_dir = data_prefix / "share" / "gromacs" / "top" + if top_dir.is_dir(): + search_paths.append(top_dir) + break + + # GMXLIB environment variable (colon-separated paths) + gmxlib = os.environ.get("GMXLIB", "") + for p in gmxlib.split(":"): + if p and Path(p).is_dir(): + search_paths.append(Path(p)) + + # Current working directory + search_paths.append(Path.cwd()) + + # Check if the force field exists in any search path + for search_dir in search_paths: + if (search_dir / ff_dirname).is_dir(): + return + + # Not found — collect available force fields for the error message + available = set() + for search_dir in search_paths: + if search_dir.is_dir(): + for entry in search_dir.iterdir(): + if entry.is_dir() and entry.name.endswith(".ff"): + available.add(entry.stem) + + raise ValueError( + f"Force field '{forcefield}' not found. " + f"Searched: {[str(p) for p in search_paths]}.\n" + f"Available force fields: {sorted(available)}.\n" + "Install the force field to a search path or set GMXLIB." + ) + + def clean_pdb(pdb_path: Path, output_path: Path) -> Path: """Clean a PDB file using PDBFixer: remove heterogens, add missing heavy atoms. @@ -159,6 +328,8 @@ def run_pdb2gmx( Paths to generated topology, structure, and position restraint files. """ + check_forcefield_available(config.forcefield) + output_dir.mkdir(parents=True, exist_ok=True) # Resolve all paths to absolute to avoid cwd confusion with subprocess @@ -185,22 +356,26 @@ def run_pdb2gmx( if config.merge_all: cmd.extend(["-merge", "all"]) - if disulfide_bonds: - cmd.append("-ss") - # Apply protonation state overrides by renaming residues in the PDB # before pdb2gmx processes it (standard approach for CHARMM/AMBER FFs) if protonation_states: - pdb_path = _apply_protonation_states(pdb_path, protonation_states, output_dir) + pdb_path = _apply_protonation_states( + pdb_path, protonation_states, output_dir, forcefield=config.forcefield + ) cmd[cmd.index("-f") + 1] = str(pdb_path) + pdb2gmx_input = "" + if disulfide_bonds: + cmd.append("-ss") + pdb2gmx_input = _build_disulfide_prompt_input(pdb_path, disulfide_bonds) + logger.info(f"Running pdb2gmx: {' '.join(cmd)}") result = subprocess.run( cmd, cwd=str(output_dir), text=True, - input="", + input=pdb2gmx_input, capture_output=True, timeout=120, ) @@ -379,9 +554,12 @@ def validate_with_grompp(topology: Path, structure: Path, mdp: Path, cwd: Path) timeout=60, ) - # Clean up the check tpr + # Clean up grompp artifacts if tpr_out.is_file(): tpr_out.unlink() + mdout_path = cwd / "mdout.mdp" + if mdout_path.is_file(): + mdout_path.unlink() if result.returncode != 0: raise RuntimeError( diff --git a/mdfactory/simulation/openmm_utils.py b/mdfactory/simulation/openmm_utils.py index 6e6c64d..81fdb29 100644 --- a/mdfactory/simulation/openmm_utils.py +++ b/mdfactory/simulation/openmm_utils.py @@ -744,12 +744,10 @@ def relax_with_protein_restraints( box_vectors = state.getPeriodicBoxVectors() simulation.topology.setPeriodicBoxVectors(box_vectors) - app.PDBFile.writeFile( - simulation.topology, positions, open("relaxed_system.pdb", "w"), keepIds=True - ) - app.PDBxFile.writeFile( - simulation.topology, positions, open("relaxed_system.mmcif", "w"), keepIds=True - ) + with open("relaxed_system.pdb", "w") as f: + app.PDBFile.writeFile(simulation.topology, positions, f, keepIds=True) + with open("relaxed_system.mmcif", "w") as f: + app.PDBxFile.writeFile(simulation.topology, positions, f, keepIds=True) u_tmp = mda.Universe(app.PDBxFile("relaxed_system.mmcif")) ret = mda.Merge(u.atoms) diff --git a/mdfactory/tests/test_proteinbox.py b/mdfactory/tests/test_proteinbox.py index f8d845e..d7ae889 100644 --- a/mdfactory/tests/test_proteinbox.py +++ b/mdfactory/tests/test_proteinbox.py @@ -17,12 +17,22 @@ from mdfactory.models.species import ProteinSpecies from mdfactory.setup.protein import ( _apply_protonation_states, + _build_disulfide_prompt_input, _sum_charges_from_itp, + check_forcefield_available, extract_charge_from_topology, + run_pdb2gmx, update_topology_molecules, ) +def _pdb_atom(serial, atom_name, resname, resid, x, y, z): + return ( + f"ATOM {serial:5d} {atom_name:<4s} {resname:>3s} A{resid:4d} " + f"{x:8.3f}{y:8.3f}{z:8.3f} 1.00 0.00\n" + ) + + class TestProteinSpecies: def test_basic_creation(self, tmp_path): pdb = tmp_path / "test.pdb" @@ -100,6 +110,20 @@ def test_frozen(self): config.forcefield = "other" +class TestForceFieldCheck: + def test_available_forcefield_passes(self): + check_forcefield_available("charmm27") + + def test_unavailable_forcefield_raises(self): + with pytest.raises(ValueError, match="not found"): + check_forcefield_available("nonexistent_ff_xyz") + + def test_error_lists_available_forcefields(self): + with pytest.raises(ValueError, match="charmm27") as exc_info: + check_forcefield_available("nonexistent_ff_xyz") + assert "Available force fields" in str(exc_info.value) + + class TestBuildInputProteinbox: def test_yaml_roundtrip(self, tmp_path): pdb = tmp_path / "test.pdb" @@ -212,6 +236,92 @@ def test_apply_protonation_states(self, tmp_path): # ALA at resid 16 should be unchanged assert "ALA" in content + def test_apply_protonation_states_translates_charmm_histidine_alias(self, tmp_path): + pdb = tmp_path / "input.pdb" + pdb.write_text(_pdb_atom(1, "N", "HIS", 15, 1.0, 2.0, 3.0)) + result = _apply_protonation_states( + pdb, {"HIS15": "HIE"}, tmp_path, forcefield="charmm36m" + ) + assert "HSE" in result.read_text() + + def test_apply_protonation_states_rejects_four_character_names(self, tmp_path): + pdb = tmp_path / "input.pdb" + pdb.write_text(_pdb_atom(1, "N", "GLU", 35, 1.0, 2.0, 3.0)) + with pytest.raises(ValueError, match="cannot be written to PDB"): + _apply_protonation_states(pdb, {"GLU35": "GLUP"}, tmp_path) + + def test_build_disulfide_prompt_input_answers_exact_requested_pairs(self, tmp_path): + pdb = tmp_path / "input.pdb" + pdb.write_text( + "".join( + [ + _pdb_atom(1, "SG", "CYS", 1, 0.0, 0.0, 0.0), + _pdb_atom(2, "SG", "CYS", 4, 2.0, 0.0, 0.0), + _pdb_atom(3, "SG", "CYS", 2, 10.0, 0.0, 0.0), + _pdb_atom(4, "SG", "CYS", 3, 12.0, 0.0, 0.0), + ] + ) + ) + assert _build_disulfide_prompt_input(pdb, [(2, 3)]) == "n\ny\n" + + def test_build_disulfide_prompt_input_rejects_missing_pair(self, tmp_path): + pdb = tmp_path / "input.pdb" + pdb.write_text( + "".join( + [ + _pdb_atom(1, "SG", "CYS", 6, 0.0, 0.0, 0.0), + _pdb_atom(2, "SG", "CYS", 127, 8.0, 0.0, 0.0), + ] + ) + ) + with pytest.raises(ValueError, match="not detected as close CYS SG pairs"): + _build_disulfide_prompt_input(pdb, [(6, 127)]) + + def test_run_pdb2gmx_passes_disulfide_answers_to_subprocess(self, tmp_path, monkeypatch): + pdb = tmp_path / "input.pdb" + pdb.write_text( + "".join( + [ + _pdb_atom(1, "SG", "CYS", 1, 0.0, 0.0, 0.0), + _pdb_atom(2, "SG", "CYS", 4, 2.0, 0.0, 0.0), + _pdb_atom(3, "SG", "CYS", 2, 10.0, 0.0, 0.0), + _pdb_atom(4, "SG", "CYS", 3, 12.0, 0.0, 0.0), + ] + ) + ) + calls = {} + + def fake_run(cmd, cwd, text, input, capture_output, timeout): + calls["cmd"] = cmd + calls["cwd"] = cwd + calls["input"] = input + output_dir = Path(cwd) + (output_dir / "processed.gro").write_text("mock gro\n") + (output_dir / "topol.top").write_text("[ atoms ]\n") + (output_dir / "posre.itp").write_text("mock posre\n") + + class Result: + returncode = 0 + stdout = "" + stderr = "" + + return Result() + + monkeypatch.setattr("mdfactory.setup.protein.subprocess.run", fake_run) + monkeypatch.setattr("mdfactory.setup.protein.check_forcefield_available", lambda ff: None) + + run_pdb2gmx( + pdb_path=pdb, + config=Pdb2gmxConfig(forcefield="amber99sb-ildn", water_model="tip3p"), + disulfide_bonds=[(2, 3)], + protonation_states={}, + output_dir=tmp_path / "pdb2gmx_output", + ) + + assert "-ss" in calls["cmd"] + assert calls["input"] == "n\ny\n" + assert calls["cwd"] == str((tmp_path / "pdb2gmx_output").resolve()) + class TestMetadata: def test_proteinbox_metadata(self, tmp_path): From c07392a23b2cb78eaee286d08047dda9f4bc979c Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 13:05:44 +0200 Subject: [PATCH 10/14] Add force field registry with auto-download and fix charge parsing - Add FORCEFIELD_REGISTRY mapping friendly names (charmm36m, charmm36m-ljpme) to MacKerell lab download URLs and extracted directory names. - Auto-download missing force fields on first use from the registry. Stored in platformdirs user_data_dir (~/Library/Application Support/mdfactory/forcefields/). - resolve_forcefield() translates friendly names to actual directory stems so users write "charmm36m" in YAML and pdb2gmx receives the correct directory name. - Inject GMXLIB in subprocess env so gmx finds downloaded force fields. - Fix extract_charge_from_topology: skip .ff/ library includes (ions.itp, tip3p.itp) that contain atom type templates, not system charges. Was causing +159 charge for lysozyme with charmm36m instead of +8. Co-Authored-By: Claude Opus 4.6 --- mdfactory/setup/protein.py | 155 +++++++++++++++++++++++++---- mdfactory/tests/test_proteinbox.py | 2 +- 2 files changed, 139 insertions(+), 18 deletions(-) diff --git a/mdfactory/setup/protein.py b/mdfactory/setup/protein.py index 68b3a5c..cc4ce31 100644 --- a/mdfactory/setup/protein.py +++ b/mdfactory/setup/protein.py @@ -2,17 +2,39 @@ # ABOUTME: Wraps PDBFixer for cleaning and gmx pdb2gmx for topology generation """Protein preparation utilities for the proteinbox simulation type.""" +import io import os import shutil import subprocess +import tarfile from pathlib import Path +from urllib.request import urlopen from loguru import logger from ..models.parametrization import GromacsProteinParameterSet, Pdb2gmxConfig +from ..settings import get_data_dir _DISULFIDE_CUTOFF_ANGSTROM = 2.2 +FORCEFIELD_REGISTRY: dict[str, dict[str, str]] = { + "charmm36m": { + "url": "http://mackerell.umaryland.edu/download.php?filename=CHARMM_ff_params_files/charmm36-feb2026_cgenff-5.0.ff.tgz", + "dirname": "charmm36-feb2026_cgenff-5.0.ff", + "description": "CHARMM36m protein + CGenFF 5.0 (Feb 2026)", + }, + "charmm36m-ljpme": { + "url": "http://mackerell.umaryland.edu/download.php?filename=CHARMM_ff_params_files/charmm36-feb2026_ljpme_cgenff-5.0.ff.tgz", + "dirname": "charmm36-feb2026_ljpme_cgenff-5.0.ff", + "description": "CHARMM36m + CGenFF 5.0, LJ-PME variant (Feb 2026)", + }, +} + + +def get_forcefield_dir() -> Path: + """Return the directory where downloaded force fields are stored.""" + return get_data_dir() / "forcefields" + def _resolve_protonation_residue_name(residue_name: str, forcefield: str | None) -> str: """Return the residue name to write into the PDB for a protonation override.""" @@ -202,27 +224,69 @@ def check_gmx_available() -> Path: return Path(gmx_path) -def check_forcefield_available(forcefield: str) -> None: - """Verify that the requested force field is findable by GROMACS. - - Searches the GROMACS share directory, GMXLIB paths, and the current - working directory for a directory named ``{forcefield}.ff``. +def download_forcefield(name: str) -> Path: + """Download a force field from the registry to the local data directory. Parameters ---------- - forcefield : str - Force field name as passed to ``gmx pdb2gmx -ff``. + name : str + Registry key (e.g. "charmm36m"). + + Returns + ------- + Path + Path to the extracted force field directory. Raises ------ ValueError - If the force field directory is not found, listing available options. + If the name is not in the registry. """ - ff_dirname = f"{forcefield}.ff" + if name not in FORCEFIELD_REGISTRY: + raise ValueError( + f"Force field '{name}' is not in the download registry. " + f"Available: {sorted(FORCEFIELD_REGISTRY.keys())}." + ) + + entry = FORCEFIELD_REGISTRY[name] + ff_dir = get_forcefield_dir() + ff_dir.mkdir(parents=True, exist_ok=True) + + target = ff_dir / entry["dirname"] + if target.is_dir(): + logger.info(f"Force field '{name}' already present at {target}") + return target + + logger.info(f"Downloading force field '{name}' from {entry['url']}...") + response = urlopen(entry["url"], timeout=120) # noqa: S310 + data = io.BytesIO(response.read()) + + with tarfile.open(fileobj=data, mode="r:gz") as tar: + tar.extractall(path=ff_dir, filter="data") + + if not target.is_dir(): + raise RuntimeError( + f"Download succeeded but expected directory '{entry['dirname']}' " + f"not found in {ff_dir}. Tarball may have unexpected structure." + ) + + logger.info(f"Force field '{name}' installed to {target}") + return target + + +def download_all_forcefields() -> list[Path]: + """Download all force fields in the registry. Idempotent.""" + paths = [] + for name in FORCEFIELD_REGISTRY: + paths.append(download_forcefield(name)) + return paths + + +def _get_gmx_search_paths() -> list[Path]: + """Return the directories GROMACS searches for force fields.""" search_paths: list[Path] = [] - # GROMACS data prefix from the binary result = subprocess.run( ["gmx", "--version"], text=True, @@ -237,32 +301,79 @@ def check_forcefield_available(forcefield: str) -> None: search_paths.append(top_dir) break - # GMXLIB environment variable (colon-separated paths) gmxlib = os.environ.get("GMXLIB", "") for p in gmxlib.split(":"): if p and Path(p).is_dir(): search_paths.append(Path(p)) - # Current working directory + ff_dir = get_forcefield_dir() + if ff_dir.is_dir(): + search_paths.append(ff_dir) + search_paths.append(Path.cwd()) + return search_paths + + +def resolve_forcefield(forcefield: str) -> str: + """Resolve a friendly force field name to the actual directory stem. + + If the name is a registry key (e.g. "charmm36m"), returns the actual + directory name (e.g. "charmm36-feb2026_cgenff-5.0"). Otherwise returns + the name unchanged (for built-in FFs like "charmm27"). + + """ + if forcefield in FORCEFIELD_REGISTRY: + return FORCEFIELD_REGISTRY[forcefield]["dirname"].removesuffix(".ff") + return forcefield + + +def check_forcefield_available(forcefield: str) -> None: + """Verify that the requested force field is findable by GROMACS. + + If the force field is in the download registry but not installed, + downloads it automatically. + + Parameters + ---------- + forcefield : str + Force field name as passed to ``gmx pdb2gmx -ff``, or a registry + key (e.g. "charmm36m"). + + Raises + ------ + ValueError + If the force field is not found and not in the registry. + + """ + # Resolve friendly name to actual dirname + resolved = resolve_forcefield(forcefield) + ff_dirname = f"{resolved}.ff" + + search_paths = _get_gmx_search_paths() - # Check if the force field exists in any search path for search_dir in search_paths: if (search_dir / ff_dirname).is_dir(): return - # Not found — collect available force fields for the error message + # Not found — try downloading from registry + if forcefield in FORCEFIELD_REGISTRY: + download_forcefield(forcefield) + return + + # Not in registry either — error with available options available = set() for search_dir in search_paths: if search_dir.is_dir(): for entry in search_dir.iterdir(): if entry.is_dir() and entry.name.endswith(".ff"): available.add(entry.stem) + available.update(FORCEFIELD_REGISTRY.keys()) raise ValueError( f"Force field '{forcefield}' not found. " f"Searched: {[str(p) for p in search_paths]}.\n" f"Available force fields: {sorted(available)}.\n" + f"Downloadable: {sorted(FORCEFIELD_REGISTRY.keys())}.\n" "Install the force field to a search path or set GMXLIB." ) @@ -329,6 +440,7 @@ def run_pdb2gmx( """ check_forcefield_available(config.forcefield) + resolved_ff = resolve_forcefield(config.forcefield) output_dir.mkdir(parents=True, exist_ok=True) @@ -346,7 +458,7 @@ def run_pdb2gmx( "-o", str(structure_file), "-p", str(topology_file), "-i", str(posre_file), - "-ff", config.forcefield, + "-ff", resolved_ff, "-water", config.water_model, ] @@ -371,6 +483,13 @@ def run_pdb2gmx( logger.info(f"Running pdb2gmx: {' '.join(cmd)}") + # Ensure GMXLIB includes our forcefield download directory + env = os.environ.copy() + ff_dir = get_forcefield_dir() + if ff_dir.is_dir(): + existing_gmxlib = env.get("GMXLIB", "") + env["GMXLIB"] = f"{ff_dir}:{existing_gmxlib}" if existing_gmxlib else str(ff_dir) + result = subprocess.run( cmd, cwd=str(output_dir), @@ -378,6 +497,7 @@ def run_pdb2gmx( input=pdb2gmx_input, capture_output=True, timeout=120, + env=env, ) if result.returncode != 0: @@ -435,9 +555,10 @@ def extract_charge_from_topology(top_path: Path) -> int: stripped = line.strip() if stripped.startswith("#include"): - # Follow includes for itp files in the same directory + # Follow includes only for local itp files (protein chains, + # position restraints), not force field library files include_path = stripped.split('"')[1] if '"' in stripped else None - if include_path and not include_path.endswith(".ff/"): + if include_path and ".ff/" not in include_path: itp_path = top_path.parent / include_path if itp_path.is_file(): total_charge += _sum_charges_from_itp(itp_path) diff --git a/mdfactory/tests/test_proteinbox.py b/mdfactory/tests/test_proteinbox.py index d7ae889..5f11bc3 100644 --- a/mdfactory/tests/test_proteinbox.py +++ b/mdfactory/tests/test_proteinbox.py @@ -291,7 +291,7 @@ def test_run_pdb2gmx_passes_disulfide_answers_to_subprocess(self, tmp_path, monk ) calls = {} - def fake_run(cmd, cwd, text, input, capture_output, timeout): + def fake_run(cmd, cwd, text, input, capture_output, timeout, env=None): calls["cmd"] = cmd calls["cwd"] = cwd calls["input"] = input From 848041e504bd5cfdb1e889fe0d407cde311543da Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 13:22:41 +0200 Subject: [PATCH 11/14] Add [gromacs] config section for gmx path and force field management - Add [gromacs] section to settings.py with GMX_PATH and FORCEFIELD_DIR, following the same pattern as [cgenff] SILCSBIODIR. - Settings.__init__ auto-prepends FORCEFIELD_DIR to GMXLIB env var on startup, matching how SILCSBIODIR is auto-set for CGenFF. - check_gmx_available() checks configured GMX_PATH first, falls back to PATH lookup. - All gmx subprocess calls (pdb2gmx, grompp) use the configured binary and inject GMXLIB for force field resolution. - Add GROMACS setup to config wizard (sync_config.py): prompts for gmx path, forcefield dir, and offers to download CHARMM36m on setup. Co-Authored-By: Claude Opus 4.6 --- mdfactory/settings.py | 29 ++++++++++++++++++++++++ mdfactory/setup/protein.py | 41 +++++++++++++++++++++++++++------- mdfactory/utils/sync_config.py | 27 ++++++++++++++++++++++ 3 files changed, 89 insertions(+), 8 deletions(-) diff --git a/mdfactory/settings.py b/mdfactory/settings.py index 7c99a42..3223ca7 100644 --- a/mdfactory/settings.py +++ b/mdfactory/settings.py @@ -45,6 +45,10 @@ def _get_defaults() -> dict[str, dict[str, str]]: "cgenff": { "SILCSBIODIR": "", }, + "gromacs": { + "GMX_PATH": "", + "FORCEFIELD_DIR": str(data_dir / "forcefields"), + }, "storage": { "PARAMETERS": str(data_dir / "parameters"), }, @@ -75,6 +79,7 @@ def _get_defaults() -> dict[str, dict[str, str]]: # Keep DEFAULT_CONFIG as class attribute for backward compat in tests DEFAULT_CONFIG = { "cgenff": {"SILCSBIODIR": ""}, + "gromacs": {"GMX_PATH": "", "FORCEFIELD_DIR": ""}, "storage": {"PARAMETERS": ""}, "database": {"TYPE": "sqlite"}, "databases": { @@ -105,6 +110,12 @@ def __init__(self): self._load_config() if "cgenff" in self.config: os.environ.setdefault("SILCSBIODIR", str(self.cgenff_dir)) + if "gromacs" in self.config: + gmxlib_dir = str(self.gromacs_forcefield_dir) + if gmxlib_dir: + existing = os.environ.get("GMXLIB", "") + if gmxlib_dir not in existing: + os.environ["GMXLIB"] = f"{gmxlib_dir}:{existing}" if existing else gmxlib_dir def _load_config(self): """Load configuration from defaults, then user config, then env overrides.""" @@ -159,6 +170,24 @@ def cgenff_dir(self) -> Path: """Return the CGenFF directory (SILCSBIODIR).""" return Path(self.config["cgenff"].get("SILCSBIODIR", "")).resolve() + # --- GROMACS properties --- + + @property + def gromacs_gmx_path(self) -> Path | None: + """Return the configured gmx binary path, or None to use PATH lookup.""" + raw = self.config["gromacs"].get("GMX_PATH", "") + if not raw: + return None + return Path(self._resolve_local_path(raw)) + + @property + def gromacs_forcefield_dir(self) -> Path: + """Return the directory where downloaded force fields are stored.""" + raw = self.config["gromacs"].get("FORCEFIELD_DIR", "") + if raw: + return Path(self._resolve_local_path(raw)) + return get_data_dir() / "forcefields" + # --- Storage properties --- @property diff --git a/mdfactory/setup/protein.py b/mdfactory/setup/protein.py index cc4ce31..7ec7391 100644 --- a/mdfactory/setup/protein.py +++ b/mdfactory/setup/protein.py @@ -13,7 +13,6 @@ from loguru import logger from ..models.parametrization import GromacsProteinParameterSet, Pdb2gmxConfig -from ..settings import get_data_dir _DISULFIDE_CUTOFF_ANGSTROM = 2.2 @@ -33,7 +32,9 @@ def get_forcefield_dir() -> Path: """Return the directory where downloaded force fields are stored.""" - return get_data_dir() / "forcefields" + from ..settings import settings # noqa: PLC0415 + + return settings.gromacs_forcefield_dir def _resolve_protonation_residue_name(residue_name: str, forcefield: str | None) -> str: @@ -202,7 +203,9 @@ def _apply_protonation_states( def check_gmx_available() -> Path: - """Verify that the GROMACS gmx binary is available on PATH. + """Verify that the GROMACS gmx binary is available. + + Checks the configured path from settings first, then falls back to PATH. Returns ------- @@ -212,14 +215,26 @@ def check_gmx_available() -> Path: Raises ------ RuntimeError - If gmx is not found on PATH. + If gmx is not found. """ + from ..settings import settings # noqa: PLC0415 + + configured = settings.gromacs_gmx_path + if configured is not None: + if configured.is_file(): + return configured + raise RuntimeError( + f"Configured GROMACS path '{configured}' does not exist. " + "Check [gromacs] GMX_PATH in your mdfactory config." + ) + gmx_path = shutil.which("gmx") if gmx_path is None: raise RuntimeError( "GROMACS 'gmx' binary not found on PATH. " - "Ensure GROMACS is installed and 'module load gromacs' has been run." + "Either set [gromacs] GMX_PATH in mdfactory config, " + "or ensure 'gmx' is on PATH (e.g. 'module load gromacs')." ) return Path(gmx_path) @@ -287,8 +302,9 @@ def _get_gmx_search_paths() -> list[Path]: """Return the directories GROMACS searches for force fields.""" search_paths: list[Path] = [] + gmx_bin = str(check_gmx_available()) result = subprocess.run( - ["gmx", "--version"], + [gmx_bin, "--version"], text=True, capture_output=True, timeout=10, @@ -439,6 +455,7 @@ def run_pdb2gmx( Paths to generated topology, structure, and position restraint files. """ + gmx_bin = str(check_gmx_available()) check_forcefield_available(config.forcefield) resolved_ff = resolve_forcefield(config.forcefield) @@ -453,7 +470,7 @@ def run_pdb2gmx( posre_file = output_dir / "posre.itp" cmd = [ - "gmx", "pdb2gmx", + gmx_bin, "pdb2gmx", "-f", str(pdb_path), "-o", str(structure_file), "-p", str(topology_file), @@ -656,9 +673,10 @@ def validate_with_grompp(topology: Path, structure: Path, mdp: Path, cwd: Path) If grompp reports errors. """ + gmx_bin = str(check_gmx_available()) tpr_out = cwd / "check.tpr" cmd = [ - "gmx", "grompp", + gmx_bin, "grompp", "-f", str(mdp), "-c", str(structure), "-p", str(topology), @@ -666,6 +684,12 @@ def validate_with_grompp(topology: Path, structure: Path, mdp: Path, cwd: Path) "-maxwarn", "0", ] + env = os.environ.copy() + ff_dir = get_forcefield_dir() + if ff_dir.is_dir(): + existing_gmxlib = env.get("GMXLIB", "") + env["GMXLIB"] = f"{ff_dir}:{existing_gmxlib}" if existing_gmxlib else str(ff_dir) + result = subprocess.run( cmd, cwd=str(cwd), @@ -673,6 +697,7 @@ def validate_with_grompp(topology: Path, structure: Path, mdp: Path, cwd: Path) input="", capture_output=True, timeout=60, + env=env, ) # Clean up grompp artifacts diff --git a/mdfactory/utils/sync_config.py b/mdfactory/utils/sync_config.py index 9792998..c077f0d 100644 --- a/mdfactory/utils/sync_config.py +++ b/mdfactory/utils/sync_config.py @@ -285,6 +285,23 @@ def run_config_wizard() -> None: logger.info("Using existing user config. No changes made.") return + # GROMACS setup (optional) + use_gromacs = questionary.confirm("Do you use GROMACS (pdb2gmx)?", default=True).ask() + gmx_path = "" + forcefield_dir = str(data_dir / "forcefields") + if use_gromacs: + gmx_path = questionary.text( + "Path to gmx binary (leave empty to use PATH):", + default="", + ).ask() + forcefield_dir = questionary.text( + "Force field download directory:", + default=str(data_dir / "forcefields"), + ).ask() + download_ffs = questionary.confirm( + "Download CHARMM36m force field now?", default=True + ).ask() + # CGenFF setup (optional) use_cgenff = questionary.confirm("Do you use CGenFF (SILCSBIO)?", default=False).ask() silcsbiodir = "" @@ -312,6 +329,8 @@ def run_config_wizard() -> None: for key, value in options.items(): config[section][key] = str(value) + config["gromacs"]["GMX_PATH"] = gmx_path + config["gromacs"]["FORCEFIELD_DIR"] = normalize_local_path(forcefield_dir) config["cgenff"]["SILCSBIODIR"] = silcsbiodir config["storage"]["PARAMETERS"] = normalize_local_path(param_dir) config["database"]["TYPE"] = backend @@ -330,6 +349,14 @@ def run_config_wizard() -> None: cfg.ensure_dirs() cfg.reload() + # Download force fields if requested + if use_gromacs and download_ffs: + from ..setup.protein import download_all_forcefields + + logger.info("Downloading registered force fields...") + download_all_forcefields() + logger.success("Force fields downloaded.") + init_default = backend in {"sqlite", "csv"} if not questionary.confirm( "Initialize databases now?", From 4d0dd493c55689d892f74c448f196b6d6db69bc8 Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 13:33:28 +0200 Subject: [PATCH 12/14] Align proteinbox gromacs configuration --- config_templates/config.ini | 6 +++ mdfactory/settings.py | 22 +++++++--- mdfactory/setup/protein.py | 44 +++++-------------- mdfactory/tests/test_proteinbox.py | 38 ++++++++++++++-- mdfactory/tests/test_settings.py | 20 +++++++++ .../tests/test_sync_config_local_paths.py | 36 +++++++++++---- 6 files changed, 117 insertions(+), 49 deletions(-) diff --git a/config_templates/config.ini b/config_templates/config.ini index e8573a1..1c95c7e 100644 --- a/config_templates/config.ini +++ b/config_templates/config.ini @@ -13,6 +13,12 @@ [cgenff] SILCSBIODIR = +[gromacs] +; GMX_PATH defaults to looking up "gmx" on PATH +GMX_PATH = +; FORCEFIELD_DIR defaults to /forcefields +FORCEFIELD_DIR = + [storage] ; PARAMETERS defaults to /parameters PARAMETERS = diff --git a/mdfactory/settings.py b/mdfactory/settings.py index 3223ca7..46e7a9c 100644 --- a/mdfactory/settings.py +++ b/mdfactory/settings.py @@ -111,11 +111,9 @@ def __init__(self): if "cgenff" in self.config: os.environ.setdefault("SILCSBIODIR", str(self.cgenff_dir)) if "gromacs" in self.config: - gmxlib_dir = str(self.gromacs_forcefield_dir) - if gmxlib_dir: - existing = os.environ.get("GMXLIB", "") - if gmxlib_dir not in existing: - os.environ["GMXLIB"] = f"{gmxlib_dir}:{existing}" if existing else gmxlib_dir + env = self.gromacs_env(os.environ) + if "GMXLIB" in env: + os.environ["GMXLIB"] = env["GMXLIB"] def _load_config(self): """Load configuration from defaults, then user config, then env overrides.""" @@ -157,6 +155,7 @@ def ensure_dirs(self): get_config_dir().mkdir(parents=True, exist_ok=True) get_data_dir().mkdir(parents=True, exist_ok=True) self.parameter_store.mkdir(parents=True, exist_ok=True) + self.gromacs_forcefield_dir.mkdir(parents=True, exist_ok=True) # --- CGenFF properties --- @@ -188,6 +187,19 @@ def gromacs_forcefield_dir(self) -> Path: return Path(self._resolve_local_path(raw)) return get_data_dir() / "forcefields" + def gromacs_env(self, base_env: dict[str, str] | None = None) -> dict[str, str]: + """Return an environment with configured GROMACS force fields on GMXLIB.""" + env = dict(os.environ if base_env is None else base_env) + forcefield_dir = str(self.gromacs_forcefield_dir) + if not forcefield_dir: + return env + + paths = [p for p in env.get("GMXLIB", "").split(":") if p] + if forcefield_dir not in paths: + paths.insert(0, forcefield_dir) + env["GMXLIB"] = ":".join(paths) + return env + # --- Storage properties --- @property diff --git a/mdfactory/setup/protein.py b/mdfactory/setup/protein.py index 7ec7391..a1696be 100644 --- a/mdfactory/setup/protein.py +++ b/mdfactory/setup/protein.py @@ -3,7 +3,6 @@ """Protein preparation utilities for the proteinbox simulation type.""" import io -import os import shutil import subprocess import tarfile @@ -37,6 +36,13 @@ def get_forcefield_dir() -> Path: return settings.gromacs_forcefield_dir +def get_gromacs_env(base_env: dict[str, str] | None = None) -> dict[str, str]: + """Return an environment with configured GROMACS force fields on GMXLIB.""" + from ..settings import settings # noqa: PLC0415 + + return settings.gromacs_env(base_env) + + def _resolve_protonation_residue_name(residue_name: str, forcefield: str | None) -> str: """Return the residue name to write into the PDB for a protonation override.""" residue_name = residue_name.upper() @@ -317,15 +323,11 @@ def _get_gmx_search_paths() -> list[Path]: search_paths.append(top_dir) break - gmxlib = os.environ.get("GMXLIB", "") + gmxlib = get_gromacs_env().get("GMXLIB", "") for p in gmxlib.split(":"): if p and Path(p).is_dir(): search_paths.append(Path(p)) - ff_dir = get_forcefield_dir() - if ff_dir.is_dir(): - search_paths.append(ff_dir) - search_paths.append(Path.cwd()) return search_paths @@ -346,9 +348,6 @@ def resolve_forcefield(forcefield: str) -> str: def check_forcefield_available(forcefield: str) -> None: """Verify that the requested force field is findable by GROMACS. - If the force field is in the download registry but not installed, - downloads it automatically. - Parameters ---------- forcefield : str @@ -371,26 +370,20 @@ def check_forcefield_available(forcefield: str) -> None: if (search_dir / ff_dirname).is_dir(): return - # Not found — try downloading from registry - if forcefield in FORCEFIELD_REGISTRY: - download_forcefield(forcefield) - return - - # Not in registry either — error with available options available = set() for search_dir in search_paths: if search_dir.is_dir(): for entry in search_dir.iterdir(): if entry.is_dir() and entry.name.endswith(".ff"): available.add(entry.stem) - available.update(FORCEFIELD_REGISTRY.keys()) raise ValueError( f"Force field '{forcefield}' not found. " f"Searched: {[str(p) for p in search_paths]}.\n" f"Available force fields: {sorted(available)}.\n" f"Downloadable: {sorted(FORCEFIELD_REGISTRY.keys())}.\n" - "Install the force field to a search path or set GMXLIB." + "Install the force field to a search path, set [gromacs] FORCEFIELD_DIR, " + "or download registered force fields with `mdfactory config init`." ) @@ -500,13 +493,6 @@ def run_pdb2gmx( logger.info(f"Running pdb2gmx: {' '.join(cmd)}") - # Ensure GMXLIB includes our forcefield download directory - env = os.environ.copy() - ff_dir = get_forcefield_dir() - if ff_dir.is_dir(): - existing_gmxlib = env.get("GMXLIB", "") - env["GMXLIB"] = f"{ff_dir}:{existing_gmxlib}" if existing_gmxlib else str(ff_dir) - result = subprocess.run( cmd, cwd=str(output_dir), @@ -514,7 +500,7 @@ def run_pdb2gmx( input=pdb2gmx_input, capture_output=True, timeout=120, - env=env, + env=get_gromacs_env(), ) if result.returncode != 0: @@ -684,12 +670,6 @@ def validate_with_grompp(topology: Path, structure: Path, mdp: Path, cwd: Path) "-maxwarn", "0", ] - env = os.environ.copy() - ff_dir = get_forcefield_dir() - if ff_dir.is_dir(): - existing_gmxlib = env.get("GMXLIB", "") - env["GMXLIB"] = f"{ff_dir}:{existing_gmxlib}" if existing_gmxlib else str(ff_dir) - result = subprocess.run( cmd, cwd=str(cwd), @@ -697,7 +677,7 @@ def validate_with_grompp(topology: Path, structure: Path, mdp: Path, cwd: Path) input="", capture_output=True, timeout=60, - env=env, + env=get_gromacs_env(), ) # Clean up grompp artifacts diff --git a/mdfactory/tests/test_proteinbox.py b/mdfactory/tests/test_proteinbox.py index 5f11bc3..fd2a506 100644 --- a/mdfactory/tests/test_proteinbox.py +++ b/mdfactory/tests/test_proteinbox.py @@ -111,18 +111,44 @@ def test_frozen(self): class TestForceFieldCheck: - def test_available_forcefield_passes(self): + def test_available_forcefield_passes(self, tmp_path, monkeypatch): + (tmp_path / "charmm27.ff").mkdir() + monkeypatch.setattr( + "mdfactory.setup.protein._get_gmx_search_paths", + lambda: [tmp_path], + ) check_forcefield_available("charmm27") - def test_unavailable_forcefield_raises(self): + def test_unavailable_forcefield_raises(self, tmp_path, monkeypatch): + monkeypatch.setattr( + "mdfactory.setup.protein._get_gmx_search_paths", + lambda: [tmp_path], + ) with pytest.raises(ValueError, match="not found"): check_forcefield_available("nonexistent_ff_xyz") - def test_error_lists_available_forcefields(self): + def test_error_lists_available_forcefields(self, tmp_path, monkeypatch): + (tmp_path / "charmm27.ff").mkdir() + monkeypatch.setattr( + "mdfactory.setup.protein._get_gmx_search_paths", + lambda: [tmp_path], + ) with pytest.raises(ValueError, match="charmm27") as exc_info: check_forcefield_available("nonexistent_ff_xyz") assert "Available force fields" in str(exc_info.value) + def test_registered_forcefield_check_does_not_download(self, tmp_path, monkeypatch): + monkeypatch.setattr( + "mdfactory.setup.protein._get_gmx_search_paths", + lambda: [tmp_path], + ) + monkeypatch.setattr( + "mdfactory.setup.protein.download_forcefield", + lambda name: pytest.fail("check_forcefield_available should not download"), + ) + with pytest.raises(ValueError, match="Downloadable"): + check_forcefield_available("charmm36m") + class TestBuildInputProteinbox: def test_yaml_roundtrip(self, tmp_path): @@ -295,6 +321,7 @@ def fake_run(cmd, cwd, text, input, capture_output, timeout, env=None): calls["cmd"] = cmd calls["cwd"] = cwd calls["input"] = input + calls["env"] = env output_dir = Path(cwd) (output_dir / "processed.gro").write_text("mock gro\n") (output_dir / "topol.top").write_text("[ atoms ]\n") @@ -309,6 +336,10 @@ class Result: monkeypatch.setattr("mdfactory.setup.protein.subprocess.run", fake_run) monkeypatch.setattr("mdfactory.setup.protein.check_forcefield_available", lambda ff: None) + monkeypatch.setattr( + "mdfactory.setup.protein.get_gromacs_env", + lambda: {"GMXLIB": "/mock/forcefields"}, + ) run_pdb2gmx( pdb_path=pdb, @@ -321,6 +352,7 @@ class Result: assert "-ss" in calls["cmd"] assert calls["input"] == "n\ny\n" assert calls["cwd"] == str((tmp_path / "pdb2gmx_output").resolve()) + assert calls["env"] == {"GMXLIB": "/mock/forcefields"} class TestMetadata: diff --git a/mdfactory/tests/test_settings.py b/mdfactory/tests/test_settings.py index 0eb0159..b918aac 100644 --- a/mdfactory/tests/test_settings.py +++ b/mdfactory/tests/test_settings.py @@ -25,6 +25,7 @@ def test_defaults_loaded(): assert "csv" in s.config assert "foundry" in s.config assert "cgenff" in s.config + assert "gromacs" in s.config assert "storage" in s.config assert s.config["database"]["TYPE"] == "sqlite" @@ -53,6 +54,25 @@ def test_cgenff_dir_default(): assert s.cgenff_dir is not None +def test_gromacs_env_prepends_forcefield_dir(tmp_path, monkeypatch): + """gromacs_env should add the configured force field directory to GMXLIB.""" + monkeypatch.setenv("MDFACTORY_DATA_DIR", str(tmp_path / "data")) + s = Settings() + env = s.gromacs_env({"GMXLIB": "/existing/path"}) + assert env["GMXLIB"].split(":") == [ + str(tmp_path / "data" / "forcefields"), + "/existing/path", + ] + + +def test_config_template_documents_gromacs_section(): + """The sample config should expose all user-facing GROMACS settings.""" + template = Path("config_templates/config.ini").read_text() + assert "[gromacs]" in template + assert "GMX_PATH =" in template + assert "FORCEFIELD_DIR =" in template + + def test_reload(): """reload() should re-read config from disk.""" s = Settings() diff --git a/mdfactory/tests/test_sync_config_local_paths.py b/mdfactory/tests/test_sync_config_local_paths.py index b09bef8..e4a3127 100644 --- a/mdfactory/tests/test_sync_config_local_paths.py +++ b/mdfactory/tests/test_sync_config_local_paths.py @@ -89,12 +89,16 @@ def fake_initialize(): ) # Mock questionary prompts in order: - # 1. confirm CGenFF -> False - # 2. text parameter store -> default - # 3. select backend -> "sqlite" - # 4. text RUN_DB_PATH -> "runs.db" - # 5. text ANALYSIS_DB_PATH -> "analysis.db" - # 6. confirm initialize -> True + # 1. confirm GROMACS -> True + # 2. text GMX_PATH -> "" + # 3. text FORCEFIELD_DIR -> default + # 4. confirm force field download -> False + # 5. confirm CGenFF -> False + # 6. text parameter store -> default + # 7. select backend -> "sqlite" + # 8. text RUN_DB_PATH -> "runs.db" + # 9. text ANALYSIS_DB_PATH -> "analysis.db" + # 10. confirm initialize -> True class FakeQuestion: def __init__(self, value): self._value = value @@ -102,8 +106,16 @@ def __init__(self, value): def ask(self): return self._value - answers_confirm = iter([False, True]) - answers_text = iter([str(tmp_path / "data" / "parameters"), "runs.db", "analysis.db"]) + answers_confirm = iter([True, False, False, True]) + answers_text = iter( + [ + "", + str(tmp_path / "data" / "forcefields"), + str(tmp_path / "data" / "parameters"), + "runs.db", + "analysis.db", + ] + ) answers_select = iter(["sqlite"]) monkeypatch.setattr( @@ -122,4 +134,10 @@ def ask(self): run_config_wizard() assert called["value"] is True - assert (tmp_path / "config.ini").exists() + config_path = tmp_path / "config.ini" + assert config_path.exists() + + config = configparser.ConfigParser() + config.read(config_path) + assert config["gromacs"]["GMX_PATH"] == "" + assert config["gromacs"]["FORCEFIELD_DIR"] == str(tmp_path / "data" / "forcefields") From 733a84d4dbe343267e15774f67f9036e5ff48327 Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 13:40:45 +0200 Subject: [PATCH 13/14] Fix proteinbox lint formatting --- mdfactory/models/input.py | 7 +++- mdfactory/models/parametrization.py | 12 ++---- mdfactory/setup/protein.py | 63 ++++++++++++++++++---------- mdfactory/simulation/openmm_utils.py | 10 ++--- mdfactory/tests/test_proteinbox.py | 34 +++++++-------- 5 files changed, 69 insertions(+), 57 deletions(-) diff --git a/mdfactory/models/input.py b/mdfactory/models/input.py index 9c1fbfe..b748b2b 100644 --- a/mdfactory/models/input.py +++ b/mdfactory/models/input.py @@ -14,7 +14,12 @@ MixedBoxComposition, ProteinBoxComposition, ) -from .parametrization import CgenffConfig, Pdb2gmxConfig, ParametrizationConfig, SmirnoffConfig +from .parametrization import ( + CgenffConfig, + ParametrizationConfig, + Pdb2gmxConfig, + SmirnoffConfig, +) # Map system_type to the corresponding model class # TODO: StrEnum? diff --git a/mdfactory/models/parametrization.py b/mdfactory/models/parametrization.py index 845637a..e14733e 100644 --- a/mdfactory/models/parametrization.py +++ b/mdfactory/models/parametrization.py @@ -59,15 +59,9 @@ class Pdb2gmxConfig(BaseModel): forcefield: str = Field( "charmm36m", description="Force field name as recognized by gmx pdb2gmx." ) - water_model: str = Field( - "tip3p", description="Water model name as recognized by gmx pdb2gmx." - ) - ignh: bool = Field( - True, description="Ignore hydrogens in input PDB (regenerate with pdb2gmx)." - ) - merge_all: bool = Field( - False, description="Merge all chains into a single moleculetype." - ) + water_model: str = Field("tip3p", description="Water model name as recognized by gmx pdb2gmx.") + ignh: bool = Field(True, description="Ignore hydrogens in input PDB (regenerate with pdb2gmx).") + merge_all: bool = Field(False, description="Merge all chains into a single moleculetype.") ParametrizationConfig = Annotated[ diff --git a/mdfactory/setup/protein.py b/mdfactory/setup/protein.py index a1696be..ddb3d4e 100644 --- a/mdfactory/setup/protein.py +++ b/mdfactory/setup/protein.py @@ -111,7 +111,9 @@ def _build_disulfide_prompt_input( raise ValueError(f"Invalid disulfide bond with identical residues: {pair}") sg_atoms = _read_cysteine_sg_atoms(pdb_path) - missing_resids = sorted({resid for pair in requested for resid in pair if resid not in sg_atoms}) + missing_resids = sorted( + {resid for pair in requested for resid in pair if resid not in sg_atoms} + ) if missing_resids: raise ValueError( f"Requested disulfide residues are not cysteine SG atoms in {pdb_path}: " @@ -189,20 +191,21 @@ def _apply_protonation_states( with open(pdb_path) as f_in, open(output_pdb, "w") as f_out: for line in f_in: - if line.startswith(("ATOM", "HETATM")) and len(line) >= 26: - current_resname = line[17:20].strip() + output_line = line + if output_line.startswith(("ATOM", "HETATM")) and len(output_line) >= 26: + current_resname = output_line[17:20].strip() try: - current_resid = int(line[22:26].strip()) + current_resid = int(output_line[22:26].strip()) except ValueError: - f_out.write(line) + f_out.write(output_line) continue for prefix, target_resid, new_name in overrides: if current_resname.startswith(prefix) and current_resid == target_resid: padded = f"{new_name:<3s}" - line = line[:17] + padded + line[20:] + output_line = output_line[:17] + padded + output_line[20:] break - f_out.write(line) + f_out.write(output_line) logger.info(f"Protonation states applied: {protonation_states}") return output_pdb @@ -314,6 +317,7 @@ def _get_gmx_search_paths() -> list[Path]: text=True, capture_output=True, timeout=10, + check=False, ) for line in result.stdout.splitlines(): if "Data prefix" in line: @@ -463,13 +467,20 @@ def run_pdb2gmx( posre_file = output_dir / "posre.itp" cmd = [ - gmx_bin, "pdb2gmx", - "-f", str(pdb_path), - "-o", str(structure_file), - "-p", str(topology_file), - "-i", str(posre_file), - "-ff", resolved_ff, - "-water", config.water_model, + gmx_bin, + "pdb2gmx", + "-f", + str(pdb_path), + "-o", + str(structure_file), + "-p", + str(topology_file), + "-i", + str(posre_file), + "-ff", + resolved_ff, + "-water", + config.water_model, ] if config.ignh: @@ -501,6 +512,7 @@ def run_pdb2gmx( capture_output=True, timeout=120, env=get_gromacs_env(), + check=False, ) if result.returncode != 0: @@ -634,9 +646,7 @@ def update_topology_molecules( if num_cl > 0: f.write(f"{'CL':<20s} {num_cl}\n") - logger.info( - f"Topology updated: {n_water} water, {num_na} Na+, {num_cl} Cl-" - ) + logger.info(f"Topology updated: {n_water} water, {num_na} Na+, {num_cl} Cl-") def validate_with_grompp(topology: Path, structure: Path, mdp: Path, cwd: Path) -> None: @@ -662,12 +672,18 @@ def validate_with_grompp(topology: Path, structure: Path, mdp: Path, cwd: Path) gmx_bin = str(check_gmx_available()) tpr_out = cwd / "check.tpr" cmd = [ - gmx_bin, "grompp", - "-f", str(mdp), - "-c", str(structure), - "-p", str(topology), - "-o", str(tpr_out), - "-maxwarn", "0", + gmx_bin, + "grompp", + "-f", + str(mdp), + "-c", + str(structure), + "-p", + str(topology), + "-o", + str(tpr_out), + "-maxwarn", + "0", ] result = subprocess.run( @@ -678,6 +694,7 @@ def validate_with_grompp(topology: Path, structure: Path, mdp: Path, cwd: Path) capture_output=True, timeout=60, env=get_gromacs_env(), + check=False, ) # Clean up grompp artifacts diff --git a/mdfactory/simulation/openmm_utils.py b/mdfactory/simulation/openmm_utils.py index 81fdb29..a180ecd 100644 --- a/mdfactory/simulation/openmm_utils.py +++ b/mdfactory/simulation/openmm_utils.py @@ -1,5 +1,5 @@ # ABOUTME: OpenMM simulation utilities for box compression, equilibration, and relaxation -# ABOUTME: Provides volume convergence monitoring, density-targeted compression, and protein restraints +# ABOUTME: Provides convergence checks, density-targeted compression, and protein restraints """OpenMM simulation utilities for box compression and equilibration.""" import sys @@ -675,9 +675,7 @@ def relax_with_protein_restraints( """ print("Loading GROMACS files for protein relaxation...") gro = app.PDBFile(str(pdb)) - top_file = app.GromacsTopFile( - str(top), periodicBoxVectors=gro.topology.getPeriodicBoxVectors() - ) + top_file = app.GromacsTopFile(str(top), periodicBoxVectors=gro.topology.getPeriodicBoxVectors()) print("Creating OpenMM system...") system = top_file.createSystem( @@ -687,9 +685,7 @@ def relax_with_protein_restraints( ) # Position restraints on protein atoms - restraint_force = mm.CustomExternalForce( - "k*((x-x0)^2+(y-y0)^2+(z-z0)^2)" - ) + restraint_force = mm.CustomExternalForce("k*((x-x0)^2+(y-y0)^2+(z-z0)^2)") restraint_force.addGlobalParameter( "k", restraint_k * unit.kilojoules_per_mole / unit.nanometer**2 ) diff --git a/mdfactory/tests/test_proteinbox.py b/mdfactory/tests/test_proteinbox.py index fd2a506..5043a1d 100644 --- a/mdfactory/tests/test_proteinbox.py +++ b/mdfactory/tests/test_proteinbox.py @@ -10,17 +10,13 @@ from mdfactory.models.composition import ProteinBoxComposition from mdfactory.models.input import BuildInput -from mdfactory.models.parametrization import ( - GromacsProteinParameterSet, - Pdb2gmxConfig, -) +from mdfactory.models.parametrization import Pdb2gmxConfig from mdfactory.models.species import ProteinSpecies from mdfactory.setup.protein import ( _apply_protonation_states, _build_disulfide_prompt_input, _sum_charges_from_itp, check_forcefield_available, - extract_charge_from_topology, run_pdb2gmx, update_topology_molecules, ) @@ -205,7 +201,8 @@ def test_default_parametrization_config(self, tmp_path): class TestTopologyParsing: def test_sum_charges_from_itp(self, tmp_path): itp = tmp_path / "protein.itp" - itp.write_text(textwrap.dedent("""\ + itp.write_text( + textwrap.dedent("""\ [ moleculetype ] Protein 3 @@ -218,13 +215,15 @@ def test_sum_charges_from_itp(self, tmp_path): [ bonds ] 1 2 - """)) + """) + ) charge = _sum_charges_from_itp(itp) assert abs(charge - (-0.090)) < 1e-6 def test_update_topology_molecules(self, tmp_path): top = tmp_path / "topol.top" - top.write_text(textwrap.dedent("""\ + top.write_text( + textwrap.dedent("""\ #include "charmm36m.ff/forcefield.itp" #include "protein.itp" @@ -233,7 +232,8 @@ def test_update_topology_molecules(self, tmp_path): [ molecules ] Protein_chain_A 1 - """)) + """) + ) update_topology_molecules(top, n_water=5000, num_na=10, num_cl=18) content = top.read_text() assert "SOL" in content @@ -245,15 +245,15 @@ def test_update_topology_molecules(self, tmp_path): def test_apply_protonation_states(self, tmp_path): pdb = tmp_path / "input.pdb" - pdb.write_text(textwrap.dedent("""\ + pdb.write_text( + textwrap.dedent("""\ ATOM 1 N HIS A 15 1.000 2.000 3.000 1.00 0.00 ATOM 2 CA HIS A 15 1.500 2.500 3.500 1.00 0.00 ATOM 3 N ALA A 16 2.000 3.000 4.000 1.00 0.00 ATOM 4 N GLU A 35 3.000 4.000 5.000 1.00 0.00 - """)) - result = _apply_protonation_states( - pdb, {"HIS15": "HIE", "GLU35": "GLH"}, tmp_path + """) ) + result = _apply_protonation_states(pdb, {"HIS15": "HIE", "GLU35": "GLH"}, tmp_path) content = result.read_text() # HIS at resid 15 should be renamed to HIE assert "HIE" in content @@ -265,9 +265,7 @@ def test_apply_protonation_states(self, tmp_path): def test_apply_protonation_states_translates_charmm_histidine_alias(self, tmp_path): pdb = tmp_path / "input.pdb" pdb.write_text(_pdb_atom(1, "N", "HIS", 15, 1.0, 2.0, 3.0)) - result = _apply_protonation_states( - pdb, {"HIS15": "HIE"}, tmp_path, forcefield="charmm36m" - ) + result = _apply_protonation_states(pdb, {"HIS15": "HIE"}, tmp_path, forcefield="charmm36m") assert "HSE" in result.read_text() def test_apply_protonation_states_rejects_four_character_names(self, tmp_path): @@ -317,11 +315,12 @@ def test_run_pdb2gmx_passes_disulfide_answers_to_subprocess(self, tmp_path, monk ) calls = {} - def fake_run(cmd, cwd, text, input, capture_output, timeout, env=None): + def fake_run(cmd, cwd, text, input, capture_output, timeout, env=None, check=None): calls["cmd"] = cmd calls["cwd"] = cwd calls["input"] = input calls["env"] = env + calls["check"] = check output_dir = Path(cwd) (output_dir / "processed.gro").write_text("mock gro\n") (output_dir / "topol.top").write_text("[ atoms ]\n") @@ -353,6 +352,7 @@ class Result: assert calls["input"] == "n\ny\n" assert calls["cwd"] == str((tmp_path / "pdb2gmx_output").resolve()) assert calls["env"] == {"GMXLIB": "/mock/forcefields"} + assert calls["check"] is False class TestMetadata: From 7de3b9e83acfb0263675ca669e4930b21f04b9ed Mon Sep 17 00:00:00 2001 From: MSiggel <11818778+MSiggel@users.noreply.github.com> Date: Sun, 31 May 2026 13:46:07 +0200 Subject: [PATCH 14/14] Mock gmx binary in proteinbox unit test --- mdfactory/tests/test_proteinbox.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mdfactory/tests/test_proteinbox.py b/mdfactory/tests/test_proteinbox.py index 5043a1d..69023bb 100644 --- a/mdfactory/tests/test_proteinbox.py +++ b/mdfactory/tests/test_proteinbox.py @@ -334,6 +334,9 @@ class Result: return Result() monkeypatch.setattr("mdfactory.setup.protein.subprocess.run", fake_run) + monkeypatch.setattr( + "mdfactory.setup.protein.check_gmx_available", lambda: Path("/mock/gmx") + ) monkeypatch.setattr("mdfactory.setup.protein.check_forcefield_available", lambda ff: None) monkeypatch.setattr( "mdfactory.setup.protein.get_gromacs_env", @@ -349,6 +352,7 @@ class Result: ) assert "-ss" in calls["cmd"] + assert calls["cmd"][:2] == ["/mock/gmx", "pdb2gmx"] assert calls["input"] == "n\ny\n" assert calls["cwd"] == str((tmp_path / "pdb2gmx_output").resolve()) assert calls["env"] == {"GMXLIB": "/mock/forcefields"}