Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions config_templates/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
[cgenff]
SILCSBIODIR =

[gromacs]
; GMX_PATH defaults to looking up "gmx" on PATH
GMX_PATH =
; FORCEFIELD_DIR defaults to <platformdirs.user_data_dir>/forcefields
FORCEFIELD_DIR =

[storage]
; PARAMETERS defaults to <platformdirs.user_data_dir>/parameters
PARAMETERS =
Expand Down
25 changes: 25 additions & 0 deletions examples/proteinbox/lysozyme_charmm36m.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
simulation_type: proteinbox
engine: gromacs
parametrization: pdb2gmx
parametrization_config:
type: pdb2gmx
forcefield: charmm36m
water_model: tip3p
ignh: true

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bad key name, have no idea what that means.


system:
protein:
resname: LYZ
count: 1
pdb_path: ./1aki.pdb
disulfide_bonds:
- [6, 127]
- [30, 115]
- [64, 80]
- [76, 94]
protonation_states:
HIS15: HIE

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be a list of key-value pairs, right?

box_padding: 12.0

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consistent naming, as in other system types?

ionization:
neutralize: true
concentration: 0.15
136 changes: 135 additions & 1 deletion mdfactory/build.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# ABOUTME: Core build pipeline for constructing MD simulation systems
# ABOUTME: Dispatches to mixedbox, bilayer, and LNP build routines
# ABOUTME: Dispatches to mixedbox, bilayer, LNP, and proteinbox build routines
"""Core build pipeline for constructing MD simulation systems."""

import os
from functools import partial
from pathlib import Path

import MDAnalysis as mda
import numpy as np
Expand Down Expand Up @@ -682,3 +683,136 @@ def ionize_solvated_system(ion_config, u_solvated, total_charge):
na_spec = SingleMoleculeSpecies(count=num_na, smiles="[Na+]", resname="NA")
cl_spec = SingleMoleculeSpecies(count=num_cl, smiles="[Cl-]", resname="CL")
return u_ionized, [na_spec, cl_spec]


@validate_call
def build_proteinbox(inp: BuildInput):
"""Build a protein-in-waterbox system: clean, parametrize, solvate, ionize, relax."""
import shutil

from loguru import logger

from .models.composition import ProteinBoxComposition
from .models.parametrization import Pdb2gmxConfig
from .setup.protein import (
check_gmx_available,
clean_pdb,
run_pdb2gmx,
update_topology_molecules,
validate_with_grompp,
)

if not isinstance(inp.system, ProteinBoxComposition):
raise TypeError("System must specify a ProteinBoxComposition.")

if not isinstance(inp.parametrization_config, Pdb2gmxConfig):
raise TypeError("Parametrization config must be Pdb2gmxConfig for proteinbox.")

# 1. Verify GROMACS is available
gmx_path = check_gmx_available()
logger.info(f"Using GROMACS: {gmx_path}")

protein = inp.system.protein
config = inp.parametrization_config

# 2. Clean PDB
pdb_input = protein.pdb_path
if not pdb_input.is_absolute():
pdb_input = pdb_input.resolve()

cleaned_pdb = Path("cleaned.pdb")
clean_pdb(pdb_input, cleaned_pdb)

# 3. Run pdb2gmx
pdb2gmx_dir = Path("pdb2gmx_output")
params = run_pdb2gmx(
pdb_path=cleaned_pdb,
config=config,
disulfide_bonds=protein.disulfide_bonds,
protonation_states=protein.protonation_states,
output_dir=pdb2gmx_dir,
)
logger.info(f"pdb2gmx output: {params.structure_file}")

# 4. Center protein in cubic box
u = mda.Universe(str(params.structure_file))
extent = u.atoms.positions.max(axis=0) - u.atoms.positions.min(axis=0)
box_size = extent.max() + 2 * inp.system.box_padding
u.dimensions = [box_size, box_size, box_size, 90, 90, 90]
center = np.array([box_size / 2, box_size / 2, box_size / 2])
u.atoms.translate(-u.atoms.center_of_geometry() + center)
logger.info(f"Protein centered in cubic box: {box_size:.1f} A")

# 5. Solvate (reuse existing solvation)
u_solvated = solvate(u, prune_in_z=False)
u_solvated.dimensions = u.dimensions
n_water = len(u_solvated.select_atoms("water").residues)
logger.info(f"Solvation added {n_water} water molecules.")

if n_water == 0:
raise ValueError("Solvation did not add any water molecules.")

# 6. Ionize (reuse existing ionization)
protein_charge = params.total_charge
u_ionized, ion_species = ionize_solvated_system(
inp.system.ionization, u_solvated, protein_charge
)
num_na = ion_species[0].count if len(ion_species) > 0 else 0
num_cl = ion_species[1].count if len(ion_species) > 1 else 0
n_water_final = len(u_ionized.select_atoms("water").residues)

# 7. Update topology [ molecules ] section
topology_dest = Path("topology.top")
shutil.copy(params.topology_file, topology_dest)
shutil.copy(params.position_restraint_file, Path("posre.itp"))
update_topology_molecules(topology_dest, n_water_final, num_na, num_cl)

# 8. Write coordinates for OpenMM relaxation
pre_relax_structure = Path("system_pre_relax.pdb").resolve()
u_ionized.atoms.write(str(pre_relax_structure))

# 9. OpenMM relaxation with protein position restraints
protein_atoms = u_ionized.select_atoms("protein and not name H*")
protein_indices = protein_atoms.indices.tolist()
topology_dest = topology_dest.resolve()
position_restraint_file = Path("posre.itp").resolve()

with working_directory("relaxation", create=True, cleanup=True) as wd:
shutil.copy(pre_relax_structure, wd / "system.pdb")
shutil.copy(topology_dest, wd / "topology.top")
shutil.copy(position_restraint_file, wd / "posre.itp")

# Copy forcefield directory if referenced by topology
# pdb2gmx topologies use #include from the GROMACS data dir, so OpenMM
# needs the GromacsTopFile to resolve them via GMXLIB or local copies
from .simulation.openmm_utils import relax_with_protein_restraints

u_relaxed = relax_with_protein_restraints(
u_ionized,
wd / "system.pdb",
wd / "topology.top",
protein_indices=protein_indices,
steps=10000,
)

logger.info("OpenMM relaxation complete.")

# 10. Write final output
u_relaxed.atoms.write("system.pdb")
logger.info("Final system written to system.pdb")

# 11. Validate with grompp (if em.mdp is available)
manager = RunScheduleManager()
manager.copy_run_files_with_check(
engine=inp.engine, system_type="proteinbox", target_folder=os.getcwd(), force_copy=True
)
logger.info("Run schedule files copied.")

em_mdp = Path("em.mdp")
if em_mdp.is_file():
try:
validate_with_grompp(topology_dest, Path("system.pdb"), em_mdp, Path.cwd())
except RuntimeError as e:
logger.warning(f"grompp validation failed (non-fatal): {e}")

logger.info("Proteinbox build complete.")
35 changes: 32 additions & 3 deletions mdfactory/models/composition.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# ABOUTME: Pydantic models for system composition (mixedbox, bilayer, LNP)
# ABOUTME: Pydantic models for system composition (mixedbox, bilayer, LNP, proteinbox)
# ABOUTME: Defines species counts, ionization, and composition validation
"""Pydantic models for system composition (mixedbox, bilayer, LNP)."""
"""Pydantic models for system composition (mixedbox, bilayer, LNP, proteinbox)."""

from typing import Optional

import numpy as np
from pydantic import BaseModel, ConfigDict, Field, model_validator

from .species import LipidSpecies, SingleMoleculeSpecies, Species
from .species import LipidSpecies, ProteinSpecies, SingleMoleculeSpecies, Species


def distribute_counts(fractions: list[float], total: int) -> list[int]:
Expand Down Expand Up @@ -375,3 +375,32 @@ def get_species_with_counts(self) -> list[Species]:
else:
species_map[key] = spec.model_copy()
return list(species_map.values())


class ProteinBoxComposition(BaseModel):
"""Define a protein-in-waterbox system: one protein centered in a cubic box with ions."""

model_config = ConfigDict(extra="forbid")

protein: ProteinSpecies
box_padding: float = Field(
10.0, description="Minimum distance from protein to box edge in Angstroms.", ge=0.0
)
ionization: IonizationConfig = Field(
default_factory=IonizationConfig, description="Configuration for ionization."
)

@property
def species(self) -> list[ProteinSpecies]:
"""Return protein as a single-element species list for metadata compatibility."""
return [self.protein]

@property
def total_count(self) -> int:
"""Return 1 (one protein) for metadata compatibility."""
return 1

@property
def charge(self) -> int:
"""Protein charge is not pre-computable; return 0 as placeholder."""
return 0
27 changes: 22 additions & 5 deletions mdfactory/models/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,35 @@

from pydantic import BaseModel, Field, model_validator

from .composition import BilayerComposition, LNPComposition, MixedBoxComposition
from .parametrization import CgenffConfig, ParametrizationConfig, SmirnoffConfig
from .composition import (
BilayerComposition,
LNPComposition,
MixedBoxComposition,
ProteinBoxComposition,
)
from .parametrization import (
CgenffConfig,
ParametrizationConfig,
Pdb2gmxConfig,
SmirnoffConfig,
)

# Map system_type to the corresponding model class
# TODO: StrEnum?
type_mapping = {
"mixedbox": MixedBoxComposition,
"bilayer": BilayerComposition,
"lnp": LNPComposition,
"proteinbox": ProteinBoxComposition,
}


class BuildInput(BaseModel):
"""Represent a complete simulation build specification with composition and parametrization."""

simulation_type: Literal["mixedbox", "bilayer", "lnp"]
system: MixedBoxComposition | BilayerComposition | LNPComposition
parametrization: Literal["cgenff", "smirnoff"] = Field(
simulation_type: Literal["mixedbox", "bilayer", "lnp", "proteinbox"]
system: MixedBoxComposition | BilayerComposition | LNPComposition | ProteinBoxComposition
parametrization: Literal["cgenff", "smirnoff", "pdb2gmx"] = Field(
"cgenff", description="Parametrization to use."
)
parametrization_config: ParametrizationConfig | None = Field(
Expand Down Expand Up @@ -74,6 +85,10 @@ def metadata(self) -> dict[str, Any]:
elif self.simulation_type == "mixedbox":
system_specific["target_density"] = self.system.target_density
system_specific["ionization"] = self.system.ionization.model_dump()
elif self.simulation_type == "proteinbox":
system_specific["box_padding"] = self.system.box_padding
system_specific["ionization"] = self.system.ionization.model_dump()
system_specific["pdb_path"] = str(self.system.protein.pdb_path)

return {
"hash": self.hash,
Expand Down Expand Up @@ -162,4 +177,6 @@ def set_default_parametrization_config(self) -> "BuildInput":
object.__setattr__(self, "parametrization_config", SmirnoffConfig())
elif self.parametrization == "cgenff":
object.__setattr__(self, "parametrization_config", CgenffConfig())
elif self.parametrization == "pdb2gmx":
object.__setattr__(self, "parametrization_config", Pdb2gmxConfig())
return self
31 changes: 30 additions & 1 deletion mdfactory/models/parametrization.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,24 @@ class CgenffConfig(BaseModel):
# CGenFF uses SILCSBIODIR from config.ini, no additional settings needed


class Pdb2gmxConfig(BaseModel):
"""Configuration for pdb2gmx protein parametrization."""

model_config = ConfigDict(extra="forbid", frozen=True)

type: Literal["pdb2gmx"] = Field("pdb2gmx", description="Config type discriminator.")
forcefield: str = Field(
"charmm36m", description="Force field name as recognized by gmx pdb2gmx."
)
water_model: str = Field("tip3p", description="Water model name as recognized by gmx pdb2gmx.")
ignh: bool = Field(True, description="Ignore hydrogens in input PDB (regenerate with pdb2gmx).")
merge_all: bool = Field(False, description="Merge all chains into a single moleculetype.")


ParametrizationConfig = Annotated[
Annotated[SmirnoffConfig, Tag("smirnoff")] | Annotated[CgenffConfig, Tag("cgenff")],
Annotated[SmirnoffConfig, Tag("smirnoff")]
| Annotated[CgenffConfig, Tag("cgenff")]
| Annotated[Pdb2gmxConfig, Tag("pdb2gmx")],
Discriminator("type"),
]

Expand Down Expand Up @@ -91,3 +107,16 @@ def from_data_row(cls, data_row):
raise ValueError("Invalid parameter_data_type for GromacsSingleMoleculeParameterSet.")
data = json.loads(data_row["parameter_data"])
return cls(**data)


class GromacsProteinParameterSet(BaseModel):
"""Store GROMACS topology output from pdb2gmx for a protein system."""

model_config = ConfigDict(extra="forbid")

topology_file: Path
structure_file: Path
position_restraint_file: Path
forcefield: str
water_model: str
total_charge: int
27 changes: 25 additions & 2 deletions mdfactory/models/species.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# ABOUTME: Pydantic models for molecular species (small molecules and lipids)
# ABOUTME: Pydantic models for molecular species (small molecules, lipids, proteins)
# ABOUTME: Provides SMILES-based identity, charge, and molecular object properties
"""Pydantic models for molecular species (small molecules and lipids)."""
"""Pydantic models for molecular species (small molecules, lipids, proteins)."""

import hashlib
import io
import warnings
from functools import cached_property
from pathlib import Path
from typing import Optional

from pydantic import BaseModel, Field, model_validator
Expand Down Expand Up @@ -196,3 +197,25 @@ def rdkit_molecule(self) -> "Chem.rdchem.Mol":
# u, tail_atom_ids=self.tail_atoms, head_atom_ids=self.head_atoms, z_axis=[1, 0, 0]
# )
# return u


class ProteinSpecies(Species):
"""Represent a protein species identified by a PDB structure file."""

count: Optional[int] = Field(1, description="Number of protein copies (always 1).")
fraction: Optional[float] = Field(1.0, description="Fraction (always 1.0 for protein).")
pdb_path: Path = Field(..., description="Path to the input PDB file.")
disulfide_bonds: list[tuple[int, int]] = Field(
default_factory=list,
description="Pairs of residue IDs forming disulfide bonds, e.g. [(6, 127), (30, 115)].",
)
protonation_states: dict[str, str] = Field(
default_factory=dict,
description="Residue-specific protonation states, e.g. {'HIS15': 'HIE', 'GLU35': 'GLH'}.",
)

@property
def charge(self) -> int:
raise NotImplementedError(
"Protein charge is determined by pdb2gmx topology output, not pre-computable."
)
Loading
Loading