Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ on:
pull_request:
branches:
- main
- develop
workflow_dispatch:

jobs:
Expand Down
7 changes: 7 additions & 0 deletions config_templates/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,10 @@ ANALYSIS_NAME = analysis
RUN_DB_PATH = /Group Functions/mdfactory/runs
ANALYSIS_DB_PATH = /Group Functions/mdfactory/analysis
ARTIFACT_DB_PATH = /Group Functions/mdfactory/artifacts

[slurm]
; Optional manual overrides. When empty, values are autodiscovered via sinfo/sacctmgr.
ACCOUNT =
PARTITION_CPU =
PARTITION_GPU =
DEFAULT_QOS =
130 changes: 130 additions & 0 deletions mdfactory/analysis/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(
roots: list[Path | str] | Path | str,
trajectory_file: str = "prod.xtc",
structure_file: str = "system.pdb",
min_status: str = "production",
):
"""Initialize store with one or more root paths.

Expand All @@ -51,6 +52,9 @@ def __init__(
Trajectory filename to discover
structure_file : str
Structure filename to discover
min_status : str
Minimum simulation status to include in discovery. One of:
"build", "equilibrated", "production", "completed".

"""
# Normalize roots to list of Paths
Expand All @@ -61,6 +65,7 @@ def __init__(

self.trajectory_file = trajectory_file
self.structure_file = structure_file
self.min_status = min_status

self._simulations: dict[str, Simulation] = {} # hash -> Simulation
self._discovery_df: pd.DataFrame | None = None
Expand Down Expand Up @@ -105,6 +110,7 @@ def discover(self, refresh: bool = False) -> pd.DataFrame:
root,
trajectory_file=self.trajectory_file,
structure_file=self.structure_file,
min_status=self.min_status,
)
dfs.append(df)

Expand Down Expand Up @@ -240,13 +246,137 @@ def build_metadata_table(
"path": path,
**flattened,
}

# Merge tags into metadata row
if build_input.tags:
for tag_key, tag_val in build_input.tags.items():
metadata_row[f"tag_{tag_key}"] = tag_val

metadata_rows.append(metadata_row)

metadata_df = pd.DataFrame(metadata_rows)
logger.info(f"Built metadata table with {len(metadata_df)} rows")

return metadata_df

def search(
self,
*,
simulation_type: str | None = None,
status: str | None = None,
hash_prefix: str | None = None,
tags: dict[str, str] | None = None,
smiles: str | None = None,
) -> pd.DataFrame:
"""Search and filter discovered simulations.

Applies all provided filters conjunctively (AND logic).

Parameters
----------
simulation_type : str | None
Filter by simulation type (exact match).
status : str | None
Filter by minimum status threshold.
hash_prefix : str | None
Filter by hash prefix (case-insensitive).
tags : dict[str, str] | None
Filter by tag key-value pairs (all must match).
smiles : str | None
Filter by SMILES substructure match against any species.

Returns
-------
pd.DataFrame
Filtered DataFrame with columns: hash, path, simulation_type,
status, tags (dict or None).

"""
from loguru import logger

self._ensure_discovered()

if len(self._discovery_df) == 0:
logger.info("No simulations to search")
return pd.DataFrame(columns=["hash", "path", "simulation_type", "status", "tags"])

# Pre-validate and import dependencies before iterating
from .constants import STATUS_ORDER

if status is not None and status not in STATUS_ORDER:
raise ValueError(f"Invalid status '{status}'. Must be one of: {STATUS_ORDER}")

smiles_substructure_match = None
if smiles is not None:
try:
from mdfactory.utils.chemistry_utilities import (
smiles_substructure_match,
)
except ImportError:
raise ImportError(
"RDKit is required for SMILES search. "
"Install it via conda: conda install -c conda-forge rdkit"
)

# Build result rows from discovery
rows = []
for _, row in self._discovery_df.iterrows():
sim = row["simulation"]
bi = sim.build_input
sim_hash = row["hash"]
sim_path = row["path"]
sim_status = sim.status

# Filter: simulation_type
if simulation_type is not None and bi.simulation_type != simulation_type:
continue

# Filter: status (minimum threshold)
if status is not None:
status_idx = STATUS_ORDER.index(sim_status)
min_idx = STATUS_ORDER.index(status)
if status_idx < min_idx:
continue

# Filter: hash prefix
if hash_prefix is not None:
if not sim_hash.upper().startswith(hash_prefix.upper()):
continue

# Filter: tags
if tags is not None:
if bi.tags is None:
continue
if not all(bi.tags.get(k) == v for k, v in tags.items()):
continue

# Filter: SMILES substructure
if smiles is not None:
match_found = False
for species in bi.system.species:
species_smiles = getattr(species, "smiles", None)
if species_smiles and smiles_substructure_match(smiles, species_smiles):
match_found = True
break
if not match_found:
continue

rows.append(
{
"hash": sim_hash,
"path": sim_path,
"simulation_type": bi.simulation_type,
"status": sim_status,
"tags": bi.tags,
}
)

logger.info(f"Search returned {len(rows)} results")
return pd.DataFrame(
rows,
columns=["hash", "path", "simulation_type", "status", "tags"],
)

def load_analysis_with_metadata(
self,
analysis_name: str,
Expand Down
46 changes: 9 additions & 37 deletions mdfactory/analysis/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import os
import shutil
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Iterable
Expand All @@ -16,42 +15,15 @@
from mdfactory.analysis.artifacts import ARTIFACT_REGISTRY
from mdfactory.analysis.simulation import ANALYSIS_REGISTRY, Simulation


@dataclass(frozen=True)
class SlurmConfig:
"""Configuration for submitit/SLURM execution."""

account: str
partition: str = "cpu"
time: str = "2h"
cpus_per_task: int = 4
mem_gb: int = 8
qos: str | None = None
constraint: str | None = None
job_name_prefix: str = "mdfactory-analysis"


def normalize_slurm_time(value: str) -> str:
"""Normalize SLURM time strings to accepted formats."""
raw = value.strip()
if ":" in raw:
return raw
lowered = raw.lower()
if lowered.endswith("d"):
days = int(lowered[:-1])
return f"{days}-00:00:00"
if lowered.endswith("h"):
hours = int(lowered[:-1])
return f"{hours:02d}:00:00"
if lowered.endswith("m"):
minutes = int(lowered[:-1])
hours, minutes = divmod(minutes, 60)
return f"{hours:02d}:{minutes:02d}:00"
if lowered.isdigit():
minutes = int(lowered)
hours, minutes = divmod(minutes, 60)
return f"{hours:02d}:{minutes:02d}:00"
return raw
# SlurmConfig and normalize_slurm_time live in the performance package so that
# every SLURM-facing backend (submitit, Parsl, Nextflow) can share them.
# Re-exported here for backward compatibility:
# from mdfactory.analysis.submit import SlurmConfig # still works
# from mdfactory.analysis.submit import normalize_slurm_time # still works
from mdfactory.performance.slurm_config import ( # noqa: F401 (re-export)
SlurmConfig,
normalize_slurm_time,
)


def resolve_simulation_paths(
Expand Down
Loading
Loading