Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 28 additions & 5 deletions src/core/stokesian_dynamics/sd_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "BoxGeometry.hpp"
#include "Particle.hpp"
#include "communication.hpp"
#include "errorhandling.hpp"
#include "system/System.hpp"
#include "thermostat.hpp"

Expand Down Expand Up @@ -134,6 +135,14 @@ void StokesianDynamics::propagate_vel_pos(
f_host.resize(6 * n_part);
a_host.resize(n_part);

// Make sure every present particle type has a registered radius. We must
// NOT throw here: this branch runs only on rank 0 and sits between the
// collective gather_buffer (above) and the collective scatter_buffer
// (below). An asymmetric throw would leave the other ranks blocked
// forever in MPI_Scatterv. Instead, register a runtime error (turned
// into a coordinated cross-rank error by check_runtime_errors() in the
// integration loop) and still fall through to the collective scatter.
bool missing_radius = false;
std::size_t i = 0;
for (auto const &p : parts_buffer) {
x_host[6 * i + 0] = p.pos[0];
Expand All @@ -152,15 +161,29 @@ void StokesianDynamics::propagate_vel_pos(
f_host[6 * i + 4] = p.ext_force.torque[1];
f_host[6 * i + 5] = p.ext_force.torque[2];

a_host[i] = radii.at(p.type);
auto const radius_it = radii.find(p.type);
if (radius_it == radii.end()) {
runtimeErrorMsg() << "Stokesian Dynamics: no radius defined for "
"particle type "
<< p.type;
missing_radius = true;
break;
}
a_host[i] = radius_it->second;

++i;
}

v_sd = sd_cpu(x_host, f_host, a_host, n_part, viscosity,
std::sqrt(kT / time_step),
static_cast<std::size_t>(stokesian.rng_counter()),
static_cast<std::size_t>(stokesian.rng_seed()), flags);
if (missing_radius) {
// Skip the solver and ship zeroed velocities; the registered runtime
// error aborts the run cleanly after the collective scatter below.
v_sd.assign(6 * n_part, 0.);
} else {
v_sd = sd_cpu(x_host, f_host, a_host, n_part, viscosity,
std::sqrt(kT / time_step),
static_cast<std::size_t>(stokesian.rng_counter()),
static_cast<std::size_t>(stokesian.rng_seed()), flags);
}
} else { // if (this_node == 0)
v_sd.resize(particles.size() * 6);
} // if (this_node == 0) {...} else
Expand Down
2 changes: 2 additions & 0 deletions testsuite/python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,8 @@ python_test(FILE propagation_brownian.py MAX_NUM_PROC 1)
python_test(FILE propagation_lb.py MAX_NUM_PROC 2 GPU_SLOTS 1)
python_test(FILE propagation_npt.py MAX_NUM_PROC 4 GPU_SLOTS 1)
python_test(FILE propagation_stokesian.py MAX_NUM_PROC 2)
python_test(FILE stokesian_missing_radius.py NO_MPI DEPENDENCIES
stokesian_missing_radius_child.py)
python_test(FILE integrator_symplectic_euler_langevin.py MAX_NUM_PROC 4)
python_test(FILE integrator_steepest_descent.py MAX_NUM_PROC 4)
python_test(FILE integrator_exceptions.py MAX_NUM_PROC 1)
Expand Down
101 changes: 101 additions & 0 deletions testsuite/python/stokesian_missing_radius.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#
# Copyright (C) 2026 The ESPResSo project
#
# This file is part of ESPResSo.
#
# ESPResSo is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ESPResSo is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
"""
Regression test for an MPI deadlock in Stokesian Dynamics.

When a particle has a type for which no radius was registered, the core
used to call ``radii.at(p.type)`` only inside the rank-0-only block of
``StokesianDynamics::propagate_vel_pos``, *after* the collective
``gather_buffer`` but *before* the collective ``scatter_buffer``. The
resulting ``std::out_of_range`` unwound out of the integration loop on
rank 0, while the other ranks blocked forever in the matching
``MPI_Scatterv`` -> indefinite deadlock.

This test is registered ``NO_MPI`` so that the parent process is a plain
single process. It then launches the offending scenario as a child
``mpiexec -n 2`` job under a hard timeout. OpenMPI forbids recursive
``mpiexec`` calls, hence the deadlock cannot be reproduced from within an
already-MPI parent. A hang (subprocess timeout) is the failure signature
of the bug; a clean coordinated runtime error (the child exits and prints
``OK``) is the fixed behaviour.
"""
import unittest as ut
import unittest_decorators as utx
import os
import sys
import shutil
import pathlib
import subprocess


@utx.skipIfMissingFeatures("STOKESIAN_DYNAMICS")
class StokesianMissingRadius(ut.TestCase):

# pypresso exports PYTHONPATH, so the plain interpreter in `sys.executable`
# can import espressomd in the child (same pattern as `caliper.py`)
interpreter = sys.executable
child = str(pathlib.Path(__file__).resolve().parent /
"stokesian_missing_radius_child.py")

@staticmethod
def _clean_mpi_env():
# importing espressomd already initialized MPI in this (parent)
# process; OpenMPI then refuses the nested ``mpiexec`` ("does not
# support recursive calls") unless the inherited MPI environment is
# stripped from the child's environment.
return {k: v for k, v in os.environ.items()
if not (k.startswith("OMPI_") or k.startswith("PMIX_") or
k.startswith("PMI_"))}

def test_missing_radius_no_deadlock(self):
"""
A particle whose type has no registered radius must trigger a
coordinated error on all ranks, not an MPI deadlock.
"""
mpiexec = shutil.which("mpiexec")
if mpiexec is None:
self.skipTest("mpiexec not available")

cmd = [mpiexec, "-n", "2", "--oversubscribe", "--bind-to", "none",
self.interpreter, self.child]
try:
result = subprocess.run(
cmd, timeout=45, capture_output=True, text=True,
env=self._clean_mpi_env())
except subprocess.TimeoutExpired as err:
self.fail(
"Stokesian Dynamics deadlocked on a particle with a missing "
f"radius (subprocess timed out).\nstdout:\n{err.stdout}\n"
f"stderr:\n{err.stderr}")

combined = (result.stdout or "") + (result.stderr or "")
# The child must NOT have completed integration silently and must NOT
# have aborted with a raw, uncoordinated std::out_of_range.
self.assertNotIn("unordered_map::at", combined,
msg=f"raw uncoordinated abort:\n{combined}")
self.assertNotIn("NO ERROR RAISED", combined,
msg=f"missing radius went unnoticed:\n{combined}")
# The child reports OK once it sees a clean, coordinated runtime error
# mentioning the missing radius.
self.assertIn("OK: coordinated runtime error raised", combined,
msg=f"unexpected child output:\n{combined}")


if __name__ == "__main__":
ut.main()
62 changes: 62 additions & 0 deletions testsuite/python/stokesian_missing_radius_child.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#
# Copyright (C) 2026 The ESPResSo project
#
# This file is part of ESPResSo.
#
# ESPResSo is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ESPResSo is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
"""
Helper script for ``propagation_stokesian.py``.

Sets up Stokesian Dynamics with a radius for type 0 only, but adds a
particle of type 1 (no radius defined). On the unfixed core this makes
rank 0 throw ``std::out_of_range`` from ``radii.at(p.type)`` *after* the
collective gather but *before* the collective scatter, so the other ranks
block forever in ``MPI_Scatterv`` -> MPI deadlock.

A correct core must raise a coordinated ESPResSo runtime error on all
ranks instead of hanging.

The script is meant to be launched under ``mpiexec`` with a hard timeout.
A clean coordinated error -> exit code 0 ("OK" printed). A hang/timeout
-> the launcher kills it (the deadlock signature).
"""
import espressomd

system = espressomd.System(box_l=[10.0, 10.0, 10.0])
system.cell_system.skin = 0.0
system.periodicity = [False, False, False]
system.time_step = 0.01
system.min_global_cut = 4.0

# place two particles in different regions so they can land on
# different MPI ranks under the default domain decomposition
system.part.add(pos=[1.0, 1.0, 1.0], type=0)
system.part.add(pos=[9.0, 9.0, 9.0], type=1)

system.thermostat.set_stokesian(kT=0.0, seed=42)
# radius defined for type 0 only; type 1 has no radius entry
system.integrator.set_stokesian_dynamics(viscosity=1.0, radii={0: 1.0})

try:
system.integrator.run(1)
except Exception as err:
# a clean, coordinated error is the desired behaviour
if "radius" in str(err).lower():
print("OK: coordinated runtime error raised")
else:
print(f"UNEXPECTED ERROR: {err}")
raise
else:
print("NO ERROR RAISED (integration completed unexpectedly)")
Loading