From fdd39cb53a91157599b9d3e7acd035564fe060ac Mon Sep 17 00:00:00 2001 From: Rudolf Weeber Date: Mon, 15 Jun 2026 12:55:00 +0200 Subject: [PATCH] core: fix Stokesian Dynamics MPI deadlock on missing per-type radius (bug-sweep #25) Bug: StokesianDynamics::propagate_vel_pos evaluated radii.at(p.type) inside the rank-0-only block, between the collective gather_buffer and the collective scatter_buffer. For a particle whose type has no registered radius, at() threw std::out_of_range on rank 0, which unwound out of the integration loop (no parallel_try_catch on that call path) before reaching MPI_Scatterv. The other ranks blocked forever in the matching MPI_Scatterv -> indefinite MPI deadlock (multi-rank) or an uncoordinated std::out_of_range abort (serial). Fix: replace radii.at() with radii.find() and, on a missing type, register a coordinated runtime error via runtimeErrorMsg() instead of throwing. The rank-0 branch still falls through to the collective scatter_buffer (shipping zeroed velocities), so every rank reaches MPI_Scatterv and no deadlock occurs. The integration loop's collective check_runtime_errors(comm_cart) then turns the registered message into a clean cross-rank ESPResSo runtime error ("Stokesian Dynamics: no radius defined for particle type N"). This mirrors the existing SD precondition checks in integrate.cpp. Test: testsuite/python/stokesian_missing_radius.py (NO_MPI) launches the offending scenario as a child mpiexec -n 2 job under a 45 s timeout; on the unfixed core it times out (deadlock), on the fixed core it exits cleanly with the coordinated error. Co-Authored-By: Claude Opus 4.8 --- src/core/stokesian_dynamics/sd_interface.cpp | 33 +++++- testsuite/python/CMakeLists.txt | 2 + testsuite/python/stokesian_missing_radius.py | 101 ++++++++++++++++++ .../python/stokesian_missing_radius_child.py | 62 +++++++++++ 4 files changed, 193 insertions(+), 5 deletions(-) create mode 100644 testsuite/python/stokesian_missing_radius.py create mode 100644 testsuite/python/stokesian_missing_radius_child.py diff --git a/src/core/stokesian_dynamics/sd_interface.cpp b/src/core/stokesian_dynamics/sd_interface.cpp index 5525df7c78..bedb56a03c 100644 --- a/src/core/stokesian_dynamics/sd_interface.cpp +++ b/src/core/stokesian_dynamics/sd_interface.cpp @@ -28,6 +28,7 @@ #include "BoxGeometry.hpp" #include "Particle.hpp" #include "communication.hpp" +#include "errorhandling.hpp" #include "system/System.hpp" #include "thermostat.hpp" @@ -134,6 +135,14 @@ void StokesianDynamics::propagate_vel_pos( f_host.resize(6 * n_part); a_host.resize(n_part); + // Make sure every present particle type has a registered radius. We must + // NOT throw here: this branch runs only on rank 0 and sits between the + // collective gather_buffer (above) and the collective scatter_buffer + // (below). An asymmetric throw would leave the other ranks blocked + // forever in MPI_Scatterv. Instead, register a runtime error (turned + // into a coordinated cross-rank error by check_runtime_errors() in the + // integration loop) and still fall through to the collective scatter. + bool missing_radius = false; std::size_t i = 0; for (auto const &p : parts_buffer) { x_host[6 * i + 0] = p.pos[0]; @@ -152,15 +161,29 @@ void StokesianDynamics::propagate_vel_pos( f_host[6 * i + 4] = p.ext_force.torque[1]; f_host[6 * i + 5] = p.ext_force.torque[2]; - a_host[i] = radii.at(p.type); + auto const radius_it = radii.find(p.type); + if (radius_it == radii.end()) { + runtimeErrorMsg() << "Stokesian Dynamics: no radius defined for " + "particle type " + << p.type; + missing_radius = true; + break; + } + a_host[i] = radius_it->second; ++i; } - v_sd = sd_cpu(x_host, f_host, a_host, n_part, viscosity, - std::sqrt(kT / time_step), - static_cast(stokesian.rng_counter()), - static_cast(stokesian.rng_seed()), flags); + if (missing_radius) { + // Skip the solver and ship zeroed velocities; the registered runtime + // error aborts the run cleanly after the collective scatter below. + v_sd.assign(6 * n_part, 0.); + } else { + v_sd = sd_cpu(x_host, f_host, a_host, n_part, viscosity, + std::sqrt(kT / time_step), + static_cast(stokesian.rng_counter()), + static_cast(stokesian.rng_seed()), flags); + } } else { // if (this_node == 0) v_sd.resize(particles.size() * 6); } // if (this_node == 0) {...} else diff --git a/testsuite/python/CMakeLists.txt b/testsuite/python/CMakeLists.txt index 951e2c19ec..68234868a9 100644 --- a/testsuite/python/CMakeLists.txt +++ b/testsuite/python/CMakeLists.txt @@ -449,6 +449,8 @@ python_test(FILE propagation_brownian.py MAX_NUM_PROC 1) python_test(FILE propagation_lb.py MAX_NUM_PROC 2 GPU_SLOTS 1) python_test(FILE propagation_npt.py MAX_NUM_PROC 4 GPU_SLOTS 1) python_test(FILE propagation_stokesian.py MAX_NUM_PROC 2) +python_test(FILE stokesian_missing_radius.py NO_MPI DEPENDENCIES + stokesian_missing_radius_child.py) python_test(FILE integrator_symplectic_euler_langevin.py MAX_NUM_PROC 4) python_test(FILE integrator_steepest_descent.py MAX_NUM_PROC 4) python_test(FILE integrator_exceptions.py MAX_NUM_PROC 1) diff --git a/testsuite/python/stokesian_missing_radius.py b/testsuite/python/stokesian_missing_radius.py new file mode 100644 index 0000000000..f29fb028e5 --- /dev/null +++ b/testsuite/python/stokesian_missing_radius.py @@ -0,0 +1,101 @@ +# +# Copyright (C) 2026 The ESPResSo project +# +# This file is part of ESPResSo. +# +# ESPResSo is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# ESPResSo is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +""" +Regression test for an MPI deadlock in Stokesian Dynamics. + +When a particle has a type for which no radius was registered, the core +used to call ``radii.at(p.type)`` only inside the rank-0-only block of +``StokesianDynamics::propagate_vel_pos``, *after* the collective +``gather_buffer`` but *before* the collective ``scatter_buffer``. The +resulting ``std::out_of_range`` unwound out of the integration loop on +rank 0, while the other ranks blocked forever in the matching +``MPI_Scatterv`` -> indefinite deadlock. + +This test is registered ``NO_MPI`` so that the parent process is a plain +single process. It then launches the offending scenario as a child +``mpiexec -n 2`` job under a hard timeout. OpenMPI forbids recursive +``mpiexec`` calls, hence the deadlock cannot be reproduced from within an +already-MPI parent. A hang (subprocess timeout) is the failure signature +of the bug; a clean coordinated runtime error (the child exits and prints +``OK``) is the fixed behaviour. +""" +import unittest as ut +import unittest_decorators as utx +import os +import sys +import shutil +import pathlib +import subprocess + + +@utx.skipIfMissingFeatures("STOKESIAN_DYNAMICS") +class StokesianMissingRadius(ut.TestCase): + + # pypresso exports PYTHONPATH, so the plain interpreter in `sys.executable` + # can import espressomd in the child (same pattern as `caliper.py`) + interpreter = sys.executable + child = str(pathlib.Path(__file__).resolve().parent / + "stokesian_missing_radius_child.py") + + @staticmethod + def _clean_mpi_env(): + # importing espressomd already initialized MPI in this (parent) + # process; OpenMPI then refuses the nested ``mpiexec`` ("does not + # support recursive calls") unless the inherited MPI environment is + # stripped from the child's environment. + return {k: v for k, v in os.environ.items() + if not (k.startswith("OMPI_") or k.startswith("PMIX_") or + k.startswith("PMI_"))} + + def test_missing_radius_no_deadlock(self): + """ + A particle whose type has no registered radius must trigger a + coordinated error on all ranks, not an MPI deadlock. + """ + mpiexec = shutil.which("mpiexec") + if mpiexec is None: + self.skipTest("mpiexec not available") + + cmd = [mpiexec, "-n", "2", "--oversubscribe", "--bind-to", "none", + self.interpreter, self.child] + try: + result = subprocess.run( + cmd, timeout=45, capture_output=True, text=True, + env=self._clean_mpi_env()) + except subprocess.TimeoutExpired as err: + self.fail( + "Stokesian Dynamics deadlocked on a particle with a missing " + f"radius (subprocess timed out).\nstdout:\n{err.stdout}\n" + f"stderr:\n{err.stderr}") + + combined = (result.stdout or "") + (result.stderr or "") + # The child must NOT have completed integration silently and must NOT + # have aborted with a raw, uncoordinated std::out_of_range. + self.assertNotIn("unordered_map::at", combined, + msg=f"raw uncoordinated abort:\n{combined}") + self.assertNotIn("NO ERROR RAISED", combined, + msg=f"missing radius went unnoticed:\n{combined}") + # The child reports OK once it sees a clean, coordinated runtime error + # mentioning the missing radius. + self.assertIn("OK: coordinated runtime error raised", combined, + msg=f"unexpected child output:\n{combined}") + + +if __name__ == "__main__": + ut.main() diff --git a/testsuite/python/stokesian_missing_radius_child.py b/testsuite/python/stokesian_missing_radius_child.py new file mode 100644 index 0000000000..bf7052d4a3 --- /dev/null +++ b/testsuite/python/stokesian_missing_radius_child.py @@ -0,0 +1,62 @@ +# +# Copyright (C) 2026 The ESPResSo project +# +# This file is part of ESPResSo. +# +# ESPResSo is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# ESPResSo is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +""" +Helper script for ``propagation_stokesian.py``. + +Sets up Stokesian Dynamics with a radius for type 0 only, but adds a +particle of type 1 (no radius defined). On the unfixed core this makes +rank 0 throw ``std::out_of_range`` from ``radii.at(p.type)`` *after* the +collective gather but *before* the collective scatter, so the other ranks +block forever in ``MPI_Scatterv`` -> MPI deadlock. + +A correct core must raise a coordinated ESPResSo runtime error on all +ranks instead of hanging. + +The script is meant to be launched under ``mpiexec`` with a hard timeout. +A clean coordinated error -> exit code 0 ("OK" printed). A hang/timeout +-> the launcher kills it (the deadlock signature). +""" +import espressomd + +system = espressomd.System(box_l=[10.0, 10.0, 10.0]) +system.cell_system.skin = 0.0 +system.periodicity = [False, False, False] +system.time_step = 0.01 +system.min_global_cut = 4.0 + +# place two particles in different regions so they can land on +# different MPI ranks under the default domain decomposition +system.part.add(pos=[1.0, 1.0, 1.0], type=0) +system.part.add(pos=[9.0, 9.0, 9.0], type=1) + +system.thermostat.set_stokesian(kT=0.0, seed=42) +# radius defined for type 0 only; type 1 has no radius entry +system.integrator.set_stokesian_dynamics(viscosity=1.0, radii={0: 1.0}) + +try: + system.integrator.run(1) +except Exception as err: + # a clean, coordinated error is the desired behaviour + if "radius" in str(err).lower(): + print("OK: coordinated runtime error raised") + else: + print(f"UNEXPECTED ERROR: {err}") + raise +else: + print("NO ERROR RAISED (integration completed unexpectedly)")