From 40ea929010beb46b1471b06243525d24937936a9 Mon Sep 17 00:00:00 2001 From: Kapil Sharma Date: Tue, 10 Mar 2026 09:09:03 -0700 Subject: [PATCH] Fix SIGABRT in gloo_test from thread-unsafe GTEST_SKIP and throwing destructor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: `gloo_test` crashes with `terminate called recursively` (exit code 134) on CI runners that compile with IBVERBS support but lack RDMA hardware at runtime (e.g., GitHub Actions ubuntu-latest with `-DUSE_IBVERBS=ON`). There are two interacting bugs: 1. **GTEST_SKIP() called from worker threads** (`base_test.h`): `BaseTest::spawn()` runs `contextSize` threads via `spawnThreads()`, and each thread calls `GTEST_SKIP()` when `createDevice()` returns nullptr. GTest assertion/skip macros are not thread-safe — concurrent calls race on GTest's internal `TestPartResultReporterInterface`, corrupting state. 2. **IBVERBS Device destructor throws** (`ibverbs/device.cc`): On CI runners with software RDMA devices (rxe/siw), `CreateDevice(IBVERBS)` succeeds but device teardown can fail. `Device::~Device()` uses `GLOO_ENFORCE_EQ` which throws `EnforceNotMet` from a destructor. In C++11+ this is guaranteed to call `std::terminate` — the compiler warns: `'throw' will always call 'terminate' [-Wterminate]`. Fixes: - **base_test.h**: Replace the in-thread `GTEST_SKIP()` with a `std::atomic` flag. Worker threads set the flag and return early when the transport is unavailable. After all threads join, the main thread checks the flag and calls `GTEST_SKIP()` — safe because it runs single-threaded. - **ibverbs/device.cc**: Replace `GLOO_ENFORCE_EQ` (which throws) in `Device::~Device()` with `GLOO_ERROR` (which logs). Destructors must never throw. Differential Revision: D95934130 --- gloo/test/base_test.h | 16 ++++++++++++++-- gloo/transport/ibverbs/device.cc | 14 +++++++++++--- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/gloo/test/base_test.h b/gloo/test/base_test.h index 307b4a8a6..7da5cfdea 100644 --- a/gloo/test/base_test.h +++ b/gloo/test/base_test.h @@ -10,6 +10,7 @@ #include +#include #include #include #include @@ -112,6 +113,13 @@ class BaseTest : public ::testing::Test { device_creator, std::function)> fn, int base = 2) { + // Track whether workers found the transport unavailable so we can call + // GTEST_SKIP() from the main thread after joining. GTEST_SKIP() is not + // thread-safe and must not be called from worker threads — doing so + // causes a data race on GTest internals that can manifest as + // "terminate called recursively" (SIGABRT / exit code 134). + std::atomic transportUnavailable{false}; + Barrier barrier(size); auto store = std::make_shared<::gloo::rendezvous::HashStore>(); @@ -119,11 +127,11 @@ class BaseTest : public ::testing::Test { auto context = std::make_shared<::gloo::rendezvous::Context>(rank, size, base); - // Create device per thread to avoid collisions then they are using the + // Create device per thread to avoid collisions when they are using the // socket address. auto device = device_creator(transport); if (!device) { - GTEST_SKIP() << "Skipping test: transport not available"; + transportUnavailable.store(true); return; } context->connectFullMesh(store, device); @@ -150,6 +158,10 @@ class BaseTest : public ::testing::Test { context->closeConnections(); } }); + + if (transportUnavailable.load()) { + GTEST_SKIP() << "Skipping test: transport not available"; + } } void spawn( diff --git a/gloo/transport/ibverbs/device.cc b/gloo/transport/ibverbs/device.cc index 48bd2782a..65cacc4d6 100644 --- a/gloo/transport/ibverbs/device.cc +++ b/gloo/transport/ibverbs/device.cc @@ -154,14 +154,22 @@ Device::~Device() { done_ = true; loop_->join(); + // Log errors instead of throwing — destructors are implicitly noexcept + // in C++11+, so a throw here calls std::terminate(). rv = ibv_destroy_comp_channel(comp_channel_); - GLOO_ENFORCE_EQ(rv, 0, strerror(errno)); + if (rv != 0) { + GLOO_ERROR("ibv_destroy_comp_channel: ", strerror(errno)); + } rv = ibv_dealloc_pd(pd_); - GLOO_ENFORCE_EQ(rv, 0, strerror(errno)); + if (rv != 0) { + GLOO_ERROR("ibv_dealloc_pd: ", strerror(errno)); + } rv = ibv_close_device(context_); - GLOO_ENFORCE_EQ(rv, 0, strerror(errno)); + if (rv != 0) { + GLOO_ERROR("ibv_close_device: ", strerror(errno)); + } } std::string Device::str() const {