diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index 37b8375197..51a1ec1d56 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -9,6 +9,7 @@
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/reduce.cuh>
@@ -164,6 +165,8 @@ void bitset_view<bitset_t, index_t>::repeat(const raft::resources& res,
                                             index_t times,
                                             bitset_t* output_device_ptr) const
 {
+  // Only a copy and kernel run below this point.
+  if (resource::get_dry_run_flag(res)) { return; }
   constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
 
   if (bitset_len_ % bits_per_element == 0) {
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
index d6b3fb7b63..8b6f8ab70c 100644
--- a/cpp/include/raft/core/bitset.hpp
+++ b/cpp/include/raft/core/bitset.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/integer_utils.hpp>
@@ -131,9 +132,11 @@ struct bitset_view {
     auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
     count(res, count_gpu_scalar.view());
     index_t count_cpu = 0;
-    raft::update_host(
-      &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
-    resource::sync_stream(res);
+    if (!resource::get_dry_run_flag(res)) {
+      raft::update_host(
+        &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
+      resource::sync_stream(res);
+    }
     return count_cpu;
   }
 
@@ -406,9 +409,11 @@ struct bitset {
     auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
     count(res, count_gpu_scalar.view());
     index_t count_cpu = 0;
-    raft::update_host(
-      &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
-    resource::sync_stream(res);
+    if (!resource::get_dry_run_flag(res)) {
+      raft::update_host(
+        &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
+      resource::sync_stream(res);
+    }
     return count_cpu;
   }
   /**
diff --git a/cpp/include/raft/core/coo_matrix.hpp b/cpp/include/raft/core/coo_matrix.hpp
index 62ad6fda0a..0c8dff62c4 100644
--- a/cpp/include/raft/core/coo_matrix.hpp
+++ b/cpp/include/raft/core/coo_matrix.hpp
@@ -1,6 +1,6 @@
 
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -179,8 +179,8 @@ class coordinate_structure : public coordinate_structure_t<RowType, ColType, NZT
   void initialize_sparsity(nnz_type nnz)
   {
     sparse_structure_type::initialize_sparsity(nnz);
-    c_rows_.resize(nnz);
-    c_cols_.resize(nnz);
+    c_rows_.reallocate(nnz);
+    c_cols_.reallocate(nnz);
   }
 
  protected:
diff --git a/cpp/include/raft/core/csr_matrix.hpp b/cpp/include/raft/core/csr_matrix.hpp
index bd385f7e15..4ba2f253f8 100644
--- a/cpp/include/raft/core/csr_matrix.hpp
+++ b/cpp/include/raft/core/csr_matrix.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -188,8 +188,8 @@ class compressed_structure
   void initialize_sparsity(NZType nnz) override
   {
     sparse_structure_type::initialize_sparsity(nnz);
-    c_indptr_.resize(this->get_n_rows() + 1);
-    c_indices_.resize(nnz);
+    c_indptr_.reallocate(this->get_n_rows() + 1);
+    c_indices_.reallocate(nnz);
   }
 
  protected:
diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp
index 8905f4c29b..058fcaac34 100644
--- a/cpp/include/raft/core/detail/copy.hpp
+++ b/cpp/include/raft/core/detail/copy.hpp
@@ -10,6 +10,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/mdspan.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/stream_view.hpp>
 #include <raft/core/resources.hpp>
 
@@ -398,6 +399,10 @@ mdspan_copyable_t<DstType, SrcType> copy(resources const& res, DstType&& dst, Sr
     RAFT_EXPECTS(src.extent(i) == dst.extent(i), "Must copy between mdspans of the same shape");
   }
 
+  // Dry-run guard: raft::copy is a pure data-movement utility with no
+  // allocations that callers would need tracked.
+  if (resource::get_dry_run_flag(res)) { return; }
+
   if constexpr (config::use_intermediate_src) {
 #ifndef RAFT_DISABLE_CUDA
     // Copy to intermediate source on device, then perform necessary
diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp
index 9a9871a3ab..ff60d99c10 100644
--- a/cpp/include/raft/core/device_container_policy.hpp
+++ b/cpp/include/raft/core/device_container_policy.hpp
@@ -126,6 +126,29 @@ class device_uvector {
 
   void resize(size_type size) { data_.resize(size, data_.stream()); }
 
+  /**
+   * @brief Resize the internal buffer without copying old data.
+   *
+   * Unlike resize(), this never copies old data.
+   * Thus, unlike in resize(), there's no point in time where the old and the new buffers are both
+   * alive, and the peak memory usage is lower.
+   *
+   * Unlike resize(), this deallocates the old buffer even if the new size is smaller.
+   * This ensures the memory is released promptly.
+   */
+  void reallocate(size_type size)
+  {
+    if (size != data_.size()) {
+      auto stream = data_.stream();
+      auto mr     = data_.memory_resource();
+      // Resize and shrink rmm::device_uvector: force deallocation without copying old data
+      data_.resize(0, data_.stream());
+      data_.shrink_to_fit(data_.stream());
+      // Assign a new value after the old one is deallocated
+      data_ = rmm::device_uvector<T>(size, stream, mr);
+    }
+  }
+
   [[nodiscard]] auto data() noexcept -> pointer { return data_.data(); }
   [[nodiscard]] auto data() const noexcept -> const_pointer { return data_.data(); }
 };
diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp
index c575546be2..40b48f0f6e 100644
--- a/cpp/include/raft/core/device_mdarray.hpp
+++ b/cpp/include/raft/core/device_mdarray.hpp
@@ -8,6 +8,7 @@
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <rmm/resource_ref.hpp>
@@ -163,7 +164,7 @@ auto make_device_scalar(raft::resources const& handle, ElementType const& v)
   using policy_t = typename device_scalar<ElementType, IndexType>::container_policy_type;
   policy_t policy{};
   auto scalar = device_scalar<ElementType, IndexType>{handle, extents, policy};
-  scalar(0)   = v;
+  if (!resource::get_dry_run_flag(handle)) { scalar(0) = v; }
   return scalar;
 }
 
diff --git a/cpp/include/raft/core/host_container_policy.hpp b/cpp/include/raft/core/host_container_policy.hpp
index 47db081771..87a0acea77 100644
--- a/cpp/include/raft/core/host_container_policy.hpp
+++ b/cpp/include/raft/core/host_container_policy.hpp
@@ -104,6 +104,27 @@ requires cuda::mr::synchronous_resource_with<MR, cuda::mr::host_accessible>
     *this = std::move(new_container);
   }
 
+  /**
+   * @brief Resize the internal buffer without copying old data.
+   *
+   * Unlike resize(), this never copies old data.
+   * Thus, unlike in resize(), there's no point in time where the old and the new buffers are both
+   * alive, and the peak memory usage is lower.
+   *
+   * Unlike resize(), this deallocates the old buffer even if the new size is smaller.
+   * This ensures the memory is released promptly.
+   */
+  void reallocate(size_type count)
+  {
+    if (bytesize_ == sizeof(value_type) * count) { return; }
+    if (data_ != nullptr) {
+      mr_.deallocate_sync(data_, bytesize_);
+      data_ = nullptr;
+    }
+    auto tmp = host_container{count, mr_};
+    std::swap(tmp, *this);
+  }
+
   [[nodiscard]] auto data() noexcept -> pointer { return data_; }
   [[nodiscard]] auto data() const noexcept -> const_pointer { return data_; }
 };
diff --git a/cpp/include/raft/core/host_mdarray.hpp b/cpp/include/raft/core/host_mdarray.hpp
index 535d4f47bf..b0751b28d3 100644
--- a/cpp/include/raft/core/host_mdarray.hpp
+++ b/cpp/include/raft/core/host_mdarray.hpp
@@ -8,6 +8,7 @@
 #include <raft/core/host_container_policy.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cstdint>
@@ -223,7 +224,7 @@ auto make_host_scalar(raft::resources const& res, ElementType const& v)
   using policy_t = typename host_scalar<ElementType, IndexType>::container_policy_type;
   policy_t policy;
   auto scalar = host_scalar<ElementType, IndexType>{res, extents, policy};
-  scalar(0)   = v;
+  if (!resource::get_dry_run_flag(res)) { scalar(0) = v; }
   return scalar;
 }
 
diff --git a/cpp/include/raft/core/managed_mdarray.hpp b/cpp/include/raft/core/managed_mdarray.hpp
index 21db4b52aa..b52d7c6eba 100644
--- a/cpp/include/raft/core/managed_mdarray.hpp
+++ b/cpp/include/raft/core/managed_mdarray.hpp
@@ -8,6 +8,7 @@
 #include <raft/core/managed_container_policy.hpp>
 #include <raft/core/managed_mdspan.hpp>
 #include <raft/core/mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cstdint>
@@ -117,7 +118,7 @@ auto make_managed_scalar(raft::resources const& handle, ElementType const& v)
   using policy_t = typename managed_scalar<ElementType>::container_policy_type;
   policy_t policy{};
   auto scalar = managed_scalar<ElementType>{handle, extents, policy};
-  scalar(0)   = v;
+  if (!resource::get_dry_run_flag(handle)) { scalar(0) = v; }
   return scalar;
 }
 
diff --git a/cpp/include/raft/core/pinned_mdarray.hpp b/cpp/include/raft/core/pinned_mdarray.hpp
index f01f00f897..3f1ae81244 100644
--- a/cpp/include/raft/core/pinned_mdarray.hpp
+++ b/cpp/include/raft/core/pinned_mdarray.hpp
@@ -8,6 +8,7 @@
 #include <raft/core/mdarray.hpp>
 #include <raft/core/pinned_container_policy.hpp>
 #include <raft/core/pinned_mdspan.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cstdint>
@@ -117,7 +118,7 @@ auto make_pinned_scalar(raft::resources const& handle, ElementType const& v)
   using policy_t = typename pinned_scalar<ElementType>::container_policy_type;
   policy_t policy{};
   auto scalar = pinned_scalar<ElementType>{handle, extents, policy};
-  scalar(0)   = v;
+  if (!resource::get_dry_run_flag(handle)) { scalar(0) = v; }
   return scalar;
 }
 
diff --git a/cpp/include/raft/core/resource/cuda_stream.hpp b/cpp/include/raft/core/resource/cuda_stream.hpp
index 690bd610f9..454082d7c3 100644
--- a/cpp/include/raft/core/resource/cuda_stream.hpp
+++ b/cpp/include/raft/core/resource/cuda_stream.hpp
@@ -1,10 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
 #include <raft/core/interruptible.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/resource_types.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -82,13 +83,18 @@ inline void set_cuda_stream(resources const& res, rmm::cuda_stream_view stream_v
  */
 inline void sync_stream(const resources& res, rmm::cuda_stream_view stream)
 {
+  if (raft::resource::get_dry_run_flag(res)) { return; }
   interruptible::synchronize(stream);
 }
 
 /**
  * @brief synchronize main stream on the resources instance
  */
-inline void sync_stream(const resources& res) { sync_stream(res, get_cuda_stream(res)); }
+inline void sync_stream(const resources& res)
+{
+  if (raft::resource::get_dry_run_flag(res)) { return; }
+  sync_stream(res, get_cuda_stream(res));
+}
 
 /**
  * @}
diff --git a/cpp/include/raft/core/resource/dry_run_flag.hpp b/cpp/include/raft/core/resource/dry_run_flag.hpp
new file mode 100644
index 0000000000..4d0c9e27b5
--- /dev/null
+++ b/cpp/include/raft/core/resource/dry_run_flag.hpp
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <raft/core/resource/resource_types.hpp>
+#include <raft/core/resources.hpp>
+
+#include <memory>
+
+namespace raft::resource {
+
+/**
+ * @defgroup dry_run_flag Dry-run flag resource
+ * @{
+ */
+
+/**
+ * @brief Resource that holds a boolean dry-run flag.
+ *
+ * When the dry-run flag is set, algorithms should skip kernel execution
+ * and only perform allocations to measure memory usage.
+ */
+class dry_run_flag_resource : public resource {
+ public:
+  dry_run_flag_resource() = default;
+  explicit dry_run_flag_resource(bool value) : flag_(value) {}
+  ~dry_run_flag_resource() override = default;
+
+  auto get_resource() -> void* override { return &flag_; }
+
+  void set(bool value) { flag_ = value; }
+  [[nodiscard]] auto get() const -> bool { return flag_; }
+
+ private:
+  bool flag_{false};
+};
+
+/**
+ * @brief Factory that creates a dry_run_flag_resource.
+ */
+class dry_run_flag_resource_factory : public resource_factory {
+ public:
+  explicit dry_run_flag_resource_factory(bool initial_value = false) : initial_value_(initial_value)
+  {
+  }
+
+  auto get_resource_type() -> resource_type override { return resource_type::DRY_RUN_FLAG; }
+  auto make_resource() -> resource* override { return new dry_run_flag_resource(initial_value_); }
+
+ private:
+  bool initial_value_;
+};
+
+/**
+ * @brief Get the dry-run flag from a resources handle.
+ *
+ * @param res raft resources object
+ * @return true if dry-run mode is active
+ */
+inline auto get_dry_run_flag(resources const& res) -> bool
+{
+  if (!res.has_resource_factory(resource_type::DRY_RUN_FLAG)) {
+    res.add_resource_factory(std::make_shared<dry_run_flag_resource_factory>());
+  }
+  return *res.get_resource<bool>(resource_type::DRY_RUN_FLAG);
+}
+
+/**
+ * @brief Set the dry-run flag on a resources handle.
+ *
+ * @param res raft resources object
+ * @param value true to enable dry-run mode, false to disable
+ */
+inline void set_dry_run_flag(resources const& res, bool value)
+{
+  if (!res.has_resource_factory(resource_type::DRY_RUN_FLAG)) {
+    res.add_resource_factory(std::make_shared<dry_run_flag_resource_factory>(value));
+  } else {
+    // The resource may already be instantiated; update it directly
+    auto* flag = res.get_resource<bool>(resource_type::DRY_RUN_FLAG);
+    *flag      = value;
+  }
+}
+
+/** @} */
+
+}  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index cda3c8ecae..105adc4018 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -40,6 +40,7 @@ enum resource_type {
   MULTI_GPU,                 // resource that tracks resource of each device in multi-gpu world
   PINNED_MEMORY_RESOURCE,    // memory resource for pinned (page-locked) host allocations
   MANAGED_MEMORY_RESOURCE,   // resource for managed (unified) allocations
+  DRY_RUN_FLAG,              // dry-run mode flag for allocation profiling
 
   LAST_KEY  // reserved for the last key
 };
diff --git a/cpp/include/raft/core/sparse_types.hpp b/cpp/include/raft/core/sparse_types.hpp
index c0de7ca673..acdf14cf2a 100644
--- a/cpp/include/raft/core/sparse_types.hpp
+++ b/cpp/include/raft/core/sparse_types.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -177,7 +177,7 @@ class sparse_matrix {
 
   ~sparse_matrix() noexcept(std::is_nothrow_destructible<container_type>::value) = default;
 
-  void initialize_sparsity(nnz_type nnz) { c_elements_.resize(nnz); };
+  void initialize_sparsity(nnz_type nnz) { c_elements_.reallocate(nnz); };
 
   raft::span<ElementType, is_device> get_elements()
   {
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index 6e299182da..66a3af3b52 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __CLASS_LABELS_H
@@ -7,11 +7,37 @@
 
 #pragma once
 
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/label/detail/classlabels.cuh>
 
 namespace raft {
 namespace label {
 
+/**
+ * Get unique class labels.
+ *
+ * The y array is assumed to store class labels. The unique values are selected
+ * from this array.
+ *
+ * @tparam value_t numeric type of the arrays with class labels
+ * @param [in] handle raft resources handle (dry-run aware)
+ * @param [inout] unique output unique labels
+ * @param [in] y device array of labels, size [n]
+ * @param [in] n number of labels
+ * @returns number of unique labels (upper bound in dry-run mode)
+ */
+template <typename value_t>
+int getUniquelabels(raft::resources const& handle,
+                    rmm::device_uvector<value_t>& unique,
+                    value_t* y,
+                    size_t n)
+{
+  return detail::getUniquelabels<value_t>(
+    resource::get_dry_run_flag(handle), unique, y, n, resource::get_cuda_stream(handle));
+}
+
 /**
  * Get unique class labels.
  *
diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
index 8b0a296eb3..ab66d82a32 100644
--- a/cpp/include/raft/label/detail/classlabels.cuh
+++ b/cpp/include/raft/label/detail/classlabels.cuh
@@ -29,15 +29,17 @@ namespace detail {
  * from this array.
  *
  * \tparam value_t numeric type of the arrays with class labels
- * \param [in] y device array of labels, size [n]
- * \param [in] n number of labels
+ * \param [in] dry_run if true, perform allocations but skip CUDA work
  * \param [out] unique device array of unique labels, unallocated on entry,
  *   on exit it has size [n_unique]
- * \param [out] n_unique number of unique labels
+ * \param [in] y device array of labels, size [n]
+ * \param [in] n number of labels
  * \param [in] stream cuda stream
+ * \return number of unique labels (upper bound when dry_run is true)
  */
 template <typename value_t>
-int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
+int getUniquelabels(
+  bool dry_run, rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
 {
   rmm::device_scalar<int> d_num_selected(stream);
   rmm::device_uvector<value_t> workspace(n, stream);
@@ -53,6 +55,11 @@ int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n,
   bytes = std::max(bytes, bytes2);
   rmm::device_uvector<char> cub_storage(bytes, stream);
 
+  if (dry_run) {
+    if (unique.size() < n) { unique = rmm::device_uvector<value_t>(n, stream); }
+    return static_cast<int>(n);
+  }
+
   // Select Unique classes
   cub::DeviceRadixSort::SortKeys(
     cub_storage.data(), bytes, y, workspace.data(), n, 0, sizeof(value_t) * 8, stream);
@@ -72,6 +79,26 @@ int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n,
   return n_unique;
 }
 
+/**
+ * Get unique class labels.
+ *
+ * The y array is assumed to store class labels. The unique values are selected
+ * from this array.
+ *
+ * \tparam value_t numeric type of the arrays with class labels
+ * \param [out] unique device array of unique labels, unallocated on entry,
+ *   on exit it has size [n_unique]
+ * \param [in] y device array of labels, size [n]
+ * \param [in] n number of labels
+ * \param [in] stream cuda stream
+ * \return number of unique labels
+ */
+template <typename value_t>
+int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
+{
+  return getUniquelabels(false, unique, y, n, stream);
+}
+
 /**
  * Assign one versus rest labels.
  *
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index b1953470b0..c0e086f43f 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __ADD_H
@@ -12,6 +12,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -102,6 +103,7 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void add(raft::resources const& handle, InType in1, InType in2, OutType out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
@@ -139,6 +141,7 @@ void add_scalar(raft::resources const& handle,
                 OutType out,
                 raft::device_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
@@ -174,6 +177,7 @@ void add_scalar(raft::resources const& handle,
                 OutType out,
                 raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index 3ed5ed7736..ca6548f28b 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __COALESCED_REDUCTION_H
@@ -12,6 +12,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 namespace raft {
@@ -62,7 +63,7 @@ void coalescedReduction(OutType* dots,
                         FinalLambda final_op   = raft::identity_op())
 {
   detail::coalescedReduction<InType, OutType, IdxType>(
-    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+    false, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 /**
@@ -120,30 +121,32 @@ void coalesced_reduction(raft::resources const& handle,
     RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(0),
                  "Output should be equal to number of rows in Input");
 
-    coalescedReduction(dots.data_handle(),
-                       data.data_handle(),
-                       data.extent(1),
-                       data.extent(0),
-                       init,
-                       resource::get_cuda_stream(handle),
-                       inplace,
-                       main_op,
-                       reduce_op,
-                       final_op);
+    detail::coalescedReduction(resource::get_dry_run_flag(handle),
+                               dots.data_handle(),
+                               data.data_handle(),
+                               data.extent(1),
+                               data.extent(0),
+                               init,
+                               resource::get_cuda_stream(handle),
+                               inplace,
+                               main_op,
+                               reduce_op,
+                               final_op);
   } else if constexpr (std::is_same_v<LayoutPolicy, raft::col_major>) {
     RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(1),
                  "Output should be equal to number of columns in Input");
 
-    coalescedReduction(dots.data_handle(),
-                       data.data_handle(),
-                       data.extent(0),
-                       data.extent(1),
-                       init,
-                       resource::get_cuda_stream(handle),
-                       inplace,
-                       main_op,
-                       reduce_op,
-                       final_op);
+    detail::coalescedReduction(resource::get_dry_run_flag(handle),
+                               dots.data_handle(),
+                               data.data_handle(),
+                               data.extent(0),
+                               data.extent(1),
+                               init,
+                               resource::get_cuda_stream(handle),
+                               inplace,
+                               main_op,
+                               reduce_op,
+                               final_op);
   }
 }
 
diff --git a/cpp/include/raft/linalg/detail/axpy.cuh b/cpp/include/raft/linalg/detail/axpy.cuh
index 1ab690937d..40634b6428 100644
--- a/cpp/include/raft/linalg/detail/axpy.cuh
+++ b/cpp/include/raft/linalg/detail/axpy.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include "cublas_wrappers.hpp"
 
 #include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cublas_v2.h>
@@ -24,6 +25,7 @@ void axpy(raft::resources const& handle,
           const int incy,
           cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto cublas_h = resource::get_cublas_handle(handle);
   cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
   RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, n, alpha, x, incx, y, incy, stream));
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
index b05449f90a..d997377d54 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/binary_op.cuh>
 
@@ -53,6 +54,7 @@ void choleskyRank1Update(raft::resources const& handle,
     *n_bytes = offset + 1 * sizeof(math_t);
     return;
   }
+  if (resource::get_dry_run_flag(handle)) { return; }
   math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
   math_t* L_22 = L + (n - 1) * ld + n - 1;
 
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index 2d513b433d..f44aa48cfb 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -498,7 +498,8 @@ template <typename ThickPolicy,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void coalescedReductionThick(OutType* dots,
+void coalescedReductionThick(bool dry_run,
+                             OutType* dots,
                              const InType* data,
                              IdxType D,
                              IdxType N,
@@ -517,6 +518,8 @@ void coalescedReductionThick(OutType* dots,
 
   rmm::device_uvector<OutType> buffer(N * ThickPolicy::BlocksPerRow, stream);
 
+  if (dry_run) { return; }
+
   /* We apply a two-step reduction:
    *  1. coalescedReductionThickKernel reduces the [N x D] input data to [N x BlocksPerRow]. It
    *     applies the main_op but not the final op.
@@ -550,7 +553,8 @@ template <typename InType,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void coalescedReductionThickDispatcher(OutType* dots,
+void coalescedReductionThickDispatcher(bool dry_run,
+                                       OutType* dots,
                                        const InType* data,
                                        IdxType D,
                                        IdxType N,
@@ -564,7 +568,7 @@ void coalescedReductionThickDispatcher(OutType* dots,
   // Note: multiple elements per thread to take advantage of the sequential reduction and loop
   // unrolling
   coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 128, 1>>(
-    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+    dry_run, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 // Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along
@@ -579,7 +583,8 @@ template <typename InType,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void coalescedReduction(OutType* dots,
+void coalescedReduction(bool dry_run,
+                        OutType* dots,
                         const InType* data,
                         IdxType D,
                         IdxType N,
@@ -600,12 +605,16 @@ void coalescedReduction(OutType* dots,
    */
   const IdxType numSMs = raft::getMultiProcessorCount();
   if (D <= IdxType(512) || (N >= IdxType(16) * numSMs && D < IdxType(2048))) {
+    if (dry_run) { return; }
     coalescedReductionThinDispatcher(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (N < numSMs && D >= IdxType(1 << 17)) {
+    // Must call through to coalescedReductionThick even in dry-run so workspace
+    // allocations are recorded (coalescedReductionThick allocates before guarding).
     coalescedReductionThickDispatcher(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+      dry_run, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
+    if (dry_run) { return; }
     coalescedReductionMediumDispatcher(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   }
diff --git a/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp b/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp
index 469780ba1f..3ffa4ded84 100644
--- a/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -9,6 +9,7 @@
 #include <raft/core/resource/cublaslt_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/custom_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cache.hpp>
 #include <raft/util/cuda_data_type.hpp>
@@ -282,6 +283,8 @@ template <bool DevicePointerMode = false, typename S, typename A, typename B, ty
                                   uint64_t ldc,
                                   cudaStream_t stream)
 {
+  // We pass nullptr to the workspace, so the extra memory usage should be zero.
+  if (resource::get_dry_run_flag(res)) { return; }
   common::nvtx::range<common::nvtx::domain::raft> batch_scope(
     "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k);
   std::shared_ptr<matmul_desc> mm_desc{nullptr};
diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh
index 5b64add128..d8d31fc411 100644
--- a/cpp/include/raft/linalg/detail/eig.cuh
+++ b/cpp/include/raft/linalg/detail/eig.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -9,6 +9,7 @@
 
 #include <raft/core/resource/cusolver_dn_handle.hpp>
 #include <raft/core/resource/detail/stream_sync_event.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/copy.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -44,9 +45,13 @@ void eigDC_legacy(raft::resources const& handle,
                                                eig_vals,
                                                &lwork));
 
+  // TODO(achirkin): Consider using the workspace resource for these temporary allocations.
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> d_dev_info(stream);
 
+  // The workspace is already allocated, no more allocation are foreseeable.
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   raft::matrix::copy(handle,
                      make_device_matrix_view<const math_t>(in, n_rows, n_cols),
                      make_device_matrix_view<math_t>(eig_vectors, n_rows, n_cols));
@@ -115,6 +120,12 @@ void eigDC(raft::resources const& handle,
   rmm::device_scalar<int> d_dev_info(stream_new);
   std::vector<math_t> h_work(workspaceHost / sizeof(math_t));
 
+  if (resource::get_dry_run_flag(handle)) {
+    // No more allocations beyond this points, but need to cleanup.
+    RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params));
+    return;
+  }
+
   raft::copy(eig_vectors, in, n_rows * n_cols, stream_new);
 
   RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH,
@@ -181,7 +192,9 @@ void eigSelDC(raft::resources const& handle,
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> d_dev_info(stream);
-  rmm::device_uvector<math_t> d_eig_vectors(0, stream);
+  rmm::device_uvector<math_t> d_eig_vectors(memUsage == COPY_INPUT ? n_rows * n_cols : 0, stream);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
 
   if (memUsage == OVERWRITE_INPUT) {
     RAFT_CUSOLVER_TRY(cusolverDnsyevdx(cusolverH,
@@ -202,7 +215,6 @@ void eigSelDC(raft::resources const& handle,
                                        d_dev_info.data(),
                                        stream));
   } else if (memUsage == COPY_INPUT) {
-    d_eig_vectors.resize(n_rows * n_cols, stream);
     raft::matrix::copy(handle,
                        make_device_matrix_view<const math_t>(in, n_rows, n_cols),
                        make_device_matrix_view(eig_vectors, n_rows, n_cols));
@@ -279,6 +291,12 @@ void eigJacobi(raft::resources const& handle,
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> dev_info(stream);
 
+  if (resource::get_dry_run_flag(handle)) {
+    // No more allocations beyond this points, but need to cleanup.
+    RAFT_CUSOLVER_TRY(cusolverDnDestroySyevjInfo(syevj_params));
+    return;
+  }
+
   raft::matrix::copy(handle,
                      make_device_matrix_view<const math_t>(in, n_rows, n_cols),
                      make_device_matrix_view(eig_vectors, n_rows, n_cols));
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
index 3233940a66..905ecab0c5 100644
--- a/cpp/include/raft/linalg/detail/gemv.hpp
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include "cublas_wrappers.hpp"
 
 #include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cublas_v2.h>
@@ -31,6 +32,7 @@ void gemv(raft::resources const& handle,
           const int incy,
           cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   cublasHandle_t cublas_h = resource::get_cublas_handle(handle);
   detail::cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
   RAFT_CUBLAS_TRY(detail::cublasgemv(cublas_h,
@@ -109,6 +111,7 @@ void gemv(raft::resources const& handle,
           const math_t beta,
           cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   cublasHandle_t cublas_h = resource::get_cublas_handle(handle);
   cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
   RAFT_CUBLAS_TRY(
diff --git a/cpp/include/raft/linalg/detail/lstsq.cuh b/cpp/include/raft/linalg/detail/lstsq.cuh
index 176c7763ba..1d37119a45 100644
--- a/cpp/include/raft/linalg/detail/lstsq.cuh
+++ b/cpp/include/raft/linalg/detail/lstsq.cuh
@@ -9,6 +9,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream_pool.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/linalg/eig.cuh>
@@ -130,6 +131,9 @@ void lstsqSvdQR(raft::resources const& handle,
                                         + 1                // devInfo
                                       ,
                                       stream);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   math_t* cusolverWorkSet = workset.data();
   math_t* U               = cusolverWorkSet + cusolverWorkSetSize;
   math_t* Vt              = U + n_rows * minmn;
@@ -204,6 +208,12 @@ void lstsqSvdJacobi(raft::resources const& handle,
                                         + 1                // devInfo
                                       ,
                                       stream);
+
+  if (resource::get_dry_run_flag(handle)) {
+    RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params));
+    return;
+  }
+
   math_t* cusolverWorkSet = workset.data();
   math_t* U               = cusolverWorkSet + cusolverWorkSetSize;
   math_t* V               = U + n_rows * minmn;
@@ -248,21 +258,27 @@ void lstsqEig(raft::resources const& handle,
 {
   rmm::cuda_stream_view mainStream   = rmm::cuda_stream_view(stream);
   rmm::cuda_stream_view multAbStream = resource::get_next_usable_stream(handle);
+  bool dry_run                       = resource::get_dry_run_flag(handle);
   bool concurrent;
-  // Check if the two streams can run concurrently. This is needed because a legacy default stream
-  // would synchronize with other blocking streams. To avoid synchronization in such case, we try to
-  // use an additional stream from the pool.
-  if (!are_implicitly_synchronized(mainStream, multAbStream)) {
-    concurrent = true;
-  } else if (resource::get_stream_pool_size(handle) > 1) {
-    mainStream = resource::get_next_usable_stream(handle);
-    concurrent = true;
+  if (dry_run) {
+    concurrent = false;
   } else {
-    multAbStream = mainStream;
-    concurrent   = false;
+    // Check if the two streams can run concurrently. This is needed because a legacy default stream
+    // would synchronize with other blocking streams. To avoid synchronization in such case, we try
+    // to use an additional stream from the pool.
+    if (!are_implicitly_synchronized(mainStream, multAbStream)) {
+      concurrent = true;
+    } else if (resource::get_stream_pool_size(handle) > 1) {
+      mainStream = resource::get_next_usable_stream(handle);
+      concurrent = true;
+    } else {
+      multAbStream = mainStream;
+      concurrent   = false;
+    }
   }
 
   rmm::device_uvector<math_t> workset(n_cols * n_cols * 3 + n_cols * 2, mainStream);
+
   // the event is created only if the given raft handle is capable of running
   // at least two CUDA streams without implicit synchronization.
   DeviceEvent worksetDone(concurrent);
@@ -302,8 +318,8 @@ void lstsqEig(raft::resources const& handle,
   raft::common::nvtx::pop_range();
 
   // QS  <- Q invS
-  raft::linalg::matrixVectorOp<false, true>(
-    QS, Q, S, n_cols, n_cols, DivideByNonZero<math_t>(), mainStream);
+  raft::linalg::detail::matrixVectorOp<false, true>(
+    dry_run, QS, Q, S, n_cols, n_cols, DivideByNonZero<math_t>(), mainStream);
   // covA <- QS Q* == Q invS Q* == inv(A* A)
   raft::linalg::gemm(handle,
                      QS,
@@ -392,6 +408,8 @@ void lstsqQR(raft::resources const& handle,
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   // #TODO: Call from public API when ready
   RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf(
     cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream));
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index 3153de5396..5678f8e39b 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/input_validation.hpp>
@@ -206,6 +207,7 @@ template <bool PassOffset,
           typename = raft::enable_if_input_device_mdspan<InTypes...>>
 void map(const raft::resources& res, OutType out, Func f, InTypes... ins)
 {
+  if (resource::get_dry_run_flag(res)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
   (map_check_shape(out, ins), ...);
 
diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
index 64de01a3fe..3275410bac 100644
--- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -1,11 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/linewise_op.cuh>
 
 namespace raft {
@@ -19,7 +20,8 @@ template <bool rowMajor,
           typename VecT,
           typename IdxType = int,
           int TPB          = 256>
-void matrixVectorOp(MatT* out,
+void matrixVectorOp(bool dry_run,
+                    MatT* out,
                     const MatT* matrix,
                     const VecT* vec,
                     IdxType D,
@@ -27,6 +29,7 @@ void matrixVectorOp(MatT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
+  if (dry_run) { return; }
   raft::resources handle;
   resource::set_cuda_stream(handle, stream);
   constexpr raft::Apply apply =
@@ -56,7 +59,8 @@ template <bool rowMajor,
           typename Vec2T,
           typename IdxType = int,
           int TPB          = 256>
-void matrixVectorOp(MatT* out,
+void matrixVectorOp(bool dry_run,
+                    MatT* out,
                     const MatT* matrix,
                     const Vec1T* vec1,
                     const Vec2T* vec2,
@@ -65,6 +69,7 @@ void matrixVectorOp(MatT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
+  if (dry_run) { return; }
   raft::resources handle;
   resource::set_cuda_stream(handle, stream);
   constexpr raft::Apply apply =
diff --git a/cpp/include/raft/linalg/detail/norm.cuh b/cpp/include/raft/linalg/detail/norm.cuh
index ea7f5c8d28..549ecda0f5 100644
--- a/cpp/include/raft/linalg/detail/norm.cuh
+++ b/cpp/include/raft/linalg/detail/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -19,18 +19,23 @@ template <NormType norm_type,
           typename IdxType,
           typename Lambda,
           typename OutType = Type>
-void rowNormCaller(
-  OutType* dots, const Type* data, IdxType D, IdxType N, cudaStream_t stream, Lambda fin_op)
+void rowNormCaller(bool dry_run,
+                   OutType* dots,
+                   const Type* data,
+                   IdxType D,
+                   IdxType N,
+                   cudaStream_t stream,
+                   Lambda fin_op)
 {
   if constexpr (norm_type == L1Norm) {
-    raft::linalg::reduce<rowMajor, true, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op);
+    reduce<rowMajor, true, Type, OutType, IdxType>(
+      dry_run, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op);
   } else if constexpr (norm_type == L2Norm) {
-    raft::linalg::reduce<rowMajor, true, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op);
+    reduce<rowMajor, true, Type, OutType, IdxType>(
+      dry_run, dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op);
   } else if constexpr (norm_type == LinfNorm) {
-    raft::linalg::reduce<rowMajor, true, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op);
+    reduce<rowMajor, true, Type, OutType, IdxType>(
+      dry_run, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op);
   } else {
     THROW("Unsupported norm type: %d", norm_type);
   }
@@ -42,18 +47,23 @@ template <NormType norm_type,
           typename IdxType,
           typename Lambda,
           typename OutType = Type>
-void colNormCaller(
-  OutType* dots, const Type* data, IdxType D, IdxType N, cudaStream_t stream, Lambda fin_op)
+void colNormCaller(bool dry_run,
+                   OutType* dots,
+                   const Type* data,
+                   IdxType D,
+                   IdxType N,
+                   cudaStream_t stream,
+                   Lambda fin_op)
 {
   if constexpr (norm_type == L1Norm) {
-    raft::linalg::reduce<rowMajor, false, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op);
+    reduce<rowMajor, false, Type, OutType, IdxType>(
+      dry_run, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op);
   } else if constexpr (norm_type == L2Norm) {
-    raft::linalg::reduce<rowMajor, false, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op);
+    reduce<rowMajor, false, Type, OutType, IdxType>(
+      dry_run, dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op);
   } else if constexpr (norm_type == LinfNorm) {
-    raft::linalg::reduce<rowMajor, false, Type, OutType, IdxType>(
-      dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op);
+    reduce<rowMajor, false, Type, OutType, IdxType>(
+      false, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op);
   } else {
     THROW("Unsupported norm type: %d", norm_type);
   }
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index 63cba5d73c..14b453203c 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -9,6 +9,7 @@
 #include "cusolver_wrappers.hpp"
 
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/triangular.cuh>
 
@@ -39,15 +40,26 @@ void qrGetQ_inplace(
 {
   RAFT_EXPECTS(n_rows >= n_cols, "QR decomposition expects n_rows >= n_cols.");
   cusolverDnHandle_t cusolver = resource::get_cusolver_dn_handle(handle);
+  auto is_dry_run             = resource::get_dry_run_flag(handle);
 
   rmm::device_uvector<math_t> tau(n_cols, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * n_cols, stream));
+  if (!is_dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * n_cols, stream));
+  }
 
   rmm::device_scalar<int> dev_info(stream);
-  int ws_size;
+  int ws_size_Dngeqrf;
+  int ws_size_Dnorgqr;
+
+  RAFT_CUSOLVER_TRY(
+    cusolverDngeqrf_bufferSize(cusolver, n_rows, n_cols, Q, n_rows, &ws_size_Dngeqrf));
+  RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(
+    cusolver, n_rows, n_cols, n_cols, Q, n_rows, tau.data(), &ws_size_Dnorgqr));
+
+  rmm::device_uvector<math_t> workspace(std::max(ws_size_Dngeqrf, ws_size_Dnorgqr), stream);
+
+  if (is_dry_run) { return; }
 
-  RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(cusolver, n_rows, n_cols, Q, n_rows, &ws_size));
-  rmm::device_uvector<math_t> workspace(ws_size, stream);
   RAFT_CUSOLVER_TRY(cusolverDngeqrf(cusolver,
                                     n_rows,
                                     n_cols,
@@ -55,13 +67,10 @@ void qrGetQ_inplace(
                                     n_rows,
                                     tau.data(),
                                     workspace.data(),
-                                    ws_size,
+                                    ws_size_Dngeqrf,
                                     dev_info.data(),
                                     stream));
 
-  RAFT_CUSOLVER_TRY(
-    cusolverDnorgqr_bufferSize(cusolver, n_rows, n_cols, n_cols, Q, n_rows, tau.data(), &ws_size));
-  workspace.resize(ws_size, stream);
   RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolver,
                                     n_rows,
                                     n_cols,
@@ -70,7 +79,7 @@ void qrGetQ_inplace(
                                     n_rows,
                                     tau.data(),
                                     workspace.data(),
-                                    ws_size,
+                                    ws_size_Dnorgqr,
                                     dev_info.data(),
                                     stream));
 }
@@ -83,7 +92,7 @@ void qrGetQ(raft::resources const& handle,
             int n_cols,
             cudaStream_t stream)
 {
-  raft::copy(Q, M, n_rows * n_cols, stream);
+  if (!resource::get_dry_run_flag(handle)) { raft::copy(Q, M, n_rows * n_cols, stream); }
   qrGetQ_inplace(handle, Q, n_rows, n_cols, stream);
 }
 
@@ -99,19 +108,32 @@ void qrGetQR(raft::resources const& handle,
   cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle);
 
   int m = n_rows, n = n_cols;
+  int R_full_nrows = m, R_full_ncols = n;
+  int Q_nrows = m, Q_ncols = n;
+  int Lwork_Dngeqrf, Lwork_Dnorgqr;
   rmm::device_uvector<math_t> R_full(m * n, stream);
   rmm::device_uvector<math_t> tau(std::min(m, n), stream);
+  rmm::device_scalar<int> devInfo(stream);
+
+  RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(
+    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork_Dngeqrf));
+  RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH,
+                                               Q_nrows,
+                                               Q_ncols,
+                                               std::min(Q_ncols, Q_nrows),
+                                               Q,
+                                               Q_nrows,
+                                               tau.data(),
+                                               &Lwork_Dnorgqr));
+
+  rmm::device_uvector<math_t> workspace(std::max(Lwork_Dngeqrf, Lwork_Dnorgqr), stream);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * std::min(m, n), stream));
-  int R_full_nrows = m, R_full_ncols = n;
   RAFT_CUDA_TRY(
     cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
-  int Lwork;
-  rmm::device_scalar<int> devInfo(stream);
-
-  RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(
-    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork));
-  rmm::device_uvector<math_t> workspace(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDngeqrf(cusolverH,
                                     R_full_nrows,
                                     R_full_ncols,
@@ -119,7 +141,7 @@ void qrGetQR(raft::resources const& handle,
                                     R_full_nrows,
                                     tau.data(),
                                     workspace.data(),
-                                    Lwork,
+                                    Lwork_Dngeqrf,
                                     devInfo.data(),
                                     stream));
 
@@ -130,11 +152,7 @@ void qrGetQR(raft::resources const& handle,
 
   RAFT_CUDA_TRY(
     cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
-  int Q_nrows = m, Q_ncols = n;
 
-  RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(
-    cusolverH, Q_nrows, Q_ncols, std::min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
-  workspace.resize(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolverH,
                                     Q_nrows,
                                     Q_ncols,
@@ -143,7 +161,7 @@ void qrGetQR(raft::resources const& handle,
                                     Q_nrows,
                                     tau.data(),
                                     workspace.data(),
-                                    Lwork,
+                                    Lwork_Dnorgqr,
                                     devInfo.data(),
                                     stream));
 }
diff --git a/cpp/include/raft/linalg/detail/reduce.cuh b/cpp/include/raft/linalg/detail/reduce.cuh
index 4d90e32e99..a9ea95ca28 100644
--- a/cpp/include/raft/linalg/detail/reduce.cuh
+++ b/cpp/include/raft/linalg/detail/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -21,7 +21,8 @@ template <bool rowMajor,
           typename MainLambda   = raft::identity_op,
           typename ReduceLambda = raft::add_op,
           typename FinalLambda  = raft::identity_op>
-void reduce(OutType* dots,
+void reduce(bool dry_run,
+            OutType* dots,
             const InType* data,
             IdxType D,
             IdxType N,
@@ -33,17 +34,19 @@ void reduce(OutType* dots,
             FinalLambda final_op   = raft::identity_op())
 {
   if constexpr (rowMajor && alongRows) {
-    raft::linalg::coalescedReduction<InType, OutType, IdxType>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+    coalescedReduction<InType, OutType, IdxType>(
+      dry_run, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if constexpr (rowMajor && !alongRows) {
+    if (dry_run) { return; }  // no allocations in strided reduction
     raft::linalg::stridedReduction<InType, OutType, IdxType>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if constexpr (!rowMajor && alongRows) {
+    if (dry_run) { return; }  // no allocations in strided reduction
     raft::linalg::stridedReduction<InType, OutType, IdxType>(
       dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
-    raft::linalg::coalescedReduction<InType, OutType, IdxType>(
-      dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
+    coalescedReduction<InType, OutType, IdxType>(
+      dry_run, dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
   }
 }
 
diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
index 9dcdd1ed14..85b3a0cbcc 100644
--- a/cpp/include/raft/linalg/detail/rsvd.cuh
+++ b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/eig.cuh>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/qr.cuh>
@@ -85,6 +86,8 @@ void randomized_svd(const raft::resources& handle,
   auto h_workspace = raft::make_host_vector<char>(workspaceHost);
   auto devInfo     = raft::make_device_scalar<int>(handle, 0);
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   RAFT_CUSOLVER_TRY(cusolverDnxgesvdr(cusolverH,
                                       jobu,
                                       jobv,
@@ -154,6 +157,7 @@ void rsvdFixedRank(raft::resources const& handle,
                    int max_sweeps,
                    cudaStream_t stream)
 {
+  bool is_dry_run              = resource::get_dry_run_flag(handle);
   cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle);
   cublasHandle_t cublasH       = resource::get_cublas_handle(handle);
 
@@ -171,7 +175,9 @@ void rsvdFixedRank(raft::resources const& handle,
 
   // Build temporary U, S, V matrices
   rmm::device_uvector<math_t> S_vec_tmp(l, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream));
+  if (!is_dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream));
+  }
 
   // build random matrix
   rmm::device_uvector<math_t> RN(n * l, stream);
@@ -187,9 +193,11 @@ void rsvdFixedRank(raft::resources const& handle,
   rmm::device_uvector<math_t> Z(n * l, stream);
   rmm::device_uvector<math_t> Yorth(m * l, stream);
   rmm::device_uvector<math_t> Zorth(n * l, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream));
+  if (!is_dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream));
+  }
 
   // power sampling scheme
   for (int j = 1; j < q; j++) {
@@ -236,30 +244,40 @@ void rsvdFixedRank(raft::resources const& handle,
 
   // orthogonalize on exit from loop to get Q
   rmm::device_uvector<math_t> Q(m * l, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream));
+  if (!is_dry_run) { RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream)); }
   raft::linalg::qrGetQ(handle, Y.data(), Q.data(), m, l, stream);
 
   // either QR of B^T method, or eigendecompose BB^T method
   if (!use_bbt) {
     // form Bt = Mt*Q : nxm * mxl = nxl
     rmm::device_uvector<math_t> Bt(n * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream));
+    }
     raft::linalg::gemm(
       handle, M, m, n, Q.data(), Bt.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
 
     // compute QR factorization of Bt
     // M is mxn ; Q is mxn ; R is min(m,n) x min(m,n) */
     rmm::device_uvector<math_t> Qhat(n * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream));
+    }
     rmm::device_uvector<math_t> Rhat(l * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream));
+    }
     raft::linalg::qrGetQR(handle, Bt.data(), Qhat.data(), Rhat.data(), n, l, stream);
 
     // compute SVD of Rhat (lxl)
     rmm::device_uvector<math_t> Uhat(l * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    }
     rmm::device_uvector<math_t> Vhat(l * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream));
+    }
     if (use_jacobi)
       raft::linalg::svdJacobi(handle,
                               Rhat.data(),
@@ -350,9 +368,13 @@ void rsvdFixedRank(raft::resources const& handle,
 
     // compute eigendecomposition of BBt
     rmm::device_uvector<math_t> Uhat(l * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    }
     rmm::device_uvector<math_t> Uhat_dup(l * l, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream));
+    if (!is_dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream));
+    }
 
     raft::matrix::upper_triangular(
       handle,
@@ -397,9 +419,13 @@ void rsvdFixedRank(raft::resources const& handle,
     // Sigma^{-1}[(p+1):l, (p+1):l] nxl * lxk * kxk = nxk
     if (gen_right_vec) {
       rmm::device_uvector<math_t> Sinv(k * k, stream);
-      RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream));
+      if (!is_dry_run) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream));
+      }
       rmm::device_uvector<math_t> UhatSinv(l * k, stream);
-      RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream));
+      if (!is_dry_run) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream));
+      }
       math_t scalar = 1.0;
       raft::matrix::reciprocal(
         handle,
diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh
index ba831822d7..d4100cf473 100644
--- a/cpp/include/raft/linalg/detail/svd.cuh
+++ b/cpp/include/raft/linalg/detail/svd.cuh
@@ -12,6 +12,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/eig.cuh>
 #include <raft/linalg/gemm.cuh>
@@ -59,6 +60,8 @@ void svdQR(raft::resources const& handle,
   RAFT_CUSOLVER_TRY(cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
   rmm::device_uvector<T> d_work(lwork, stream);
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   char jobu  = 'S';
   char jobvt = 'A';
 
@@ -216,6 +219,11 @@ void svdJacobi(raft::resources const& handle,
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
+  if (resource::get_dry_run_flag(handle)) {
+    RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params));
+    return;
+  }
+
   RAFT_CUSOLVER_TRY(cusolverDngesvdj(cusolverH,
                                      CUSOLVER_EIG_MODE_VECTOR,
                                      econ,
@@ -280,16 +288,19 @@ bool evaluateSVDByL2Norm(raft::resources const& handle,
                          math_t tol,
                          cudaStream_t stream)
 {
-  cublasHandle_t cublasH = resource::get_cublas_handle(handle);
-
   int m = n_rows, n = n_cols;
+  bool is_dry_run = resource::get_dry_run_flag(handle);
 
   // form product matrix
   rmm::device_uvector<math_t> P_d(m * n, stream);
   rmm::device_uvector<math_t> S_mat(k * k, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
 
+  if (!is_dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
+  }
+
+  // These RAFT functions have their own dry-run guards at the leaf level
   raft::matrix::set_diagonal(handle,
                              make_device_vector_view<const math_t>(S_vec, k),
                              make_device_matrix_view<math_t>(S_mat.data(), k, k));
@@ -307,8 +318,12 @@ bool evaluateSVDByL2Norm(raft::resources const& handle,
   // calculate percent error
   const math_t alpha = 1.0, beta = -1.0;
   rmm::device_uvector<math_t> A_minus_P(m * n, stream);
+
+  if (is_dry_run) { return false; }
+
   RAFT_CUDA_TRY(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
 
+  cublasHandle_t cublasH = resource::get_cublas_handle(handle);
   RAFT_CUBLAS_TRY(cublasgeam(cublasH,
                              CUBLAS_OP_N,
                              CUBLAS_OP_N,
diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh
index 9efac50763..beda4aaa04 100644
--- a/cpp/include/raft/linalg/detail/transpose.cuh
+++ b/cpp/include/raft/linalg/detail/transpose.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -87,6 +88,7 @@ void transpose_half(raft::resources const& handle,
                     const IndexType stride_out = 1)
 {
   if (n_cols == 0 || n_rows == 0) return;
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto stream = resource::get_cuda_stream(handle);
 
   int dev_id, sm_count;
@@ -134,6 +136,7 @@ void transpose(raft::resources const& handle,
                int n_cols,
                cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   int out_n_rows = n_cols;
   int out_n_cols = n_rows;
 
@@ -188,6 +191,7 @@ void transpose_row_major_impl(
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto out_n_rows   = in.extent(1);
   auto out_n_cols   = in.extent(0);
   T constexpr kOne  = 1;
@@ -230,6 +234,7 @@ void transpose_col_major_impl(
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> in,
   raft::mdspan<T, raft::matrix_extent<IndexType>, LayoutPolicy, AccessorPolicy> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto out_n_rows   = in.extent(1);
   auto out_n_cols   = in.extent(0);
   T constexpr kOne  = 1;
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index 69600f016c..0a64b8db55 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __DIVIDE_H
@@ -11,6 +11,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/input_validation.hpp>
 
@@ -61,6 +62,7 @@ void divide_scalar(raft::resources const& handle,
                    OutType out,
                    raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/dot.cuh b/cpp/include/raft/linalg/dot.cuh
index af40c07459..086633745b 100644
--- a/cpp/include/raft/linalg/dot.cuh
+++ b/cpp/include/raft/linalg/dot.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __DOT_H
@@ -11,6 +11,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 
@@ -40,6 +41,7 @@ void dot(raft::resources const& handle,
 {
   RAFT_EXPECTS(x.size() == y.size(),
                "Size mismatch between x and y input vectors in raft::linalg::dot");
+  if (resource::get_dry_run_flag(handle)) { return; }
 
   RAFT_CUBLAS_TRY(detail::cublasdot(resource::get_cublas_handle(handle),
                                     x.size(),
@@ -70,6 +72,7 @@ void dot(raft::resources const& handle,
 {
   RAFT_EXPECTS(x.size() == y.size(),
                "Size mismatch between x and y input vectors in raft::linalg::dot");
+  if (resource::get_dry_run_flag(handle)) { return; }
 
   RAFT_CUBLAS_TRY(detail::cublasdot(resource::get_cublas_handle(handle),
                                     x.size(),
diff --git a/cpp/include/raft/linalg/map_reduce.cuh b/cpp/include/raft/linalg/map_reduce.cuh
index e5176dda01..3c206bc11b 100644
--- a/cpp/include/raft/linalg/map_reduce.cuh
+++ b/cpp/include/raft/linalg/map_reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __MAP_REDUCE_H
@@ -11,6 +11,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 
 namespace raft::linalg {
 
@@ -89,6 +90,7 @@ void map_reduce(raft::resources const& handle,
                 ReduceLambda op,
                 Args... args)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   mapReduce<InValueType, MapOp, ReduceLambda, IndexType, 256, OutValueType, Args...>(
     out.data_handle(),
     in.extent(0),
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 47a3cd9ce8..abd437ab91 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __MATRIX_VECTOR_OP_H
@@ -12,6 +12,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/core/types.hpp>
 #include <raft/util/input_validation.hpp>
@@ -56,7 +57,7 @@ void matrixVectorOp(MatT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  detail::matrixVectorOp<rowMajor, bcastAlongRows>(out, matrix, vec, D, N, op, stream);
+  detail::matrixVectorOp<rowMajor, bcastAlongRows>(false, out, matrix, vec, D, N, op, stream);
 }
 
 /**
@@ -100,7 +101,8 @@ void matrixVectorOp(MatT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  detail::matrixVectorOp<rowMajor, bcastAlongRows>(out, matrix, vec1, vec2, D, N, op, stream);
+  detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false, out, matrix, vec1, vec2, D, N, op, stream);
 }
 
 /**
@@ -156,13 +158,14 @@ void matrix_vector_op(raft::resources const& handle,
                  "Size mismatch between matrix and vector");
   }
 
-  matrixVectorOp<rowMajor, bcastAlongRows>(out.data_handle(),
-                                           matrix.data_handle(),
-                                           vec.data_handle(),
-                                           out.extent(1),
-                                           out.extent(0),
-                                           op,
-                                           resource::get_cuda_stream(handle));
+  detail::matrixVectorOp<rowMajor, bcastAlongRows>(resource::get_dry_run_flag(handle),
+                                                   out.data_handle(),
+                                                   matrix.data_handle(),
+                                                   vec.data_handle(),
+                                                   out.extent(1),
+                                                   out.extent(0),
+                                                   op,
+                                                   resource::get_cuda_stream(handle));
 }
 
 /**
@@ -221,14 +224,15 @@ void matrix_vector_op(raft::resources const& handle,
                  "Size mismatch between matrix and vector");
   }
 
-  matrixVectorOp<rowMajor, bcastAlongRows>(out.data_handle(),
-                                           matrix.data_handle(),
-                                           vec1.data_handle(),
-                                           vec2.data_handle(),
-                                           out.extent(1),
-                                           out.extent(0),
-                                           op,
-                                           resource::get_cuda_stream(handle));
+  detail::matrixVectorOp<rowMajor, bcastAlongRows>(resource::get_dry_run_flag(handle),
+                                                   out.data_handle(),
+                                                   matrix.data_handle(),
+                                                   vec1.data_handle(),
+                                                   vec2.data_handle(),
+                                                   out.extent(1),
+                                                   out.extent(0),
+                                                   op,
+                                                   resource::get_cuda_stream(handle));
 }
 
 /** @} */  // end of group matrix_vector_op
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
index 70c04ccc6b..85ca248cf5 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __MSE_H
@@ -11,6 +11,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 
 namespace raft {
 namespace linalg {
@@ -57,6 +58,7 @@ void mean_squared_error(raft::resources const& handle,
                         raft::device_scalar_view<OutValueType, IndexType> out,
                         OutValueType weight)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(A.size() == B.size(), "Size mismatch between inputs");
 
   meanSquaredError(out.data_handle(),
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index 22c89a5883..325918868e 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __MULTIPLY_H
@@ -12,6 +12,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -63,6 +64,7 @@ void multiply_scalar(
   OutType out,
   raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
index e16fbf4353..c0839aca44 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __NORM_H
@@ -14,6 +14,7 @@
 #include <raft/core/mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/types.hpp>
 #include <raft/linalg/norm_types.hpp>
 #include <raft/util/input_validation.hpp>
@@ -54,7 +55,7 @@ void rowNorm(OutType* dots,
              cudaStream_t stream,
              Lambda fin_op = raft::identity_op())
 {
-  detail::rowNormCaller<norm_type, rowMajor>(dots, data, D, N, stream, fin_op);
+  detail::rowNormCaller<norm_type, rowMajor>(false, dots, data, D, N, stream, fin_op);
 }
 
 /**
@@ -85,7 +86,7 @@ void colNorm(OutType* dots,
              cudaStream_t stream,
              Lambda fin_op = raft::identity_op())
 {
-  detail::colNormCaller<norm_type, rowMajor>(dots, data, D, N, stream, fin_op);
+  detail::colNormCaller<norm_type, rowMajor>(false, dots, data, D, N, stream, fin_op);
 }
 
 /**
@@ -128,21 +129,23 @@ void norm(raft::resources const& handle,
   if constexpr (along_rows) {
     RAFT_EXPECTS(static_cast<IndexType>(out.size()) == in.extent(0),
                  "Output should be equal to number of rows in Input");
-    rowNorm<norm_type, row_major>(out.data_handle(),
-                                  in.data_handle(),
-                                  in.extent(1),
-                                  in.extent(0),
-                                  resource::get_cuda_stream(handle),
-                                  fin_op);
+    detail::rowNormCaller<norm_type, row_major>(resource::get_dry_run_flag(handle),
+                                                out.data_handle(),
+                                                in.data_handle(),
+                                                in.extent(1),
+                                                in.extent(0),
+                                                resource::get_cuda_stream(handle),
+                                                fin_op);
   } else {
     RAFT_EXPECTS(static_cast<IndexType>(out.size()) == in.extent(1),
                  "Output should be equal to number of columns in Input");
-    colNorm<norm_type, row_major>(out.data_handle(),
-                                  in.data_handle(),
-                                  in.extent(1),
-                                  in.extent(0),
-                                  resource::get_cuda_stream(handle),
-                                  fin_op);
+    detail::colNormCaller<norm_type, row_major>(resource::get_dry_run_flag(handle),
+                                                out.data_handle(),
+                                                in.data_handle(),
+                                                in.extent(1),
+                                                in.extent(0),
+                                                resource::get_cuda_stream(handle),
+                                                fin_op);
   }
 }
 
diff --git a/cpp/include/raft/linalg/normalize.cuh b/cpp/include/raft/linalg/normalize.cuh
index 730d5aff25..86b59751f5 100644
--- a/cpp/include/raft/linalg/normalize.cuh
+++ b/cpp/include/raft/linalg/normalize.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/norm_types.hpp>
 #include <raft/util/input_validation.hpp>
 
@@ -53,6 +54,7 @@ void row_normalize(raft::resources const& handle,
                    FinalLambda fin_op,
                    ElementType eps = ElementType(1e-8))
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
   RAFT_EXPECTS(in.extent(0) == out.extent(0),
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index ae4820cda3..de6461bc83 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __POWER_H
@@ -10,6 +10,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/util/input_validation.hpp>
@@ -74,6 +75,7 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void power(raft::resources const& handle, InType in1, InType in2, OutType out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
@@ -112,6 +114,7 @@ void power_scalar(
   OutType out,
   const raft::host_scalar_view<const typename InType::value_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index ce2c324f24..e3650469df 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __REDUCE_H
@@ -13,6 +13,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/types.hpp>
 #include <raft/util/input_validation.hpp>
 
@@ -71,7 +72,7 @@ void reduce(OutType* dots,
             FinalLambda final_op   = raft::identity_op())
 {
   detail::reduce<rowMajor, alongRows, InType, OutType, IdxType>(
-    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+    false, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 /**
@@ -166,16 +167,18 @@ void reduce(raft::resources const& handle,
                  "Output should be equal to number of columns in Input");
   }
 
-  reduce<row_major, along_rows>(dots.data_handle(),
-                                data.data_handle(),
-                                data.extent(1),
-                                data.extent(0),
-                                init,
-                                resource::get_cuda_stream(handle),
-                                inplace,
-                                main_op,
-                                reduce_op,
-                                final_op);
+  detail::reduce<row_major, along_rows, InElementType, OutElementType, IdxType>(
+    resource::get_dry_run_flag(handle),
+    dots.data_handle(),
+    data.data_handle(),
+    data.extent(1),
+    data.extent(0),
+    init,
+    resource::get_cuda_stream(handle),
+    inplace,
+    main_op,
+    reduce_op,
+    final_op);
 }
 
 /** @} */  // end of group reduction
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index e0ac2d6544..eb90244cc3 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __REDUCE_COLS_BY_KEY
@@ -11,6 +11,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 namespace raft {
@@ -81,6 +82,7 @@ void reduce_cols_by_key(
   IndexType nkeys = 0,
   bool reset_sums = true)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   if (nkeys > 0) {
     RAFT_EXPECTS(out.extent(1) == nkeys, "Output doesn't have nkeys columns");
   } else {
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 7e7e91bcb9..685f8fb962 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __REDUCE_ROWS_BY_KEY
@@ -11,6 +11,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 namespace raft {
@@ -147,6 +148,7 @@ void reduce_rows_by_key(
   std::optional<raft::device_vector_view<const WeightType, IndexType>> d_weights = std::nullopt,
   bool reset_sums                                                                = true)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(d_A.extent(0) == d_A.extent(0) && d_sums.extent(1) == n_unique_keys,
                "Output is not of size ncols * n_unique_keys");
   RAFT_EXPECTS(d_keys.extent(0) == d_A.extent(1), "Keys is not of size nrows");
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index e0c232e62a..abf19e765e 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __SQRT_H
@@ -10,6 +10,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/unary_op.cuh>
 
 namespace raft {
@@ -51,6 +52,7 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void sqrt(raft::resources const& handle, InType in, OutType out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index efbd80126e..eb34a99452 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -13,6 +13,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <type_traits>
@@ -127,6 +128,7 @@ void strided_reduction(raft::resources const& handle,
                        ReduceLambda reduce_op = raft::add_op(),
                        FinalLambda final_op   = raft::identity_op())
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   if constexpr (std::is_same_v<LayoutPolicy, raft::row_major>) {
     RAFT_EXPECTS(static_cast<IndexType>(dots.size()) == data.extent(1),
                  "Output should be equal to number of columns in Input");
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index 1aba864100..08e5f38fbe 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -13,6 +13,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/util/input_validation.hpp>
 
 namespace raft {
@@ -98,6 +99,7 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void subtract(raft::resources const& handle, InType in1, InType in2, OutType out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
@@ -136,6 +138,7 @@ void subtract_scalar(
   OutType out,
   raft::device_scalar_view<const typename InType::element_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
@@ -172,6 +175,7 @@ void subtract_scalar(
   OutType out,
   raft::host_scalar_view<const typename InType::element_type, ScalarIdxType> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   using in_value_t  = typename InType::value_type;
   using out_value_t = typename OutType::value_type;
 
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index 69e2130adb..6cf4b3a266 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __UNARY_OP_H
@@ -9,6 +9,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map.cuh>
 
@@ -109,6 +110,7 @@ template <typename OutType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void write_only_unary_op(const raft::resources& handle, OutType out, Lambda op)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   return writeOnlyUnaryOp(out.data_handle(), out.size(), op, resource::get_cuda_stream(handle));
 }
 
diff --git a/cpp/include/raft/matrix/argmax.cuh b/cpp/include/raft/matrix/argmax.cuh
index 36a8999b64..caa477fa8e 100644
--- a/cpp/include/raft/matrix/argmax.cuh
+++ b/cpp/include/raft/matrix/argmax.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/math.cuh>
 
 namespace raft::matrix {
@@ -27,6 +28,7 @@ void argmax(raft::resources const& handle,
             raft::device_matrix_view<const math_t, matrix_idx_t, row_major> in,
             raft::device_vector_view<idx_t, matrix_idx_t> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == in.extent(0),
                "Size of output vector must equal number of rows in input matrix.");
   detail::argmax(in.data_handle(),
diff --git a/cpp/include/raft/matrix/argmin.cuh b/cpp/include/raft/matrix/argmin.cuh
index a168d3969a..9531b6a426 100644
--- a/cpp/include/raft/matrix/argmin.cuh
+++ b/cpp/include/raft/matrix/argmin.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/math.cuh>
 
 namespace raft::matrix {
@@ -27,6 +28,7 @@ void argmin(raft::resources const& handle,
             raft::device_matrix_view<const math_t, matrix_idx_t, row_major> in,
             raft::device_vector_view<idx_t, matrix_idx_t> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == in.extent(0),
                "Size of output vector must equal number of rows in input matrix.");
   detail::argmin(in.data_handle(),
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
index 0347797a4c..7e5d95f3eb 100644
--- a/cpp/include/raft/matrix/col_wise_sort.cuh
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __COL_WISE_SORT_H
@@ -10,6 +10,7 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/columnWiseSort.cuh>
 
 namespace raft::matrix {
@@ -38,8 +39,16 @@ void sort_cols_per_row(const InType* in,
                        cudaStream_t stream,
                        InType* sortedKeys = nullptr)
 {
-  detail::sortColumnsPerRow<InType, OutType>(
-    in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys);
+  detail::sortColumnsPerRow<InType, OutType>(false,
+                                             in,
+                                             out,
+                                             n_rows,
+                                             n_columns,
+                                             bAllocWorkspace,
+                                             workspacePtr,
+                                             workspaceSize,
+                                             stream,
+                                             sortedKeys);
 }
 
 /**
@@ -78,12 +87,14 @@ void sort_cols_per_row(raft::resources const& handle,
                  "Input and `sorted_keys` matrices must have the same shape.");
   }
 
+  bool dry_run          = resource::get_dry_run_flag(handle);
   size_t workspace_size = 0;
   bool alloc_workspace  = false;
 
   in_t* keys = sorted_keys.has_value() ? sorted_keys.value().data_handle() : nullptr;
 
-  detail::sortColumnsPerRow<in_t, out_t>(in.data_handle(),
+  detail::sortColumnsPerRow<in_t, out_t>(dry_run,
+                                         in.data_handle(),
                                          out.data_handle(),
                                          in.extent(0),
                                          in.extent(1),
@@ -96,7 +107,10 @@ void sort_cols_per_row(raft::resources const& handle,
   if (alloc_workspace) {
     auto workspace = raft::make_device_vector<char>(handle, workspace_size);
 
-    detail::sortColumnsPerRow<in_t, out_t>(in.data_handle(),
+    if (dry_run) { return; }
+
+    detail::sortColumnsPerRow<in_t, out_t>(dry_run,
+                                           in.data_handle(),
                                            out.data_handle(),
                                            in.extent(0),
                                            in.extent(1),
diff --git a/cpp/include/raft/matrix/copy.cuh b/cpp/include/raft/matrix/copy.cuh
index 8c3f00eca5..0aca60483a 100644
--- a/cpp/include/raft/matrix/copy.cuh
+++ b/cpp/include/raft/matrix/copy.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 #include <raft/util/input_validation.hpp>
 
@@ -34,6 +35,7 @@ void copy_rows(raft::resources const& handle,
                raft::device_matrix_view<m_t, idx_t, layout> out,
                raft::device_vector_view<const idx_t, idx_t> indices)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.extent(1) == out.extent(1),
                "Input and output matrices must have same number of columns");
   RAFT_EXPECTS(indices.extent(0) == out.extent(0),
@@ -59,6 +61,7 @@ void copy(raft::resources const& handle,
           raft::device_matrix_view<const m_t, matrix_idx_t, row_major> in,
           raft::device_matrix_view<m_t, matrix_idx_t, row_major> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.extent(0) == out.extent(0) && in.extent(1) == out.extent(1),
                "Input and output matrix shapes must match.");
 
@@ -79,6 +82,7 @@ void copy(raft::resources const& handle,
           raft::device_matrix_view<const m_t, matrix_idx_t, col_major> in,
           raft::device_matrix_view<m_t, matrix_idx_t, col_major> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.extent(0) == out.extent(0) && in.extent(1) == out.extent(1),
                "Input and output matrix shapes must match.");
 
@@ -100,6 +104,7 @@ void trunc_zero_origin(raft::resources const& handle,
                        raft::device_matrix_view<const m_t, idx_t, col_major> in,
                        raft::device_matrix_view<m_t, idx_t, col_major> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) <= in.extent(0) && out.extent(1) <= in.extent(1),
                "Output matrix must have less or equal number of rows and columns");
 
diff --git a/cpp/include/raft/matrix/detail/columnWiseSort.cuh b/cpp/include/raft/matrix/detail/columnWiseSort.cuh
index a8f654557d..a36e9ee4da 100644
--- a/cpp/include/raft/matrix/detail/columnWiseSort.cuh
+++ b/cpp/include/raft/matrix/detail/columnWiseSort.cuh
@@ -163,7 +163,8 @@ cudaError_t layoutSortOffset(T* in, T value, int n_times, cudaStream_t stream)
  * @param sortedKeys: Optional, output matrix for sorted keys (input)
  */
 template <typename InType, typename OutType>
-void sortColumnsPerRow(const InType* in,
+void sortColumnsPerRow(bool dry_run,
+                       const InType* in,
                        OutType* out,
                        int n_rows,
                        int n_columns,
@@ -203,6 +204,8 @@ void sortColumnsPerRow(const InType* in,
     // more elements per thread --> more register pressure
     // 512(blockSize) * 8 elements per thread = 71 register / thread
 
+    if (dry_run) { return; }
+
     // instantiate some kernel combinations
     if (n_columns <= 512)
       INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 128, 4, stream);
@@ -250,6 +253,8 @@ void sortColumnsPerRow(const InType* in,
       // for segment offsets
       workspaceSize += raft::alignTo(sizeof(int) * (size_t)numSegments, memAlignWidth);
     } else {
+      if (dry_run) { return; }
+
       size_t workspaceOffset = 0;
 
       if (!sortedKeys) {
@@ -301,6 +306,8 @@ void sortColumnsPerRow(const InType* in,
 
       workspaceSize += raft::alignTo(sizeof(OutType) * (size_t)n_columns, memAlignWidth);
     } else {
+      if (dry_run) { return; }
+
       size_t workspaceOffset   = 0;
       bool userKeyOutputBuffer = true;
 
diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh
index 20bbc4271d..d931f433f4 100644
--- a/cpp/include/raft/matrix/detail/gather.cuh
+++ b/cpp/include/raft/matrix/detail/gather.cuh
@@ -13,6 +13,7 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/pinned_mdarray.hpp>
 #include <raft/core/pinned_mdspan.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/integer_utils.hpp>
@@ -550,13 +551,15 @@ void gather(raft::resources const& res,
             device_vector_view<const IdxT, MatIdxT> indices,
             raft::device_matrix_view<T, MatIdxT> output)
 {
+  auto dry_run = resource::get_dry_run_flag(res);
   raft::common::nvtx::range<common::nvtx::domain::raft> fun_scope("gather");
   IdxT n_dim        = output.extent(1);
   IdxT n_train      = output.extent(0);
   auto indices_host = raft::make_host_vector<IdxT, MatIdxT>(n_train);
-  raft::copy(
-    indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res));
-  resource::sync_stream(res);
+  if (!dry_run) {
+    raft::copy(
+      indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res));
+  }
 
   const size_t buffer_size = 32768 * 1024;  // bytes
   const size_t max_batch_size =
@@ -568,6 +571,10 @@ void gather(raft::resources const& res,
   auto out_tmp1 = raft::make_pinned_matrix<T, MatIdxT>(res, max_batch_size, n_dim);
   auto out_tmp2 = raft::make_pinned_matrix<T, MatIdxT>(res, max_batch_size, n_dim);
 
+  if (dry_run) { return; }
+
+  resource::sync_stream(res);
+
   // Usually a limited number of threads provide sufficient bandwidth for gathering data.
 #if defined(_OPENMP)
   int n_threads = std::min(omp_get_max_threads(), 32);
diff --git a/cpp/include/raft/matrix/detail/gather_inplace.cuh b/cpp/include/raft/matrix/detail/gather_inplace.cuh
index beaad13657..ac9105b1cc 100644
--- a/cpp/include/raft/matrix/detail/gather_inplace.cuh
+++ b/cpp/include/raft/matrix/detail/gather_inplace.cuh
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/util/fast_int_div.cuh>
@@ -38,12 +39,14 @@ void gatherInplaceImpl(raft::resources const& handle,
   // re-assign batch_size for default case
   if (batch_size == 0 || batch_size > n) batch_size = n;
 
+  auto scratch_space = raft::make_device_vector<MatrixT, IndexT>(handle, map_length * batch_size);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   auto exec_policy = resource::get_thrust_policy(handle);
 
   IndexT n_batches = raft::ceildiv(n, batch_size);
 
-  auto scratch_space = raft::make_device_vector<MatrixT, IndexT>(handle, map_length * batch_size);
-
   for (IndexT bid = 0; bid < n_batches; bid++) {
     IndexT batch_offset   = bid * batch_size;
     IndexT cols_per_batch = min(batch_size, n - batch_offset);
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index 05416d16be..9eefcf547e 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <raft/core/operators.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
@@ -186,10 +187,10 @@ template <typename math_t, typename IdxType = int>
 void ratio(
   raft::resources const& handle, const math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
 {
-  auto d_src  = src;
-  auto d_dest = dest;
-
   rmm::device_scalar<math_t> d_sum(stream);
+  if (resource::get_dry_run_flag(handle)) { return; }
+  auto d_src      = src;
+  auto d_dest     = dest;
   auto* d_sum_ptr = d_sum.data();
   raft::linalg::mapThenSumReduce(d_sum_ptr, len, raft::identity_op{}, stream, src);
   raft::linalg::unaryOp(
@@ -200,15 +201,16 @@ template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType =
 void matrixVectorBinaryMult(
   Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    data, data, vec, n_col, n_row, raft::mul_op(), stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false, data, data, vec, n_col, n_row, raft::mul_op(), stream);
 }
 
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryMultSkipZero(
   Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false,
     data,
     data,
     vec,
@@ -227,8 +229,8 @@ template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType =
 void matrixVectorBinaryDiv(
   Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    data, data, vec, n_col, n_row, raft::div_op(), stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false, data, data, vec, n_col, n_row, raft::div_op(), stream);
 }
 
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
@@ -240,7 +242,8 @@ void matrixVectorBinaryDivSkipZero(Type* data,
                                    bool return_zero = false)
 {
   if (return_zero) {
-    raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
+    raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+      false,
       data,
       data,
       vec,
@@ -254,7 +257,8 @@ void matrixVectorBinaryDivSkipZero(Type* data,
       },
       stream);
   } else {
-    raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
+    raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+      false,
       data,
       data,
       vec,
@@ -274,16 +278,16 @@ template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType =
 void matrixVectorBinaryAdd(
   Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    data, data, vec, n_col, n_row, raft::add_op(), stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false, data, data, vec, n_col, n_row, raft::add_op(), stream);
 }
 
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinarySub(
   Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    data, data, vec, n_col, n_row, raft::sub_op(), stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    false, data, data, vec, n_col, n_row, raft::sub_op(), stream);
 }
 
 // Computes an argmin/argmax column-wise in a DxN matrix
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index af42e12037..9e2989bee8 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/util/cache_util.cuh>
@@ -296,6 +297,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
 template <typename m_t, typename idx_t = int>
 m_t getL2Norm(raft::resources const& handle, const m_t* in, idx_t size, cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return m_t{0}; }
   cublasHandle_t cublasH = resource::get_cublas_handle(handle);
   m_t normval            = 0;
   RAFT_EXPECTS(
diff --git a/cpp/include/raft/matrix/detail/scatter_inplace.cuh b/cpp/include/raft/matrix/detail/scatter_inplace.cuh
index 7ffa697f71..0c3ea275b7 100644
--- a/cpp/include/raft/matrix/detail/scatter_inplace.cuh
+++ b/cpp/include/raft/matrix/detail/scatter_inplace.cuh
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/util/cuda_dev_essentials.cuh>
@@ -63,12 +64,14 @@ void scatterInplaceImpl(
   // re-assign batch_size for default case
   if (batch_size == 0 || batch_size > n) batch_size = n;
 
+  auto scratch_space = raft::make_device_vector<MatrixT, IndexT>(handle, m * batch_size);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   auto exec_policy = resource::get_thrust_policy(handle);
 
   IndexT n_batches = raft::ceildiv(n, batch_size);
 
-  auto scratch_space = raft::make_device_vector<MatrixT, IndexT>(handle, m * batch_size);
-
   for (IndexT bid = 0; bid < n_batches; bid++) {
     IndexT batch_offset   = bid * batch_size;
     IndexT cols_per_batch = min(batch_size, n - batch_offset);
diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
index 37411ba0bd..f7b13d9977 100644
--- a/cpp/include/raft/matrix/detail/select_k-inl.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh
@@ -14,6 +14,7 @@
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/matrix/select_k_types.hpp>
 
@@ -125,6 +126,8 @@ void segmented_sort_by_key(raft::resources const& handle,
   auto d_temp_storage = raft::make_device_mdarray<char, size_t>(
     handle, mr, raft::make_extents<size_t>(temp_storage_bytes));
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   if (asc) {
     // Run sorting operation
     cub::DeviceSegmentedRadixSort::SortPairs((void*)d_temp_storage.data_handle(),
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index a6dd7e0ce5..28eb3b411c 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -11,6 +11,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resource/device_properties.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/matrix/detail/select_k_layout.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -873,7 +874,8 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
 }
 
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize, typename RowLayout>
-void radix_topk(const T* in,
+void radix_topk(bool dry_run,
+                const T* in,
                 const IdxT* in_idx,
                 int batch_size,
                 IdxT len,
@@ -907,6 +909,8 @@ void radix_topk(const T* in,
 
   rmm::device_buffer bufs(max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr);
 
+  if (dry_run) { return; }
+
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size = std::min(max_chunk_size, batch_size - offset);
     RAFT_CUDA_TRY(
@@ -1148,7 +1152,8 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
 // used. It's used when len is relatively small or when the number of blocks per row calculated by
 // `calc_grid_dim()` is 1.
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize, typename RowLayout>
-void radix_topk_one_block(const T* in,
+void radix_topk_one_block(bool dry_run,
+                          const T* in,
                           const IdxT* in_idx,
                           int batch_size,
                           IdxT len,
@@ -1170,6 +1175,8 @@ void radix_topk_one_block(const T* in,
 
   rmm::device_buffer bufs(max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr);
 
+  if (dry_run) { return; }
+
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size          = std::min(max_chunk_size, batch_size - offset);
     const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
@@ -1266,9 +1273,11 @@ void select_k(raft::resources const& res,
   RAFT_EXPECTS(RowLayout::is_uniform || len_i != nullptr,
                "CSR layout requires a non-null indptr array (len_i)!");
 
-  auto stream = resource::get_cuda_stream(res);
-  auto mr     = resource::get_workspace_resource_ref(res);
+  bool dry_run = resource::get_dry_run_flag(res);
+  auto stream  = resource::get_cuda_stream(res);
+  auto mr      = resource::get_workspace_resource_ref(res);
   if (k == len && RowLayout::is_uniform) {
+    if (dry_run) { return; }
     RAFT_CUDA_TRY(
       cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
     if (in_idx) {
@@ -1288,15 +1297,27 @@ void select_k(raft::resources const& res,
 
   if (len <= BlockSize * items_per_thread) {
     impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, RowLayout>(
-      in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
+      dry_run, in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
   } else {
     unsigned grid_dim =
       impl::calc_grid_dim<T, IdxT, BitsPerPass, BlockSize>(batch_size, len, sm_cnt);
     if (grid_dim == 1) {
-      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, RowLayout>(
-        in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
+      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, RowLayout>(dry_run,
+                                                                             in,
+                                                                             in_idx,
+                                                                             batch_size,
+                                                                             len,
+                                                                             k,
+                                                                             out,
+                                                                             out_idx,
+                                                                             select_min,
+                                                                             len_i,
+                                                                             sm_cnt,
+                                                                             stream,
+                                                                             mr);
     } else {
-      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize, RowLayout>(in,
+      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize, RowLayout>(dry_run,
+                                                                   in,
                                                                    in_idx,
                                                                    batch_size,
                                                                    len,
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index a480743664..5860de1c8b 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -10,6 +10,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/custom_resource.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/select_k_layout.cuh>
 #include <raft/util/bitonic_sort.cuh>
 #include <raft/util/cache.hpp>
@@ -1042,7 +1043,8 @@ template <template <int, bool, typename, typename> class WarpSortClass,
           typename T,
           typename IdxT,
           typename RowLayout>
-void select_k_(int num_of_block,
+void select_k_(bool dry_run,
+               int num_of_block,
                int num_of_warp,
                const T* in,
                const IdxT* in_idx,
@@ -1058,6 +1060,7 @@ void select_k_(int num_of_block,
 {
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
+  if (dry_run) { return; }
 
   int capacity   = bound_by_power_of_two(k);
   int warp_width = std::min(capacity, WarpSize);
@@ -1121,7 +1124,8 @@ void select_k_impl(raft::resources const& res,
   calc_launch_parameter<WarpSortClass, T, IdxT>(
     res, batch_size, len, k, &num_of_block, &num_of_warp);
 
-  select_k_<WarpSortClass, T, IdxT, RowLayout>(num_of_block,
+  select_k_<WarpSortClass, T, IdxT, RowLayout>(resource::get_dry_run_flag(res),
+                                               num_of_block,
                                                num_of_warp,
                                                in,
                                                in_idx,
@@ -1185,6 +1189,7 @@ void select_k(raft::resources const& res,
               bool select_min,
               const IdxT* in_indptr = nullptr)
 {
+  if (resource::get_dry_run_flag(res)) { return; }
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
          "The `len` (%zu) does not fit the indexing type",
@@ -1198,7 +1203,8 @@ void select_k(raft::resources const& res,
   int len_per_thread = len / (num_of_block * num_of_warp * std::min(capacity, WarpSize));
 
   if (len_per_thread <= LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
-    select_k_<warp_sort_immediate, T, IdxT, RowLayout>(num_of_block,
+    select_k_<warp_sort_immediate, T, IdxT, RowLayout>(resource::get_dry_run_flag(res),
+                                                       num_of_block,
                                                        num_of_warp,
                                                        in,
                                                        in_idx,
@@ -1214,7 +1220,8 @@ void select_k(raft::resources const& res,
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
       res, batch_size, len, k, &num_of_block, &num_of_warp);
-    select_k_<warp_sort_filtered, T, IdxT, RowLayout>(num_of_block,
+    select_k_<warp_sort_filtered, T, IdxT, RowLayout>(resource::get_dry_run_flag(res),
+                                                      num_of_block,
                                                       num_of_warp,
                                                       in,
                                                       in_idx,
diff --git a/cpp/include/raft/matrix/detail/shift.cuh b/cpp/include/raft/matrix/detail/shift.cuh
index 48aea2ff5a..a82a639d11 100644
--- a/cpp/include/raft/matrix/detail/shift.cuh
+++ b/cpp/include/raft/matrix/detail/shift.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/shift_types.hpp>
 
@@ -134,6 +135,7 @@ void shift_dispatch(raft::resources const& handle,
                     ShiftDirection shift_direction = ShiftDirection::TOWARDS_END,
                     ShiftType shift_type           = ShiftType::COL)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   size_t n_rows = in_out.extent(0);
   size_t n_cols = in_out.extent(1);
   size_t TPB    = 256;
@@ -169,6 +171,7 @@ void shift(raft::resources const& handle,
            ShiftDirection shift_direction = ShiftDirection::TOWARDS_END,
            ShiftType shift_type           = ShiftType::COL)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   if (val.has_value()) {
     shift_dispatch<ValueT, IdxT, ValueT, CONSTANT>(
       handle, in_out, val.value(), k, shift_direction, shift_type);
@@ -186,6 +189,7 @@ void shift(raft::resources const& handle,
            ShiftDirection shift_direction = ShiftDirection::TOWARDS_END,
            ShiftType shift_type           = ShiftType::COL)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   size_t k = shift_type == ShiftType::COL ? values.extent(1) : values.extent(0);
   shift_dispatch<ValueT, IdxT, const ValueT*, MATRIX>(
     handle, in_out, values.data_handle(), k, shift_direction, shift_type);
diff --git a/cpp/include/raft/matrix/diagonal.cuh b/cpp/include/raft/matrix/diagonal.cuh
index f7f04d9d9e..3d4a7f4f72 100644
--- a/cpp/include/raft/matrix/diagonal.cuh
+++ b/cpp/include/raft/matrix/diagonal.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 #include <raft/matrix/init.cuh>
 #include <raft/util/input_validation.hpp>
@@ -29,6 +30,7 @@ void set_diagonal(raft::resources const& handle,
                   raft::device_vector_view<const m_t, idx_t> vec,
                   raft::device_matrix_view<m_t, idx_t, layout> matrix)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(vec.extent(0) == std::min(matrix.extent(0), matrix.extent(1)),
                "Diagonal vector must be min(matrix.n_rows, matrix.n_cols)");
   constexpr auto is_row_major = std::is_same_v<layout, layout_c_contiguous>;
@@ -52,6 +54,7 @@ void get_diagonal(raft::resources const& handle,
                   raft::device_matrix_view<const m_t, idx_t, layout> matrix,
                   raft::device_vector_view<m_t, idx_t> vec)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(vec.extent(0) == std::min(matrix.extent(0), matrix.extent(1)),
                "Diagonal vector must be min(matrix.n_rows, matrix.n_cols)");
   constexpr auto is_row_major = std::is_same_v<layout, layout_c_contiguous>;
@@ -72,6 +75,7 @@ template <typename m_t, typename idx_t, typename layout>
 void invert_diagonal(raft::resources const& handle,
                      raft::device_matrix_view<m_t, idx_t, layout> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   // TODO: Use get_diagonal for this to support rectangular
   RAFT_EXPECTS(inout.extent(0) == inout.extent(1), "Matrix must be square.");
   detail::getDiagonalInverseMatrix(
@@ -92,6 +96,7 @@ void eye(const raft::resources& handle, raft::device_matrix_view<math_t, idx_t,
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
 
   auto diag = raft::make_device_vector<math_t, idx_t>(handle, min(out.extent(0), out.extent(1)));
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_CUDA_TRY(cudaMemsetAsync(
     out.data_handle(), 0, out.size() * sizeof(math_t), resource::get_cuda_stream(handle)));
   raft::matrix::fill(handle, diag.view(), math_t(1));
diff --git a/cpp/include/raft/matrix/gather.cuh b/cpp/include/raft/matrix/gather.cuh
index 52985dea32..1ed3415dee 100644
--- a/cpp/include/raft/matrix/gather.cuh
+++ b/cpp/include/raft/matrix/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/detail/gather.cuh>
 #include <raft/matrix/detail/gather_inplace.cuh>
@@ -207,6 +208,7 @@ void gather(const raft::resources& handle,
             raft::device_matrix_view<matrix_t, idx_t, row_major> out,
             map_xform_t transform_op = raft::identity_op())
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
   RAFT_EXPECTS(out.extent(1) == in.extent(1),
@@ -252,6 +254,7 @@ void gather(
   raft::device_matrix_view<matrix_t, idx_t, row_major> out,
   map_xform_t transform_op = raft::identity_op())
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
   RAFT_EXPECTS(out.extent(1) == in.extent(1),
@@ -306,6 +309,7 @@ void gather_if(const raft::resources& handle,
                unary_pred_t pred_op,
                map_xform_t transform_op = raft::identity_op())
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
   RAFT_EXPECTS(out.extent(1) == in.extent(1),
@@ -363,6 +367,7 @@ void gather_if(const raft::resources& handle,
                unary_pred_t pred_op,
                map_xform_t transform_op = raft::identity_op())
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
                "Number of rows in output matrix must equal the size of the map vector");
   RAFT_EXPECTS(out.extent(1) == in.extent(1),
diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh
index fbf5671deb..2852b92814 100644
--- a/cpp/include/raft/matrix/init.cuh
+++ b/cpp/include/raft/matrix/init.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/matrix/detail/math.cuh>
 
@@ -34,6 +35,7 @@ void fill(raft::resources const& handle,
           raft::device_mdspan<math_t, extents, layout> out,
           raft::host_scalar_view<math_t> scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(out), "Data layout not supported");
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must be the same size.");
   RAFT_EXPECTS(scalar.data_handle() != nullptr, "Empty scalar");
@@ -58,6 +60,7 @@ void fill(raft::resources const& handle,
           raft::device_mdspan<math_t, extents, layout> inout,
           math_t scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   linalg::map(handle, inout, raft::const_op{scalar});
 }
 
diff --git a/cpp/include/raft/matrix/linewise_op.cuh b/cpp/include/raft/matrix/linewise_op.cuh
index 8b49f55d95..778f3faa13 100644
--- a/cpp/include/raft/matrix/linewise_op.cuh
+++ b/cpp/include/raft/matrix/linewise_op.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/core/types.hpp>
 #include <raft/matrix/detail/linewise_op.cuh>
@@ -60,6 +61,7 @@ void linewise_op(raft::resources const& handle,
                  Lambda op,
                  vec_t... vecs)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   constexpr auto is_rowmajor = std::is_same_v<layout, row_major>;
   constexpr auto is_colmajor = std::is_same_v<layout, col_major>;
 
@@ -95,6 +97,7 @@ void linewise_op(raft::resources const& handle,
                  Lambda op,
                  vec_t... vecs)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   constexpr auto is_rowmajor = std::is_same_v<layout, raft::layout_right_padded<m_t>>;
   constexpr auto is_colmajor = std::is_same_v<layout, raft::layout_left_padded<m_t>>;
 
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
index 88d5c942f0..58e4dab885 100644
--- a/cpp/include/raft/matrix/norm.cuh
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 
 namespace raft::matrix {
@@ -25,6 +26,7 @@ namespace raft::matrix {
 template <typename m_t, typename idx_t>
 m_t l2_norm(raft::resources const& handle, raft::device_mdspan<const m_t, idx_t> in)
 {
+  if (resource::get_dry_run_flag(handle)) { return {}; }
   return detail::getL2Norm(handle, in.data_handle(), in.size(), resource::get_cuda_stream(handle));
 }
 
diff --git a/cpp/include/raft/matrix/power.cuh b/cpp/include/raft/matrix/power.cuh
index 135e2e53a4..5369f4e2e8 100644
--- a/cpp/include/raft/matrix/power.cuh
+++ b/cpp/include/raft/matrix/power.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/math.cuh>
 
 namespace raft::matrix {
@@ -32,6 +33,7 @@ void weighted_power(raft::resources const& handle,
                     raft::device_matrix_view<math_t, idx_t, layout> out,
                     math_t scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Size of input and output matrices must be equal");
   detail::power(
     in.data_handle(), out.data_handle(), scalar, in.size(), resource::get_cuda_stream(handle));
@@ -51,6 +53,7 @@ void weighted_power(raft::resources const& handle,
                     raft::device_matrix_view<math_t, idx_t, layout> inout,
                     math_t scalar)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::power(inout.data_handle(), scalar, inout.size(), resource::get_cuda_stream(handle));
 }
 
@@ -65,6 +68,7 @@ void weighted_power(raft::resources const& handle,
 template <typename math_t, typename idx_t, typename layout>
 void power(raft::resources const& handle, raft::device_matrix_view<math_t, idx_t, layout> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::power<math_t>(inout.data_handle(), inout.size(), resource::get_cuda_stream(handle));
 }
 
@@ -83,6 +87,7 @@ void power(raft::resources const& handle,
            raft::device_matrix_view<const math_t, idx_t, layout> in,
            raft::device_matrix_view<math_t, idx_t, layout> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must be same size.");
   detail::power<math_t>(
     in.data_handle(), out.data_handle(), in.size(), resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/matrix/print.cuh b/cpp/include/raft/matrix/print.cuh
index 47c10aed9e..ae7ca7bb55 100644
--- a/cpp/include/raft/matrix/print.cuh
+++ b/cpp/include/raft/matrix/print.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 #include <raft/matrix/matrix_types.hpp>
 
@@ -31,6 +32,7 @@ void print(raft::resources const& handle,
            raft::device_matrix_view<const m_t, idx_t, col_major> in,
            print_separators& separators)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::print(in.data_handle(),
                 in.extent(0),
                 in.extent(1),
diff --git a/cpp/include/raft/matrix/reciprocal.cuh b/cpp/include/raft/matrix/reciprocal.cuh
index cef3e5269b..ef390d89f6 100644
--- a/cpp/include/raft/matrix/reciprocal.cuh
+++ b/cpp/include/raft/matrix/reciprocal.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/math.cuh>
 
 namespace raft::matrix {
@@ -37,6 +38,7 @@ void reciprocal(raft::resources const& handle,
                 bool setzero = false,
                 math_t thres = 1e-15)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have the same size.");
   detail::reciprocal<math_t>(in.data_handle(),
                              out.data_handle(),
@@ -66,6 +68,7 @@ void reciprocal(raft::resources const& handle,
                 bool setzero = false,
                 math_t thres = 1e-15)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::reciprocal<math_t>(inout.data_handle(),
                              *(scalar.data_handle()),
                              inout.size(),
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index 54f86ccfd7..a124cc0025 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 #include <raft/util/input_validation.hpp>
 
@@ -27,6 +28,7 @@ template <typename m_t, typename idx_t, typename layout_t>
 void col_reverse(raft::resources const& handle,
                  raft::device_matrix_view<m_t, idx_t, layout_t> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
   if (raft::is_col_major(inout)) {
     detail::colReverse(
@@ -47,6 +49,7 @@ template <typename m_t, typename idx_t, typename layout_t>
 void row_reverse(raft::resources const& handle,
                  raft::device_matrix_view<m_t, idx_t, layout_t> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
   if (raft::is_col_major(inout)) {
     detail::rowReverse(
diff --git a/cpp/include/raft/matrix/sign_flip.cuh b/cpp/include/raft/matrix/sign_flip.cuh
index 5c44db8310..22ac81c28d 100644
--- a/cpp/include/raft/matrix/sign_flip.cuh
+++ b/cpp/include/raft/matrix/sign_flip.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 
 namespace raft::matrix {
@@ -28,6 +29,7 @@ template <typename math_t, typename idx_t>
 void sign_flip(raft::resources const& handle,
                raft::device_matrix_view<math_t, idx_t, col_major> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::signFlip(
     inout.data_handle(), inout.extent(0), inout.extent(1), resource::get_cuda_stream(handle));
 }
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
index 0ed2b14636..6dcd57ca08 100644
--- a/cpp/include/raft/matrix/slice.cuh
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 #include <raft/util/input_validation.hpp>
 
@@ -46,6 +47,7 @@ void slice(raft::resources const& handle,
            raft::device_matrix_view<m_t, idx_t, layout_t> out,
            slice_coordinates<idx_t> coords)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(raft::is_row_or_column_major(in), "Matrix layout must be row- or column-major");
   RAFT_EXPECTS(coords.row2 > coords.row1, "row2 must be > row1");
   RAFT_EXPECTS(coords.col2 > coords.col1, "col2 must be > col1");
diff --git a/cpp/include/raft/matrix/sqrt.cuh b/cpp/include/raft/matrix/sqrt.cuh
index 8f202c8422..542f6d9933 100644
--- a/cpp/include/raft/matrix/sqrt.cuh
+++ b/cpp/include/raft/matrix/sqrt.cuh
@@ -8,6 +8,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/math.cuh>
 
 namespace raft::matrix {
@@ -31,6 +32,7 @@ void sqrt(raft::resources const& handle,
           raft::device_matrix_view<const math_t, idx_t, layout> in,
           raft::device_matrix_view<math_t, idx_t, layout> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have same size.");
   detail::seqRoot(
     in.data_handle(), out.data_handle(), in.size(), resource::get_cuda_stream(handle));
@@ -47,6 +49,7 @@ void sqrt(raft::resources const& handle,
 template <typename math_t, typename idx_t, typename layout>
 void sqrt(raft::resources const& handle, raft::device_matrix_view<math_t, idx_t, layout> inout)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::seqRoot(inout.data_handle(), inout.size(), resource::get_cuda_stream(handle));
 }
 
@@ -68,6 +71,7 @@ void weighted_sqrt(raft::resources const& handle,
                    raft::host_scalar_view<math_t> scalar,
                    bool set_neg_zero = false)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have same size.");
   detail::seqRoot(in.data_handle(),
                   out.data_handle(),
@@ -93,6 +97,7 @@ void weighted_sqrt(raft::resources const& handle,
                    raft::host_scalar_view<math_t> scalar,
                    bool set_neg_zero = false)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::seqRoot(inout.data_handle(),
                   *(scalar.data_handle()),
                   inout.size(),
diff --git a/cpp/include/raft/matrix/threshold.cuh b/cpp/include/raft/matrix/threshold.cuh
index f499e46dc2..e7104200b7 100644
--- a/cpp/include/raft/matrix/threshold.cuh
+++ b/cpp/include/raft/matrix/threshold.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 
 namespace raft::matrix {
@@ -32,6 +33,7 @@ void zero_small_values(raft::resources const& handle,
                        raft::device_matrix_view<math_t, idx_t, layout> out,
                        math_t thres = 1e-15)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must have same size");
   detail::setSmallValuesZero(
     out.data_handle(), in.data_handle(), in.size(), resource::get_cuda_stream(handle), thres);
@@ -51,6 +53,7 @@ void zero_small_values(raft::resources const& handle,
                        raft::device_matrix_view<math_t, idx_t, layout> inout,
                        math_t thres = 1e-15)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::setSmallValuesZero(
     inout.data_handle(), inout.size(), resource::get_cuda_stream(handle), thres);
 }
diff --git a/cpp/include/raft/matrix/triangular.cuh b/cpp/include/raft/matrix/triangular.cuh
index 768b2ff6ea..ec61fe5185 100644
--- a/cpp/include/raft/matrix/triangular.cuh
+++ b/cpp/include/raft/matrix/triangular.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/detail/matrix.cuh>
 
 namespace raft::matrix {
@@ -27,6 +28,7 @@ void upper_triangular(raft::resources const& handle,
                       raft::device_matrix_view<const m_t, idx_t, col_major> src,
                       raft::device_matrix_view<m_t, idx_t, col_major> dst)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto k = std::min(src.extent(0), src.extent(1));
   RAFT_EXPECTS(k == dst.extent(0) && k == dst.extent(1),
                "dst should be of size kxk, k = min(n_rows, n_cols)");
diff --git a/cpp/include/raft/mr/dry_run_resource.hpp b/cpp/include/raft/mr/dry_run_resource.hpp
new file mode 100644
index 0000000000..6a41c2c6f8
--- /dev/null
+++ b/cpp/include/raft/mr/dry_run_resource.hpp
@@ -0,0 +1,223 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cuda/memory_resource>
+#include <cuda/stream_ref>
+#include <cuda_runtime_api.h>
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+#include <utility>
+
+namespace raft::mr {
+
+namespace detail {
+
+/**
+ * @brief Lock-free atomic counter that tracks current and peak allocation bytes.
+ */
+struct dry_run_memory_counter {
+  void record_allocate(std::size_t bytes) noexcept
+  {
+    auto new_total    = allocated_bytes_.fetch_add(bytes, std::memory_order_relaxed) + bytes;
+    auto current_peak = peak_bytes_.load(std::memory_order_relaxed);
+    while (new_total > current_peak &&
+           !peak_bytes_.compare_exchange_weak(
+             current_peak, new_total, std::memory_order_relaxed, std::memory_order_relaxed)) {}
+  }
+
+  void record_deallocate(std::size_t bytes) noexcept
+  {
+    allocated_bytes_.fetch_sub(bytes, std::memory_order_relaxed);
+  }
+
+  [[nodiscard]] auto get_allocated_bytes() const noexcept -> std::size_t
+  {
+    return allocated_bytes_.load(std::memory_order_relaxed);
+  }
+
+  [[nodiscard]] auto get_peak_bytes() const noexcept -> std::size_t
+  {
+    return peak_bytes_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<std::size_t> allocated_bytes_{0};
+  std::atomic<std::size_t> peak_bytes_{0};
+};
+
+/**
+ * @brief Minimal RAII container for a single allocation from a memory resource.
+ *
+ * Stripped-down RAII wrapper: just allocate / deallocate / data().
+ * Two constructor overloads cover sync and async resources:
+ *   - Sync: (MR, size, alignment) -- calls allocate_sync, destructor calls deallocate_sync
+ *   - Async: (MR, stream, size, alignment) -- calls allocate, destructor calls deallocate
+ *
+ * @tparam MR  Memory resource type, stored by value (use a ref type for non-owning).
+ */
+template <typename MR>
+class probe_container {
+  MR mr_;
+  void* ptr_;
+  std::size_t size_;
+  std::size_t alignment_;
+
+ public:
+  template <typename M = MR, std::enable_if_t<cuda::mr::synchronous_resource<M>, int> = 0>
+  probe_container(MR mr, std::size_t size, std::size_t alignment = alignof(std::max_align_t))
+    : mr_(std::move(mr)), ptr_(nullptr), size_(size), alignment_(alignment)
+  {
+    ptr_ = mr_.allocate_sync(size_, alignment_);
+  }
+
+  template <typename M = MR, std::enable_if_t<cuda::mr::resource<M>, int> = 0>
+  probe_container(MR mr,
+                  cuda::stream_ref stream,
+                  std::size_t size,
+                  std::size_t alignment = alignof(std::max_align_t))
+    : mr_(std::move(mr)), ptr_(nullptr), size_(size), alignment_(alignment)
+  {
+    ptr_ = mr_.allocate(stream, size_, alignment_);
+  }
+
+  ~probe_container()
+  {
+    if (ptr_ == nullptr) return;
+    if constexpr (cuda::mr::resource<MR>) {
+      mr_.deallocate(cuda::stream_ref{cudaStreamPerThread}, ptr_, size_, alignment_);
+    } else {
+      mr_.deallocate_sync(ptr_, size_, alignment_);
+    }
+  }
+
+  probe_container(probe_container const&)            = delete;
+  probe_container& operator=(probe_container const&) = delete;
+  probe_container(probe_container&&)                 = delete;
+  probe_container& operator=(probe_container&&)      = delete;
+
+  [[nodiscard]] auto data() const noexcept -> void* { return ptr_; }
+};
+
+}  // namespace detail
+
+static constexpr std::size_t kDryRunProbeSize = 256;
+
+/**
+ * @brief Resource adaptor that returns a single probed pointer for every allocation
+ *        and tracks peak usage without holding real memory.
+ *
+ * Modeled after raft::mr::statistics_adaptor: a single template handles host,
+ * device, pinned, and managed resources depending on the Upstream type.
+ *
+ * Properties are forwarded from Upstream via ADL friend get_property, so
+ * dry_run_resource<host_resource_ref> satisfies host_accessible,
+ * dry_run_resource<host_device_resource_ref> satisfies host + device accessible,
+ * and dry_run_resource<rmm::device_async_resource_ref> satisfies device_accessible.
+ *
+ * @tparam Upstream  Stored by value.  Use a ref type for non-owning semantics.
+ */
+template <typename Upstream>
+class dry_run_resource : public cuda::forward_property<dry_run_resource<Upstream>, Upstream> {
+  Upstream upstream_;
+
+  struct shared_state {
+    detail::dry_run_memory_counter counter;
+    std::once_flag probe_flag;
+    std::unique_ptr<detail::probe_container<Upstream>> probe;
+  };
+  std::shared_ptr<shared_state> state_;
+
+ public:
+  template <typename U, std::enable_if_t<std::is_same_v<std::decay_t<U>, Upstream>, int> = 0>
+  explicit dry_run_resource(U&& upstream)
+    : upstream_(std::forward<U>(upstream)), state_(std::make_shared<shared_state>())
+  {
+  }
+
+  // NVCC injects __host__ __device__ on std::shared_ptr special members,
+  // which makes the *implicit* or *defaulted* special members __host__
+  // __device__ too.  That conflicts with Upstream types whose special
+  // members are __host__ only (e.g. rmm::device_async_resource_ref).
+  // User-defined bodies (not = default) force plain __host__ execution space.
+  dry_run_resource(dry_run_resource&& other) noexcept
+    : upstream_(std::move(other.upstream_)), state_(std::move(other.state_))
+  {
+  }
+  dry_run_resource(dry_run_resource const& other) : upstream_(other.upstream_), state_(other.state_)
+  {
+  }
+  dry_run_resource& operator=(dry_run_resource&& other) noexcept
+  {
+    upstream_ = std::move(other.upstream_);
+    state_    = std::move(other.state_);
+    return *this;
+  }
+  dry_run_resource& operator=(dry_run_resource const& other)
+  {
+    upstream_ = other.upstream_;
+    state_    = other.state_;
+    return *this;
+  }
+
+  [[nodiscard]] auto get_counter() const noexcept -> std::shared_ptr<detail::dry_run_memory_counter>
+  {
+    return {state_, &state_->counter};
+  }
+
+  void* allocate_sync(std::size_t bytes, std::size_t alignment = alignof(std::max_align_t))
+  {
+    std::call_once(state_->probe_flag, [&] {
+      state_->probe =
+        std::make_unique<detail::probe_container<Upstream>>(upstream_, kDryRunProbeSize, alignment);
+    });
+    state_->counter.record_allocate(bytes);
+    return state_->probe->data();
+  }
+
+  void deallocate_sync(void*, std::size_t bytes, std::size_t = alignof(std::max_align_t)) noexcept
+  {
+    state_->counter.record_deallocate(bytes);
+  }
+
+  template <typename U = Upstream, std::enable_if_t<cuda::mr::resource<U>, int> = 0>
+  void* allocate(cuda::stream_ref stream,
+                 std::size_t bytes,
+                 std::size_t alignment = alignof(std::max_align_t))
+  {
+    std::call_once(state_->probe_flag, [&] {
+      state_->probe = std::make_unique<detail::probe_container<Upstream>>(
+        upstream_, stream, kDryRunProbeSize, alignment);
+    });
+    state_->counter.record_allocate(bytes);
+    return state_->probe->data();
+  }
+
+  template <typename U = Upstream, std::enable_if_t<cuda::mr::resource<U>, int> = 0>
+  void deallocate(cuda::stream_ref,
+                  void*,
+                  std::size_t bytes,
+                  std::size_t = alignof(std::max_align_t)) noexcept
+  {
+    state_->counter.record_deallocate(bytes);
+  }
+
+  [[nodiscard]] bool operator==(dry_run_resource const& other) const noexcept
+  {
+    return upstream_ == other.upstream_;
+  }
+
+  [[nodiscard]] auto upstream_resource() noexcept -> Upstream& { return upstream_; }
+  [[nodiscard]] auto upstream_resource() const noexcept -> Upstream const& { return upstream_; }
+};
+
+template <typename Upstream>
+dry_run_resource(Upstream) -> dry_run_resource<Upstream>;
+
+}  // namespace raft::mr
diff --git a/cpp/include/raft/mr/host_memory_resource.hpp b/cpp/include/raft/mr/host_memory_resource.hpp
index 5adc8dad9f..4f3961b391 100644
--- a/cpp/include/raft/mr/host_memory_resource.hpp
+++ b/cpp/include/raft/mr/host_memory_resource.hpp
@@ -29,18 +29,18 @@ namespace detail {
 struct default_host_resource_holder {
  private:
   std::mutex lock_;
-  raft::mr::host_resource_ref ref_{raft::mr::new_delete_resource()};
+  raft::mr::host_resource res_{raft::mr::new_delete_resource()};
 
  public:
-  inline auto set(raft::mr::host_resource_ref ref) -> raft::mr::host_resource_ref
+  inline auto set(raft::mr::host_resource res) -> raft::mr::host_resource
   {
     std::unique_lock<std::mutex> guard(lock_);
-    return std::exchange(ref_, ref);
+    return std::exchange(res_, res);
   }
   inline auto get() -> raft::mr::host_resource_ref
   {
     std::unique_lock<std::mutex> guard(lock_);
-    return ref_;
+    return raft::mr::host_resource_ref{res_};
   }
 };
 
@@ -62,16 +62,14 @@ inline auto get_default_host_resource() -> raft::mr::host_resource_ref
 /**
  * @brief Set the default host memory resource.
  *
- * The caller must keep the underlying resource alive while it is set as the default
  * (same contract as rmm::mr::set_current_device_resource).
  *
- * @param ref Non-owning reference to the resource to install.
- * @return The previous default host resource ref.
+ * @param res The resource to install.
+ * @return The previous default host resource.
  */
-inline auto set_default_host_resource(raft::mr::host_resource_ref ref)
-  -> raft::mr::host_resource_ref
+inline auto set_default_host_resource(raft::mr::host_resource res) -> raft::mr::host_resource
 {
-  return detail::default_host_resource_holder_.set(ref);
+  return detail::default_host_resource_holder_.set(res);
 }
 
 }  // namespace raft::mr
diff --git a/cpp/include/raft/mr/statistics_adaptor.hpp b/cpp/include/raft/mr/statistics_adaptor.hpp
index 161fc18559..dbcbc11036 100644
--- a/cpp/include/raft/mr/statistics_adaptor.hpp
+++ b/cpp/include/raft/mr/statistics_adaptor.hpp
@@ -72,6 +72,32 @@ class statistics_adaptor : public cuda::forward_property<statistics_adaptor<Upst
   {
   }
 
+  // NVCC injects __host__ __device__ on std::shared_ptr special members,
+  // which makes the *implicit* or *defaulted* special members __host__
+  // __device__ too.  That conflicts with Upstream types whose special
+  // members are __host__ only (e.g. rmm::device_async_resource_ref).
+  // User-defined bodies (not = default) force plain __host__ execution space.
+  statistics_adaptor(statistics_adaptor&& other) noexcept
+    : upstream_(std::move(other.upstream_)), stats_(std::move(other.stats_))
+  {
+  }
+  statistics_adaptor(statistics_adaptor const& other)
+    : upstream_(other.upstream_), stats_(other.stats_)
+  {
+  }
+  statistics_adaptor& operator=(statistics_adaptor&& other) noexcept
+  {
+    upstream_ = std::move(other.upstream_);
+    stats_    = std::move(other.stats_);
+    return *this;
+  }
+  statistics_adaptor& operator=(statistics_adaptor const& other)
+  {
+    upstream_ = other.upstream_;
+    stats_    = other.stats_;
+    return *this;
+  }
+
   /**
    * @brief Get the shared resource_stats object.
    *
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index 0c5442f981..89657d306d 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -203,7 +203,8 @@ void generate_data(DataT* out,
  * @param[in]  type               RNG type
  */
 template <typename DataT, typename IdxT>
-void make_blobs_caller(DataT* out,
+void make_blobs_caller(bool dry_run,
+                       DataT* out,
                        IdxT* labels,
                        IdxT n_rows,
                        IdxT n_cols,
@@ -221,16 +222,18 @@ void make_blobs_caller(DataT* out,
 {
   raft::random::RngState r(seed, type);
   // use the right centers buffer for data generation
-  rmm::device_uvector<DataT> rand_centers(0, stream);
+  rmm::device_uvector<DataT> rand_centers(centers == nullptr ? n_clusters * n_cols : 0, stream);
   const DataT* _centers;
   if (centers == nullptr) {
-    rand_centers.resize(n_clusters * n_cols, stream);
-    detail::uniform(
-      r, rand_centers.data(), n_clusters * n_cols, center_box_min, center_box_max, stream);
     _centers = rand_centers.data();
   } else {
     _centers = centers;
   }
+  if (dry_run) { return; }
+  if (centers == nullptr) {
+    detail::uniform(
+      r, rand_centers.data(), n_clusters * n_cols, center_box_min, center_box_max, stream);
+  }
   generate_labels(labels, n_rows, n_clusters, shuffle, r, stream);
   generate_data(out,
                 labels,
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
index 92256a8ec4..f9cd201a10 100644
--- a/cpp/include/raft/random/detail/make_regression.cuh
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -9,6 +9,7 @@
 
 #pragma once
 
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/gemm.cuh>
@@ -51,25 +52,34 @@ static void _make_low_rank_matrix(raft::resources const& handle,
                                   raft::random::RngState& r,
                                   cudaStream_t stream)
 {
-  IdxT n = std::min(n_rows, n_cols);
+  bool is_dry_run = resource::get_dry_run_flag(handle);
+  IdxT n          = std::min(n_rows, n_cols);
 
   // Generate random (ortho normal) vectors with QR decomposition
   rmm::device_uvector<DataT> rd_mat_0(n_rows * n, stream);
   rmm::device_uvector<DataT> rd_mat_1(n_cols * n, stream);
-  normal(r, rd_mat_0.data(), n_rows * n, (DataT)0.0, (DataT)1.0, stream);
-  normal(r, rd_mat_1.data(), n_cols * n, (DataT)0.0, (DataT)1.0, stream);
+  if (!is_dry_run) {
+    normal(r, rd_mat_0.data(), n_rows * n, (DataT)0.0, (DataT)1.0, stream);
+    normal(r, rd_mat_1.data(), n_cols * n, (DataT)0.0, (DataT)1.0, stream);
+  }
   rmm::device_uvector<DataT> q0(n_rows * n, stream);
   rmm::device_uvector<DataT> q1(n_cols * n, stream);
-  raft::linalg::qrGetQ(handle, rd_mat_0.data(), q0.data(), n_rows, n, stream);
-  raft::linalg::qrGetQ(handle, rd_mat_1.data(), q1.data(), n_cols, n, stream);
+  if (!is_dry_run) {
+    raft::linalg::qrGetQ(handle, rd_mat_0.data(), q0.data(), n_rows, n, stream);
+    raft::linalg::qrGetQ(handle, rd_mat_1.data(), q1.data(), n_cols, n, stream);
+  }
 
   // Build the singular profile by assembling signal and noise components
   rmm::device_uvector<DataT> singular_vec(n, stream);
-  _singular_profile_kernel<<<raft::ceildiv<IdxT>(n, 256), 256, 0, stream>>>(
-    singular_vec.data(), n, tail_strength, effective_rank);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  if (!is_dry_run) {
+    _singular_profile_kernel<<<raft::ceildiv<IdxT>(n, 256), 256, 0, stream>>>(
+      singular_vec.data(), n, tail_strength, effective_rank);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
   rmm::device_uvector<DataT> singular_mat(n * n, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(singular_mat.data(), 0, n * n * sizeof(DataT), stream));
+  if (!is_dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(singular_mat.data(), 0, n * n * sizeof(DataT), stream));
+  }
 
   raft::matrix::set_diagonal(handle,
                              make_device_vector_view<const DataT, IdxT>(singular_vec.data(), n),
@@ -150,55 +160,41 @@ void make_regression_caller(raft::resources const& handle,
                             uint64_t seed                    = 0ULL,
                             raft::random::GeneratorType type = raft::random::GenPC)
 {
-  n_informative = std::min(n_informative, n_cols);
+  bool is_dry_run = resource::get_dry_run_flag(handle);
+  n_informative   = std::min(n_informative, n_cols);
 
   raft::random::RngState r(seed, type);
 
   if (effective_rank < 0) {
     // Randomly generate a well conditioned input set
-    normal(r, out, n_rows * n_cols, (DataT)0.0, (DataT)1.0, stream);
+    if (!is_dry_run) { normal(r, out, n_rows * n_cols, (DataT)0.0, (DataT)1.0, stream); }
   } else {
     // Randomly generate a low rank, fat tail input set
     _make_low_rank_matrix(handle, out, n_rows, n_cols, effective_rank, tail_strength, r, stream);
   }
 
   // Use the right output buffer for the values
-  rmm::device_uvector<DataT> tmp_values(0, stream);
-  DataT* _values;
-  if (shuffle) {
-    tmp_values.resize(n_rows * n_targets, stream);
-    _values = tmp_values.data();
-  } else {
-    _values = values;
-  }
+  rmm::device_uvector<DataT> tmp_values(shuffle ? n_rows * n_targets : 0, stream);
+  DataT* _values = shuffle ? tmp_values.data() : values;
   // Create a column-major matrix of output values only if it has more
   // than 1 column
-  rmm::device_uvector<DataT> values_col(0, stream);
-  DataT* _values_col;
-  if (n_targets > 1) {
-    values_col.resize(n_rows * n_targets, stream);
-    _values_col = values_col.data();
-  } else {
-    _values_col = _values;
-  }
+  rmm::device_uvector<DataT> values_col(n_targets > 1 ? n_rows * n_targets : 0, stream);
+  DataT* _values_col = n_targets > 1 ? values_col.data() : _values;
 
   // Use the right buffer for the coefficients
-  rmm::device_uvector<DataT> tmp_coef(0, stream);
-  DataT* _coef;
-  if (coef != nullptr && !shuffle) {
-    _coef = coef;
-  } else {
-    tmp_coef.resize(n_cols * n_targets, stream);
-    _coef = tmp_coef.data();
-  }
+  rmm::device_uvector<DataT> tmp_coef((coef != nullptr && !shuffle) ? 0 : n_cols * n_targets,
+                                      stream);
+  DataT* _coef = tmp_coef.size() == 0 ? coef : tmp_coef.data();
 
   // Generate a ground truth model with only n_informative features
-  uniform(r, _coef, n_informative * n_targets, (DataT)1.0, (DataT)100.0, stream);
-  if (coef && n_informative != n_cols) {
-    RAFT_CUDA_TRY(cudaMemsetAsync(_coef + n_informative * n_targets,
-                                  0,
-                                  (n_cols - n_informative) * n_targets * sizeof(DataT),
-                                  stream));
+  if (!is_dry_run) {
+    uniform(r, _coef, n_informative * n_targets, (DataT)1.0, (DataT)100.0, stream);
+    if (coef && n_informative != n_cols) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(_coef + n_informative * n_targets,
+                                    0,
+                                    (n_cols - n_informative) * n_targets * sizeof(DataT),
+                                    stream));
+    }
   }
 
   // Compute the output values
@@ -224,15 +220,16 @@ void make_regression_caller(raft::resources const& handle,
     raft::linalg::transpose(handle, _values_col, _values, n_rows, n_targets, stream);
   }
 
-  if (bias != 0.0) {
-    // Add bias
-    raft::linalg::addScalar(_values, _values, bias, n_rows * n_targets, stream);
+  if (!is_dry_run) {
+    if (bias != 0.0) {
+      // Add bias
+      raft::linalg::addScalar(_values, _values, bias, n_rows * n_targets, stream);
+    }
   }
 
-  rmm::device_uvector<DataT> white_noise(0, stream);
-  if (noise != 0.0) {
+  rmm::device_uvector<DataT> white_noise(noise != 0.0 ? n_rows * n_targets : 0, stream);
+  if (noise != 0.0 && !is_dry_run) {
     // Add white noise
-    white_noise.resize(n_rows * n_targets, stream);
     normal(r, white_noise.data(), n_rows * n_targets, (DataT)0.0, noise, stream);
     raft::linalg::add(_values, _values, white_noise.data(), n_rows * n_targets, stream);
   }
@@ -242,26 +239,28 @@ void make_regression_caller(raft::resources const& handle,
     rmm::device_uvector<IdxT> perms_samples(n_rows, stream);
     rmm::device_uvector<IdxT> perms_features(n_cols, stream);
 
-    constexpr IdxT Nthreads = 256;
+    if (!is_dry_run) {
+      constexpr IdxT Nthreads = 256;
 
-    // Shuffle the samples from out to tmp_out
-    raft::random::permute<DataT, IdxT, IdxT>(
-      perms_samples.data(), tmp_out.data(), out, n_cols, n_rows, true, stream);
-    IdxT nblks_rows = raft::ceildiv<IdxT>(n_rows, Nthreads);
-    _gather2d_kernel<<<nblks_rows, Nthreads, 0, stream>>>(
-      values, _values, perms_samples.data(), n_rows, n_targets);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-    // Shuffle the features from tmp_out to out
-    raft::random::permute<DataT, IdxT, IdxT>(
-      perms_features.data(), out, tmp_out.data(), n_rows, n_cols, false, stream);
-
-    // Shuffle the coefficients accordingly
-    if (coef != nullptr) {
-      IdxT nblks_cols = raft::ceildiv<IdxT>(n_cols, Nthreads);
-      _gather2d_kernel<<<nblks_cols, Nthreads, 0, stream>>>(
-        coef, _coef, perms_features.data(), n_cols, n_targets);
+      // Shuffle the samples from out to tmp_out
+      raft::random::permute<DataT, IdxT, IdxT>(
+        perms_samples.data(), tmp_out.data(), out, n_cols, n_rows, true, stream);
+      IdxT nblks_rows = raft::ceildiv<IdxT>(n_rows, Nthreads);
+      _gather2d_kernel<<<nblks_rows, Nthreads, 0, stream>>>(
+        values, _values, perms_samples.data(), n_rows, n_targets);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+      // Shuffle the features from tmp_out to out
+      raft::random::permute<DataT, IdxT, IdxT>(
+        perms_features.data(), out, tmp_out.data(), n_rows, n_cols, false, stream);
+
+      // Shuffle the coefficients accordingly
+      if (coef != nullptr) {
+        IdxT nblks_cols = raft::ceildiv<IdxT>(n_cols, Nthreads);
+        _gather2d_kernel<<<nblks_cols, Nthreads, 0, stream>>>(
+          coef, _coef, perms_features.data(), n_cols, n_targets);
+        RAFT_CUDA_TRY(cudaPeekAtLastError());
+      }
     }
   }
 }
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index 0cec08e6e3..8f83a10430 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -9,6 +9,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusolver_dn_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
@@ -73,7 +74,8 @@ template <typename T>
 void matVecAdd(
   T* out, const T* in_m, const T* in_v, T scalar, int rows, int cols, cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<true, true>(
+  raft::linalg::detail::matrixVectorOp<true, true>(
+    false,
     out,
     in_m,
     in_v,
@@ -183,6 +185,7 @@ class multi_variable_gaussian_impl {
 
   void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
   {
+    if (resource::get_dry_run_flag(handle)) { return; }
     auto cusolverHandle = resource::get_cusolver_dn_handle(handle);
     auto cudaStream     = resource::get_cuda_stream(handle);
     if (method == chol_decomp) {
diff --git a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
index b8139600b9..27c0a4fdc9 100644
--- a/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
+++ b/cpp/include/raft/random/detail/rmat_rectangular_generator.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include "rmat_rectangular_generator_types.cuh"
 
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/random/rng_device.cuh>
 #include <raft/random/rng_state.hpp>
@@ -204,6 +205,7 @@ void rmat_rectangular_gen_impl(raft::resources const& handle,
                                IdxT r_scale,
                                IdxT c_scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   static_assert(std::is_integral_v<IdxT>,
                 "rmat_rectangular_gen: "
                 "Template parameter IdxT must be an integral type");
@@ -259,6 +261,7 @@ void rmat_rectangular_gen_impl(raft::resources const& handle,
                                IdxT r_scale,
                                IdxT c_scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   static_assert(std::is_integral_v<IdxT>,
                 "rmat_rectangular_gen: "
                 "Template parameter IdxT must be an integral type");
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 91c5dfa56c..dffd27b66c 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -13,12 +13,12 @@
 #include <raft/random/rng_device.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cudart_utils.hpp>
-#include <raft/util/detail/cub_wrappers.cuh>
 #include <raft/util/scatter.cuh>
 
 #include <rmm/device_scalar.hpp>
 
 #include <cub/device/device_merge_sort.cuh>
+#include <cub/device/device_radix_sort.cuh>
 #include <cub/device/device_scan.cuh>
 #include <cub/device/device_select.cuh>
 #include <cuda_fp16.h>
@@ -251,7 +251,8 @@ void call_sample_with_replacement_kernel(DeviceState<GenType> const& dev_state,
 }
 
 template <typename OutType, typename WeightType, typename IndexType = OutType>
-std::enable_if_t<std::is_integral_v<OutType>> discrete(RngState& rng_state,
+std::enable_if_t<std::is_integral_v<OutType>> discrete(bool dry_run,
+                                                       RngState& rng_state,
                                                        OutType* ptr,
                                                        const WeightType* weights,
                                                        IndexType sampledLen,
@@ -264,6 +265,9 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(RngState& rng_state,
   cub::DeviceScan::InclusiveSum(
     nullptr, temp_storage_bytes, weights, weights_csum.data(), len, stream);
   rmm::device_uvector<uint8_t> temp_storage(temp_storage_bytes, stream);
+
+  if (dry_run) { return; }
+
   cub::DeviceScan::InclusiveSum(
     temp_storage.data(), temp_storage_bytes, weights, weights_csum.data(), len, stream);
 
@@ -280,7 +284,8 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(RngState& rng_state,
 
 /** Note the memory space requirements are O(4*len) */
 template <typename DataT, typename WeightsT, typename IdxT = int>
-void sampleWithoutReplacement(RngState& rng_state,
+void sampleWithoutReplacement(bool dry_run,
+                              RngState& rng_state,
                               DataT* out,
                               IdxT* outIdx,
                               const DataT* in,
@@ -301,13 +306,37 @@ void sampleWithoutReplacement(RngState& rng_state,
   params.inIdxPtr = inIdxPtr;
   params.wts      = wts;
 
+  // Query workspace size for sortPairs before dry-run check to track allocation
+  size_t workspace_size = 0;
+  cub::DeviceRadixSort::SortPairs(nullptr,
+                                  workspace_size,
+                                  expWts.data(),
+                                  sortedWts.data(),
+                                  inIdxPtr,
+                                  outIdxBuff.data(),
+                                  (int)len,
+                                  0,
+                                  sizeof(WeightsT) * 8,
+                                  stream);
+  rmm::device_uvector<char> workspace(workspace_size, stream);
+
+  if (dry_run) { return; }
+
   RAFT_CALL_RNG_FUNC(rng_state, call_rng_kernel<1>, rng_state, stream, expWts.data(), len, params);
 
   ///@todo: use a more efficient partitioning scheme instead of full sort
   // sort the array and pick the top sampledLen items
   IdxT* outIdxPtr = outIdxBuff.data();
-  rmm::device_uvector<char> workspace(0, stream);
-  sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
+  cub::DeviceRadixSort::SortPairs(workspace.data(),
+                                  workspace_size,
+                                  expWts.data(),
+                                  sortedWts.data(),
+                                  inIdxPtr,
+                                  outIdxPtr,
+                                  (int)len,
+                                  0,
+                                  sizeof(WeightsT) * 8,
+                                  stream);
   if (outIdx != nullptr) {
     RAFT_CUDA_TRY(cudaMemcpyAsync(
       outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
@@ -364,20 +393,18 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
   // There is a variance of n_excess_samples, we take 10% more elements.
   n_excess_samples += std::max<IdxT>(0.1 * n_samples, 100);
 
+  bool dry_run = resource::get_dry_run_flag(res);
+  auto stream  = resource::get_cuda_stream(res);
+
   while (true) {
     // n_excess_sampless will be larger than N around k = 0.64*N. When we reach N, then instead of
     // doing rejection sampling, we simply shuffle the range [0..N-1] using N random numbers.
     n_excess_samples = std::min<IdxT>(n_excess_samples, N);
     auto rnd_idx     = raft::make_device_vector<IdxT, IdxT>(res, n_excess_samples);
+    auto linear_idx  = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
 
-    auto linear_idx = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
-    raft::linalg::map_offset(res, linear_idx.view(), identity_op());
-
-    uniformInt(res, state, rnd_idx.data_handle(), rnd_idx.size(), IdxT(0), IdxT(N));
-
-    // Sort indices according to rnd keys
+    // Workspace size queries (safe with nullptr)
     size_t workspace_size = 0;
-    auto stream           = resource::get_cuda_stream(res);
     cub::DeviceMergeSort::SortPairs(nullptr,
                                     workspace_size,
                                     rnd_idx.data_handle(),
@@ -385,7 +412,30 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
                                     rnd_idx.size(),
                                     raft::less_op{},
                                     stream);
+
+    auto keys_out   = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+    auto values_out = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+    rmm::device_scalar<IdxT> num_selected(stream);
+    size_t worksize2 = 0;
+    cub::DeviceSelect::UniqueByKey(nullptr,
+                                   worksize2,
+                                   rnd_idx.data_handle(),
+                                   linear_idx.data_handle(),
+                                   keys_out.data_handle(),
+                                   values_out.data_handle(),
+                                   num_selected.data(),
+                                   rnd_idx.size(),
+                                   stream);
+
+    workspace_size = std::max(workspace_size, worksize2);
     auto workspace = raft::make_device_vector<char, IdxT>(res, workspace_size);
+
+    if (dry_run) { return raft::make_device_vector<IdxT, IdxT>(res, n_samples); }
+
+    raft::linalg::map_offset(res, linear_idx.view(), identity_op());
+    uniformInt(res, state, rnd_idx.data_handle(), rnd_idx.size(), IdxT(0), IdxT(N));
+
+    // Sort indices according to rnd keys
     cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
                                     workspace_size,
                                     rnd_idx.data_handle(),
@@ -404,25 +454,6 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
     }
     // Else we do a rejection sampling (or excess sampling): we generated more random indices than
     // needed and reject the duplicates.
-    auto keys_out   = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
-    auto values_out = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
-    rmm::device_scalar<IdxT> num_selected(stream);
-    size_t worksize2 = 0;
-    cub::DeviceSelect::UniqueByKey(nullptr,
-                                   worksize2,
-                                   rnd_idx.data_handle(),
-                                   linear_idx.data_handle(),
-                                   keys_out.data_handle(),
-                                   values_out.data_handle(),
-                                   num_selected.data(),
-                                   rnd_idx.size(),
-                                   stream);
-
-    if (worksize2 > workspace.size()) {
-      workspace      = raft::make_device_vector<char, IdxT>(res, worksize2);
-      workspace_size = workspace.size();
-    }
-
     cub::DeviceSelect::UniqueByKey(workspace.data_handle(),
                                    workspace_size,
                                    rnd_idx.data_handle(),
diff --git a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
index 1b4cff1892..d0c43e657a 100644
--- a/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
+++ b/cpp/include/raft/random/detail/rng_impl_deprecated.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -11,14 +11,16 @@
 
 #include "rng_device.cuh"
 
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
-#include <raft/util/detail/cub_wrappers.cuh>
 #include <raft/util/scatter.cuh>
 
 #include <rmm/device_uvector.hpp>
 
+#include <cub/device/device_radix_sort.cuh>
+
 #include <curand_kernel.h>
 
 #include <random>
@@ -271,13 +273,37 @@ class RngImpl {
     SamplingParams<WeightsT, IdxT> params;
     params.inIdxPtr = inIdxPtr;
     params.wts      = wts;
+
+    // Query workspace size for sortPairs before dry-run check to track allocation
+    size_t workspace_size = 0;
+    cub::DeviceRadixSort::SortPairs(nullptr,
+                                    workspace_size,
+                                    expWts.data(),
+                                    sortedWts.data(),
+                                    inIdxPtr,
+                                    outIdxBuff.data(),
+                                    (int)len,
+                                    0,
+                                    sizeof(WeightsT) * 8,
+                                    stream);
+    rmm::device_uvector<char> workspace(workspace_size, stream);
+
+    if (resource::get_dry_run_flag(handle)) { return; }
     kernel_dispatch<WeightsT, IdxT, 1, SamplingParams<WeightsT, IdxT>>(
       expWts.data(), len, stream, params);
     ///@todo: use a more efficient partitioning scheme instead of full sort
     // sort the array and pick the top sampledLen items
     IdxT* outIdxPtr = outIdxBuff.data();
-    rmm::device_uvector<char> workspace(0, stream);
-    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
+    cub::DeviceRadixSort::SortPairs(workspace.data(),
+                                    workspace_size,
+                                    expWts.data(),
+                                    sortedWts.data(),
+                                    inIdxPtr,
+                                    outIdxPtr,
+                                    (int)len,
+                                    0,
+                                    sizeof(WeightsT) * 8,
+                                    stream);
     if (outIdx != nullptr) {
       RAFT_CUDA_TRY(cudaMemcpyAsync(
         outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
index 7b8c104941..86053667cb 100644
--- a/cpp/include/raft/random/make_blobs.cuh
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -12,6 +12,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <optional>
@@ -69,7 +70,8 @@ void make_blobs(DataT* out,
                 uint64_t seed                  = 0ULL,
                 GeneratorType type             = GenPC)
 {
-  detail::make_blobs_caller(out,
+  detail::make_blobs_caller(false,
+                            out,
                             labels,
                             n_rows,
                             n_cols,
@@ -155,7 +157,8 @@ void make_blobs(
   auto prm_centers     = centers.has_value() ? centers.value().data_handle() : nullptr;
   auto prm_cluster_std = cluster_std.has_value() ? cluster_std.value().data_handle() : nullptr;
 
-  detail::make_blobs_caller(out.data_handle(),
+  detail::make_blobs_caller(resource::get_dry_run_flag(handle),
+                            out.data_handle(),
                             labels.data_handle(),
                             (IdxT)out.extent(0),
                             (IdxT)out.extent(1),
diff --git a/cpp/include/raft/random/permute.cuh b/cpp/include/raft/random/permute.cuh
index 0520fbfe94..29af06b512 100644
--- a/cpp/include/raft/random/permute.cuh
+++ b/cpp/include/raft/random/permute.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -12,6 +12,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <optional>
@@ -90,6 +91,7 @@ void permute(raft::resources const& handle,
              std::optional<raft::device_vector_view<IntType, IdxType>> permsOut,
              std::optional<raft::device_matrix_view<InputOutputValueType, IdxType, Layout>> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   static_assert(std::is_integral_v<IntType>,
                 "permute: The type of each element "
                 "of permsOut (if provided) must be an integral type.");
@@ -142,6 +144,7 @@ void permute(raft::resources const& handle,
              PermsOutType&& permsOut,
              OutType&& out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   // If PermsOutType is std::optional<device_vector_view<T, IdxType>>
   // for some T, then that type T need not be related to any of the
   // other template parameters.  Thus, we have to deduce it specially.
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 606d459baa..b02e0cc507 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -11,6 +11,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cassert>
@@ -44,6 +45,7 @@ void uniform(raft::resources const& handle,
              OutputValueType start,
              OutputValueType end)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::uniform(
     rng_state, out.data_handle(), out.extent(0), start, end, resource::get_cuda_stream(handle));
 }
@@ -72,6 +74,7 @@ void uniform(raft::resources const& handle,
              OutType start,
              OutType end)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::uniform(rng_state, ptr, len, start, end, resource::get_cuda_stream(handle));
 }
 
@@ -95,6 +98,7 @@ void uniformInt(raft::resources const& handle,
                 OutputValueType start,
                 OutputValueType end)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   static_assert(
     std::is_same<OutputValueType, typename std::remove_cv<OutputValueType>::type>::value,
     "uniformInt: The output vector must be a view of nonconst, "
@@ -125,6 +129,7 @@ void uniformInt(raft::resources const& handle,
                 OutType start,
                 OutType end)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::uniformInt(rng_state, ptr, len, start, end, resource::get_cuda_stream(handle));
 }
 
@@ -149,6 +154,7 @@ void normal(raft::resources const& handle,
             OutputValueType mu,
             OutputValueType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::normal(
     rng_state, out.data_handle(), out.extent(0), mu, sigma, resource::get_cuda_stream(handle));
 }
@@ -173,6 +179,7 @@ void normal(raft::resources const& handle,
             OutType mu,
             OutType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::normal(rng_state, ptr, len, mu, sigma, resource::get_cuda_stream(handle));
 }
 
@@ -196,6 +203,7 @@ void normalInt(raft::resources const& handle,
                OutputValueType mu,
                OutputValueType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   static_assert(
     std::is_same<OutputValueType, typename std::remove_cv<OutputValueType>::type>::value,
     "normalInt: The output vector must be a view of nonconst, "
@@ -227,6 +235,7 @@ void normalInt(raft::resources const& handle,
                IntType mu,
                IntType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::normalInt(rng_state, ptr, len, mu, sigma, resource::get_cuda_stream(handle));
 }
 
@@ -259,6 +268,7 @@ void normalTable(
   std::variant<raft::device_vector_view<const OutputValueType, IndexType>, OutputValueType> sigma,
   raft::device_matrix_view<OutputValueType, IndexType, raft::row_major> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   const OutputValueType* sigma_vec_ptr = nullptr;
   OutputValueType sigma_value{};
 
@@ -325,6 +335,7 @@ void normalTable(raft::resources const& handle,
                  const OutType* sigma_vec,
                  OutType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::normalTable(
     rng_state, ptr, n_rows, n_cols, mu_vec, sigma_vec, sigma, resource::get_cuda_stream(handle));
 }
@@ -347,6 +358,7 @@ void fill(raft::resources const& handle,
           OutputValueType val,
           raft::device_vector_view<OutputValueType, IndexType> out)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::fill(rng_state, out.data_handle(), out.extent(0), val, resource::get_cuda_stream(handle));
 }
 
@@ -365,6 +377,7 @@ template <typename OutType, typename LenType = int>
 void fill(
   raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType val)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::fill(rng_state, ptr, len, val, resource::get_cuda_stream(handle));
 }
 
@@ -388,6 +401,7 @@ void bernoulli(raft::resources const& handle,
                raft::device_vector_view<OutputValueType, IndexType> out,
                Type prob)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::bernoulli(
     rng_state, out.data_handle(), out.extent(0), prob, resource::get_cuda_stream(handle));
 }
@@ -409,6 +423,7 @@ template <typename Type, typename OutType = bool, typename LenType = int>
 void bernoulli(
   raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, Type prob)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::bernoulli(rng_state, ptr, len, prob, resource::get_cuda_stream(handle));
 }
 
@@ -432,6 +447,7 @@ void scaled_bernoulli(raft::resources const& handle,
                       OutputValueType prob,
                       OutputValueType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::scaled_bernoulli(
     rng_state, out.data_handle(), out.extent(0), prob, scale, resource::get_cuda_stream(handle));
 }
@@ -456,6 +472,7 @@ void scaled_bernoulli(raft::resources const& handle,
                       OutType prob,
                       OutType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::scaled_bernoulli(rng_state, ptr, len, prob, scale, resource::get_cuda_stream(handle));
 }
 
@@ -480,6 +497,7 @@ void gumbel(raft::resources const& handle,
             OutputValueType mu,
             OutputValueType beta)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::gumbel(
     rng_state, out.data_handle(), out.extent(0), mu, beta, resource::get_cuda_stream(handle));
 }
@@ -505,6 +523,7 @@ void gumbel(raft::resources const& handle,
             OutType mu,
             OutType beta)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::gumbel(rng_state, ptr, len, mu, beta, resource::get_cuda_stream(handle));
 }
 
@@ -528,6 +547,7 @@ void lognormal(raft::resources const& handle,
                OutputValueType mu,
                OutputValueType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::lognormal(
     rng_state, out.data_handle(), out.extent(0), mu, sigma, resource::get_cuda_stream(handle));
 }
@@ -552,6 +572,7 @@ void lognormal(raft::resources const& handle,
                OutType mu,
                OutType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::lognormal(rng_state, ptr, len, mu, sigma, resource::get_cuda_stream(handle));
 }
 
@@ -575,6 +596,7 @@ void logistic(raft::resources const& handle,
               OutputValueType mu,
               OutputValueType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::logistic(
     rng_state, out.data_handle(), out.extent(0), mu, scale, resource::get_cuda_stream(handle));
 }
@@ -599,6 +621,7 @@ void logistic(raft::resources const& handle,
               OutType mu,
               OutType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::logistic(rng_state, ptr, len, mu, scale, resource::get_cuda_stream(handle));
 }
 
@@ -620,6 +643,7 @@ void exponential(raft::resources const& handle,
                  raft::device_vector_view<OutputValueType, IndexType> out,
                  OutputValueType lambda)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::exponential(
     rng_state, out.data_handle(), out.extent(0), lambda, resource::get_cuda_stream(handle));
 }
@@ -639,6 +663,7 @@ template <typename OutType, typename LenType = int>
 void exponential(
   raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType lambda)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::exponential(rng_state, ptr, len, lambda, resource::get_cuda_stream(handle));
 }
 
@@ -660,6 +685,7 @@ void rayleigh(raft::resources const& handle,
               raft::device_vector_view<OutputValueType, IndexType> out,
               OutputValueType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::rayleigh(
     rng_state, out.data_handle(), out.extent(0), sigma, resource::get_cuda_stream(handle));
 }
@@ -679,6 +705,7 @@ template <typename OutType, typename LenType = int>
 void rayleigh(
   raft::resources const& handle, RngState& rng_state, OutType* ptr, LenType len, OutType sigma)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::rayleigh(rng_state, ptr, len, sigma, resource::get_cuda_stream(handle));
 }
 /**
@@ -701,6 +728,7 @@ void laplace(raft::resources const& handle,
              OutputValueType mu,
              OutputValueType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::laplace(
     rng_state, out.data_handle(), out.extent(0), mu, scale, resource::get_cuda_stream(handle));
 }
@@ -725,6 +753,7 @@ void laplace(raft::resources const& handle,
              OutType mu,
              OutType scale)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   detail::laplace(rng_state, ptr, len, mu, scale, resource::get_cuda_stream(handle));
 }
 
@@ -761,7 +790,8 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(
   raft::device_vector_view<OutType, IndexType> out,
   raft::device_vector_view<const WeightType, IndexType> weights)
 {
-  detail::discrete(rng_state,
+  detail::discrete(resource::get_dry_run_flag(handle),
+                   rng_state,
                    out.data_handle(),
                    weights.data_handle(),
                    out.extent(0),
@@ -798,8 +828,15 @@ void sampleWithoutReplacement(raft::resources const& handle,
                               IdxT sampledLen,
                               IdxT len)
 {
-  detail::sampleWithoutReplacement(
-    rng_state, out, outIdx, in, wts, sampledLen, len, resource::get_cuda_stream(handle));
+  detail::sampleWithoutReplacement(resource::get_dry_run_flag(handle),
+                                   rng_state,
+                                   out,
+                                   outIdx,
+                                   in,
+                                   wts,
+                                   sampledLen,
+                                   len,
+                                   resource::get_cuda_stream(handle));
 }
 
 /** @brief Sample from range 0..N-1.
diff --git a/cpp/include/raft/random/sample_without_replacement.cuh b/cpp/include/raft/random/sample_without_replacement.cuh
index ea26007dbe..4dea950abc 100644
--- a/cpp/include/raft/random/sample_without_replacement.cuh
+++ b/cpp/include/raft/random/sample_without_replacement.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cassert>
@@ -129,7 +130,8 @@ void sample_without_replacement(raft::resources const& handle,
   }
   const weight_type* wts_ptr = wts_has_value ? (*wts).data_handle() : nullptr;
 
-  detail::sampleWithoutReplacement(rng_state,
+  detail::sampleWithoutReplacement(resource::get_dry_run_flag(handle),
+                                   rng_state,
                                    out.data_handle(),
                                    outIdx_ptr,
                                    in.data_handle(),
diff --git a/cpp/include/raft/solver/detail/lap_functions.cuh b/cpp/include/raft/solver/detail/lap_functions.cuh
index a2378c6e95..84ed9d0fe1 100644
--- a/cpp/include/raft/solver/detail/lap_functions.cuh
+++ b/cpp/include/raft/solver/detail/lap_functions.cuh
@@ -1,6 +1,6 @@
 /*
  * SPDX-FileCopyrightText: Copyright 2020 KETAN DATE & RAKESH NAGI
- * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 /*
@@ -29,6 +29,7 @@
 #pragma once
 
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/solver/detail/lap_kernels.cuh>
 #include <raft/solver/linear_assignment_types.hpp>
@@ -109,6 +110,7 @@ inline void initialReduction(raft::resources const& handle,
                              int SP,
                              vertex_t N)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -149,6 +151,8 @@ inline void computeInitialAssignments(raft::resources const& handle,
   rmm::device_uvector<int> row_lock_v(size, resource::get_cuda_stream(handle));
   rmm::device_uvector<int> col_lock_v(size, resource::get_cuda_stream(handle));
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1);
   thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1);
   thrust::fill_n(thrust::device, row_lock_v.data(), size, 0);
@@ -182,6 +186,7 @@ inline int computeRowCovers(raft::resources const& handle,
                             int SP,
                             vertex_t N)
 {
+  if (resource::get_dry_run_flag(handle)) { return {}; }
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -224,6 +229,7 @@ inline void coverZeroAndExpand(raft::resources const& handle,
                                vertex_t N,
                                weight_t epsilon)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
@@ -256,11 +262,18 @@ inline vertex_t zeroCoverIteration(raft::resources const& handle,
                                    vertex_t N,
                                    weight_t epsilon)
 {
-  vertex_t M;
-
-  rmm::device_uvector<vertex_t> csr_ptrs_v(0, resource::get_cuda_stream(handle));
+  bool is_dry_run = resource::get_dry_run_flag(handle);
+  vertex_t M      = 0;
   rmm::device_uvector<vertex_t> csr_neighbors_v(0, resource::get_cuda_stream(handle));
 
+  // Allocate all buffers before dry-run check to track allocations
+  rmm::device_uvector<vertex_t> csr_ptrs_v(SP + 1, resource::get_cuda_stream(handle));
+  if (is_dry_run) {
+    // Upper bound for csr_neighbors_v: at most SP * N elements (one per matrix element)
+    csr_neighbors_v = rmm::device_uvector<vertex_t>(SP * N, resource::get_cuda_stream(handle));
+    return vertex_t{0};
+  }
+
   {
     dim3 blocks_per_grid;
     dim3 threads_per_block;
@@ -271,9 +284,6 @@ inline vertex_t zeroCoverIteration(raft::resources const& handle,
 
     thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false);
     thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
-
-    csr_ptrs_v.resize(SP + 1, resource::get_cuda_stream(handle));
-
     thrust::fill_n(thrust::device, csr_ptrs_v.data(), (SP + 1), vertex_t{-1});
 
     detail::calculateRectangularDims(blocks_per_grid, threads_per_block, total_blocks, N, SP);
@@ -338,6 +348,7 @@ inline void executeZeroCover(raft::resources const& handle,
                              vertex_t N,
                              weight_t epsilon)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   vertex_t M = 1;
   while (M > 0) {
     M = zeroCoverIteration(
@@ -359,10 +370,15 @@ inline void reversePass(raft::resources const& handle,
 
   std::size_t size = SP * N;
 
-  detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size);
-
   rmm::device_uvector<bool> predicates_v(size, resource::get_cuda_stream(handle));
   rmm::device_uvector<vertex_t> addresses_v(size, resource::get_cuda_stream(handle));
+  if (resource::get_dry_run_flag(handle)) {
+    // Upper bound for elements_v: at most size elements (one per matrix element)
+    rmm::device_uvector<vertex_t> elements_v(size, resource::get_cuda_stream(handle));
+    return;
+  }
+
+  detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size);
 
   thrust::fill_n(thrust::device, predicates_v.data(), size, false);
   thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0});
@@ -416,14 +432,19 @@ inline void augmentationPass(raft::resources const& handle,
                              int SP,
                              int N)
 {
+  rmm::device_uvector<bool> predicates_v(SP * N, resource::get_cuda_stream(handle));
+  rmm::device_uvector<vertex_t> addresses_v(SP * N, resource::get_cuda_stream(handle));
+  if (resource::get_dry_run_flag(handle)) {
+    // Upper bound for elements_v: at most SP * N elements (one per matrix element)
+    rmm::device_uvector<vertex_t> elements_v(SP * N, resource::get_cuda_stream(handle));
+    return;
+  }
+
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N);
 
-  rmm::device_uvector<bool> predicates_v(SP * N, resource::get_cuda_stream(handle));
-  rmm::device_uvector<vertex_t> addresses_v(SP * N, resource::get_cuda_stream(handle));
-
   thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false);
   thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
 
@@ -485,12 +506,14 @@ inline void dualUpdate(raft::resources const& handle,
                        vertex_t N,
                        weight_t epsilon)
 {
+  rmm::device_uvector<weight_t> sp_min_v(SP, resource::get_cuda_stream(handle));
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks;
 
-  rmm::device_uvector<weight_t> sp_min_v(SP, resource::get_cuda_stream(handle));
-
   detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
   kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0, resource::get_cuda_stream(handle)>>>(
     sp_min_v.data(),
@@ -528,6 +551,7 @@ inline void calcObjValDual(raft::resources const& handle,
                            int SP,
                            int N)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -552,6 +576,7 @@ inline void calcObjValPrimal(raft::resources const& handle,
                              int SP,
                              vertex_t N)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
diff --git a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
index 492a7fe3ae..46a3018493 100644
--- a/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/adj_to_csr.cuh
@@ -1,11 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/device_atomics.cuh>
@@ -129,6 +130,8 @@ void adj_to_csr(raft::resources const& handle,
                 index_t* out_col_ind     // output column indices
 )
 {
+  if (resource::get_dry_run_flag(handle)) { return; }  // No allocations below
+
   auto stream = resource::get_cuda_stream(handle);
 
   // Check inputs and return early if possible.
diff --git a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
index 42266cee71..d92b4eb9e9 100644
--- a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/mdspan_util.cuh>  // detail::popc
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/convert/detail/adj_to_csr.cuh>
@@ -15,10 +16,12 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/block/block_reduce.cuh>
+#include <cub/device/device_scan.cuh>
 #include <cuda/std/cassert>
 #include <cuda/std/functional>
 #include <thrust/fill.h>
-#include <thrust/scan.h>
+
+#include <limits>
 
 namespace raft {
 namespace sparse {
@@ -104,6 +107,8 @@ void calc_nnz_by_rows(raft::resources const& handle,
     sub_nnz_size     = num_rows * ((num_cols + bits_per_sub_col - 1) / bits_per_sub_col);
     return;
   }
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   auto stream        = resource::get_cuda_stream(handle);
   const size_t total = num_rows * num_cols;
   const size_t bitmap_num =
@@ -243,6 +248,7 @@ void fill_indices_by_rows(raft::resources const& handle,
                           index_t bits_per_sub_col,
                           size_t sub_nnz_size)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   auto stream  = resource::get_cuda_stream(handle);
   auto block_x = num_rows;
   auto block_y = sub_nnz_size / num_rows;
@@ -282,7 +288,10 @@ void bitmap_to_csr(raft::resources const& handle,
   index_t* indptr  = csr_view.get_indptr().data();
   index_t* indices = csr_view.get_indices().data();
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+  if (!resource::get_dry_run_flag(handle)) {
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+  }
 
   size_t sub_nnz_size      = 0;
   index_t bits_per_sub_col = 0;
@@ -299,6 +308,21 @@ void bitmap_to_csr(raft::resources const& handle,
   rmm::device_async_resource_ref device_memory = resource::get_workspace_resource_ref(handle);
   rmm::device_uvector<nnz_t> sub_nnz(sub_nnz_size + 1, stream, device_memory);
 
+  size_t scan_ws_bytes = 0;
+  cub::DeviceScan::ExclusiveSum(
+    nullptr, scan_ws_bytes, sub_nnz.data(), sub_nnz.data(), sub_nnz_size + 1, stream);
+  rmm::device_uvector<char> scan_ws(scan_ws_bytes, stream);
+
+  if (resource::get_dry_run_flag(handle)) {
+    if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
+      auto safe_nnz = std::min(
+        static_cast<uint64_t>(csr_view.get_n_rows()) * static_cast<uint64_t>(csr_view.get_n_cols()),
+        static_cast<uint64_t>(std::numeric_limits<nnz_t>::max()));
+      csr.initialize_sparsity(static_cast<nnz_t>(safe_nnz));
+    }
+    return;
+  }
+
   calc_nnz_by_rows(handle,
                    bitmap.data(),
                    csr_view.get_n_rows(),
@@ -307,8 +331,8 @@ void bitmap_to_csr(raft::resources const& handle,
                    sub_nnz_size,
                    bits_per_sub_col);
 
-  thrust::exclusive_scan(
-    thrust_policy, sub_nnz.data(), sub_nnz.data() + sub_nnz_size + 1, sub_nnz.data());
+  cub::DeviceScan::ExclusiveSum(
+    scan_ws.data(), scan_ws_bytes, sub_nnz.data(), sub_nnz.data(), sub_nnz_size + 1, stream);
 
   if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
     nnz_t nnz = 0;
diff --git a/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
index efb418434e..c8edcc713d 100644
--- a/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/bitset_to_csr.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/detail/mdspan_util.cuh>  // detail::popc
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/convert/detail/adj_to_csr.cuh>
@@ -14,9 +15,11 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cub/device/device_scan.cuh>
 #include <cuda/std/cassert>
 #include <thrust/fill.h>
-#include <thrust/scan.h>
+
+#include <limits>
 
 namespace raft {
 namespace sparse {
@@ -62,6 +65,7 @@ void gpu_repeat_csr(raft::resources const& handle,
                     index_t* d_repeated_indices)
 {
   if (nnz == 0) return;
+  if (resource::get_dry_run_flag(handle)) { return; }
 
   auto stream            = resource::get_cuda_stream(handle);
   index_t repeat_csr_tpb = 256;
@@ -95,7 +99,10 @@ void bitset_to_csr(raft::resources const& handle,
   index_t* indptr  = csr_view.get_indptr().data();
   index_t* indices = csr_view.get_indices().data();
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+  if (!resource::get_dry_run_flag(handle)) {
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(indptr, 0, (csr_view.get_n_rows() + 1) * sizeof(index_t), stream));
+  }
 
   size_t sub_nnz_size      = 0;
   index_t bits_per_sub_col = 0;
@@ -112,6 +119,21 @@ void bitset_to_csr(raft::resources const& handle,
   rmm::device_async_resource_ref device_memory = resource::get_workspace_resource_ref(handle);
   rmm::device_uvector<nnz_t> sub_nnz(sub_nnz_size + 1, stream, device_memory);
 
+  size_t scan_ws_bytes = 0;
+  cub::DeviceScan::ExclusiveSum(
+    nullptr, scan_ws_bytes, sub_nnz.data(), sub_nnz.data(), sub_nnz_size + 1, stream);
+  rmm::device_uvector<char> scan_ws(scan_ws_bytes, stream);
+
+  if (resource::get_dry_run_flag(handle)) {
+    if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
+      auto safe_nnz = std::min(
+        static_cast<uint64_t>(csr_view.get_n_rows()) * static_cast<uint64_t>(csr_view.get_n_cols()),
+        static_cast<uint64_t>(std::numeric_limits<nnz_t>::max()));
+      csr.initialize_sparsity(static_cast<nnz_t>(safe_nnz));
+    }
+    return;
+  }
+
   calc_nnz_by_rows(handle,
                    bitset.data(),
                    row_t(1),
@@ -120,8 +142,8 @@ void bitset_to_csr(raft::resources const& handle,
                    sub_nnz_size,
                    bits_per_sub_col);
 
-  thrust::exclusive_scan(
-    thrust_policy, sub_nnz.data(), sub_nnz.data() + sub_nnz_size + 1, sub_nnz.data());
+  cub::DeviceScan::ExclusiveSum(
+    scan_ws.data(), scan_ws_bytes, sub_nnz.data(), sub_nnz.data(), sub_nnz_size + 1, stream);
 
   nnz_t bitset_nnz = 0;
   if constexpr (is_device_csr_sparsity_owning_v<csr_matrix_t>) {
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index f35064a287..06f60d6fc6 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -7,6 +7,7 @@
 
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -46,15 +47,19 @@ void coo_to_csr(raft::resources const& handle,
 {
   auto stream         = resource::get_cuda_stream(handle);
   auto cusparseHandle = resource::get_cusparse_handle(handle);
+
   rmm::device_uvector<int> dstRows(nnz, stream);
-  RAFT_CUDA_TRY(
-    cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
-  RAFT_CUDA_TRY(
-    cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
   auto buffSize = raft::sparse::detail::cusparsecoosort_bufferSizeExt(
     cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
   rmm::device_uvector<char> pBuffer(buffSize, stream);
   rmm::device_uvector<int> P(nnz, stream);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
   RAFT_CUSPARSE_TRY(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
   raft::sparse::detail::cusparsecoosortByRow(
     cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream);
diff --git a/cpp/include/raft/sparse/linalg/detail/laplacian.cuh b/cpp/include/raft/sparse/linalg/detail/laplacian.cuh
index 2763e2d55d..6b1f88f629 100644
--- a/cpp/include/raft/sparse/linalg/detail/laplacian.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/laplacian.cuh
@@ -194,13 +194,13 @@ device_coo_matrix<ElementType, RowType, ColType, NZType> compute_graph_laplacian
                            });
 
   raft::sparse::op::coo_sort<ElementType, RowType, NZType>(
+    res,
     dim,
     dim,
     result.structure_view().get_nnz(),
     result.structure_view().get_rows().data(),
     result.structure_view().get_cols().data(),
-    result.get_elements().data(),
-    raft::resource::get_cuda_stream(res));
+    result.get_elements().data());
 
   auto result_nnz = result.structure_view().get_nnz();
   auto degrees    = raft::make_device_vector<ElementType, RowType>(res, dim);
diff --git a/cpp/include/raft/sparse/linalg/detail/sddmm.hpp b/cpp/include/raft/sparse/linalg/detail/sddmm.hpp
index c9f80ee340..daa3f831e2 100644
--- a/cpp/include/raft/sparse/linalg/detail/sddmm.hpp
+++ b/cpp/include/raft/sparse/linalg/detail/sddmm.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -9,6 +9,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/linalg_types.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
@@ -65,6 +66,8 @@ void sddmm(raft::resources const& handle,
 
   rmm::device_uvector<uint8_t> tmp(bufferSize, resource::get_cuda_stream(handle));
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsesddmm(resource::get_cusparse_handle(handle),
                                                         op_a,
                                                         op_b,
diff --git a/cpp/include/raft/sparse/linalg/detail/spmm.hpp b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
index b621940c2b..2d21198bb3 100644
--- a/cpp/include/raft/sparse/linalg/detail/spmm.hpp
+++ b/cpp/include/raft/sparse/linalg/detail/spmm.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -9,6 +9,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
@@ -85,10 +86,12 @@ void spmm(raft::resources const& handle,
                                                   &bufferSize,
                                                   resource::get_cuda_stream(handle)));
 
-  raft::interruptible::synchronize(resource::get_cuda_stream(handle));
+  resource::sync_stream(handle);
 
   rmm::device_uvector<ValueType> tmp(bufferSize, resource::get_cuda_stream(handle));
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm(resource::get_cusparse_handle(handle),
                                                        opX,
                                                        opY,
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index d82932a9d8..7a623d8b7f 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include <raft/core/device_coo_matrix.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/matrix/init.cuh>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
@@ -400,22 +401,23 @@ void symmetrize(raft::resources const& handle,
   rmm::device_uvector<value_idx> symm_cols(nnz * 2, stream);
   rmm::device_uvector<value_t> symm_vals(nnz * 2, stream);
 
-  raft::copy_async(symm_rows.data(), rows, nnz, stream);
-  raft::copy_async(symm_rows.data() + nnz, cols, nnz, stream);
-  raft::copy_async(symm_cols.data(), cols, nnz, stream);
-  raft::copy_async(symm_cols.data() + nnz, rows, nnz, stream);
+  if (!resource::get_dry_run_flag(handle)) {
+    raft::copy_async(symm_rows.data(), rows, nnz, stream);
+    raft::copy_async(symm_rows.data() + nnz, cols, nnz, stream);
+    raft::copy_async(symm_cols.data(), cols, nnz, stream);
+    raft::copy_async(symm_cols.data() + nnz, rows, nnz, stream);
 
-  raft::copy_async(symm_vals.data(), vals, nnz, stream);
-  raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream);
+    raft::copy_async(symm_vals.data(), vals, nnz, stream);
+    raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream);
+  }
 
-  // sort COO
-  raft::sparse::op::coo_sort((value_idx)m,
+  raft::sparse::op::coo_sort(handle,
+                             (value_idx)m,
                              (value_idx)n,
                              static_cast<nnz_t>(nnz) * 2,
                              symm_rows.data(),
                              symm_cols.data(),
-                             symm_vals.data(),
-                             stream);
+                             symm_vals.data());
 
   raft::sparse::op::max_duplicates(
     handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n);
diff --git a/cpp/include/raft/sparse/linalg/detail/utils.cuh b/cpp/include/raft/sparse/linalg/detail/utils.cuh
index 75a8f08113..bf2d1fc22c 100644
--- a/cpp/include/raft/sparse/linalg/detail/utils.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/utils.cuh
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <raft/core/math.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 
 #include <cub/warp/warp_reduce.cuh>
 #include <cuda_fp16.h>
@@ -94,6 +95,7 @@ void faster_dot_on_csr(raft::resources const& handle,
                        const value_idx dim)
 {
   if (nnz == 0 || n_rows == 0) return;
+  if (resource::get_dry_run_flag(handle)) { return; }  // No allocations below
 
   auto stream = resource::get_cuda_stream(handle);
 
diff --git a/cpp/include/raft/sparse/linalg/spmm.hpp b/cpp/include/raft/sparse/linalg/spmm.hpp
index 2d5e766f45..5d600fe19d 100644
--- a/cpp/include/raft/sparse/linalg/spmm.hpp
+++ b/cpp/include/raft/sparse/linalg/spmm.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __SPMM_H
@@ -7,6 +7,9 @@
 
 #pragma once
 
+#include <raft/core/copy.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/sparse/linalg/detail/cusparse_utils.hpp>
 #include <raft/sparse/linalg/detail/spmm.hpp>
 
@@ -52,16 +55,22 @@ void spmm(raft::resources const& handle,
   // WARNING: The following copy is working around a bug in cusparse which causes an alignment issue
   // and incorrect results. This bug is fixed in CUDA 12.5+ so this workaround shouldn't be removed
   // until that version is supported.
-  auto size = is_row_major ? (z.extent(0) - 1) * z.stride(0) + z.extent(1)
-                           : (z.extent(1) - 1) * z.stride(1) + z.extent(0);
-  rmm::device_uvector<ValueType> z_tmp(size, raft::resource::get_cuda_stream(handle));
-  raft::copy(z_tmp.data(), z.data_handle(), z_tmp.size(), raft::resource::get_cuda_stream(handle));
+  auto size  = is_row_major ? (z.extent(0) - 1) * z.stride(0) + z.extent(1)
+                            : (z.extent(1) - 1) * z.stride(1) + z.extent(0);
+  auto z_tmp = raft::make_device_mdarray<ValueType, IndexType>(
+    handle,
+    raft::resource::get_workspace_resource_ref(handle),
+    raft::make_extents<IndexType>(size));
+
+  raft::copy(handle,
+             z_tmp.view(),
+             raft::make_device_vector_view<const ValueType, IndexType>(z.data_handle(), size));
 
   auto z_tmp_view =
     is_row_major ? raft::make_device_strided_matrix_view<ValueType, IndexType, layout_c_contiguous>(
-                     z_tmp.data(), z.extent(0), z.extent(1), z.stride(0))
+                     z_tmp.data_handle(), z.extent(0), z.extent(1), z.stride(0))
                  : raft::make_device_strided_matrix_view<ValueType, IndexType, layout_f_contiguous>(
-                     z_tmp.data(), z.extent(0), z.extent(1), z.stride(1));
+                     z_tmp.data_handle(), z.extent(0), z.extent(1), z.stride(1));
 
   auto descr_x = detail::create_descriptor(x);
   auto descr_y = detail::create_descriptor(y);
@@ -71,7 +80,9 @@ void spmm(raft::resources const& handle,
 
   // WARNING: Do not remove the following copy unless you can, with certainty, say that
   // the underlying cuSPARSE issue affecting CUDA 12.2+ has been resolved.
-  raft::copy(z.data_handle(), z_tmp.data(), z_tmp.size(), raft::resource::get_cuda_stream(handle));
+  raft::copy(handle,
+             raft::make_device_vector_view<ValueType, IndexType>(z.data_handle(), size),
+             raft::make_const_mdspan(z_tmp.view()));
   RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(descr_x));
   RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_y));
   RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(descr_z));
diff --git a/cpp/include/raft/sparse/linalg/transpose.cuh b/cpp/include/raft/sparse/linalg/transpose.cuh
index 9d080a4b57..8ff4fb7c88 100644
--- a/cpp/include/raft/sparse/linalg/transpose.cuh
+++ b/cpp/include/raft/sparse/linalg/transpose.cuh
@@ -1,13 +1,16 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
 #include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
-#include <raft/sparse/linalg/detail/transpose.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
+
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace sparse {
@@ -42,17 +45,44 @@ void csr_transpose(raft::resources const& handle,
                    value_idx nnz,
                    cudaStream_t stream)
 {
-  detail::csr_transpose(resource::get_cusparse_handle(handle),
-                        csr_indptr,
-                        csr_indices,
-                        csr_data,
-                        csc_indptr,
-                        csc_indices,
-                        csc_data,
-                        csr_nrows,
-                        csr_ncols,
-                        nnz,
-                        stream);
+  auto cusparse_h = resource::get_cusparse_handle(handle);
+
+  size_t convert_csc_workspace_size = 0;
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecsr2csc_bufferSize(cusparse_h,
+                                                                     csr_nrows,
+                                                                     csr_ncols,
+                                                                     nnz,
+                                                                     csr_data,
+                                                                     csr_indptr,
+                                                                     csr_indices,
+                                                                     csc_data,
+                                                                     csc_indptr,
+                                                                     csc_indices,
+                                                                     CUSPARSE_ACTION_NUMERIC,
+                                                                     CUSPARSE_INDEX_BASE_ZERO,
+                                                                     CUSPARSE_CSR2CSC_ALG1,
+                                                                     &convert_csc_workspace_size,
+                                                                     stream));
+
+  rmm::device_uvector<char> convert_csc_workspace(convert_csc_workspace_size, stream);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecsr2csc(cusparse_h,
+                                                          csr_nrows,
+                                                          csr_ncols,
+                                                          nnz,
+                                                          csr_data,
+                                                          csr_indptr,
+                                                          csr_indices,
+                                                          csc_data,
+                                                          csc_indptr,
+                                                          csc_indices,
+                                                          CUSPARSE_ACTION_NUMERIC,
+                                                          CUSPARSE_INDEX_BASE_ZERO,
+                                                          CUSPARSE_CSR2CSC_ALG1,
+                                                          convert_csc_workspace.data(),
+                                                          stream));
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/include/raft/sparse/matrix/detail/preprocessing.cuh b/cpp/include/raft/sparse/matrix/detail/preprocessing.cuh
index 6c45a15527..a8a6fac7f6 100644
--- a/cpp/include/raft/sparse/matrix/detail/preprocessing.cuh
+++ b/cpp/include/raft/sparse/matrix/detail/preprocessing.cuh
@@ -1,16 +1,21 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
+#include <raft/core/copy.cuh>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/label/classlabels.cuh>
 #include <raft/linalg/map_reduce.cuh>
 #include <raft/stats/histogram.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 #include <thrust/reduce.h>
 
 namespace raft::sparse::matrix::detail {
@@ -34,7 +39,12 @@ void get_uniques_counts(raft::resources const& handle,
                         raft::device_vector_view<IndexType, int64_t> keys_out,
                         raft::device_vector_view<ValueType, int64_t> counts_out)
 {
-  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  if (resource::get_dry_run_flag(handle)) {
+    // Upper bound for thrust::reduce_by_key internal workspace
+    rmm::device_uvector<char> reduce_ws(static_cast<size_t>(nnz) * sizeof(IndexType) + 4096,
+                                        resource::get_cuda_stream(handle));
+    return;
+  }
   thrust::reduce_by_key(raft::resource::get_thrust_policy(handle),
                         rows,
                         rows + nnz,
@@ -64,27 +74,26 @@ void fit_tfidf(raft::resources const& handle,
                raft::device_vector_view<IndexType, int64_t> idFeatCount,
                int& fullFeatCount)
 {
-  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  auto batchIdLen = raft::make_host_scalar<ValueType>(handle, 0);
+  auto values_mat = raft::make_device_scalar<ValueType>(handle, 0);
 
-  // Use RAFT's histogram function to count occurrences of each column index
-  // This replaces the countLabels function from kmeans_common.cuh
   raft::stats::histogram(
-    raft::stats::HistTypeAuto,                           // Let RAFT choose the best algorithm
-    idFeatCount.data_handle(),                           // output bins (counts per feature)
-    num_cols,                                            // number of bins (one per column/feature)
-    columns,                                             // input data (column indices)
-    nnz,                                                 // number of data points
-    1,                                                   // single batch
-    stream,                                              // CUDA stream
-    raft::stats::IdentityBinner<IndexType, IndexType>()  // column indices map directly to bins
-  );
-
-  // get total number of words
-  auto batchIdLen = raft::make_host_scalar<ValueType>(0);
-  auto values_mat = raft::make_device_scalar<ValueType>(handle, 0);
-  raft::linalg::mapReduce<ValueType>(
-    values_mat.data_handle(), nnz, 0.0f, raft::identity_op(), raft::add_op(), stream, values);
-  raft::copy(batchIdLen.data_handle(), values_mat.data_handle(), values_mat.size(), stream);
+    handle,
+    raft::stats::HistTypeAuto,
+    raft::make_device_matrix_view<const IndexType, IndexType, raft::col_major>(columns, nnz, 1),
+    raft::make_device_matrix_view<int, IndexType, raft::col_major>(
+      idFeatCount.data_handle(), num_cols, 1),
+    raft::stats::IdentityBinner<IndexType, IndexType>());
+
+  raft::linalg::map_reduce(handle,
+                           raft::make_device_vector_view<const ValueType, IndexType>(values, nnz),
+                           values_mat.view(),
+                           ValueType{0},
+                           raft::identity_op(),
+                           raft::add_op());
+
+  raft::copy(handle, batchIdLen.view(), raft::make_const_mdspan(values_mat.view()));
+  if (resource::get_dry_run_flag(handle)) { return; }
   fullFeatCount += (int)batchIdLen(0);
 }
 
@@ -118,16 +127,15 @@ void fit_bm25(raft::resources const& handle,
 {
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
 
-  // Count unique row indices using raft::label::getUniquelabels
-  // This replaces the get_n_components function from cross_component_nn.cuh
   rmm::device_uvector<IndexType> temp_unique_rows(0, stream);
-  int uniq_cnt  = raft::label::getUniquelabels(temp_unique_rows, rows, nnz, stream);
-  auto row_keys = raft::make_device_vector<IndexType>(handle, uniq_cnt);
-  auto row_cnts = raft::make_device_vector<ValueType>(handle, uniq_cnt);
+  int uniq_cnt   = raft::label::getUniquelabels(handle, temp_unique_rows, rows, nnz);
+  auto row_keys  = raft::make_device_vector<IndexType>(handle, uniq_cnt);
+  auto row_cnts  = raft::make_device_vector<ValueType>(handle, uniq_cnt);
+  auto dummy_vec = raft::make_device_vector<IndexType>(handle, uniq_cnt);
+
   get_uniques_counts<IndexType, ValueType, int64_t>(
     handle, rows, columns, values, nnz, row_keys.view(), row_cnts.view());
 
-  auto dummy_vec = raft::make_device_vector<IndexType>(handle, uniq_cnt);
   raft::linalg::map(
     handle,
     dummy_vec.view(),
diff --git a/cpp/include/raft/sparse/matrix/preprocessing.cuh b/cpp/include/raft/sparse/matrix/preprocessing.cuh
index 4d70fd67fa..d6934fb8b1 100644
--- a/cpp/include/raft/sparse/matrix/preprocessing.cuh
+++ b/cpp/include/raft/sparse/matrix/preprocessing.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/device_coo_matrix.hpp>
 #include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/matrix/detail/preprocessing.cuh>
 
@@ -116,13 +117,15 @@ void encode_bm25(raft::resources const& handle,
   auto nnz      = csr_in.structure_view().get_nnz();
   auto indptr   = csr_in.structure_view().get_indptr();
 
-  auto rows = raft::make_device_vector<IndexType, int64_t>(handle, nnz);
-  raft::sparse::convert::csr_to_coo(
-    indptr.data(), (int)indptr.size(), rows.data_handle(), (int)nnz, stream);
-
+  auto rows         = raft::make_device_vector<IndexType, int64_t>(handle, nnz);
   int fullFeatCount = 0;
   auto featIdCount  = raft::make_device_vector<IndexType, int64_t>(handle, num_cols);
   auto rowFeatCnts  = raft::make_device_vector<IndexType, int64_t>(handle, num_rows);
+
+  if (!resource::get_dry_run_flag(handle)) {
+    raft::sparse::convert::csr_to_coo(
+      indptr.data(), (int)indptr.size(), rows.data_handle(), (int)nnz, stream);
+  }
   detail::fit_bm25<ValueType, IndexType>(handle,
                                          rows.data_handle(),
                                          columns.data(),
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 642b673cb6..6e2ba4c97e 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include <raft/core/device_coo_matrix.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
@@ -213,6 +214,18 @@ void coo_remove_scalar(raft::resources const& handle,
   rmm::device_uvector<nnz_t> row_count_nz(in_n_rows, stream);
   rmm::device_uvector<nnz_t> row_count(in_n_rows, stream);
 
+  if (resource::get_dry_run_flag(handle)) {
+    // Upper bound on non-dry-run compliant and data-dependent code below (thrust calls, unknown
+    // out_nnz <= in_nnz)
+    out.initialize_sparsity(in_nnz);
+    rmm::device_uvector<nnz_t> ex_scan(in_n_rows, stream);
+    rmm::device_uvector<nnz_t> cur_ex_scan(in_n_rows, stream);
+    // Upper bound for thrust workspace (reduce + 2x exclusive_scan)
+    rmm::device_uvector<char> thrust_ws(3 * (static_cast<size_t>(in_n_rows) * sizeof(nnz_t) + 4096),
+                                        stream);
+    return;
+  }
+
   RAFT_CUDA_TRY(
     cudaMemsetAsync(row_count_nz.data(), 0, static_cast<nnz_t>(in_n_rows) * sizeof(nnz_t), stream));
   RAFT_CUDA_TRY(
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index 6fc529b758..3102442897 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -1,11 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
@@ -18,6 +19,7 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cub/device/device_scan.cuh>
 #include <cuda_runtime.h>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -129,9 +131,21 @@ void max_duplicates(raft::resources const& handle,
   // compute diffs & take exclusive scan
   rmm::device_uvector<value_idx> diff(nnz + 1, stream);
 
+  size_t scan_ws_bytes = 0;
+  cub::DeviceScan::ExclusiveSum(
+    nullptr, scan_ws_bytes, diff.data(), diff.data(), static_cast<int>(diff.size()), stream);
+  rmm::device_uvector<char> scan_ws(scan_ws_bytes, stream);
+
+  if (resource::get_dry_run_flag(handle)) {
+    // Upper bound: at most nnz unique entries (no duplicates removed).
+    out.allocate(nnz, m, n, false, stream);
+    return;
+  }
+
   compute_duplicates_mask(diff.data(), rows, cols, nnz, stream);
 
-  thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), diff.data());
+  cub::DeviceScan::ExclusiveSum(
+    scan_ws.data(), scan_ws_bytes, diff.data(), diff.data(), static_cast<int>(diff.size()), stream);
 
   // compute final size
   value_idx size = 0;
diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index febe91adb9..bb73fffb9e 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -11,6 +11,7 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/std/tuple>
@@ -49,6 +50,8 @@ struct TupleComp {
  * @brief Sorts the arrays that comprise the coo matrix
  * by row and then by column.
  *
+ * @param dry_run when true, allocates a best-effort workspace estimate
+ *                without launching kernels (for memory tracking)
  * @param m number of rows in coo matrix
  * @param n number of cols in coo matrix
  * @param nnz number of non-zeros
@@ -58,8 +61,18 @@ struct TupleComp {
  * @param stream: cuda stream to use
  */
 template <typename T, typename IdxT = int, typename nnz_t>
-void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
+void coo_sort(
+  bool dry_run, IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
 {
+  if (dry_run) {
+    // Best-effort upper bound for thrust::sort_by_key workspace.
+    // Double-buffer estimate for large inputs; minimum 4096 for small inputs
+    // where per-allocation alignment overhead dominates.
+    auto sort_data_bytes = static_cast<std::size_t>(nnz) * (sizeof(IdxT) * 2 + sizeof(T));
+    rmm::device_uvector<char> sort_ws_est(std::max(sort_data_bytes * 2, std::size_t{4096}), stream);
+    return;
+  }
+
   auto coo_indices = thrust::make_zip_iterator(cuda::std::make_tuple(rows, cols));
 
   // get all the colors in contiguous locations so we can map them to warps.
@@ -76,7 +89,7 @@ template <typename T, typename IdxT = int, typename nnz_t>
 void coo_sort(COO<T, IdxT, nnz_t>* const in, cudaStream_t stream)
 {
   coo_sort<T, IdxT, nnz_t>(
-    in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+    false, in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh
index 7a5eaacdd7..48469f7275 100644
--- a/cpp/include/raft/sparse/op/sort.cuh
+++ b/cpp/include/raft/sparse/op/sort.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __SPARSE_SORT_H
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/sparse/op/detail/sort.h>
 
@@ -29,7 +31,33 @@ namespace op {
 template <typename T, typename IdxT = int, typename nnz_t>
 void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
 {
-  detail::coo_sort(m, n, nnz, rows, cols, vals, stream);
+  detail::coo_sort(false, m, n, nnz, rows, cols, vals, stream);
+}
+
+/**
+ * @brief Sorts the arrays that comprise the coo matrix
+ * by row and then by column (dry-run aware).
+ *
+ * @param handle raft resources handle
+ * @param m number of rows in coo matrix
+ * @param n number of cols in coo matrix
+ * @param nnz number of non-zeros
+ * @param rows rows array from coo matrix
+ * @param cols cols array from coo matrix
+ * @param vals vals array from coo matrix
+ */
+template <typename T, typename IdxT = int, typename nnz_t>
+void coo_sort(
+  raft::resources const& handle, IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals)
+{
+  detail::coo_sort(resource::get_dry_run_flag(handle),
+                   m,
+                   n,
+                   nnz,
+                   rows,
+                   cols,
+                   vals,
+                   resource::get_cuda_stream(handle));
 }
 
 /**
@@ -41,8 +69,7 @@ void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaSt
 template <typename T, typename IdxT = int, typename nnz_t>
 void coo_sort(COO<T, IdxT, nnz_t>* const in, cudaStream_t stream)
 {
-  coo_sort<T, IdxT, nnz_t>(
-    in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+  detail::coo_sort(in, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index c0acf7a975..3690b836b5 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -18,6 +18,7 @@
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/core/types.hpp>
 #include <raft/linalg/add.cuh>
@@ -141,7 +142,8 @@ void lanczos_solve_ritz(
   raft::device_vector_view<ValueTypeT> sm_eigenvalues,
   raft::device_matrix_view<ValueTypeT, uint32_t, raft::col_major> sm_eigenvectors)
 {
-  auto stream = resource::get_cuda_stream(handle);
+  auto stream           = resource::get_cuda_stream(handle);
+  bool const is_dry_run = resource::get_dry_run_flag(handle);
 
   ValueTypeT zero = 0;
   auto triangular_matrix =
@@ -152,19 +154,18 @@ void lanczos_solve_ritz(
     raft::make_device_vector_view<const ValueTypeT, uint32_t>(alpha.data_handle(), ncv);
   raft::matrix::set_diagonal(handle, alphaVec, triangular_matrix.view());
 
-  // raft::matrix::initializeDiagonalMatrix(
-  //   alpha.data_handle(), triangular_matrix.data_handle(), ncv, ncv, stream);
-
-  int blockSize = 256;
-  int numBlocks = (ncv + blockSize - 1) / blockSize;
-  kernel_triangular_populate<ValueTypeT>
-    <<<blockSize, numBlocks, 0, stream>>>(triangular_matrix.data_handle(), beta.data_handle(), ncv);
-
-  if (beta_k) {
-    int threadsPerBlock = 256;
-    int blocksPerGrid   = (k + threadsPerBlock - 1) / threadsPerBlock;
-    kernel_triangular_beta_k<ValueTypeT><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-      triangular_matrix.data_handle(), beta_k.value().data_handle(), (int)k, ncv);
+  if (!is_dry_run) {
+    int blockSize = 256;
+    int numBlocks = (ncv + blockSize - 1) / blockSize;
+    kernel_triangular_populate<ValueTypeT><<<blockSize, numBlocks, 0, stream>>>(
+      triangular_matrix.data_handle(), beta.data_handle(), ncv);
+
+    if (beta_k) {
+      int threadsPerBlock = 256;
+      int blocksPerGrid   = (k + threadsPerBlock - 1) / threadsPerBlock;
+      kernel_triangular_beta_k<ValueTypeT><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+        triangular_matrix.data_handle(), beta_k.value().data_handle(), (int)k, ncv);
+    }
   }
 
   auto triangular_matrix_view =
@@ -193,32 +194,36 @@ void lanczos_solve_ritz(
     eigenvectors_k_slice = raft::make_device_matrix_view<ValueTypeT, IndexTypeT, raft::col_major>(
       eigenvectors.data_handle() + (ncv - nEigVecs) * ncv, ncv, nEigVecs);
   } else if (which == LANCZOS_WHICH::SM || which == LANCZOS_WHICH::LM) {
-    thrust::sequence(thrust::device, indices.data_handle(), indices.data_handle() + ncv, 0);
-
-    // Sort indices by absolute eigenvalues (magnitude) using a custom comparator
-    thrust::sort(thrust::device,
-                 indices.data_handle(),
-                 indices.data_handle() + ncv,
-                 [eigenvalues = eigenvalues.data_handle()] __device__(int a, int b) {
-                   return fabsf(eigenvalues[a]) < fabsf(eigenvalues[b]);
-                 });
-
-    if (which == LANCZOS_WHICH::SM) {
-      // Take the first nEigVecs indices (smallest magnitude)
-      raft::copy(selected_indices.data_handle(), indices.data_handle(), nEigVecs, stream);
-    } else if (which == LANCZOS_WHICH::LM) {
-      // Take the last nEigVecs indices (largest magnitude)
-      raft::copy(
-        selected_indices.data_handle(), indices.data_handle() + (ncv - nEigVecs), nEigVecs, stream);
+    if (!is_dry_run) {  // TODO: we must be missing some allocations in thrust helpers here
+      thrust::sequence(thrust::device, indices.data_handle(), indices.data_handle() + ncv, 0);
+
+      // Sort indices by absolute eigenvalues (magnitude) using a custom comparator
+      thrust::sort(thrust::device,
+                   indices.data_handle(),
+                   indices.data_handle() + ncv,
+                   [eigenvalues = eigenvalues.data_handle()] __device__(int a, int b) {
+                     return fabsf(eigenvalues[a]) < fabsf(eigenvalues[b]);
+                   });
+
+      if (which == LANCZOS_WHICH::SM) {
+        // Take the first nEigVecs indices (smallest magnitude)
+        raft::copy(selected_indices.data_handle(), indices.data_handle(), nEigVecs, stream);
+      } else if (which == LANCZOS_WHICH::LM) {
+        // Take the last nEigVecs indices (largest magnitude)
+        raft::copy(selected_indices.data_handle(),
+                   indices.data_handle() + (ncv - nEigVecs),
+                   nEigVecs,
+                   stream);
+      }
+
+      // Re-sort these indices by algebraic value to maintain algebraic ordering
+      thrust::sort(thrust::device,
+                   selected_indices.data_handle(),
+                   selected_indices.data_handle() + nEigVecs,
+                   [eigenvalues = eigenvalues.data_handle()] __device__(int a, int b) {
+                     return eigenvalues[a] < eigenvalues[b];
+                   });
     }
-
-    // Re-sort these indices by algebraic value to maintain algebraic ordering
-    thrust::sort(thrust::device,
-                 selected_indices.data_handle(),
-                 selected_indices.data_handle() + nEigVecs,
-                 [eigenvalues = eigenvalues.data_handle()] __device__(int a, int b) {
-                   return eigenvalues[a] < eigenvalues[b];
-                 });
     raft::matrix::gather(
       handle,
       raft::make_device_matrix_view<const ValueTypeT, uint32_t, raft::row_major>(
@@ -269,15 +274,13 @@ void lanczos_aux(raft::resources const& handle,
   } else {
     spmv_alg = CUSPARSE_SPMV_ALG_DEFAULT;
   }
-  auto stream = resource::get_cuda_stream(handle);
+  auto stream           = resource::get_cuda_stream(handle);
+  bool const is_dry_run = resource::get_dry_run_flag(handle);
 
   IndexTypeT n  = A.structure_view().get_n_rows();
   auto v_vector = raft::make_device_vector_view<const ValueTypeT>(v.data_handle(), n);
   auto u_vector = raft::make_device_vector_view<const ValueTypeT>(u.data_handle(), n);
 
-  raft::copy(
-    v.data_handle(), V.data_handle() + start_idx * V.stride(0), n, stream);  // V(start_idx, 0)
-
   auto cusparse_h                 = resource::get_cusparse_handle(handle);
   cusparseSpMatDescr_t cusparse_A = raft::sparse::linalg::detail::create_descriptor(A);
 
@@ -299,17 +302,24 @@ void lanczos_aux(raft::resources const& handle,
                                                 stream);
   auto cusparse_spmv_buffer = raft::make_device_vector<ValueTypeT>(handle, bufferSize);
 
+  if (!is_dry_run) {
+    raft::copy(
+      v.data_handle(), V.data_handle() + start_idx * V.stride(0), n, stream);  // V(start_idx, 0)
+  }
+
   for (int i = start_idx; i < end_idx; i++) {
-    raft::sparse::detail::cusparsespmv(cusparse_h,
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       &one,
-                                       cusparse_A,
-                                       cusparse_v,
-                                       &zero,
-                                       cusparse_u,
-                                       spmv_alg,
-                                       cusparse_spmv_buffer.data_handle(),
-                                       stream);
+    if (!is_dry_run) {
+      raft::sparse::detail::cusparsespmv(cusparse_h,
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         &one,
+                                         cusparse_A,
+                                         cusparse_v,
+                                         &zero,
+                                         cusparse_u,
+                                         spmv_alg,
+                                         cusparse_spmv_buffer.data_handle(),
+                                         stream);
+    }
 
     auto alpha_i =
       raft::make_device_scalar_view(alpha.data_handle() + i * alpha.stride(1));  // alpha(0, i)
@@ -323,10 +333,12 @@ void lanczos_aux(raft::resources const& handle,
     ValueTypeT b            = 0;
     ValueTypeT mone         = -1;
 
-    raft::copy<ValueTypeT>(
-      &b, beta.data_handle() + ((i - 1 + ncv) % ncv) * beta.stride(1), 1, stream);
-    raft::copy<ValueTypeT>(
-      &alpha_i_host, alpha.data_handle() + i * alpha.stride(1), 1, stream);  // alpha(0, i)
+    if (!is_dry_run) {
+      raft::copy<ValueTypeT>(
+        &b, beta.data_handle() + ((i - 1 + ncv) % ncv) * beta.stride(1), 1, stream);
+      raft::copy<ValueTypeT>(
+        &alpha_i_host, alpha.data_handle() + i * alpha.stride(1), 1, stream);  // alpha(0, i)
+    }
 
     raft::linalg::axpy(handle, n, &alpha_i_host, v.data_handle(), 1, vv.data_handle(), 1, stream);
     raft::linalg::axpy(handle,
@@ -370,7 +382,9 @@ void lanczos_aux(raft::resources const& handle,
     auto uu_i = raft::make_device_scalar_view(uu.data_handle() + uu.stride(1) * i);  // uu(0, i)
     raft::linalg::add(handle, make_const_mdspan(alpha_i), make_const_mdspan(uu_i), alpha_i);
 
-    kernel_clamp_down<<<1, 1, 0, stream>>>(alpha_i.data_handle(), static_cast<ValueTypeT>(1e-9));
+    if (!is_dry_run) {
+      kernel_clamp_down<<<1, 1, 0, stream>>>(alpha_i.data_handle(), static_cast<ValueTypeT>(1e-9));
+    }
 
     auto output = raft::make_device_vector_view<ValueTypeT, uint32_t>(
       beta.data_handle() + beta.stride(1) * i, 1);
@@ -378,22 +392,26 @@ void lanczos_aux(raft::resources const& handle,
     raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(
       handle, input, output, raft::sqrt_op());
 
-    int blockSize = 256;
-    int numBlocks = (n + blockSize - 1) / blockSize;
+    if (!is_dry_run) {
+      int blockSize = 256;
+      int numBlocks = (n + blockSize - 1) / blockSize;
 
-    kernel_clamp_down_vector<<<numBlocks, blockSize, 0, stream>>>(
-      u.data_handle(), static_cast<ValueTypeT>(1e-7), n);
+      kernel_clamp_down_vector<<<numBlocks, blockSize, 0, stream>>>(
+        u.data_handle(), static_cast<ValueTypeT>(1e-7), n);
 
-    kernel_clamp_down<<<1, 1, 0, stream>>>(beta.data_handle() + beta.stride(1) * i,
-                                           static_cast<ValueTypeT>(1e-6));
+      kernel_clamp_down<<<1, 1, 0, stream>>>(beta.data_handle() + beta.stride(1) * i,
+                                             static_cast<ValueTypeT>(1e-6));
+    }
 
     if (i >= end_idx - 1) { break; }
 
-    int threadsPerBlock = 256;
-    int blocksPerGrid   = (n + threadsPerBlock - 1) / threadsPerBlock;
+    if (!is_dry_run) {
+      int threadsPerBlock = 256;
+      int blocksPerGrid   = (n + threadsPerBlock - 1) / threadsPerBlock;
 
-    kernel_normalize<ValueTypeT><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-      u.data_handle(), beta.data_handle(), i, n, v.data_handle(), V.data_handle(), n);
+      kernel_normalize<ValueTypeT><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+        u.data_handle(), beta.data_handle(), i, n, v.data_handle(), V.data_handle(), n);
+    }
   }
 }
 
@@ -421,9 +439,10 @@ auto lanczos_smallest(raft::resources const& handle,
   } else {
     spmv_alg = CUSPARSE_SPMV_ALG_DEFAULT;
   }
-  int n       = A.structure_view().get_n_rows();
-  int ncv     = restartIter;
-  auto stream = resource::get_cuda_stream(handle);
+  int n           = A.structure_view().get_n_rows();
+  int ncv         = restartIter;
+  auto stream     = resource::get_cuda_stream(handle);
+  bool is_dry_run = resource::get_dry_run_flag(handle);
 
   auto V = raft::make_device_matrix<ValueTypeT, uint32_t, raft::row_major>(handle, ncv, n);
   auto V_0_view =
@@ -432,7 +451,7 @@ auto lanczos_smallest(raft::resources const& handle,
 
   auto u        = raft::make_device_matrix<ValueTypeT, uint32_t, raft::row_major>(handle, 1, n);
   auto u_vector = raft::make_device_vector_view<ValueTypeT, uint32_t>(u.data_handle(), n);
-  raft::copy(u.data_handle(), v0, n, stream);
+  if (!is_dry_run) { raft::copy(u.data_handle(), v0, n, stream); }
 
   auto cublas_h = resource::get_cublas_handle(handle);
   auto v0nrm    = raft::make_device_vector<ValueTypeT, uint32_t>(handle, 1);
@@ -528,8 +547,14 @@ auto lanczos_smallest(raft::resources const& handle,
     raft::make_device_matrix_view<const ValueTypeT>(beta_k.data_handle(), 1, nEigVecs);
   raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(
     handle, input, output.view(), raft::sqrt_op());
-  raft::copy(&res, output.data_handle(), 1, stream);
-  resource::sync_stream(handle, stream);
+  if (!is_dry_run) {
+    raft::copy(&res, output.data_handle(), 1, stream);
+    resource::sync_stream(handle, stream);
+  } else {
+    // Force exactly one loop iteration so the MR records all allocations inside the loop body.
+    res     = tol + 1;
+    maxIter = ncv + (ncv - nEigVecs);
+  }
 
   auto uu  = raft::make_device_matrix<ValueTypeT>(handle, 1, nEigVecs);
   int iter = ncv;
@@ -538,12 +563,14 @@ auto lanczos_smallest(raft::resources const& handle,
       beta.data_handle(), 1, nEigVecs);
     raft::matrix::fill(handle, beta_view, zero);
 
-    raft::copy(alpha.data_handle(), eigenvalues_k.data_handle(), nEigVecs, stream);
+    if (!is_dry_run) {
+      raft::copy(alpha.data_handle(), eigenvalues_k.data_handle(), nEigVecs, stream);
+    }
 
     auto x_T =
       raft::make_device_matrix_view<ValueTypeT>(ritz_eigenvectors.data_handle(), nEigVecs, n);
 
-    raft::copy(V.data_handle(), x_T.data_handle(), nEigVecs * n, stream);
+    if (!is_dry_run) { raft::copy(V.data_handle(), x_T.data_handle(), nEigVecs * n, stream); }
 
     ValueTypeT one  = 1;
     ValueTypeT mone = -1;
@@ -611,16 +638,18 @@ auto lanczos_smallest(raft::resources const& handle,
                                                   stream);
     auto cusparse_spmv_buffer = raft::make_device_vector<ValueTypeT>(handle, bufferSize);
 
-    raft::sparse::detail::cusparsespmv(cusparse_h,
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       &one,
-                                       cusparse_A,
-                                       cusparse_v,
-                                       &zero,
-                                       cusparse_u,
-                                       spmv_alg,
-                                       cusparse_spmv_buffer.data_handle(),
-                                       stream);
+    if (!is_dry_run) {
+      raft::sparse::detail::cusparsespmv(cusparse_h,
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         &one,
+                                         cusparse_A,
+                                         cusparse_v,
+                                         &zero,
+                                         cusparse_u,
+                                         spmv_alg,
+                                         cusparse_spmv_buffer.data_handle(),
+                                         stream);
+    }
 
     auto alpha_k = raft::make_device_scalar_view<ValueTypeT>(alpha.data_handle() + nEigVecs);
 
@@ -741,13 +770,17 @@ auto lanczos_smallest(raft::resources const& handle,
       raft::make_device_matrix_view<const ValueTypeT>(beta_k.data_handle(), 1, nEigVecs);
     raft::linalg::norm<raft::linalg::L2Norm, raft::Apply::ALONG_ROWS>(
       handle, input2, output2.view(), raft::sqrt_op());
-    raft::copy(&res, output2.data_handle(), 1, stream);
-    resource::sync_stream(handle, stream);
+    if (!is_dry_run) {
+      raft::copy(&res, output2.data_handle(), 1, stream);
+      resource::sync_stream(handle, stream);
+    }
     RAFT_LOG_TRACE("Iteration %f: residual (tolerance) %d", iter, res);
   }
 
-  raft::copy(eigVals_dev, eigenvalues_k.data_handle(), nEigVecs, stream);
-  raft::copy(eigVecs_dev, ritz_eigenvectors.data_handle(), n * nEigVecs, stream);
+  if (!is_dry_run) {
+    raft::copy(eigVals_dev, eigenvalues_k.data_handle(), nEigVecs, stream);
+    raft::copy(eigVecs_dev, ritz_eigenvectors.data_handle(), n * nEigVecs, stream);
+  }
 
   return 0;
 }
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 33df6cbf27..bf8851fd55 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusparse_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
@@ -89,7 +90,8 @@ template <typename value_type>
 class vector_t {
  public:
   vector_t(resources const& raft_handle, size_type sz)
-    : buffer_(sz, resource::get_cuda_stream(raft_handle)),
+    : handle_(raft_handle),
+      buffer_(sz, resource::get_cuda_stream(raft_handle)),
       thrust_policy(resource::get_thrust_policy(raft_handle))
   {
   }
@@ -102,6 +104,7 @@ class vector_t {
 
   value_type nrm1() const
   {
+    if (resource::get_dry_run_flag(handle_)) { return value_type{0}; }
     return thrust::reduce(
       thrust_policy,
       buffer_.data(),
@@ -116,6 +119,7 @@ class vector_t {
 
   void fill(value_type value)
   {
+    if (resource::get_dry_run_flag(handle_)) { return; }
     thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value);
   }
 
@@ -123,6 +127,7 @@ class vector_t {
   using thrust_exec_policy_t =
     thrust::detail::execute_with_allocator<rmm::mr::thrust_allocator<char>,
                                            thrust::cuda_cub::execute_on_stream_nosync_base>;
+  raft::resources const& handle_;
   rmm::device_uvector<value_type> buffer_;
   const thrust_exec_policy_t thrust_policy;
 };
@@ -210,6 +215,7 @@ struct sparse_matrix_t {
 
     auto cusparse_h = resource::get_cusparse_handle(handle_);
     auto stream     = resource::get_cuda_stream(handle_);
+    bool is_dry_run = resource::get_dry_run_flag(handle_);
 
     cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
                                   CUSPARSE_OPERATION_NON_TRANSPOSE;         // non-transpose
@@ -238,7 +244,7 @@ struct sparse_matrix_t {
     RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(&vecX, size_x, x));
 
     rmm::device_uvector<value_type> y_tmp(size_y, stream);
-    raft::copy(y_tmp.data(), y, size_y, stream);
+    if (!is_dry_run) { raft::copy(y_tmp.data(), y, size_y, stream); }
 
     cusparseDnVecDescr_t vecY;
     RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(&vecY, size_y, y_tmp.data()));
@@ -255,11 +261,21 @@ struct sparse_matrix_t {
 
     // finally perform SpMV:
     //
-    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(
-      cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream));
-
-    // FIXME: This is a workaround for a cusparse issue being encountered in CUDA 12
-    raft::copy(y, y_tmp.data(), size_y, stream);
+    if (!is_dry_run) {
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(cusparse_h,
+                                                           trans,
+                                                           &alpha,
+                                                           matA,
+                                                           vecX,
+                                                           &beta,
+                                                           vecY,
+                                                           spmv_alg,
+                                                           external_buffer.raw(),
+                                                           stream));
+
+      // FIXME: This is a workaround for a cusparse issue being encountered in CUDA 12
+      raft::copy(y, y_tmp.data(), size_y, stream);
+    }
     // free descriptors:
     //(TODO: maybe wrap them in a RAII struct?)
     //
@@ -267,6 +283,7 @@ struct sparse_matrix_t {
     RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(vecX));
     RAFT_CUSPARSE_TRY(cusparseDestroySpMat(matA));
 #else
+    if (is_dry_run) { return; }
     RAFT_CUSPARSE_TRY(
       raft::sparse::detail::cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
     cusparseMatDescr_t descr = 0;
@@ -365,27 +382,30 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type, nnz_type> {
     constexpr int BLOCK_SIZE = 1024;
     auto n                   = sparse_matrix_t<index_type, value_type, nnz_type>::nrows_;
 
-    auto handle   = sparse_matrix_t<index_type, value_type, nnz_type>::get_handle();
-    auto cublas_h = resource::get_cublas_handle(handle);
-    auto stream   = resource::get_cuda_stream(handle);
+    auto handle     = sparse_matrix_t<index_type, value_type, nnz_type>::get_handle();
+    auto cublas_h   = resource::get_cublas_handle(handle);
+    auto stream     = resource::get_cuda_stream(handle);
+    bool is_dry_run = resource::get_dry_run_flag(handle);
 
     // scales y by beta:
     //
-    if (beta == 0) {
-      RAFT_CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
-    } else if (beta != 1) {
-      // TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream));
+    if (!is_dry_run) {
+      if (beta == 0) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
+      } else if (beta != 1) {
+        // TODO: Call from public API when ready
+        RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream));
+      }
+
+      // Apply diagonal matrix
+      //
+      dim3 gridDim{std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
+
+      dim3 blockDim{BLOCK_SIZE, 1, 1};
+      diagmv<<<gridDim, blockDim, 0, stream>>>(n, alpha, diagonal_.raw(), x, y);
+      RAFT_CHECK_CUDA(stream);
     }
 
-    // Apply diagonal matrix
-    //
-    dim3 gridDim{std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
-
-    dim3 blockDim{BLOCK_SIZE, 1, 1};
-    diagmv<<<gridDim, blockDim, 0, stream>>>(n, alpha, diagonal_.raw(), x, y);
-    RAFT_CHECK_CUDA(stream);
-
     // Apply adjacency matrix
     //
     sparse_matrix_t<index_type, value_type, nnz_type>::mv(
@@ -428,9 +448,10 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type, nnz_type
   {
     auto n = sparse_matrix_t<index_type, value_type, nnz_type>::nrows_;
 
-    auto handle   = sparse_matrix_t<index_type, value_type, nnz_type>::get_handle();
-    auto cublas_h = resource::get_cublas_handle(handle);
-    auto stream   = resource::get_cuda_stream(handle);
+    auto handle     = sparse_matrix_t<index_type, value_type, nnz_type>::get_handle();
+    auto cublas_h   = resource::get_cublas_handle(handle);
+    auto stream     = resource::get_cuda_stream(handle);
+    bool is_dry_run = resource::get_dry_run_flag(handle);
 
     // y = A*x
     //
@@ -442,29 +463,31 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type, nnz_type
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
     // TODO: Call from public API when ready
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
-      cublas_h,
-      n,
-      laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
-      1,
-      x,
-      1,
-      &dot_res,
-      stream));
-
-    // y = y -(gamma/edge_sum)*d
-    //
-    value_type gamma_ = -dot_res / edge_sum_;
-    // TODO: Call from public API when ready
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(
-      cublas_h,
-      n,
-      &gamma_,
-      laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
-      1,
-      y,
-      1,
-      stream));
+    if (!is_dry_run) {
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
+        cublas_h,
+        n,
+        laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
+        1,
+        x,
+        1,
+        &dot_res,
+        stream));
+
+      // y = y -(gamma/edge_sum)*d
+      //
+      value_type gamma_ = -dot_res / edge_sum_;
+      // TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(
+        cublas_h,
+        n,
+        &gamma_,
+        laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
+        1,
+        y,
+        1,
+        stream));
+    }
   }
 
   value_type edge_sum_;
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index cbf343ef93..74e087f749 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include <raft/core/logger_macros.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/normalize.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
@@ -46,6 +47,7 @@ void analyzeModularity(
   vertex_t const* __restrict__ clusters,
   weight_t& modularity)
 {
+  bool is_dry_run = resource::get_dry_run_flag(handle);
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
@@ -60,11 +62,19 @@ void analyzeModularity(
   raft::spectral::matrix::vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  if (!is_dry_run) {
+    RAFT_CUBLAS_TRY(
+      linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  }
 
   // Initialize Modularity
   raft::spectral::matrix::modularity_matrix_t<vertex_t, weight_t, nnz_t> B{handle, csr_m};
 
+  if (is_dry_run) {
+    // Early stopping because construct_indicator doesn't allocate anything.
+    return;
+  }
+
   // Initialize output
   modularity = 0;
 
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
index 4c6ed1eaa5..028d89f061 100644
--- a/cpp/include/raft/spectral/detail/partition.hpp
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -1,11 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
 
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/sparse/linalg/laplacian.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
@@ -56,23 +57,25 @@ void analyzePartition(raft::resources const& handle,
   vertex_t i;
   vertex_t n = csr_m.nrows_;
 
-  auto stream   = resource::get_cuda_stream(handle);
-  auto cublas_h = resource::get_cublas_handle(handle);
-
   weight_t partEdgesCut, clustersize;
 
-  // Device memory
+  // Device memory - allocate before dry-run check to track allocations
   spectral::matrix::vector_t<weight_t> part_i(handle, n);
   spectral::matrix::vector_t<weight_t> Lx(handle, n);
 
+  // Initialize Laplacian - allocate before dry-run check to track allocations
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  spectral::matrix::laplacian_matrix_t<vertex_t, weight_t, nnz_t> L{handle, csr_m};
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
+  auto stream   = resource::get_cuda_stream(handle);
+  auto cublas_h = resource::get_cublas_handle(handle);
+
   // Initialize cuBLAS
   RAFT_CUBLAS_TRY(
     raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
-  // Initialize Laplacian
-  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  spectral::matrix::laplacian_matrix_t<vertex_t, weight_t, nnz_t> L{handle, csr_m};
-
   // Initialize output
   cost    = 0;
   edgeCut = 0;
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index 72e05d5e6e..04fdd4dbf9 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
@@ -34,6 +35,11 @@ void transform_eigen_matrix(raft::resources const& handle,
                             vertex_t nEigVecs,
                             weight_t* eigVecs)
 {
+  // Allocate before dry-run check to track allocation
+  raft::spectral::matrix::vector_t<weight_t> work(handle, nEigVecs * n);
+
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   auto stream             = resource::get_cuda_stream(handle);
   auto cublas_h           = resource::get_cublas_handle(handle);
   auto thrust_exec_policy = resource::get_thrust_policy(handle);
@@ -76,7 +82,6 @@ void transform_eigen_matrix(raft::resources const& handle,
   // Transpose eigenvector matrix
   //   TODO: in-place transpose
   {
-    raft::spectral::matrix::vector_t<weight_t> work(handle, nEigVecs * n);
     // TODO: Call from public API when ready
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
@@ -134,6 +139,7 @@ bool construct_indicator(
   raft::spectral::matrix::vector_t<weight_t>& Bx,
   raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t, nnz_t> const& B)
 {
+  if (resource::get_dry_run_flag(handle)) { return {}; }
   auto stream             = resource::get_cuda_stream(handle);
   auto cublas_h           = resource::get_cublas_handle(handle);
   auto thrust_exec_policy = resource::get_thrust_policy(handle);
diff --git a/cpp/include/raft/stats/accuracy.cuh b/cpp/include/raft/stats/accuracy.cuh
index 6b96d4a5e7..903b6dbe3d 100644
--- a/cpp/include/raft/stats/accuracy.cuh
+++ b/cpp/include/raft/stats/accuracy.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/scores.cuh>
 
 namespace raft {
@@ -27,7 +28,7 @@ namespace stats {
 template <typename math_t>
 float accuracy(const math_t* predictions, const math_t* ref_predictions, int n, cudaStream_t stream)
 {
-  return detail::accuracy_score(predictions, ref_predictions, n, stream);
+  return detail::accuracy_score(false, predictions, ref_predictions, n, stream);
 }
 
 /**
@@ -53,7 +54,8 @@ float accuracy(raft::resources const& handle,
   RAFT_EXPECTS(predictions.is_exhaustive(), "predictions must be contiguous");
   RAFT_EXPECTS(ref_predictions.is_exhaustive(), "ref_predictions must be contiguous");
 
-  return detail::accuracy_score(predictions.data_handle(),
+  return detail::accuracy_score(resource::get_dry_run_flag(handle),
+                                predictions.data_handle(),
                                 ref_predictions.data_handle(),
                                 predictions.extent(0),
                                 resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
index 5ab6e14e14..6f0f38ccf8 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 /**
@@ -15,6 +15,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/adjusted_rand_index.cuh>
 
 namespace raft {
@@ -37,7 +38,7 @@ double adjusted_rand_index(const T* firstClusterArray,
                            cudaStream_t stream)
 {
   return detail::compute_adjusted_rand_index<T, MathT>(
-    firstClusterArray, secondClusterArray, size, stream);
+    false, firstClusterArray, secondClusterArray, size, stream);
 }
 
 /**
@@ -65,7 +66,8 @@ double adjusted_rand_index(raft::resources const& handle,
   RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous");
   RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous");
 
-  return detail::compute_adjusted_rand_index<value_t, math_t>(first_cluster_array.data_handle(),
+  return detail::compute_adjusted_rand_index<value_t, math_t>(resource::get_dry_run_flag(handle),
+                                                              first_cluster_array.data_handle(),
                                                               second_cluster_array.data_handle(),
                                                               first_cluster_array.extent(0),
                                                               resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
index 324e7b47f8..b95d071240 100644
--- a/cpp/include/raft/stats/completeness_score.cuh
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/homogeneity_score.cuh>
 
 namespace raft {
@@ -34,7 +35,7 @@ double completeness_score(const T* truthClusterArray,
                           cudaStream_t stream)
 {
   return detail::homogeneity_score(
-    predClusterArray, truthClusterArray, size, lower_label_range, upper_label_range, stream);
+    false, predClusterArray, truthClusterArray, size, lower_label_range, upper_label_range, stream);
 }
 
 /**
@@ -64,7 +65,8 @@ double completeness_score(raft::resources const& handle,
   RAFT_EXPECTS(truth_cluster_array.size() == pred_cluster_array.size(), "Size mismatch");
   RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous");
   RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous");
-  return detail::homogeneity_score(pred_cluster_array.data_handle(),
+  return detail::homogeneity_score(resource::get_dry_run_flag(handle),
+                                   pred_cluster_array.data_handle(),
                                    truth_cluster_array.data_handle(),
                                    truth_cluster_array.extent(0),
                                    lower_label_range,
diff --git a/cpp/include/raft/stats/contingency_matrix.cuh b/cpp/include/raft/stats/contingency_matrix.cuh
index d49a4372b5..7f99f9f299 100644
--- a/cpp/include/raft/stats/contingency_matrix.cuh
+++ b/cpp/include/raft/stats/contingency_matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -12,6 +12,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/contingencyMatrix.cuh>
 
@@ -52,7 +53,7 @@ size_t getContingencyMatrixWorkspaceSize(int nSamples,
                                          T maxLabel = std::numeric_limits<T>::max())
 {
   return detail::getContingencyMatrixWorkspaceSize(
-    nSamples, groundTruth, stream, minLabel, maxLabel);
+    false, nSamples, groundTruth, stream, minLabel, maxLabel);
 }
 
 /**
@@ -115,6 +116,7 @@ void get_input_class_cardinality(raft::resources const& handle,
                                  raft::host_scalar_view<value_t> minLabel,
                                  raft::host_scalar_view<value_t> maxLabel)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(minLabel.data_handle() != nullptr, "Invalid minLabel pointer");
   RAFT_EXPECTS(maxLabel.data_handle() != nullptr, "Invalid maxLabel pointer");
   detail::getInputClassCardinality(groundTruth.data_handle(),
@@ -168,13 +170,16 @@ void contingency_matrix(raft::resources const& handle,
   if (min_label.has_value()) { min_label_value = min_label.value(); }
   if (max_label.has_value()) { max_label_value = max_label.value(); }
 
-  auto workspace_sz = detail::getContingencyMatrixWorkspaceSize(ground_truth.extent(0),
+  auto workspace_sz = detail::getContingencyMatrixWorkspaceSize(resource::get_dry_run_flag(handle),
+                                                                ground_truth.extent(0),
                                                                 ground_truth.data_handle(),
                                                                 resource::get_cuda_stream(handle),
                                                                 min_label_value,
                                                                 max_label_value);
   auto workspace    = raft::make_device_vector<char>(handle, workspace_sz);
 
+  if (resource::get_dry_run_flag(handle)) { return; }
+
   detail::contingencyMatrix<value_t, out_t>(ground_truth.data_handle(),
                                             predicted_label.data_handle(),
                                             ground_truth.extent(0),
diff --git a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
index 3161b9c423..b53f7f00ed 100644
--- a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
@@ -104,13 +104,15 @@ int countUnique(const T* arr, int size, T& minLabel, T& maxLabel, cudaStream_t s
  *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
  * @tparam T data-type for input label arrays
  * @tparam MathT integral data-type used for computing n-choose-r
+ * @param dry_run: whether to run in dry-run mode
  * @param firstClusterArray: the array of classes
  * @param secondClusterArray: the array of classes
  * @param size: the size of the data points of type int
  * @param stream: the cudaStream object
  */
 template <typename T, typename MathT = int>
-double compute_adjusted_rand_index(const T* firstClusterArray,
+double compute_adjusted_rand_index(bool dry_run,
+                                   const T* firstClusterArray,
                                    const T* secondClusterArray,
                                    int size,
                                    cudaStream_t stream)
@@ -119,56 +121,72 @@ double compute_adjusted_rand_index(const T* firstClusterArray,
     // 1 or 0 labels always have a perfect score. This also matches sklearn behavior.
     return 1.0;
   }
-  T minFirst, maxFirst, minSecond, maxSecond;
-  auto nUniqFirst      = countUnique(firstClusterArray, size, minFirst, maxFirst, stream);
-  auto nUniqSecond     = countUnique(secondClusterArray, size, minSecond, maxSecond, stream);
-  auto lowerLabelRange = std::min(minFirst, minSecond);
-  auto upperLabelRange = std::max(maxFirst, maxSecond);
-  auto nClasses        = upperLabelRange - lowerLabelRange + 1;
-  // degenerate case of single cluster or clusters each with just one element
-  if (nUniqFirst == nUniqSecond) {
-    if (nUniqFirst == 1 || nUniqFirst == size) return 1.0;
+  // Upper bound: worst case is each sample is a unique class
+  auto nClassesUpperBound = size;
+  T lowerLabelRange, upperLabelRange;
+  MathT nUniqClasses;
+  if (dry_run) {
+    // Use upper bound for allocations in dry-run mode
+    nUniqClasses    = MathT(nClassesUpperBound);
+    lowerLabelRange = T(0);
+    upperLabelRange = T(nClassesUpperBound - 1);
+  } else {
+    T minFirst, maxFirst, minSecond, maxSecond;
+    auto nUniqFirst  = countUnique(firstClusterArray, size, minFirst, maxFirst, stream);
+    auto nUniqSecond = countUnique(secondClusterArray, size, minSecond, maxSecond, stream);
+    lowerLabelRange  = std::min(minFirst, minSecond);
+    upperLabelRange  = std::max(maxFirst, maxSecond);
+    auto nClasses    = upperLabelRange - lowerLabelRange + 1;
+    // degenerate case of single cluster or clusters each with just one element
+    if (nUniqFirst == nUniqSecond) {
+      if (nUniqFirst == 1 || nUniqFirst == size) return 1.0;
+    }
+    nUniqClasses = MathT(nClasses);
   }
-  auto nUniqClasses = MathT(nClasses);
   rmm::device_uvector<MathT> dContingencyMatrix(nUniqClasses * nUniqClasses, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(
-    dContingencyMatrix.data(), 0, nUniqClasses * nUniqClasses * sizeof(MathT), stream));
   auto workspaceSz = getContingencyMatrixWorkspaceSize<T, MathT>(
-    size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
+    dry_run, size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
   rmm::device_uvector<char> workspaceBuff(workspaceSz, stream);
-  contingencyMatrix<T, MathT>(firstClusterArray,
-                              secondClusterArray,
-                              size,
-                              dContingencyMatrix.data(),
-                              stream,
-                              workspaceBuff.data(),
-                              workspaceSz,
-                              lowerLabelRange,
-                              upperLabelRange);
+  if (!dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      dContingencyMatrix.data(), 0, nUniqClasses * nUniqClasses * sizeof(MathT), stream));
+    contingencyMatrix<T, MathT>(firstClusterArray,
+                                secondClusterArray,
+                                size,
+                                dContingencyMatrix.data(),
+                                stream,
+                                workspaceBuff.data(),
+                                workspaceSz,
+                                lowerLabelRange,
+                                upperLabelRange);
+  }
   rmm::device_uvector<MathT> a(nUniqClasses, stream);
   rmm::device_uvector<MathT> b(nUniqClasses, stream);
   rmm::device_scalar<MathT> d_aCTwoSum(stream);
   rmm::device_scalar<MathT> d_bCTwoSum(stream);
   rmm::device_scalar<MathT> d_nChooseTwoSum(stream);
   MathT h_aCTwoSum, h_bCTwoSum, h_nChooseTwoSum;
-  RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, nUniqClasses * sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, nUniqClasses * sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_aCTwoSum.data(), 0, sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_bCTwoSum.data(), 0, sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_nChooseTwoSum.data(), 0, sizeof(MathT), stream));
-  // calculating the sum of NijC2
-  raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(d_nChooseTwoSum.data(),
-                                                      nUniqClasses * nUniqClasses,
-                                                      nCTwo<MathT>(),
-                                                      stream,
-                                                      dContingencyMatrix.data(),
-                                                      dContingencyMatrix.data());
+  if (!dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, nUniqClasses * sizeof(MathT), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, nUniqClasses * sizeof(MathT), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_aCTwoSum.data(), 0, sizeof(MathT), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_bCTwoSum.data(), 0, sizeof(MathT), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_nChooseTwoSum.data(), 0, sizeof(MathT), stream));
+    // calculating the sum of NijC2
+    raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(d_nChooseTwoSum.data(),
+                                                        nUniqClasses * nUniqClasses,
+                                                        nCTwo<MathT>(),
+                                                        stream,
+                                                        dContingencyMatrix.data(),
+                                                        dContingencyMatrix.data());
+  }
   // calculating the row-wise sums
-  raft::linalg::reduce<true, true, MathT, MathT>(
-    a.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, stream);
+  raft::linalg::detail::reduce<true, true, MathT, MathT>(
+    dry_run, a.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, stream);
   // calculating the column-wise sums
-  raft::linalg::reduce<true, false, MathT, MathT>(
-    b.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, stream);
+  raft::linalg::detail::reduce<true, false, MathT, MathT>(
+    dry_run, b.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, stream);
+  if (dry_run) { return 0.0; }
   // calculating the sum of number of unordered pairs for every element in a
   raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(
     d_aCTwoSum.data(), nUniqClasses, nCTwo<MathT>(), stream, a.data(), a.data());
diff --git a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
index a0fb863122..6befbacfa4 100644
--- a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -115,29 +115,18 @@ rmm::device_uvector<value_idx> get_cluster_counts(raft::resources const& handle,
 
   rmm::device_uvector<value_idx> cluster_counts(n_labels, stream);
 
-  rmm::device_uvector<char> workspace(1, stream);
+  // Query workspace size for countLabels (can run in dry-run)
+  size_t countLabels_ws_size = 0;
+  raft::stats::detail::countLabels<value_idx, label_idx>(
+    y, nullptr, n_rows, n_labels, nullptr, countLabels_ws_size, stream);
+  rmm::device_uvector<char> workspace(countLabels_ws_size, stream);
 
-  raft::stats::detail::countLabels(y, cluster_counts.data(), n_rows, n_labels, workspace, stream);
+  if (resource::get_dry_run_flag(handle)) { return cluster_counts; }
 
-  return cluster_counts;
-}
-
-template <typename value_t, typename value_idx>
-rmm::device_uvector<value_t> get_pairwise_distance(raft::resources const& handle,
-                                                   const value_t* left_begin,
-                                                   const value_t* right_begin,
-                                                   value_idx& n_left_rows,
-                                                   value_idx& n_right_rows,
-                                                   value_idx& n_cols,
-                                                   raft::distance::DistanceType metric,
-                                                   cudaStream_t stream)
-{
-  rmm::device_uvector<value_t> distances(n_left_rows * n_right_rows, stream);
-
-  raft::distance::pairwise_distance(
-    handle, left_begin, right_begin, distances.data(), n_left_rows, n_right_rows, n_cols, metric);
+  raft::stats::detail::countLabels(
+    y, cluster_counts.data(), n_rows, n_labels, workspace.data(), countLabels_ws_size, stream);
 
-  return distances;
+  return cluster_counts;
 }
 
 template <typename value_t, typename value_idx, typename label_idx>
@@ -177,6 +166,8 @@ value_t silhouette_score(
   ASSERT(n_labels >= 2 && n_labels <= (n_rows - 1),
          "silhouette Score not defined for the given number of labels!");
 
+  bool is_dry_run = resource::get_dry_run_flag(handle);
+
   rmm::device_uvector<value_idx> cluster_counts = get_cluster_counts(handle, y, n_rows, n_labels);
 
   auto stream = resource::get_cuda_stream(handle);
@@ -185,19 +176,24 @@ value_t silhouette_score(
   auto b_size = n_rows * n_labels;
 
   value_t *a_ptr, *b_ptr;
-  rmm::device_uvector<value_t> a(0, stream);
+  // since a and silhouette score per sample are same size, reusing
+  rmm::device_uvector<value_t> a((scores == nullptr || scores == NULL) ? n_rows : 0, stream);
   rmm::device_uvector<value_t> b(b_size, stream);
 
   b_ptr = b.data();
 
-  // since a and silhouette score per sample are same size, reusing
-  if (scores == nullptr || scores == NULL) {
-    a.resize(n_rows, stream);
+  if (a.size() > 0) {
     a_ptr = a.data();
   } else {
     a_ptr = scores;
   }
 
+  // Pre-allocate maximum distance buffer size (chunk * chunk) before dry-run guard
+  // to ensure allocation is tracked in dry-run mode and avoid reallocations in the loop
+  rmm::device_uvector<value_t> distances_buffer(chunk * chunk, stream);
+
+  if (is_dry_run) { return value_t{0}; }
+
   thrust::fill(policy, a_ptr, a_ptr + n_rows, 0);
 
   dim3 block_size(std::min(n_rows, 32), std::min(n_labels, 32));
@@ -222,8 +218,15 @@ value_t silhouette_score(
       auto n_left_rows  = (i + chunk) < n_rows ? chunk : (n_rows - i);
       auto n_right_rows = (j + chunk) < n_rows ? chunk : (n_rows - j);
 
-      rmm::device_uvector<value_t> distances = get_pairwise_distance(
-        handle, left_begin, right_begin, n_left_rows, n_right_rows, n_cols, metric, chunk_stream);
+      // Reuse pre-allocated buffer (size is at most chunk * chunk)
+      raft::distance::pairwise_distance(handle,
+                                        left_begin,
+                                        right_begin,
+                                        distances_buffer.data(),
+                                        n_left_rows,
+                                        n_right_rows,
+                                        n_cols,
+                                        metric);
 
       compute_chunked_a_b(handle,
                           a_ptr,
@@ -233,7 +236,7 @@ value_t silhouette_score(
                           y,
                           n_labels,
                           cluster_counts.data(),
-                          distances.data(),
+                          distances_buffer.data(),
                           n_left_rows,
                           n_right_rows,
                           chunk_stream);
diff --git a/cpp/include/raft/stats/detail/contingencyMatrix.cuh b/cpp/include/raft/stats/detail/contingencyMatrix.cuh
index faa91abffa..bda707406a 100644
--- a/cpp/include/raft/stats/detail/contingencyMatrix.cuh
+++ b/cpp/include/raft/stats/detail/contingencyMatrix.cuh
@@ -189,6 +189,7 @@ void getInputClassCardinality(
  * @brief Calculate workspace size for running contingency matrix calculations
  * @tparam T label type
  * @tparam OutT output matrix type
+ * @param dry_run: whether to run in dry-run mode (returns upper-bound estimate)
  * @param nSamples: number of elements in input array
  * @param groundTruth: device 1-d array for ground truth (num of rows)
  * @param stream: cuda stream for execution
@@ -196,13 +197,26 @@ void getInputClassCardinality(
  * @param maxLabel: Optional, max value in input array
  */
 template <typename T, typename OutT = int>
-size_t getContingencyMatrixWorkspaceSize(int nSamples,
+size_t getContingencyMatrixWorkspaceSize(bool dry_run,
+                                         int nSamples,
                                          const T* groundTruth,
                                          cudaStream_t stream,
                                          T minLabel = std::numeric_limits<T>::max(),
                                          T maxLabel = std::numeric_limits<T>::max())
 {
   size_t workspaceSize = 0;
+  if (dry_run) {
+    // Upper-bound estimate for dry-run mode:
+    // Worst case: each sample is a unique class, so outDimN = nSamples
+    // For SORT_AND_GATOMICS implementation:
+    // - tmpStagingMemorySize = alignTo(nSamples * sizeof(T), 256) * 2
+    // - CUB workspace: conservative upper bound of 4 * nSamples * sizeof(T)
+    auto tmpStagingMemorySize = raft::alignTo<size_t>(nSamples * sizeof(T), 256);
+    tmpStagingMemorySize *= 2;
+    size_t cubWorkspaceUpperBound = 4 * nSamples * sizeof(T);
+    workspaceSize                 = tmpStagingMemorySize + cubWorkspaceUpperBound;
+    return workspaceSize;
+  }
   // below is a redundant computation - can be avoided
   if (minLabel == std::numeric_limits<T>::max() || maxLabel == std::numeric_limits<T>::max()) {
     getInputClassCardinality<T>(groundTruth, nSamples, stream, minLabel, maxLabel);
diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh
index f6beec8396..3a35ac047b 100644
--- a/cpp/include/raft/stats/detail/cov.cuh
+++ b/cpp/include/raft/stats/detail/cov.cuh
@@ -1,11 +1,12 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
 
 #include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/linalg/gemm.cuh>
 #include <raft/stats/mean_center.cuh>
 
@@ -44,6 +45,7 @@ void cov(raft::resources const& handle,
          bool stable,
          cudaStream_t stream)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   if (stable) {
     // since mean operation is assumed to be along a given column, broadcast
     // must be along rows!
diff --git a/cpp/include/raft/stats/detail/dispersion.cuh b/cpp/include/raft/stats/detail/dispersion.cuh
index 2a81c7624c..16172bce90 100644
--- a/cpp/include/raft/stats/detail/dispersion.cuh
+++ b/cpp/include/raft/stats/detail/dispersion.cuh
@@ -87,7 +87,8 @@ RAFT_KERNEL dispersionKernel(DataT* result,
  * @return the cluster dispersion value
  */
 template <typename DataT, typename IdxT = int, int TPB = 256>
-DataT dispersion(const DataT* centroids,
+DataT dispersion(bool dry_run,
+                 const DataT* centroids,
                  const IdxT* clusterSizes,
                  DataT* globalCentroid,
                  IdxT nClusters,
@@ -95,17 +96,17 @@ DataT dispersion(const DataT* centroids,
                  IdxT dim,
                  cudaStream_t stream)
 {
+  rmm::device_uvector<DataT> mean(globalCentroid == nullptr ? dim : 0, stream);
+  rmm::device_uvector<DataT> result(1, stream);
+  DataT* mu = globalCentroid;
+  if (globalCentroid == nullptr) { mu = mean.data(); }
+
+  if (dry_run) { return DataT{0}; }
+
   static const int RowsPerThread = 4;
   static const int ColsPerBlk    = 32;
   static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
   dim3 grid(raft::ceildiv(nPoints, (IdxT)RowsPerBlk), raft::ceildiv(dim, (IdxT)ColsPerBlk));
-  rmm::device_uvector<DataT> mean(0, stream);
-  rmm::device_uvector<DataT> result(1, stream);
-  DataT* mu = globalCentroid;
-  if (globalCentroid == nullptr) {
-    mean.resize(dim, stream);
-    mu = mean.data();
-  }
   RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(DataT) * dim, stream));
   RAFT_CUDA_TRY(cudaMemsetAsync(result.data(), 0, sizeof(DataT), stream));
   weightedMeanKernel<DataT, IdxT, TPB, ColsPerBlk>
@@ -121,7 +122,7 @@ DataT dispersion(const DataT* centroids,
   RAFT_CUDA_TRY(cudaGetLastError());
   DataT h_result;
   raft::update_host(&h_result, result.data(), 1, stream);
-  raft::interruptible::synchronize(stream);
+  if (!dry_run) { raft::interruptible::synchronize(stream); }
   return sqrt(h_result);
 }
 
diff --git a/cpp/include/raft/stats/detail/entropy.cuh b/cpp/include/raft/stats/detail/entropy.cuh
index 3cfd0ad582..eb6f94631b 100644
--- a/cpp/include/raft/stats/detail/entropy.cuh
+++ b/cpp/include/raft/stats/detail/entropy.cuh
@@ -45,12 +45,17 @@ struct entropyOp {
  * @brief function to calculate the bincounts of number of samples in every label
  *
  * @tparam LabelT: type of the labels
+ * @param dry_run: whether to run in dry-run mode
  * @param labels: the pointer to the array containing labels for every data sample
- * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster
+ * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster.
+ *                       Can be nullptr when workspace is nullptr (for size query).
  * @param nRows: number of data samples
  * @param lowerLabelRange
  * @param upperLabelRange
- * @param workspace: device buffer containing workspace memory
+ * @param workspace: device buffer containing workspace memory. Pass nullptr to query workspace
+ * size.
+ * @param workspace_size: [in/out] When workspace is nullptr, this is set to the required workspace
+ * size. When workspace is not nullptr, this must be the size of the workspace.
  * @param stream: the cuda stream where to launch this kernel
  */
 template <typename LabelT>
@@ -59,28 +64,16 @@ void countLabels(const LabelT* labels,
                  int nRows,
                  LabelT lowerLabelRange,
                  LabelT upperLabelRange,
-                 rmm::device_uvector<char>& workspace,
+                 void* workspace,
+                 size_t& workspace_size,
                  cudaStream_t stream)
 {
-  int num_levels            = upperLabelRange - lowerLabelRange + 2;
-  LabelT lower_level        = lowerLabelRange;
-  LabelT upper_level        = upperLabelRange + 1;
-  size_t temp_storage_bytes = 0;
+  int num_levels     = upperLabelRange - lowerLabelRange + 2;
+  LabelT lower_level = lowerLabelRange;
+  LabelT upper_level = upperLabelRange + 1;
 
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
-                                                    temp_storage_bytes,
-                                                    labels,
-                                                    binCountArray,
-                                                    num_levels,
-                                                    lower_level,
-                                                    upper_level,
-                                                    nRows,
-                                                    stream));
-
-  workspace.resize(temp_storage_bytes, stream);
-
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(),
-                                                    temp_storage_bytes,
+  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace,
+                                                    workspace_size,
                                                     labels,
                                                     binCountArray,
                                                     num_levels,
@@ -94,6 +87,7 @@ void countLabels(const LabelT* labels,
  * @brief Function to calculate entropy
  * <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">more info on entropy</a>
  *
+ * @param dry_run: whether to run in dry-run mode
  * @param clusterArray: the array of classes of type T
  * @param size: the size of the data points of type int
  * @param lowerLabelRange: the lower bound of the range of labels
@@ -102,7 +96,8 @@ void countLabels(const LabelT* labels,
  * @return the entropy score
  */
 template <typename T>
-double entropy(const T* clusterArray,
+double entropy(bool dry_run,
+               const T* clusterArray,
                const int size,
                const T lowerLabelRange,
                const T upperLabelRange,
@@ -114,15 +109,36 @@ double entropy(const T* clusterArray,
 
   // declaring, allocating and initializing memory for bincount array and entropy values
   rmm::device_uvector<double> prob(numUniqueClasses, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(prob.data(), 0, numUniqueClasses * sizeof(double), stream));
+  if (!dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(prob.data(), 0, numUniqueClasses * sizeof(double), stream));
+  }
   rmm::device_scalar<double> d_entropy(stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_entropy.data(), 0, sizeof(double), stream));
-
+  if (!dry_run) { RAFT_CUDA_TRY(cudaMemsetAsync(d_entropy.data(), 0, sizeof(double), stream)); }
+
+  // Query workspace size for countLabels (can run in dry-run)
+  size_t countLabels_ws_size = 0;
+  countLabels(clusterArray,
+              nullptr,
+              size,
+              lowerLabelRange,
+              upperLabelRange,
+              nullptr,
+              countLabels_ws_size,
+              stream);
   // workspace allocation
-  rmm::device_uvector<char> workspace(1, stream);
+  rmm::device_uvector<char> workspace(countLabels_ws_size, stream);
+
+  if (dry_run) { return 0.0; }
 
   // calculating the bincounts and populating the prob array
-  countLabels(clusterArray, prob.data(), size, lowerLabelRange, upperLabelRange, workspace, stream);
+  countLabels(clusterArray,
+              prob.data(),
+              size,
+              lowerLabelRange,
+              upperLabelRange,
+              workspace.data(),
+              countLabels_ws_size,
+              stream);
 
   // scalar dividing by size
   raft::linalg::divideScalar<double>(
diff --git a/cpp/include/raft/stats/detail/homogeneity_score.cuh b/cpp/include/raft/stats/detail/homogeneity_score.cuh
index 4582a1f43e..d0c5380567 100644
--- a/cpp/include/raft/stats/detail/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/detail/homogeneity_score.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 /**
@@ -21,6 +21,7 @@ namespace detail {
  * @brief Function to calculate the homogeneity score between two clusters
  * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
  * information</a>
+ * @param dry_run: whether to run in dry-run mode
  * @param truthClusterArray: the array of truth classes of type T
  * @param predClusterArray: the array of predicted classes of type T
  * @param size: the size of the data points of type int
@@ -29,7 +30,8 @@ namespace detail {
  * @param stream: the cudaStream object
  */
 template <typename T>
-double homogeneity_score(const T* truthClusterArray,
+double homogeneity_score(bool dry_run,
+                         const T* truthClusterArray,
                          const T* predClusterArray,
                          int size,
                          T lowerLabelRange,
@@ -40,10 +42,10 @@ double homogeneity_score(const T* truthClusterArray,
 
   double computedMI, computedEntropy;
 
-  computedMI = raft::stats::mutual_info_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  computedMI = mutual_info_score(
+    dry_run, truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
   computedEntropy =
-    raft::stats::entropy(truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+    entropy(dry_run, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 
   double homogeneity;
 
diff --git a/cpp/include/raft/stats/detail/kl_divergence.cuh b/cpp/include/raft/stats/detail/kl_divergence.cuh
index 8750b3be82..92fcefd15a 100644
--- a/cpp/include/raft/stats/detail/kl_divergence.cuh
+++ b/cpp/include/raft/stats/detail/kl_divergence.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 /**
@@ -47,15 +47,21 @@ struct KLDOp {
  * Divergence</a>
  *
  * @tparam DataT: Data type of the input array
+ * @param dry_run: whether to run in dry-run mode (skip CUDA work)
  * @param modelPDF: the model array of probability density functions of type DataT
  * @param candidatePDF: the candidate array of probability density functions of type DataT
  * @param size: the size of the data points of type int
  * @param stream: the cudaStream object
  */
 template <typename DataT>
-DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
+DataT kl_divergence(
+  bool dry_run, const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
 {
   rmm::device_scalar<DataT> d_KLDVal(stream);
+
+  if (dry_run) { return DataT{0}; }
+
+  // Note: No allocations below - only CUDA operations on pre-allocated memory
   RAFT_CUDA_TRY(cudaMemsetAsync(d_KLDVal.data(), 0, sizeof(DataT), stream));
 
   raft::linalg::mapThenSumReduce<DataT, KLDOp<DataT>, size_t, 256, const DataT*>(
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index 4f9420919d..9aa4d50170 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -14,36 +14,38 @@ namespace stats {
 namespace detail {
 
 template <bool rowMajor, typename Type, typename IdxType = int>
-void mean(Type* mu, const Type* data, IdxType D, IdxType N, cudaStream_t stream)
+void mean(bool dry_run, Type* mu, const Type* data, IdxType D, IdxType N, cudaStream_t stream)
 {
   Type ratio = Type(1) / Type(N);
-  raft::linalg::reduce<rowMajor, false>(mu,
-                                        data,
-                                        D,
-                                        N,
-                                        Type(0),
-                                        stream,
-                                        false,
-                                        raft::identity_op(),
-                                        raft::add_op(),
-                                        raft::mul_const_op<Type>(ratio));
+  raft::linalg::detail::reduce<rowMajor, false>(dry_run,
+                                                mu,
+                                                data,
+                                                D,
+                                                N,
+                                                Type(0),
+                                                stream,
+                                                false,
+                                                raft::identity_op(),
+                                                raft::add_op(),
+                                                raft::mul_const_op<Type>(ratio));
 }
 
 template <bool rowMajor, typename Type, typename IdxType = int>
 [[deprecated]] void mean(
-  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, cudaStream_t stream)
+  bool dry_run, Type* mu, const Type* data, IdxType D, IdxType N, bool sample, cudaStream_t stream)
 {
   Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
-  raft::linalg::reduce<rowMajor, false>(mu,
-                                        data,
-                                        D,
-                                        N,
-                                        Type(0),
-                                        stream,
-                                        false,
-                                        raft::identity_op(),
-                                        raft::add_op(),
-                                        raft::mul_const_op<Type>(ratio));
+  raft::linalg::detail::reduce<rowMajor, false>(dry_run,
+                                                mu,
+                                                data,
+                                                D,
+                                                N,
+                                                Type(0),
+                                                stream,
+                                                false,
+                                                raft::identity_op(),
+                                                raft::add_op(),
+                                                raft::mul_const_op<Type>(ratio));
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh
index 05c350d3d0..c22e2e8f0c 100644
--- a/cpp/include/raft/stats/detail/mean_center.cuh
+++ b/cpp/include/raft/stats/detail/mean_center.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -15,45 +15,58 @@ namespace detail {
 
 /**
  * @brief Center the input matrix wrt its mean
+ * @tparam rowMajor whether input is row or col major
+ * @tparam bcastAlongRows whether to broadcast vector along rows or columns
  * @tparam Type the data type
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads per block of the cuda kernel launched
+ * @param dry_run whether to run in dry-run mode (skip CUDA work)
  * @param out the output mean-centered matrix
  * @param data input matrix
  * @param mu the mean vector
  * @param D number of columns of data
  * @param N number of rows of data
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether to broadcast vector along rows or columns
  * @param stream cuda stream where to launch work
  */
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
-void meanCenter(
-  Type* out, const Type* data, const Type* mu, IdxType D, IdxType N, cudaStream_t stream)
+void meanCenter(bool dry_run,
+                Type* out,
+                const Type* data,
+                const Type* mu,
+                IdxType D,
+                IdxType N,
+                cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    out, data, mu, D, N, raft::sub_op{}, stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    dry_run, out, data, mu, D, N, raft::sub_op{}, stream);
 }
 
 /**
  * @brief Add the input matrix wrt its mean
+ * @tparam rowMajor whether input is row or col major
+ * @tparam bcastAlongRows whether to broadcast vector along rows or columns
  * @tparam Type the data type
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads per block of the cuda kernel launched
+ * @param dry_run whether to run in dry-run mode (skip CUDA work)
  * @param out the output mean-added matrix
  * @param data input matrix
  * @param mu the mean vector
  * @param D number of columns of data
  * @param N number of rows of data
- * @param rowMajor whether input is row or col major
- * @param bcastAlongRows whether to broadcast vector along rows or columns
  * @param stream cuda stream where to launch work
  */
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
-void meanAdd(Type* out, const Type* data, const Type* mu, IdxType D, IdxType N, cudaStream_t stream)
+void meanAdd(bool dry_run,
+             Type* out,
+             const Type* data,
+             const Type* mu,
+             IdxType D,
+             IdxType N,
+             cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp<rowMajor, bcastAlongRows>(
-    out, data, mu, D, N, raft::add_op{}, stream);
+  raft::linalg::detail::matrixVectorOp<rowMajor, bcastAlongRows>(
+    dry_run, out, data, mu, D, N, raft::add_op{}, stream);
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
index ef87ac86aa..2ebeb642cf 100644
--- a/cpp/include/raft/stats/detail/meanvar.cuh
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -182,8 +182,15 @@ RAFT_KERNEL meanvar_kernel_fill(T* mean, T* var, const mean_var<T>* aggr, I D, b
 }
 
 template <typename T, typename I = int, int BlockSize = 256>
-void meanvar(
-  T* mean, T* var, const T* data, I D, I N, bool sample, bool rowMajor, cudaStream_t stream)
+void meanvar(bool dry_run,
+             T* mean,
+             T* var,
+             const T* data,
+             I D,
+             I N,
+             bool sample,
+             bool rowMajor,
+             cudaStream_t stream)
 {
   if (rowMajor) {
     static_assert(BlockSize >= WarpSize, "Block size must be not smaller than the warp size.");
@@ -200,20 +207,25 @@ void meanvar(
     // Global memory: one mean_var<T> for each column
     //                one lock per all blocks working on the same set of columns
     rmm::device_buffer buf(sizeof(mean_var<T>) * D + sizeof(int) * gs.x, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(buf.data(), 0, buf.size(), stream));
-    mean_var<T>* mvs = static_cast<mean_var<T>*>(buf.data());
-    int* locks       = static_cast<int*>(static_cast<void*>(mvs + D));
-
-    const uint64_t len = uint64_t(D) * uint64_t(N);
-    ASSERT(len <= uint64_t(std::numeric_limits<I>::max()), "N * D does not fit the indexing type");
-    meanvar_kernel_rowmajor<T, I, BlockSize><<<gs, bs, 0, stream>>>(data, mvs, locks, len, D);
-    meanvar_kernel_fill<T, I>
-      <<<raft::ceildiv<I>(D, BlockSize), BlockSize, 0, stream>>>(mean, var, mvs, D, sample);
+    if (!dry_run) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(buf.data(), 0, buf.size(), stream));
+      mean_var<T>* mvs = static_cast<mean_var<T>*>(buf.data());
+      int* locks       = static_cast<int*>(static_cast<void*>(mvs + D));
+
+      const uint64_t len = uint64_t(D) * uint64_t(N);
+      ASSERT(len <= uint64_t(std::numeric_limits<I>::max()),
+             "N * D does not fit the indexing type");
+      meanvar_kernel_rowmajor<T, I, BlockSize><<<gs, bs, 0, stream>>>(data, mvs, locks, len, D);
+      meanvar_kernel_fill<T, I>
+        <<<raft::ceildiv<I>(D, BlockSize), BlockSize, 0, stream>>>(mean, var, mvs, D, sample);
+    }
   } else {
-    meanvar_kernel_colmajor<T, I, BlockSize>
-      <<<D, BlockSize, 0, stream>>>(mean, var, data, D, N, sample);
+    if (!dry_run) {
+      meanvar_kernel_colmajor<T, I, BlockSize>
+        <<<D, BlockSize, 0, stream>>>(mean, var, data, D, N, sample);
+    }
   }
-  RAFT_CHECK_CUDA(stream);
+  if (!dry_run) { RAFT_CHECK_CUDA(stream); }
 }
 
 };  // namespace raft::stats::detail
diff --git a/cpp/include/raft/stats/detail/mutual_info_score.cuh b/cpp/include/raft/stats/detail/mutual_info_score.cuh
index 85bca8b273..8afcbd54cc 100644
--- a/cpp/include/raft/stats/detail/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/detail/mutual_info_score.cuh
@@ -82,6 +82,7 @@ RAFT_KERNEL mutual_info_kernel(const int* dContingencyMatrix,
 /**
  * @brief Function to calculate the mutual information between two clusters
  * <a href="https://en.wikipedia.org/wiki/Mutual_information">more info on mutual information</a>
+ * @param dry_run: whether to run in dry-run mode
  * @param firstClusterArray: the array of classes of type T
  * @param secondClusterArray: the array of classes of type T
  * @param size: the size of the data points of type int
@@ -90,7 +91,8 @@ RAFT_KERNEL mutual_info_kernel(const int* dContingencyMatrix,
  * @param stream: the cudaStream object
  */
 template <typename T>
-double mutual_info_score(const T* firstClusterArray,
+double mutual_info_score(bool dry_run,
+                         const T* firstClusterArray,
                          const T* secondClusterArray,
                          int size,
                          T lowerLabelRange,
@@ -101,24 +103,28 @@ double mutual_info_score(const T* firstClusterArray,
 
   // declaring, allocating and initializing memory for the contingency marix
   rmm::device_uvector<int> dContingencyMatrix(numUniqueClasses * numUniqueClasses, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(
-    dContingencyMatrix.data(), 0, numUniqueClasses * numUniqueClasses * sizeof(int), stream));
+  if (!dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      dContingencyMatrix.data(), 0, numUniqueClasses * numUniqueClasses * sizeof(int), stream));
+  }
 
   // workspace allocation
-  size_t workspaceSz = raft::stats::getContingencyMatrixWorkspaceSize(
-    size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
+  size_t workspaceSz = raft::stats::detail::getContingencyMatrixWorkspaceSize(
+    dry_run, size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
   rmm::device_uvector<char> pWorkspace(workspaceSz, stream);
 
   // calculating the contingency matrix
-  raft::stats::contingencyMatrix(firstClusterArray,
-                                 secondClusterArray,
-                                 (int)size,
-                                 (int*)dContingencyMatrix.data(),
-                                 stream,
-                                 (void*)pWorkspace.data(),
-                                 workspaceSz,
-                                 lowerLabelRange,
-                                 upperLabelRange);
+  if (!dry_run) {
+    raft::stats::contingencyMatrix(firstClusterArray,
+                                   secondClusterArray,
+                                   (int)size,
+                                   (int*)dContingencyMatrix.data(),
+                                   stream,
+                                   (void*)pWorkspace.data(),
+                                   workspaceSz,
+                                   lowerLabelRange,
+                                   upperLabelRange);
+  }
 
   // creating device buffers for all the parameters involved in ARI calculation
   // device variables
@@ -130,17 +136,21 @@ double mutual_info_score(const T* firstClusterArray,
   double h_MI;
 
   // initializing device memory
-  RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, numUniqueClasses * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, numUniqueClasses * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_MI.data(), 0, sizeof(double), stream));
+  if (!dry_run) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, numUniqueClasses * sizeof(int), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, numUniqueClasses * sizeof(int), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_MI.data(), 0, sizeof(double), stream));
+  }
 
   // calculating the row-wise sums
-  raft::linalg::reduce<true, true, int, int, int>(
-    a.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, stream);
+  raft::linalg::detail::reduce<true, true, int, int, int>(
+    dry_run, a.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, stream);
 
   // calculating the column-wise sums
-  raft::linalg::reduce<true, false, int, int, int>(
-    b.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, stream);
+  raft::linalg::detail::reduce<true, false, int, int, int>(
+    dry_run, b.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, stream);
+
+  if (dry_run) { return 0.0; }
 
   // kernel configuration
   static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
diff --git a/cpp/include/raft/stats/detail/neighborhood_recall.cuh b/cpp/include/raft/stats/detail/neighborhood_recall.cuh
index 503e9cbb19..83cb11f92b 100644
--- a/cpp/include/raft/stats/detail/neighborhood_recall.cuh
+++ b/cpp/include/raft/stats/detail/neighborhood_recall.cuh
@@ -12,6 +12,7 @@
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <cub/block/block_reduce.cuh>
@@ -93,6 +94,7 @@ void neighborhood_recall(
   raft::device_scalar_view<ScalarType> recall_score,
   DistanceValueType const eps)
 {
+  if (resource::get_dry_run_flag(res)) { return; }
   // One warp per row, launch a warp-width block per-row kernel
   auto constexpr kThreadsPerBlock = 32;
   auto const num_blocks           = indices.extent(0);
diff --git a/cpp/include/raft/stats/detail/rand_index.cuh b/cpp/include/raft/stats/detail/rand_index.cuh
index fe8c5faac2..a2ef3c21ee 100644
--- a/cpp/include/raft/stats/detail/rand_index.cuh
+++ b/cpp/include/raft/stats/detail/rand_index.cuh
@@ -111,13 +111,15 @@ RAFT_KERNEL computeTheNumerator(
 /**
  * @brief Function to calculate RandIndex
  * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+ * @param dry_run: whether to run in dry-run mode
  * @param firstClusterArray: the array of classes of type T
  * @param secondClusterArray: the array of classes of type T
  * @param size: the size of the data points of type uint64_t
  * @param stream: the cudaStream object
  */
 template <typename T>
-double compute_rand_index(const T* firstClusterArray,
+double compute_rand_index(bool dry_run,
+                          const T* firstClusterArray,
                           const T* secondClusterArray,
                           uint64_t size,
                           cudaStream_t stream)
@@ -129,6 +131,7 @@ double compute_rand_index(const T* firstClusterArray,
 
   // allocating and initializing memory for a and b in the GPU
   rmm::device_uvector<uint64_t> arr_buf(2, stream);
+  if (dry_run) { return 0.0; }
   RAFT_CUDA_TRY(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
 
   // kernel configuration
diff --git a/cpp/include/raft/stats/detail/scores.cuh b/cpp/include/raft/stats/detail/scores.cuh
index 400b1828f2..9ff80e6c5e 100644
--- a/cpp/include/raft/stats/detail/scores.cuh
+++ b/cpp/include/raft/stats/detail/scores.cuh
@@ -36,6 +36,7 @@ namespace detail {
  * in a linear regression model. The larger the R-squared value, the
  * more variability is explained by the linear regression model.
  *
+ * @param dry_run: whether to run in dry-run mode (track allocations but skip CUDA work)
  * @param y: Array of ground-truth response variables
  * @param y_hat: Array of predicted response variables
  * @param n: Number of elements in y and y_hat
@@ -43,21 +44,21 @@ namespace detail {
  * @return: The R-squared value.
  */
 template <typename math_t>
-math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
+math_t r2_score(bool dry_run, math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 {
   rmm::device_scalar<math_t> y_bar(stream);
+  rmm::device_uvector<math_t> sse_arr(n, stream);
+  rmm::device_uvector<math_t> ssto_arr(n, stream);
 
-  raft::stats::mean<false>(y_bar.data(), y, 1, n, stream);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  if (dry_run) { return math_t{0}; }
 
-  rmm::device_uvector<math_t> sse_arr(n, stream);
+  raft::stats::detail::mean<false>(dry_run, y_bar.data(), y, 1, n, stream);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   raft::linalg::eltwiseSub(sse_arr.data(), y, y_hat, n, stream);
   raft::linalg::powerScalar(sse_arr.data(), sse_arr.data(), math_t(2.0), n, stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  rmm::device_uvector<math_t> ssto_arr(n, stream);
-
   raft::linalg::subtractDevScalar(ssto_arr.data(), y, y_bar.data(), n, stream);
   raft::linalg::powerScalar(ssto_arr.data(), ssto_arr.data(), math_t(2.0), n, stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
@@ -74,6 +75,7 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 /**
  * @brief Compute accuracy of predictions. Useful for classification.
  * @tparam math_t: data type for predictions (e.g., int for classification)
+ * @param[in] dry_run: whether to run in dry-run mode (track allocations but skip CUDA work)
  * @param[in] predictions: array of predictions (GPU pointer).
  * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
  * @param[in] n: number of elements in each of predictions, ref_predictions.
@@ -81,15 +83,18 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
  * @return: Accuracy score in [0, 1]; higher is better.
  */
 template <typename math_t>
-float accuracy_score(const math_t* predictions,
+float accuracy_score(bool dry_run,
+                     const math_t* predictions,
                      const math_t* ref_predictions,
                      int n,
                      cudaStream_t stream)
 {
-  unsigned long long correctly_predicted = 0ULL;
   rmm::device_uvector<math_t> diffs_array(n, stream);
 
+  if (dry_run) { return 0.0f; }
+
   // TODO could write a kernel instead
+  unsigned long long correctly_predicted = 0ULL;
   raft::linalg::eltwiseSub(diffs_array.data(), predictions, ref_predictions, n, stream);
   RAFT_CUDA_TRY(cudaGetLastError());
   correctly_predicted =
@@ -131,6 +136,7 @@ RAFT_KERNEL reg_metrics_kernel(
 /**
  * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error
  * @tparam T: data type for predictions (e.g., float or double for regression).
+ * @param[in] dry_run: whether to run in dry-run mode (track allocations but skip CUDA work)
  * @param[in] predictions: array of predictions (GPU pointer).
  * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
  * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0.
@@ -143,7 +149,8 @@ RAFT_KERNEL reg_metrics_kernel(
  * ref_predictions[i]| for i in [0, n).
  */
 template <typename T>
-void regression_metrics(const T* predictions,
+void regression_metrics(bool dry_run,
+                        const T* predictions,
                         const T* ref_predictions,
                         int n,
                         cudaStream_t stream,
@@ -151,15 +158,30 @@ void regression_metrics(const T* predictions,
                         double& mean_squared_error,
                         double& median_abs_error)
 {
+  int array_size = n * sizeof(double);
+  rmm::device_uvector<double> abs_diffs_array(array_size, stream);
+  rmm::device_uvector<double> sorted_abs_diffs(array_size, stream);
+  rmm::device_uvector<double> tmp_sums(2 * sizeof(double), stream);
+
+  // CUB workspace size query (safe even in dry-run — no kernel launch)
+  size_t temp_storage_bytes = 0;
+  RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)nullptr,
+                                               temp_storage_bytes,
+                                               abs_diffs_array.data(),
+                                               sorted_abs_diffs.data(),
+                                               n,
+                                               0,
+                                               8 * sizeof(double),
+                                               stream));
+  rmm::device_uvector<char> temp_storage_v(temp_storage_bytes, stream);
+
   std::vector<double> mean_errors(2);
   std::vector<double> h_sorted_abs_diffs(n);
   int thread_cnt = 256;
   int block_cnt  = raft::ceildiv(n, thread_cnt);
 
-  int array_size = n * sizeof(double);
-  rmm::device_uvector<double> abs_diffs_array(array_size, stream);
-  rmm::device_uvector<double> sorted_abs_diffs(array_size, stream);
-  rmm::device_uvector<double> tmp_sums(2 * sizeof(double), stream);
+  if (dry_run) { return; }
+
   RAFT_CUDA_TRY(cudaMemsetAsync(tmp_sums.data(), 0, 2 * sizeof(double), stream));
 
   reg_metrics_kernel<T><<<block_cnt, thread_cnt, 0, stream>>>(
@@ -172,18 +194,7 @@ void regression_metrics(const T* predictions,
   mean_squared_error = mean_errors[1] / n;
 
   // Compute median error. Sort diffs_array and pick median value
-  char* temp_storage = nullptr;
-  size_t temp_storage_bytes;
-  RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage,
-                                               temp_storage_bytes,
-                                               abs_diffs_array.data(),
-                                               sorted_abs_diffs.data(),
-                                               n,
-                                               0,
-                                               8 * sizeof(double),
-                                               stream));
-  rmm::device_uvector<char> temp_storage_v(temp_storage_bytes, stream);
-  temp_storage = temp_storage_v.data();
+  char* temp_storage = temp_storage_v.data();
   RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage,
                                                temp_storage_bytes,
                                                abs_diffs_array.data(),
diff --git a/cpp/include/raft/stats/detail/silhouette_score.cuh b/cpp/include/raft/stats/detail/silhouette_score.cuh
index d81000d726..3a6870b4db 100644
--- a/cpp/include/raft/stats/detail/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/silhouette_score.cuh
@@ -7,6 +7,7 @@
 
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/add.cuh>
@@ -90,10 +91,13 @@ RAFT_KERNEL populateAKernel(DataT* sampleToClusterSumOfDistances,
  * @tparam LabelT: type of the labels
  * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
  * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster (1 x
- * nLabels)
+ * nLabels). Can be nullptr when workspace is nullptr (for size query).
  * @param nRows: number of data samples
  * @param nUniqueLabels: number of Labels
- * @param workspace: device buffer containing workspace memory
+ * @param workspace: device buffer containing workspace memory. Pass nullptr to query workspace
+ * size.
+ * @param workspace_size: [in/out] When workspace is nullptr, this is set to the required workspace
+ * size. When workspace is not nullptr, this must be the size of the workspace.
  * @param stream: the cuda stream where to launch this kernel
  */
 template <typename DataT, typename LabelT>
@@ -101,30 +105,16 @@ void countLabels(const LabelT* labels,
                  DataT* binCountArray,
                  int nRows,
                  int nUniqueLabels,
-                 rmm::device_uvector<char>& workspace,
+                 void* workspace,
+                 size_t& workspace_size,
                  cudaStream_t stream)
 {
-  int num_levels            = nUniqueLabels + 1;
-  LabelT lower_level        = 0;
-  LabelT upper_level        = nUniqueLabels;
-  size_t temp_storage_bytes = 0;
-
-  rmm::device_uvector<int> countArray(nUniqueLabels, stream);
-
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
-                                                    temp_storage_bytes,
-                                                    labels,
-                                                    binCountArray,
-                                                    num_levels,
-                                                    lower_level,
-                                                    upper_level,
-                                                    nRows,
-                                                    stream));
-
-  workspace.resize(temp_storage_bytes, stream);
+  int num_levels     = nUniqueLabels + 1;
+  LabelT lower_level = 0;
+  LabelT upper_level = nUniqueLabels;
 
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(),
-                                                    temp_storage_bytes,
+  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace,
+                                                    workspace_size,
                                                     labels,
                                                     binCountArray,
                                                     num_levels,
@@ -195,34 +185,56 @@ DataT silhouette_score(
   cudaStream_t stream,
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
 {
+  bool is_dry_run = resource::get_dry_run_flag(handle);
   ASSERT(nLabels >= 2 && nLabels <= (nRows - 1),
          "silhouette Score not defined for the given number of labels!");
 
   // compute the distance matrix
   rmm::device_uvector<DataT> distanceMatrix(nRows * nRows, stream);
-  rmm::device_uvector<char> workspace(1, stream);
 
-  raft::distance::pairwise_distance(
-    handle, X_in, X_in, distanceMatrix.data(), nRows, nRows, nCols, metric);
+  // Query workspace size for countLabels (can run in dry-run)
+  size_t countLabels_ws_size = 0;
+  countLabels<DataT, LabelT>(labels, nullptr, nRows, nLabels, nullptr, countLabels_ws_size, stream);
+  rmm::device_uvector<char> workspace(countLabels_ws_size, stream);
 
   // deciding on the array of silhouette scores for each dataPoint
-  rmm::device_uvector<DataT> silhouette_scoreSamples(0, stream);
+  rmm::device_uvector<DataT> silhouette_scoreSamples(
+    silhouette_scorePerSample == nullptr ? nRows : 0, stream);
   DataT* perSampleSilScore = nullptr;
   if (silhouette_scorePerSample == nullptr) {
-    silhouette_scoreSamples.resize(nRows, stream);
     perSampleSilScore = silhouette_scoreSamples.data();
   } else {
     perSampleSilScore = silhouette_scorePerSample;
   }
-  RAFT_CUDA_TRY(cudaMemsetAsync(perSampleSilScore, 0, nRows * sizeof(DataT), stream));
 
   // getting the sample count per cluster
   rmm::device_uvector<DataT> binCountArray(nLabels, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(binCountArray.data(), 0, nLabels * sizeof(DataT), stream));
-  countLabels(labels, binCountArray.data(), nRows, nLabels, workspace, stream);
 
   // calculating the sample-cluster-distance-sum-array
   rmm::device_uvector<DataT> sampleToClusterSumOfDistances(nRows * nLabels, stream);
+
+  // creating the a array and b array
+  rmm::device_uvector<DataT> d_aArray(nRows, stream);
+  rmm::device_uvector<DataT> d_bArray(nRows, stream);
+
+  // elementwise dividing by bincounts
+  rmm::device_uvector<DataT> averageDistanceBetweenSampleAndCluster(nRows * nLabels, stream);
+
+  // calculating the sum of all the silhouette score
+  rmm::device_scalar<DataT> d_avgSilhouetteScore(stream);
+
+  if (is_dry_run) { return DataT{0}; }
+
+  raft::distance::pairwise_distance(
+    handle, X_in, X_in, distanceMatrix.data(), nRows, nRows, nCols, metric);
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(perSampleSilScore, 0, nRows * sizeof(DataT), stream));
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(binCountArray.data(), 0, nLabels * sizeof(DataT), stream));
+  size_t workspace_size = workspace.size();
+  countLabels<DataT, LabelT>(
+    labels, binCountArray.data(), nRows, nLabels, workspace.data(), workspace_size, stream);
+
   RAFT_CUDA_TRY(cudaMemsetAsync(
     sampleToClusterSumOfDistances.data(), 0, nRows * nLabels * sizeof(DataT), stream));
   raft::linalg::reduce_cols_by_key(distanceMatrix.data(),
@@ -233,18 +245,13 @@ DataT silhouette_score(
                                    nLabels,
                                    stream);
 
-  // creating the a array and b array
-  rmm::device_uvector<DataT> d_aArray(nRows, stream);
-  rmm::device_uvector<DataT> d_bArray(nRows, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(d_aArray.data(), 0, nRows * sizeof(DataT), stream));
   RAFT_CUDA_TRY(cudaMemsetAsync(d_bArray.data(), 0, nRows * sizeof(DataT), stream));
 
   // kernel that populates the d_aArray
-  // kernel configuration
   dim3 numThreadsPerBlock(32, 1, 1);
   dim3 numBlocks(raft::ceildiv<int>(nRows, numThreadsPerBlock.x), 1, 1);
 
-  // calling the kernel
   populateAKernel<<<numBlocks, numThreadsPerBlock, 0, stream>>>(
     sampleToClusterSumOfDistances.data(),
     binCountArray.data(),
@@ -254,19 +261,18 @@ DataT silhouette_score(
     nLabels,
     std::numeric_limits<DataT>::max());
 
-  // elementwise dividing by bincounts
-  rmm::device_uvector<DataT> averageDistanceBetweenSampleAndCluster(nRows * nLabels, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(
     averageDistanceBetweenSampleAndCluster.data(), 0, nRows * nLabels * sizeof(DataT), stream));
 
-  raft::linalg::matrixVectorOp<true, true>(averageDistanceBetweenSampleAndCluster.data(),
-                                           sampleToClusterSumOfDistances.data(),
-                                           binCountArray.data(),
-                                           binCountArray.data(),
-                                           nLabels,
-                                           nRows,
-                                           DivOp<DataT>(),
-                                           stream);
+  raft::linalg::matrix_vector_op<raft::Apply::ALONG_ROWS>(
+    handle,
+    raft::make_device_matrix_view<const DataT, int, raft::row_major>(
+      sampleToClusterSumOfDistances.data(), nRows, nLabels),
+    raft::make_device_vector_view<const DataT, int>(binCountArray.data(), nLabels),
+    raft::make_device_vector_view<const DataT, int>(binCountArray.data(), nLabels),
+    raft::make_device_matrix_view<DataT, int, raft::row_major>(
+      averageDistanceBetweenSampleAndCluster.data(), nRows, nLabels),
+    DivOp<DataT>());
 
   // calculating row-wise minimum
   raft::linalg::reduce<DataT, DataT, int, raft::identity_op, raft::min_op>(
@@ -286,8 +292,6 @@ DataT silhouette_score(
   raft::linalg::binaryOp<DataT, SilOp<DataT>>(
     perSampleSilScore, d_aArray.data(), d_bArray.data(), nRows, SilOp<DataT>(), stream);
 
-  // calculating the sum of all the silhouette score
-  rmm::device_scalar<DataT> d_avgSilhouetteScore(stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(d_avgSilhouetteScore.data(), 0, sizeof(DataT), stream));
 
   raft::linalg::mapThenSumReduce<double, raft::identity_op>(d_avgSilhouetteScore.data(),
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index ed9c496909..462174e858 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -22,6 +22,7 @@ namespace detail {
  * @tparam rowMajor whether the input data is row or col major
  * @tparam Type the data type
  * @tparam IdxType Integer type used to for addressing
+ * @param dry_run whether to run in dry-run mode (skip CUDA work)
  * @param std the output stddev vector
  * @param data the input matrix
  * @param mu the mean vector
@@ -30,11 +31,11 @@ namespace detail {
  * @param sample whether to evaluate sample stddev or not. In other words,
  * whether
  *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor whether the input data is row or col major
  * @param stream cuda stream where to launch work
  */
 template <bool rowMajor, typename Type, typename IdxType = int>
-void stddev(Type* std,
+void stddev(bool dry_run,
+            Type* std,
             const Type* data,
             const Type* mu,
             IdxType D,
@@ -42,8 +43,11 @@ void stddev(Type* std,
             bool sample,
             cudaStream_t stream)
 {
-  raft::linalg::reduce<rowMajor, false>(
-    std, data, D, N, Type(0), stream, false, [mu] __device__(Type a, IdxType i) { return a * a; });
+  raft::linalg::detail::reduce<rowMajor, false>(
+    dry_run, std, data, D, N, Type(0), stream, false, [mu] __device__(Type a, IdxType i) {
+      return a * a;
+    });
+  if (dry_run) { return; }
   Type ratio      = Type(1) / ((sample) ? Type(N - 1) : Type(N));
   Type ratio_mean = sample ? ratio * Type(N) : Type(1);
   raft::linalg::binaryOp(std,
@@ -66,18 +70,20 @@ void stddev(Type* std,
  * @tparam rowMajor whether the input data is row or col major
  * @tparam Type the data type
  * @tparam IdxType Integer type used to for addressing
- * @param var the output stddev vector
+ * @param dry_run whether to run in dry-run mode (skip CUDA work)
+ * @param var the output variance vector
  * @param data the input matrix
  * @param mu the mean vector
  * @param D number of columns of data
  * @param N number of rows of data
- * @param sample whether to evaluate sample stddev or not. In other words,
+ * @param sample whether to evaluate sample variance or not. In other words,
  * whether
  *  to normalize the output using N-1 or N, for true or false, respectively
  * @param stream cuda stream where to launch work
  */
 template <bool rowMajor, typename Type, typename IdxType = int>
-void vars(Type* var,
+void vars(bool dry_run,
+          Type* var,
           const Type* data,
           const Type* mu,
           IdxType D,
@@ -85,8 +91,11 @@ void vars(Type* var,
           bool sample,
           cudaStream_t stream)
 {
-  raft::linalg::reduce<rowMajor, false>(
-    var, data, D, N, Type(0), stream, false, [mu] __device__(Type a, IdxType i) { return a * a; });
+  raft::linalg::detail::reduce<rowMajor, false>(
+    dry_run, var, data, D, N, Type(0), stream, false, [mu] __device__(Type a, IdxType i) {
+      return a * a;
+    });
+  if (dry_run) { return; }
   Type ratio      = Type(1) / ((sample) ? Type(N - 1) : Type(N));
   Type ratio_mean = sample ? ratio * Type(N) : Type(1);
   raft::linalg::binaryOp(var,
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
deleted file mode 100644
index 3e19222203..0000000000
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/reduce.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-namespace raft {
-namespace stats {
-namespace detail {
-
-template <bool rowMajor, typename Type, typename IdxType = int>
-void sum(Type* output, const Type* input, IdxType D, IdxType N, cudaStream_t stream)
-{
-  raft::linalg::reduce<rowMajor, false>(output, input, D, N, Type(0), stream);
-}
-
-}  // namespace detail
-}  // namespace stats
-}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/trustworthiness_score.cuh b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
index e43d61f2e0..3706303a56 100644
--- a/cpp/include/raft/stats/detail/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
@@ -1,9 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/matrix/col_wise_sort.cuh>
 #include <raft/spatial/knn/knn.cuh>
@@ -152,30 +154,15 @@ double trustworthiness_score(const raft::resources& h,
     raft::distance::pairwise_distance(
       h, &X[(n - toDo) * m], X, X_dist.data(), curBatchSize, n, m, distance_type);
 
-    size_t colSortWorkspaceSize = 0;
-    bool bAllocWorkspace        = false;
-
-    raft::matrix::sort_cols_per_row(X_dist.data(),
-                                    X_ind.data(),
-                                    curBatchSize,
-                                    n,
-                                    bAllocWorkspace,
-                                    nullptr,
-                                    colSortWorkspaceSize,
-                                    stream);
-
-    if (bAllocWorkspace) {
-      rmm::device_uvector<char> sortColsWorkspace(colSortWorkspaceSize, stream);
-
-      raft::matrix::sort_cols_per_row(X_dist.data(),
-                                      X_ind.data(),
-                                      curBatchSize,
-                                      n,
-                                      bAllocWorkspace,
-                                      sortColsWorkspace.data(),
-                                      colSortWorkspaceSize,
-                                      stream);
-    }
+    // Use dry-run compliant handle-based overload that manages workspace internally
+    auto dist_view = raft::make_device_matrix_view<const math_t, int, raft::row_major>(
+      X_dist.data(), curBatchSize, n);
+    auto ind_view =
+      raft::make_device_matrix_view<int, int, raft::row_major>(X_ind.data(), curBatchSize, n);
+    raft::matrix::sort_cols_per_row(h, dist_view, ind_view, std::nullopt);
+
+    // The workspace won't grow anymore
+    if (resource::get_dry_run_flag(h)) { return 0.0; }
 
     int work     = curBatchSize * n;
     int n_blocks = raft::ceildiv(work, N_THREADS);
diff --git a/cpp/include/raft/stats/detail/v_measure.cuh b/cpp/include/raft/stats/detail/v_measure.cuh
index ad35dde5be..92f9832d23 100644
--- a/cpp/include/raft/stats/detail/v_measure.cuh
+++ b/cpp/include/raft/stats/detail/v_measure.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 /**
@@ -15,6 +15,7 @@ namespace detail {
 /**
  * @brief Function to calculate the v-measure between two clusters
  *
+ * @param dry_run: whether to run in dry-run mode
  * @param truthClusterArray: the array of truth classes of type T
  * @param predClusterArray: the array of predicted classes of type T
  * @param size: the size of the data points of type int
@@ -24,7 +25,8 @@ namespace detail {
  * @param beta: v_measure parameter
  */
 template <typename T>
-double v_measure(const T* truthClusterArray,
+double v_measure(bool dry_run,
+                 const T* truthClusterArray,
                  const T* predClusterArray,
                  int size,
                  T lowerLabelRange,
@@ -34,10 +36,10 @@ double v_measure(const T* truthClusterArray,
 {
   double computedHomogeity, computedCompleteness, computedVMeasure;
 
-  computedHomogeity = raft::stats::homogeneity_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-  computedCompleteness = raft::stats::homogeneity_score(
-    predClusterArray, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  computedHomogeity = homogeneity_score(
+    dry_run, truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  computedCompleteness = homogeneity_score(
+    dry_run, predClusterArray, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 
   if (computedCompleteness + computedHomogeity == 0.0)
     computedVMeasure = 0.0;
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
index dca1f70de8..6a35176a59 100644
--- a/cpp/include/raft/stats/detail/weighted_mean.cuh
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -22,6 +22,7 @@ namespace detail {
  * @tparam along_rows whether to reduce along rows or columns
  * @tparam Type the data type
  * @tparam IdxType Integer type used to for addressing
+ * @param dry_run whether to run in dry-run mode
  * @param mu the output mean vector
  * @param data the input matrix
  * @param weights weight of size D if along_row is true, else of size N
@@ -30,16 +31,23 @@ namespace detail {
  * @param stream cuda stream to launch work on
  */
 template <bool row_major, bool along_rows, typename Type, typename IdxType = int>
-void weightedMean(
-  Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
+void weightedMean(bool dry_run,
+                  Type* mu,
+                  const Type* data,
+                  const Type* weights,
+                  IdxType D,
+                  IdxType N,
+                  cudaStream_t stream)
 {
   // sum the weights & copy back to CPU
   auto weight_size = along_rows ? D : N;
   Type WS          = 0;
-  raft::stats::sum<false>(mu, weights, (IdxType)1, weight_size, stream);
-  raft::update_host(&WS, mu, 1, stream);
+  raft::linalg::detail::reduce<false, false>(
+    dry_run, mu, weights, (IdxType)1, weight_size, (Type)0, stream);
+  if (!dry_run) { raft::update_host(&WS, mu, 1, stream); }
 
-  raft::linalg::reduce<row_major, along_rows>(
+  raft::linalg::detail::reduce<row_major, along_rows>(
+    dry_run,
     mu,
     data,
     D,
diff --git a/cpp/include/raft/stats/dispersion.cuh b/cpp/include/raft/stats/dispersion.cuh
index 65a327c093..9980cbba92 100644
--- a/cpp/include/raft/stats/dispersion.cuh
+++ b/cpp/include/raft/stats/dispersion.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/dispersion.cuh>
 
 #include <optional>
@@ -45,7 +46,7 @@ DataT dispersion(const DataT* centroids,
                  cudaStream_t stream)
 {
   return detail::dispersion<DataT, IdxT, TPB>(
-    centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream);
+    false, centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream);
 }
 
 /**
@@ -88,7 +89,8 @@ value_t cluster_dispersion(
     RAFT_EXPECTS(global_centroid.value().is_exhaustive(), "global_centroid must be contiguous");
     global_centroid_ptr = global_centroid.value().data_handle();
   }
-  return detail::dispersion<value_t, idx_t>(centroids.data_handle(),
+  return detail::dispersion<value_t, idx_t>(resource::get_dry_run_flag(handle),
+                                            centroids.data_handle(),
                                             cluster_sizes.data_handle(),
                                             global_centroid_ptr,
                                             centroids.extent(0),
diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh
index e47f33efbe..e68492f8d5 100644
--- a/cpp/include/raft/stats/entropy.cuh
+++ b/cpp/include/raft/stats/entropy.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -9,6 +9,7 @@
 #pragma once
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/entropy.cuh>
 
 namespace raft {
@@ -33,7 +34,7 @@ double entropy(const T* clusterArray,
                const T upperLabelRange,
                cudaStream_t stream)
 {
-  return detail::entropy(clusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  return detail::entropy(false, clusterArray, size, lowerLabelRange, upperLabelRange, stream);
 }
 
 /**
@@ -60,7 +61,8 @@ double entropy(raft::resources const& handle,
                const value_t upper_label_range)
 {
   RAFT_EXPECTS(cluster_array.is_exhaustive(), "cluster_array must be contiguous");
-  return detail::entropy(cluster_array.data_handle(),
+  return detail::entropy(resource::get_dry_run_flag(handle),
+                         cluster_array.data_handle(),
                          cluster_array.extent(0),
                          lower_label_range,
                          upper_label_range,
diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh
index 36e2ff64e2..9449e863b9 100644
--- a/cpp/include/raft/stats/histogram.cuh
+++ b/cpp/include/raft/stats/histogram.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/histogram.cuh>
 #include <raft/stats/stats_types.hpp>
 
@@ -87,6 +88,10 @@ void histogram(raft::resources const& handle,
                raft::device_matrix_view<int, idx_t, raft::col_major> bins,
                binner_op binner = IdentityBinner<value_t, idx_t>())
 {
+  // Seems like neither implementation of histogram does any CUDA allocations.
+  // There is one allocation of std::vector inside Seive object in computeHashTableSize,
+  // but it doesn't go through sdt::pmr, so isn't counted.
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(std::is_integral_v<idx_t> && data.extent(0) <= std::numeric_limits<int>::max(),
                "Index type not supported");
   RAFT_EXPECTS(bins.extent(1) == data.extent(1), "Size mismatch");
diff --git a/cpp/include/raft/stats/homogeneity_score.cuh b/cpp/include/raft/stats/homogeneity_score.cuh
index 0a2225f745..39d6e297e0 100644
--- a/cpp/include/raft/stats/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/homogeneity_score.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/homogeneity_score.cuh>
 
 namespace raft {
@@ -35,7 +36,7 @@ double homogeneity_score(const T* truthClusterArray,
                          cudaStream_t stream)
 {
   return detail::homogeneity_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+    false, truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 }
 
 /**
@@ -67,7 +68,8 @@ double homogeneity_score(raft::resources const& handle,
   RAFT_EXPECTS(truth_cluster_array.size() == pred_cluster_array.size(), "Size mismatch");
   RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous");
   RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous");
-  return detail::homogeneity_score(truth_cluster_array.data_handle(),
+  return detail::homogeneity_score(resource::get_dry_run_flag(handle),
+                                   truth_cluster_array.data_handle(),
                                    pred_cluster_array.data_handle(),
                                    truth_cluster_array.extent(0),
                                    lower_label_range,
diff --git a/cpp/include/raft/stats/information_criterion.cuh b/cpp/include/raft/stats/information_criterion.cuh
index 5b486b9dbc..4c90774d3f 100644
--- a/cpp/include/raft/stats/information_criterion.cuh
+++ b/cpp/include/raft/stats/information_criterion.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 /**
@@ -20,6 +20,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/batched/information_criterion.cuh>
 #include <raft/stats/stats_types.hpp>
@@ -88,6 +89,7 @@ void information_criterion_batched(raft::resources const& handle,
                                    idx_t n_params,
                                    idx_t n_samples)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   RAFT_EXPECTS(d_ic.size() == d_loglikelihood.size(), "Size mismatch");
   RAFT_EXPECTS(d_ic.is_exhaustive(), "d_ic must be contiguous");
   RAFT_EXPECTS(d_loglikelihood.is_exhaustive(), "d_loglikelihood must be contiguous");
diff --git a/cpp/include/raft/stats/kl_divergence.cuh b/cpp/include/raft/stats/kl_divergence.cuh
index 8c4ab8e07b..7e0c024ed7 100644
--- a/cpp/include/raft/stats/kl_divergence.cuh
+++ b/cpp/include/raft/stats/kl_divergence.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/kl_divergence.cuh>
 
 namespace raft {
@@ -29,7 +30,7 @@ namespace stats {
 template <typename DataT>
 DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
 {
-  return detail::kl_divergence(modelPDF, candidatePDF, size, stream);
+  return detail::kl_divergence(false, modelPDF, candidatePDF, size, stream);
 }
 
 /**
@@ -57,7 +58,8 @@ value_t kl_divergence(raft::resources const& handle,
   RAFT_EXPECTS(modelPDF.size() == candidatePDF.size(), "Size mismatch");
   RAFT_EXPECTS(modelPDF.is_exhaustive(), "modelPDF must be contiguous");
   RAFT_EXPECTS(candidatePDF.is_exhaustive(), "candidatePDF must be contiguous");
-  return detail::kl_divergence(modelPDF.data_handle(),
+  return detail::kl_divergence(resource::get_dry_run_flag(handle),
+                               modelPDF.data_handle(),
                                candidatePDF.data_handle(),
                                modelPDF.extent(0),
                                resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index 9739b45e88..cda32fdea0 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -33,7 +33,7 @@ namespace stats {
 template <bool rowMajor, typename Type, typename IdxType = int>
 void mean(Type* mu, const Type* data, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::mean<rowMajor>(mu, data, D, N, stream);
+  detail::mean<rowMajor>(false, mu, data, D, N, stream);
 }
 
 /**
@@ -58,7 +58,7 @@ template <bool rowMajor, typename Type, typename IdxType = int>
 [[deprecated("'sample' parameter deprecated")]] void mean(
   Type* mu, const Type* data, IdxType D, IdxType N, bool sample, cudaStream_t stream)
 {
-  detail::mean<rowMajor>(mu, data, D, N, sample, stream);
+  detail::mean<rowMajor>(false, mu, data, D, N, sample, stream);
 }
 
 /**
@@ -89,7 +89,8 @@ void mean(raft::resources const& handle,
   RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch between data and mu");
   RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous");
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
-  detail::mean<std::is_same_v<layout_t, raft::row_major>>(mu.data_handle(),
+  detail::mean<std::is_same_v<layout_t, raft::row_major>>(raft::resource::get_dry_run_flag(handle),
+                                                          mu.data_handle(),
                                                           data.data_handle(),
                                                           data.extent(1),
                                                           data.extent(0),
@@ -124,7 +125,8 @@ template <typename value_t, typename idx_t, typename layout_t>
   RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch between data and mu");
   RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous");
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
-  detail::mean<std::is_same_v<layout_t, raft::row_major>>(mu.data_handle(),
+  detail::mean<std::is_same_v<layout_t, raft::row_major>>(raft::resource::get_dry_run_flag(handle),
+                                                          mu.data_handle(),
                                                           data.data_handle(),
                                                           data.extent(1),
                                                           data.extent(0),
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index 8e09edb7e6..6ea022c0b5 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/mean_center.cuh>
 
 namespace raft {
@@ -33,7 +34,8 @@ template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType =
 void meanCenter(
   Type* out, const Type* data, const Type* mu, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::meanCenter<rowMajor, bcastAlongRows, Type, IdxType, TPB>(out, data, mu, D, N, stream);
+  detail::meanCenter<rowMajor, bcastAlongRows, Type, IdxType, TPB>(
+    false, out, data, mu, D, N, stream);
 }
 
 /**
@@ -53,7 +55,7 @@ void meanCenter(
 template <bool rowMajor, bool bcastAlongRows, typename Type, typename IdxType = int, int TPB = 256>
 void meanAdd(Type* out, const Type* data, const Type* mu, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::meanAdd<rowMajor, bcastAlongRows, Type, IdxType, TPB>(out, data, mu, D, N, stream);
+  detail::meanAdd<rowMajor, bcastAlongRows, Type, IdxType, TPB>(false, out, data, mu, D, N, stream);
 }
 
 /**
@@ -90,7 +92,8 @@ void mean_center(raft::resources const& handle,
   detail::meanCenter<std::is_same_v<layout_t, raft::row_major>,
                      apply == Apply::ALONG_ROWS,
                      value_t,
-                     idx_t>(out.data_handle(),
+                     idx_t>(resource::get_dry_run_flag(handle),
+                            out.data_handle(),
                             data.data_handle(),
                             mu.data_handle(),
                             data.extent(1),
@@ -127,6 +130,7 @@ void mean_add(raft::resources const& handle,
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
   detail::
     meanAdd<std::is_same_v<layout_t, raft::row_major>, apply == Apply::ALONG_ROWS, value_t, idx_t>(
+      resource::get_dry_run_flag(handle),
       out.data_handle(),
       data.data_handle(),
       mu.data_handle(),
diff --git a/cpp/include/raft/stats/meanvar.cuh b/cpp/include/raft/stats/meanvar.cuh
index 19a74981fe..ee80ed4c28 100644
--- a/cpp/include/raft/stats/meanvar.cuh
+++ b/cpp/include/raft/stats/meanvar.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __MEANVAR_H
@@ -9,6 +9,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/meanvar.cuh>
 
 namespace raft::stats {
@@ -43,7 +44,7 @@ void meanvar(Type* mean,
              bool rowMajor,
              cudaStream_t stream)
 {
-  detail::meanvar(mean, var, data, D, N, sample, rowMajor, stream);
+  detail::meanvar(false, mean, var, data, D, N, sample, rowMajor, stream);
 }
 
 /**
@@ -84,7 +85,8 @@ void meanvar(raft::resources const& handle,
   RAFT_EXPECTS(mean.is_exhaustive(), "mean must be contiguous");
   RAFT_EXPECTS(var.is_exhaustive(), "var must be contiguous");
   RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
-  detail::meanvar(mean.data_handle(),
+  detail::meanvar(resource::get_dry_run_flag(handle),
+                  mean.data_handle(),
                   var.data_handle(),
                   data.data_handle(),
                   data.extent(1),
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
index 34ccc93086..6ef66e7ebd 100644
--- a/cpp/include/raft/stats/minmax.cuh
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __MINMAX_H
@@ -9,6 +9,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/minmax.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
@@ -95,6 +96,7 @@ void minmax(raft::resources const& handle,
             raft::device_vector_view<value_t, idx_t> globalmax,
             std::optional<raft::device_vector_view<value_t, idx_t>> sampledcols)
 {
+  if (resource::get_dry_run_flag(handle)) { return; }
   const unsigned* rowids_ptr = nullptr;
   const unsigned* colids_ptr = nullptr;
   value_t* sampledcols_ptr   = nullptr;
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
index 1e03e54f16..ed509c7f81 100644
--- a/cpp/include/raft/stats/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/mutual_info_score.cuh>
 
 namespace raft {
@@ -34,7 +35,7 @@ double mutual_info_score(const T* firstClusterArray,
                          cudaStream_t stream)
 {
   return detail::mutual_info_score(
-    firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+    false, firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 }
 
 /**
@@ -65,7 +66,8 @@ double mutual_info_score(raft::resources const& handle,
                "Size mismatch between first_cluster_array and second_cluster_array");
   RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous");
   RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous");
-  return detail::mutual_info_score(first_cluster_array.data_handle(),
+  return detail::mutual_info_score(resource::get_dry_run_flag(handle),
+                                   first_cluster_array.data_handle(),
                                    second_cluster_array.data_handle(),
                                    first_cluster_array.extent(0),
                                    lower_label_range,
diff --git a/cpp/include/raft/stats/neighborhood_recall.cuh b/cpp/include/raft/stats/neighborhood_recall.cuh
index b5086d9f1f..1e367a3c52 100644
--- a/cpp/include/raft/stats/neighborhood_recall.cuh
+++ b/cpp/include/raft/stats/neighborhood_recall.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -14,6 +14,7 @@
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 
 #include <optional>
@@ -131,7 +132,7 @@ void neighborhood_recall(
  * // run brute-force KNN for reference
  *
  * auto scalar = 0.0f;
- * auto recall_score = raft::make_host_scalar(scalar);
+ * auto recall_score = raft::make_host_scalar(res, scalar);
  *
  * raft::stats::neighborhood_recall(res,
                                     raft::make_const_mdspan(indices.view()),
@@ -171,6 +172,7 @@ void neighborhood_recall(
   auto recall_score_d = raft::make_device_scalar(res, *recall_score.data_handle());
   neighborhood_recall(
     res, indices, ref_indices, recall_score_d.view(), distances, ref_distances, eps);
+  if (resource::get_dry_run_flag(res)) { return; }
   raft::update_host(recall_score.data_handle(),
                     recall_score_d.data_handle(),
                     1,
diff --git a/cpp/include/raft/stats/r2_score.cuh b/cpp/include/raft/stats/r2_score.cuh
index 04cfd0c8fe..69a0bb6af0 100644
--- a/cpp/include/raft/stats/r2_score.cuh
+++ b/cpp/include/raft/stats/r2_score.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/stats/detail/scores.cuh>
 
 namespace raft {
@@ -33,7 +34,7 @@ namespace stats {
 template <typename math_t>
 math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 {
-  return detail::r2_score(y, y_hat, n, stream);
+  return detail::r2_score(false, y, y_hat, n, stream);
 }
 
 /**
@@ -68,7 +69,8 @@ value_t r2_score(raft::resources const& handle,
   RAFT_EXPECTS(y_hat.is_exhaustive(), "y_hat must be contiguous");
 
   // TODO: Change the underlying implementation to remove the need to const_cast
-  return detail::r2_score(const_cast<value_t*>(y.data_handle()),
+  return detail::r2_score(resource::get_dry_run_flag(handle),
+                          const_cast<value_t*>(y.data_handle()),
                           const_cast<value_t*>(y_hat.data_handle()),
                           y.extent(0),
                           resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
index b58807c053..2d8b671cd2 100644
--- a/cpp/include/raft/stats/rand_index.cuh
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __RAND_INDEX_H
@@ -9,6 +9,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/rand_index.cuh>
 
@@ -26,7 +27,7 @@ namespace stats {
 template <typename T>
 double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cudaStream_t stream)
 {
-  return detail::compute_rand_index(firstClusterArray, secondClusterArray, size, stream);
+  return detail::compute_rand_index(false, firstClusterArray, secondClusterArray, size, stream);
 }
 
 /**
@@ -53,7 +54,8 @@ double rand_index(raft::resources const& handle,
                "Size mismatch between first_cluster_array and second_cluster_array");
   RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous");
   RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous");
-  return detail::compute_rand_index(first_cluster_array.data_handle(),
+  return detail::compute_rand_index(resource::get_dry_run_flag(handle),
+                                    first_cluster_array.data_handle(),
                                     second_cluster_array.data_handle(),
                                     second_cluster_array.extent(0),
                                     resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/stats/regression_metrics.cuh b/cpp/include/raft/stats/regression_metrics.cuh
index 91798e5991..96c3d54da6 100644
--- a/cpp/include/raft/stats/regression_metrics.cuh
+++ b/cpp/include/raft/stats/regression_metrics.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __REGRESSION_METRICS_H
@@ -10,6 +10,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/scores.cuh>
 
@@ -39,8 +40,14 @@ void regression_metrics(const T* predictions,
                         double& mean_squared_error,
                         double& median_abs_error)
 {
-  detail::regression_metrics(
-    predictions, ref_predictions, n, stream, mean_abs_error, mean_squared_error, median_abs_error);
+  detail::regression_metrics(false,
+                             predictions,
+                             ref_predictions,
+                             n,
+                             stream,
+                             mean_abs_error,
+                             mean_squared_error,
+                             median_abs_error);
 }
 
 /**
@@ -79,7 +86,8 @@ void regression_metrics(raft::resources const& handle,
                "mean_squared_error view must not be empty");
   RAFT_EXPECTS(median_abs_error.data_handle() != nullptr,
                "median_abs_error view must not be empty");
-  detail::regression_metrics(predictions.data_handle(),
+  detail::regression_metrics(resource::get_dry_run_flag(handle),
+                             predictions.data_handle(),
                              ref_predictions.data_handle(),
                              predictions.extent(0),
                              resource::get_cuda_stream(handle),
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
index 6082c6d70f..5044123511 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #ifndef __STDDEV_H
@@ -9,6 +9,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/stddev.cuh>
 
@@ -42,7 +43,7 @@ void stddev(Type* std,
             bool sample,
             cudaStream_t stream)
 {
-  detail::stddev<rowMajor>(std, data, mu, D, N, sample, stream);
+  detail::stddev<rowMajor>(false, std, data, mu, D, N, sample, stream);
 }
 
 /**
@@ -72,7 +73,7 @@ void vars(Type* var,
           bool sample,
           cudaStream_t stream)
 {
-  detail::vars<rowMajor>(var, data, mu, D, N, sample, stream);
+  detail::vars<rowMajor>(false, var, data, mu, D, N, sample, stream);
 }
 
 /**
@@ -110,7 +111,8 @@ void stddev(raft::resources const& handle,
                 "raft::row_major or raft::col_major (or one of their aliases)");
   RAFT_EXPECTS(mu.size() == std.size(), "Size mismatch between mu and std");
   RAFT_EXPECTS(mu.extent(0) == data.extent(1), "Size mismatch between data and mu");
-  detail::stddev<is_row_major>(std.data_handle(),
+  detail::stddev<is_row_major>(resource::get_dry_run_flag(handle),
+                               std.data_handle(),
                                data.data_handle(),
                                mu.data_handle(),
                                data.extent(1),
@@ -156,7 +158,8 @@ void vars(raft::resources const& handle,
                 "raft::row_major or raft::col_major (or one of their aliases)");
   RAFT_EXPECTS(mu.size() == var.size(), "Size mismatch between mu and std");
   RAFT_EXPECTS(mu.extent(0) == data.extent(1), "Size mismatch between data and mu");
-  detail::vars<is_row_major>(var.data_handle(),
+  detail::vars<is_row_major>(resource::get_dry_run_flag(handle),
+                             var.data_handle(),
                              data.data_handle(),
                              mu.data_handle(),
                              data.extent(1),
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
index 08f289130f..938e7b3542 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/sum.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,7 +10,10 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
-#include <raft/stats/detail/sum.cuh>
+#include <raft/core/resource/dry_run_flag.hpp>
+#include <raft/core/types.hpp>
+#include <raft/linalg/detail/reduce.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 namespace raft {
@@ -33,7 +36,8 @@ namespace stats {
 template <bool rowMajor, typename Type, typename IdxType = int>
 void sum(Type* output, const Type* input, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::sum<rowMajor>(output, input, D, N, stream);
+  // Inline detail::sum: reduce along columns (alongRows=false)
+  raft::linalg::detail::reduce<rowMajor, false>(false, output, input, D, N, Type(0), stream);
 }
 
 /**
@@ -64,11 +68,9 @@ void sum(raft::resources const& handle,
                 "sum: Layout must be either "
                 "raft::row_major or raft::col_major (or one of their aliases)");
   RAFT_EXPECTS(input.extent(1) == output.extent(0), "Size mismatch between input and output");
-  detail::sum<is_row_major>(output.data_handle(),
-                            input.data_handle(),
-                            input.extent(1),
-                            input.extent(0),
-                            resource::get_cuda_stream(handle));
+  // Use public reduce API that handles dry-run internally
+  // Sum along columns (Apply::ALONG_COLUMNS), reduce_op is add_op (default)
+  raft::linalg::reduce<raft::Apply::ALONG_COLUMNS>(handle, input, output, value_t(0));
 }
 
 /** @} */  // end group stats_sum
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
index 490e50e82e..415fa78916 100644
--- a/cpp/include/raft/stats/v_measure.cuh
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -9,6 +9,7 @@
 #pragma once
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/detail/v_measure.cuh>
 
@@ -35,8 +36,14 @@ double v_measure(const T* truthClusterArray,
                  cudaStream_t stream,
                  double beta = 1.0)
 {
-  return detail::v_measure(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream, beta);
+  return detail::v_measure(false,
+                           truthClusterArray,
+                           predClusterArray,
+                           size,
+                           lowerLabelRange,
+                           upperLabelRange,
+                           stream,
+                           beta);
 }
 
 /**
@@ -70,7 +77,8 @@ double v_measure(raft::resources const& handle,
   RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous");
   RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous");
 
-  return detail::v_measure(truth_cluster_array.data_handle(),
+  return detail::v_measure(resource::get_dry_run_flag(handle),
+                           truth_cluster_array.data_handle(),
                            pred_cluster_array.data_handle(),
                            truth_cluster_array.extent(0),
                            lower_label_range,
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index 120bfe84cd..884ed9b5ac 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -10,6 +10,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/types.hpp>
 #include <raft/stats/detail/weighted_mean.cuh>
 
@@ -35,7 +36,7 @@ template <bool row_major, bool along_rows, typename Type, typename IdxType = int
 void weightedMean(
   Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::weightedMean<row_major, along_rows>(mu, data, weights, D, N, stream);
+  detail::weightedMean<row_major, along_rows>(false, mu, data, weights, D, N, stream);
 }
 
 /**
@@ -116,7 +117,8 @@ void weighted_mean(raft::resources const& handle,
   RAFT_EXPECTS(mu.extent(0) == mean_vec_size,
                "Size mismatch between mu and expected mean_vec_size");
 
-  detail::weightedMean<is_row_major, is_along_rows>(mu.data_handle(),
+  detail::weightedMean<is_row_major, is_along_rows>(resource::get_dry_run_flag(handle),
+                                                    mu.data_handle(),
                                                     data.data_handle(),
                                                     weights.data_handle(),
                                                     data.extent(1),
diff --git a/cpp/include/raft/util/dry_run_resources.hpp b/cpp/include/raft/util/dry_run_resources.hpp
new file mode 100644
index 0000000000..1d799fffc8
--- /dev/null
+++ b/cpp/include/raft/util/dry_run_resources.hpp
@@ -0,0 +1,253 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
+#include <raft/core/resource/managed_memory_resource.hpp>
+#include <raft/core/resource/pinned_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/mr/dry_run_resource.hpp>
+#include <raft/mr/host_device_resource.hpp>
+#include <raft/mr/host_memory_resource.hpp>
+#include <raft/util/memory_stats_resources.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cuda/stream_ref>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+namespace raft {
+
+/**
+ * @defgroup dry_run_memory Dry-run memory resources
+ * @{
+ */
+
+/**
+ * @brief Resources handle that wraps all reachable memory resources with
+ *        dry-run adaptors and tracks peak allocation usage.
+ *
+ * Inherits from raft::resources, so it can be passed anywhere a
+ * raft::resources& is expected.  On construction the handle:
+ *   - If dry-run mode is already active, does nothing (no-op).
+ *   - Materializes all tracked resource types (host, device, pinned,
+ *     managed, workspace, large_workspace).
+ *   - Takes a snapshot of the original resources to keep them alive.
+ *   - Wraps each with dry_run_resource.
+ *   - Replaces global host and device resources with dry-run versions.
+ *   - Sets the dry-run flag.
+ *
+ * On destruction the handle resets the flag and restores global resources.
+ * Composable with memory_tracking_resources in either order.
+ */
+class dry_run_resources : public resources {
+ public:
+  explicit dry_run_resources(const resources& existing)
+    : resources(existing),
+      active_(!resource::get_dry_run_flag(existing)),
+      old_host_(raft::mr::get_default_host_resource()),
+      old_device_(rmm::mr::get_current_device_resource_ref())
+  {
+    if (active_) init();
+  }
+
+  ~dry_run_resources() override
+  {
+    if (!active_) return;
+    resource::set_dry_run_flag(*this, false);
+    raft::mr::set_default_host_resource(old_host_);
+    rmm::mr::set_current_device_resource(old_device_);
+
+    // Drop all base-class entries so that probe container RAII cleanup runs
+    // while old_device_ and snapshot_ are still alive
+    resources_.clear();
+    factories_.clear();
+  }
+
+  dry_run_resources(dry_run_resources const&)            = delete;
+  dry_run_resources& operator=(dry_run_resources const&) = delete;
+  dry_run_resources(dry_run_resources&&)                 = delete;
+  dry_run_resources& operator=(dry_run_resources&&)      = delete;
+
+  [[nodiscard]] auto get_bytes_peak() const -> memory_stats
+  {
+    if (!active_) return {};
+    return {
+      .device_workspace       = ws_stats_->get_peak_bytes(),
+      .device_large_workspace = lws_stats_->get_peak_bytes(),
+      .device_global          = device_stats_->get_peak_bytes(),
+      .device_managed         = managed_stats_->get_peak_bytes(),
+      .host                   = host_stats_->get_peak_bytes(),
+      .host_pinned            = pinned_stats_->get_peak_bytes(),
+    };
+  }
+
+  [[nodiscard]] auto get_bytes_current() const -> memory_stats
+  {
+    if (!active_) return {};
+    return {
+      .device_workspace       = ws_stats_->get_allocated_bytes(),
+      .device_large_workspace = lws_stats_->get_allocated_bytes(),
+      .device_global          = device_stats_->get_allocated_bytes(),
+      .device_managed         = managed_stats_->get_allocated_bytes(),
+      .host                   = host_stats_->get_allocated_bytes(),
+      .host_pinned            = pinned_stats_->get_allocated_bytes(),
+    };
+  }
+
+ private:
+  // Declaration order determines destruction order.
+  // snapshot_ is destroyed last (keeps original resource shared_ptrs alive
+  // while dry-run adaptors hold non-owning refs into them).
+  // old_device_ is destroyed after device_adaptor_ so the probe can
+  // deallocate through it during device_adaptor_ destruction.
+  std::vector<pair_resource> snapshot_;
+
+  bool active_;
+  raft::mr::host_resource old_host_;
+  raft::mr::device_resource old_device_;
+
+  using host_dry_run_t   = raft::mr::dry_run_resource<raft::mr::host_resource_ref>;
+  using device_dry_run_t = raft::mr::dry_run_resource<rmm::device_async_resource_ref>;
+  std::unique_ptr<host_dry_run_t> host_adaptor_;
+  std::unique_ptr<device_dry_run_t> device_adaptor_;
+
+  using counter_t = raft::mr::detail::dry_run_memory_counter;
+  std::shared_ptr<counter_t> host_stats_;
+  std::shared_ptr<counter_t> pinned_stats_;
+  std::shared_ptr<counter_t> managed_stats_;
+  std::shared_ptr<counter_t> ws_stats_;
+  std::shared_ptr<counter_t> lws_stats_;
+  std::shared_ptr<counter_t> device_stats_;
+
+  void init()
+  {
+    // Independent-counting invariant
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // 1. Force-initialize all lazily-created resources (workspace, large workspace,
+    //    pinned, managed) so that their factories resolve against the *original*
+    //    global device MR, not a tracking wrapper we install later.
+    // 2. Capture every upstream ref while it still points to the original resource.
+    // 3. Snapshot the resource map to keep the originals alive.
+    // 4. Only *then* replace the global device resource with the tracking bridge.
+    // 5. Wrap each captured upstream with a separate dry_run_resource adaptor.
+    //
+    // Because step 2 happens before step 4, workspace/lws allocations flow through
+    // their own adaptor directly to the original device MR, bypassing the device adaptor.
+    // Each allocation is therefore counted in exactly one category, and
+    // memory_stats::total() returns an accurate, non-overlapping sum.
+    auto* ws         = resource::get_workspace_resource(*this);
+    auto ws_free     = resource::get_workspace_free_bytes(*this);
+    auto ws_upstream = ws->get_upstream_resource();
+    auto lws_ref     = resource::get_large_workspace_resource_ref(*this);
+    auto pinned_ref  = resource::get_pinned_memory_resource_ref(*this);
+    auto managed_ref = resource::get_managed_memory_resource_ref(*this);
+
+    // Snapshot keeps original resource objects alive while dry-run
+    // adaptors hold non-owning refs into them.
+    snapshot_ = resources_;
+
+    // --- Host (global) ---
+    {
+      host_adaptor_ = std::make_unique<host_dry_run_t>(raft::mr::host_resource_ref{old_host_});
+      host_stats_   = host_adaptor_->get_counter();
+      mr::set_default_host_resource(mr::host_resource_ref{*host_adaptor_});
+    }
+
+    // --- Pinned ---
+    {
+      mr::dry_run_resource<mr::host_device_resource_ref> dr{pinned_ref};
+      pinned_stats_ = dr.get_counter();
+      resource::set_pinned_memory_resource(*this, std::move(dr));
+    }
+
+    // --- Managed ---
+    {
+      mr::dry_run_resource<mr::host_device_resource_ref> dr{managed_ref};
+      managed_stats_ = dr.get_counter();
+      resource::set_managed_memory_resource(*this, std::move(dr));
+    }
+
+    // --- Device (global) ---
+    // Invalidate the cached thrust policy (the resource_ref it captured
+    // will be stale once we replace the global device resource).
+    factories_.at(resource::resource_type::THRUST_POLICY) = std::make_pair(
+      resource::resource_type::LAST_KEY, std::make_shared<resource::empty_resource_factory>());
+    resources_.at(resource::resource_type::THRUST_POLICY) = std::make_pair(
+      resource::resource_type::LAST_KEY, std::make_shared<resource::empty_resource>());
+    {
+      device_dry_run_t dr{rmm::device_async_resource_ref{old_device_}};
+      device_stats_   = dr.get_counter();
+      device_adaptor_ = std::make_unique<device_dry_run_t>(std::move(dr));
+      rmm::mr::set_current_device_resource(*device_adaptor_);
+    }
+
+    // --- Workspace ---
+    {
+      mr::dry_run_resource<rmm::device_async_resource_ref> dr{ws_upstream};
+      ws_stats_ = dr.get_counter();
+      resource::set_workspace_resource(*this, std::move(dr), ws_free);
+    }
+
+    // --- Large workspace ---
+    {
+      mr::dry_run_resource<rmm::device_async_resource_ref> dr{lws_ref};
+      lws_stats_ = dr.get_counter();
+      resource::set_large_workspace_resource(*this, std::move(dr));
+    }
+
+    resource::set_dry_run_flag(*this, true);
+  }
+};
+
+/** @} */
+
+}  // namespace raft
+
+namespace raft::util {
+
+/**
+ * @brief Execute an action in dry-run mode and return peak memory usage.
+ *
+ * Creates an independent copy of the resources handle with all memory resources
+ * replaced by dry-run versions, executes the action, and returns peak usage stats.
+ *
+ * The action receives the dry-run resources handle (as const raft::resources&)
+ * and can check the dry-run flag via raft::resource::get_dry_run_flag(res) to
+ * skip kernel execution.
+ *
+ * @tparam Action A callable with signature void(const raft::resources&, Args...).
+ * @tparam Args Additional argument types to forward to the action.
+ * @param res The raft resources handle.
+ * @param action The action to execute in dry-run mode.
+ * @param args Additional arguments to forward to the action.
+ * @return memory_stats with peak memory usage from the dry run.
+ *
+ * @code{.cpp}
+ * raft::resources res;
+ * auto stats = raft::util::dry_run_execute(res, [](const raft::resources& r) {
+ *   my_algorithm(r);
+ * });
+ * std::cout << "Peak workspace: " << stats.device_workspace << " bytes\n";
+ * @endcode
+ */
+template <typename Action, typename... Args>
+auto dry_run_execute(const raft::resources& res, Action&& action, Args&&... args)
+  -> raft::memory_stats
+{
+  raft::dry_run_resources dry_res(res);
+  std::forward<Action>(action)(static_cast<const raft::resources&>(dry_res),
+                               std::forward<Args>(args)...);
+  return dry_res.get_bytes_peak();
+}
+
+}  // namespace raft::util
diff --git a/cpp/include/raft/util/memory_stats_resources.hpp b/cpp/include/raft/util/memory_stats_resources.hpp
new file mode 100644
index 0000000000..f0ec64903d
--- /dev/null
+++ b/cpp/include/raft/util/memory_stats_resources.hpp
@@ -0,0 +1,237 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/managed_memory_resource.hpp>
+#include <raft/core/resource/pinned_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/mr/host_device_resource.hpp>
+#include <raft/mr/host_memory_resource.hpp>
+#include <raft/mr/statistics_adaptor.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cuda/stream_ref>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace raft {
+
+/**
+ * @brief Snapshot of memory usage across the six tracked resource types.
+ *
+ * Returned by accessor methods on dry_run_resources and
+ * memory_stats_resources (e.g. get_bytes_peak(), get_bytes_current()).
+ */
+struct memory_stats {
+  std::size_t device_workspace{0};
+  std::size_t device_large_workspace{0};
+  std::size_t device_global{0};
+  std::size_t device_managed{0};
+  std::size_t host{0};
+  std::size_t host_pinned{0};
+
+  /**
+   * @brief Sum of all memory stats across the six tracked categories.
+   *
+   * The three resource wrapper classes (dry_run_resources, memory_stats_resources,
+   * memory_tracking_resources) guarantee that every category is tracked by its own
+   * independent adaptor: each wrapper force-initializes all resources, captures their
+   * upstream refs *before* replacing the global device resource, and wraps those
+   * originals.  Workspace and large-workspace allocations therefore bypass the
+   * device-global tracking adaptor and are counted exactly once, making this sum
+   * an accurate total when used with stats produced by any of the three wrappers.
+   */
+  [[nodiscard]] inline constexpr auto total() const -> std::size_t
+  {
+    return device_workspace + device_large_workspace + device_global + device_managed + host +
+           host_pinned;
+  }
+};
+
+/**
+ * @brief Resources handle that wraps all reachable memory resources with
+ *        statistics adaptors to track actual allocation usage.
+ *
+ * Inherits from raft::resources, so it can be passed anywhere a
+ * raft::resources& is expected.  On construction the handle:
+ *   - Materializes all tracked resource types (host, device, pinned,
+ *     managed, workspace, large_workspace).
+ *   - Takes a snapshot of the original resources to keep them alive.
+ *   - Wraps each with statistics_adaptor.
+ *   - Replaces global host and device resources with tracked versions.
+ *
+ * On destruction the handle restores global resources.
+ */
+class memory_stats_resources : public resources {
+ public:
+  explicit memory_stats_resources(const resources& existing)
+    : resources(existing),
+      old_host_(mr::get_default_host_resource()),
+      old_device_(rmm::mr::get_current_device_resource_ref())
+  {
+    init();
+  }
+
+  ~memory_stats_resources() override
+  {
+    mr::set_default_host_resource(old_host_);
+    rmm::mr::set_current_device_resource(std::move(old_device_));
+  }
+
+  memory_stats_resources(memory_stats_resources const&)            = delete;
+  memory_stats_resources& operator=(memory_stats_resources const&) = delete;
+  memory_stats_resources(memory_stats_resources&&)                 = delete;
+  memory_stats_resources& operator=(memory_stats_resources&&)      = delete;
+
+  [[nodiscard]] auto get_bytes_current() const -> memory_stats
+  {
+    return read_field(&mr::resource_stats::bytes_current);
+  }
+
+  [[nodiscard]] auto get_bytes_peak() const -> memory_stats
+  {
+    return read_field(&mr::resource_stats::bytes_peak);
+  }
+
+  [[nodiscard]] auto get_bytes_total_allocated() const -> memory_stats
+  {
+    return read_field(&mr::resource_stats::bytes_total_allocated);
+  }
+
+  [[nodiscard]] auto get_bytes_total_deallocated() const -> memory_stats
+  {
+    return read_field(&mr::resource_stats::bytes_total_deallocated);
+  }
+
+  [[nodiscard]] auto get_num_allocations() const -> memory_stats
+  {
+    return read_field(&mr::resource_stats::num_allocations);
+  }
+
+  [[nodiscard]] auto get_num_deallocations() const -> memory_stats
+  {
+    return read_field(&mr::resource_stats::num_deallocations);
+  }
+
+ private:
+  using field_ptr = std::atomic<std::int64_t> mr::resource_stats::*;
+
+  [[nodiscard]] auto read_field(field_ptr field) const -> memory_stats
+  {
+    auto load = [&](const std::shared_ptr<mr::resource_stats>& s) -> std::size_t {
+      return static_cast<std::size_t>((s.get()->*field).load(std::memory_order_relaxed));
+    };
+    return {
+      .device_workspace       = load(ws_stats_),
+      .device_large_workspace = load(lws_stats_),
+      .device_global          = load(device_stats_),
+      .device_managed         = load(managed_stats_),
+      .host                   = load(host_stats_),
+      .host_pinned            = load(pinned_stats_),
+    };
+  }
+
+  std::vector<pair_resource> snapshot_;
+
+  raft::mr::host_resource old_host_;
+  raft::mr::device_resource old_device_;
+
+  using host_stats_adaptor_t = mr::statistics_adaptor<mr::host_resource_ref>;
+  std::unique_ptr<host_stats_adaptor_t> host_adaptor_;
+
+  using device_stats_adaptor_t = mr::statistics_adaptor<rmm::device_async_resource_ref>;
+  std::unique_ptr<device_stats_adaptor_t> device_adaptor_;
+
+  std::shared_ptr<mr::resource_stats> host_stats_;
+  std::shared_ptr<mr::resource_stats> pinned_stats_;
+  std::shared_ptr<mr::resource_stats> managed_stats_;
+  std::shared_ptr<mr::resource_stats> ws_stats_;
+  std::shared_ptr<mr::resource_stats> lws_stats_;
+  std::shared_ptr<mr::resource_stats> device_stats_;
+
+  void init()
+  {
+    // Independent-counting invariant
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // 1. Force-initialize all lazily-created resources (workspace, large workspace,
+    //    pinned, managed) so that their factories resolve against the *original*
+    //    global device MR, not a tracking wrapper we install later.
+    // 2. Capture every upstream ref while it still points to the original resource.
+    // 3. Snapshot the resource map to keep the originals alive.
+    // 4. Only *then* replace the global device resource with the tracking bridge.
+    // 5. Wrap each captured upstream with a separate statistics_adaptor.
+    //
+    // Because step 2 happens before step 4, workspace/lws allocations flow through
+    // their own adaptor directly to the original device MR, bypassing the device adaptor.
+    // Each allocation is therefore counted in exactly one category, and
+    // memory_stats::total() returns an accurate, non-overlapping sum.
+    auto* ws         = resource::get_workspace_resource(*this);
+    auto ws_free     = resource::get_workspace_free_bytes(*this);
+    auto ws_upstream = ws->get_upstream_resource();
+    auto lws_ref     = resource::get_large_workspace_resource_ref(*this);
+    auto pinned_ref  = resource::get_pinned_memory_resource_ref(*this);
+    auto managed_ref = resource::get_managed_memory_resource_ref(*this);
+
+    snapshot_ = resources_;
+
+    // --- Host (global) ---
+    {
+      host_adaptor_ = std::make_unique<host_stats_adaptor_t>(mr::host_resource_ref{old_host_});
+      host_stats_   = host_adaptor_->get_stats();
+      mr::set_default_host_resource(mr::host_resource_ref{*host_adaptor_});
+    }
+
+    // --- Pinned ---
+    {
+      mr::statistics_adaptor<mr::host_device_resource_ref> sa{pinned_ref};
+      pinned_stats_ = sa.get_stats();
+      resource::set_pinned_memory_resource(*this, std::move(sa));
+    }
+
+    // --- Managed ---
+    {
+      mr::statistics_adaptor<mr::host_device_resource_ref> sa{managed_ref};
+      managed_stats_ = sa.get_stats();
+      resource::set_managed_memory_resource(*this, std::move(sa));
+    }
+
+    // --- Device (global) ---
+    // Invalidate the cached thrust policy (the resource_ref it captured
+    // will be stale once we replace the global device resource).
+    factories_.at(resource::resource_type::THRUST_POLICY) = std::make_pair(
+      resource::resource_type::LAST_KEY, std::make_shared<resource::empty_resource_factory>());
+    resources_.at(resource::resource_type::THRUST_POLICY) = std::make_pair(
+      resource::resource_type::LAST_KEY, std::make_shared<resource::empty_resource>());
+    {
+      device_stats_adaptor_t sa{rmm::device_async_resource_ref{old_device_}};
+      device_stats_   = sa.get_stats();
+      device_adaptor_ = std::make_unique<device_stats_adaptor_t>(std::move(sa));
+      rmm::mr::set_current_device_resource(*device_adaptor_);
+    }
+    // --- Workspace ---
+    {
+      mr::statistics_adaptor<rmm::device_async_resource_ref> sa{ws_upstream};
+      ws_stats_ = sa.get_stats();
+      resource::set_workspace_resource(*this, std::move(sa), ws_free);
+    }
+
+    // --- Large workspace ---
+    {
+      mr::statistics_adaptor<rmm::device_async_resource_ref> sa{lws_ref};
+      lws_stats_ = sa.get_stats();
+      resource::set_large_workspace_resource(*this, std::move(sa));
+    }
+  }
+};
+
+}  // namespace raft
diff --git a/cpp/include/raft/util/memory_tracking_resources.hpp b/cpp/include/raft/util/memory_tracking_resources.hpp
index 306c10ce1e..d3955fad2e 100644
--- a/cpp/include/raft/util/memory_tracking_resources.hpp
+++ b/cpp/include/raft/util/memory_tracking_resources.hpp
@@ -106,8 +106,8 @@ class memory_tracking_resources : public resources {
   ~memory_tracking_resources() override
   {
     report_.stop();
-    raft::mr::set_default_host_resource(old_host_ref_);
-    rmm::mr::set_current_device_resource(old_device_ref_);
+    raft::mr::set_default_host_resource(old_host_);
+    rmm::mr::set_current_device_resource(old_device_);
   }
 
   memory_tracking_resources(memory_tracking_resources const&)            = delete;
@@ -126,8 +126,8 @@ class memory_tracking_resources : public resources {
     : resources(existing ? *existing : resources{}),
       owned_stream_(std::move(owned_stream)),
       report_(out_override ? *out_override : *owned_stream_, sample_interval),
-      old_host_ref_(raft::mr::get_default_host_resource()),
-      old_device_ref_(rmm::mr::get_current_device_resource_ref())
+      old_host_(raft::mr::get_default_host_resource()),
+      old_device_(rmm::mr::get_current_device_resource_ref())
   {
     init();
   }
@@ -140,9 +140,8 @@ class memory_tracking_resources : public resources {
   std::unique_ptr<std::ofstream> owned_stream_;
   raft::mr::resource_monitor report_;
 
-  raft::mr::host_resource_ref old_host_ref_;
-  rmm::device_async_resource_ref old_device_ref_;
-  std::size_t saved_ws_limit_{};
+  raft::mr::host_resource old_host_;
+  raft::mr::device_resource old_device_;
 
   using host_stats_t  = raft::mr::statistics_adaptor<raft::mr::host_resource_ref>;
   using host_notify_t = raft::mr::notifying_adaptor<host_stats_t>;
@@ -155,8 +154,22 @@ class memory_tracking_resources : public resources {
 
   void init()
   {
+    // Independent-counting invariant
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // 1. Force-initialize all lazily-created resources (workspace, large workspace,
+    //    pinned, managed) so that their factories resolve against the *original*
+    //    global device MR, not a tracking wrapper we install later.
+    // 2. Capture every upstream ref while it still points to the original resource.
+    // 3. Snapshot the resource map to keep the originals alive.
+    // 4. Only *then* replace the global device resource with the tracking bridge.
+    // 5. Wrap each captured upstream with a separate statistics/notifying adaptor.
+    //
+    // Because step 2 happens before step 4, workspace/lws allocations flow through
+    // their own adaptor directly to the original device MR, bypassing the device adaptor.
+    // Each allocation is therefore counted in exactly one category, and
+    // memory_stats::total() returns an accurate, non-overlapping sum.
     auto* ws          = raft::resource::get_workspace_resource(*this);
-    saved_ws_limit_   = ws->get_allocation_limit();
+    auto ws_free      = raft::resource::get_workspace_free_bytes(*this);
     auto upstream_ref = ws->get_upstream_resource();
     auto lws_ref      = raft::resource::get_large_workspace_resource_ref(*this);
     auto pinned_ref   = raft::resource::get_pinned_memory_resource_ref(*this);
@@ -167,7 +180,7 @@ class memory_tracking_resources : public resources {
 
     // --- Host (global) ---
     {
-      host_stats_t sa{old_host_ref_};
+      host_stats_t sa{raft::mr::host_resource_ref{old_host_}};
       report_.register_source("host", sa.get_stats());
       host_adaptor_ = std::make_unique<host_notify_t>(std::move(sa), report_.get_notifier());
       raft::mr::set_default_host_resource(*host_adaptor_);
@@ -194,8 +207,14 @@ class memory_tracking_resources : public resources {
     }
 
     // --- Device (global) ---
+    // Invalidate the cached thrust policy (the resource_ref it captured
+    // will be stale once we replace the global device resource).
+    factories_.at(resource::resource_type::THRUST_POLICY) = std::make_pair(
+      resource::resource_type::LAST_KEY, std::make_shared<resource::empty_resource_factory>());
+    resources_.at(resource::resource_type::THRUST_POLICY) = std::make_pair(
+      resource::resource_type::LAST_KEY, std::make_shared<resource::empty_resource>());
     {
-      device_stats_t sa{old_device_ref_};
+      device_stats_t sa{rmm::device_async_resource_ref{old_device_}};
       report_.register_source("device", sa.get_stats());
       device_adaptor_ = std::make_unique<device_notify_t>(std::move(sa), report_.get_notifier());
       rmm::mr::set_current_device_resource(*device_adaptor_);
@@ -208,7 +227,7 @@ class memory_tracking_resources : public resources {
       ws_stats_t sa{upstream_ref};
       report_.register_source("workspace", sa.get_stats());
       raft::resource::set_workspace_resource(
-        *this, ws_notify_t{std::move(sa), report_.get_notifier()}, saved_ws_limit_);
+        *this, ws_notify_t{std::move(sa), report_.get_notifier()}, ws_free);
     }
 
     // --- Large workspace ---
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 47ac6fc286..6573c4a664 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -322,6 +322,8 @@ if(BUILD_TESTS)
     util/popc.cu
     util/pow2_utils.cu
     util/reduction.cu
+    util/dry_run_resources.cpp
+    util/dry_run_guards.cu
   )
 endif()
 
diff --git a/cpp/tests/core/bitmap.cu b/cpp/tests/core/bitmap.cu
index 8ba85a4bc1..e3576cbcdd 100644
--- a/cpp/tests/core/bitmap.cu
+++ b/cpp/tests/core/bitmap.cu
@@ -1,12 +1,14 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../test_utils.cuh"
 
 #include <raft/core/bitmap.cuh>
+#include <raft/core/copy.cuh>
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/linalg/init.cuh>
 #include <raft/linalg/map.cuh>
 #include <raft/random/rng.cuh>
@@ -102,69 +104,85 @@ class BitmapTest : public testing::TestWithParam<test_spec_bitmap<index_t>> {
 
     create_cpu_bitmap(bitmap_ref, mask_cpu, spec.rows, spec.cols);
 
-    auto bitset_d = raft::core::bitset<bitmap_t, index_t>(
-      res, raft::make_const_mdspan(mask_device.view()), index_t(spec.rows * spec.cols));
-
-    auto bitmap_view_d =
-      raft::core::bitmap_view<bitmap_t, index_t>(bitset_d.data(), spec.rows, spec.cols);
-
-    ASSERT_EQ(bitmap_view_d.get_n_rows(), spec.rows);
-    ASSERT_EQ(bitmap_view_d.get_n_cols(), spec.cols);
-
     auto query_device  = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
     auto result_device = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
     auto query_cpu     = std::vector<index_t>(spec.query_len);
     auto result_cpu    = std::vector<uint8_t>(spec.query_len);
     auto result_ref    = std::vector<uint8_t>(spec.query_len);
 
-    raft::random::uniformInt(
-      res, rng, query_device.view(), index_t(0), index_t(spec.rows * spec.cols));
-    raft::update_host(query_cpu.data(), query_device.data_handle(), query_device.extent(0), stream);
-
-    auto queries_device_view =
-      raft::make_device_vector_view<const index_t>(query_device.data_handle(), spec.query_len);
-
-    raft::linalg::map(
+    raft::execute_with_dry_run_check(
       res,
-      result_device.view(),
-      [bitmap_view_d] __device__(index_t query) {
-        auto row = query / bitmap_view_d.get_n_cols();
-        auto col = query % bitmap_view_d.get_n_cols();
-        return (uint8_t)(bitmap_view_d.test(row, col));
+      [&](raft::resources const& h) {
+        raft::core::bitset<bitmap_t, index_t> bitset_d(
+          h, raft::make_const_mdspan(mask_device.view()), index_t(spec.rows * spec.cols));
+
+        auto bitmap_view_d =
+          raft::core::bitmap_view<bitmap_t, index_t>(bitset_d.data(), spec.rows, spec.cols);
+
+        ASSERT_EQ(bitmap_view_d.get_n_rows(), spec.rows);
+        ASSERT_EQ(bitmap_view_d.get_n_cols(), spec.cols);
+
+        raft::random::uniformInt(
+          h, rng, query_device.view(), index_t(0), index_t(spec.rows * spec.cols));
+        raft::copy(h,
+                   raft::make_host_vector_view(query_cpu.data(), query_device.extent(0)),
+                   raft::make_const_mdspan(query_device.view()));
+
+        auto queries_device_view =
+          raft::make_device_vector_view<const index_t>(query_device.data_handle(), spec.query_len);
+
+        raft::linalg::map(
+          h,
+          result_device.view(),
+          [bitmap_view_d] __device__(index_t query) {
+            auto row = query / bitmap_view_d.get_n_cols();
+            auto col = query % bitmap_view_d.get_n_cols();
+            return (uint8_t)(bitmap_view_d.test(row, col));
+          },
+          queries_device_view);
+
+        raft::copy(h,
+                   raft::make_host_vector_view(result_cpu.data(), result_device.extent(0)),
+                   raft::make_const_mdspan(result_device.view()));
+        resource::sync_stream(h, stream);
+
+        test_cpu_bitmap(bitmap_ref, query_cpu, result_ref, spec.rows, spec.cols);
+
+        if (resource::get_dry_run_flag(h)) { return; }
+        ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
+
+        raft::random::uniformInt(
+          h, rng, mask_device.view(), index_t(0), index_t(spec.rows * spec.cols));
+        raft::copy(h,
+                   raft::make_host_vector_view(mask_cpu.data(), mask_device.extent(0)),
+                   raft::make_const_mdspan(mask_device.view()));
+        resource::sync_stream(h, stream);
+
+        thrust::for_each_n(raft::resource::get_thrust_policy(h),
+                           mask_device.data_handle(),
+                           mask_device.extent(0),
+                           [bitmap_view_d] __device__(const index_t sample_index) {
+                             auto row = sample_index / bitmap_view_d.get_n_cols();
+                             auto col = sample_index % bitmap_view_d.get_n_cols();
+                             bitmap_view_d.set(row, col, false);
+                           });
+
+        raft::copy(h,
+                   raft::make_host_vector_view(bitmap_result.data(), bitmap_result.size()),
+                   raft::make_device_vector_view<const bitmap_t>(bitmap_view_d.data(),
+                                                                 bitmap_result.size()));
+
+        for (size_t i = 0; i < mask_cpu.size(); i++) {
+          auto row = mask_cpu[i] / spec.cols;
+          auto col = mask_cpu[i] % spec.cols;
+          auto idx = row * spec.cols + col;
+          bitmap_ref[idx / bitmap_element_size] &= ~(bitmap_t{1} << (idx % bitmap_element_size));
+        }
+        resource::sync_stream(h, stream);
+        ASSERT_TRUE(hostVecMatch(bitmap_ref, bitmap_result, raft::Compare<bitmap_t>()));
       },
-      queries_device_view);
-
-    raft::update_host(result_cpu.data(), result_device.data_handle(), query_device.size(), stream);
-    resource::sync_stream(res, stream);
-
-    test_cpu_bitmap(bitmap_ref, query_cpu, result_ref, spec.rows, spec.cols);
-
-    ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
-
-    raft::random::uniformInt(
-      res, rng, mask_device.view(), index_t(0), index_t(spec.rows * spec.cols));
-    raft::update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
-    resource::sync_stream(res, stream);
-
-    thrust::for_each_n(raft::resource::get_thrust_policy(res),
-                       mask_device.data_handle(),
-                       mask_device.extent(0),
-                       [bitmap_view_d] __device__(const index_t sample_index) {
-                         auto row = sample_index / bitmap_view_d.get_n_cols();
-                         auto col = sample_index % bitmap_view_d.get_n_cols();
-                         bitmap_view_d.set(row, col, false);
-                       });
-
-    raft::update_host(bitmap_result.data(), bitmap_view_d.data(), bitmap_result.size(), stream);
-
-    for (size_t i = 0; i < mask_cpu.size(); i++) {
-      auto row = mask_cpu[i] / spec.cols;
-      auto col = mask_cpu[i] % spec.cols;
-      auto idx = row * spec.cols + col;
-      bitmap_ref[idx / bitmap_element_size] &= ~(bitmap_t{1} << (idx % bitmap_element_size));
-    }
-    resource::sync_stream(res, stream);
-    ASSERT_TRUE(hostVecMatch(bitmap_ref, bitmap_result, raft::Compare<bitmap_t>()));
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      raft::ceildiv(index_t(spec.rows * spec.cols), bitmap_element_size) * sizeof(bitmap_t));
   }
 };
 
diff --git a/cpp/tests/core/bitset.cu b/cpp/tests/core/bitset.cu
index 9f5e095ffa..d2dac44935 100644
--- a/cpp/tests/core/bitset.cu
+++ b/cpp/tests/core/bitset.cu
@@ -1,12 +1,15 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../test_utils.cuh"
 
 #include <raft/core/bitset.cuh>
+#include <raft/core/copy.cuh>
 #include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/linalg/init.cuh>
 #include <raft/random/rng.cuh>
 
@@ -183,16 +186,6 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
     resource::sync_stream(res, stream);
 
-    // calculate the results
-    auto my_bitset = raft::core::bitset<bitset_t, index_t>(
-      res, raft::make_const_mdspan(mask_device.view()), index_t(spec.bitset_len));
-    update_host(bitset_result.data(), my_bitset.data(), bitset_result.size(), stream);
-
-    // calculate the reference
-    create_cpu_bitset(bitset_ref, mask_cpu);
-    resource::sync_stream(res, stream);
-    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
-
     auto query_device     = raft::make_device_vector<index_t, index_t>(res, spec.query_len);
     auto result_device    = raft::make_device_vector<uint8_t, index_t>(res, spec.query_len);
     auto query_cpu        = std::vector<index_t>(spec.query_len);
@@ -200,140 +193,208 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     auto result_ref_nbits = std::vector<uint8_t>(spec.query_len);
     auto result_ref       = std::vector<uint8_t>(spec.query_len);
 
-    // Create queries and verify the test results
-    raft::random::uniformInt(res, rng, query_device.view(), index_t(0), index_t(spec.bitset_len));
-    update_host(query_cpu.data(), query_device.data_handle(), query_device.extent(0), stream);
-    my_bitset.test(res, raft::make_const_mdspan(query_device.view()), result_device.view());
-    update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
-    test_cpu_bitset(bitset_ref, query_cpu, result_ref);
-    resource::sync_stream(res, stream);
-    ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
+    auto eval_n_elements =
+      bitset_view<bitset_t, index_t>::eval_n_elements(spec.bitset_len * spec.repeat_times);
+    auto repeat_device = raft::make_device_vector<bitset_t, index_t>(res, eval_n_elements);
+
+    // Verify dry-run compliance of bitset construction.
+    // The entire bitset lifetime must be within the dry-run scope to prevent
+    // its device_buffer from outliving the temporary resource wrapper.
+    raft::execute_with_dry_run_check(
+      res,
+      [&](raft::resources const& h) {
+        raft::core::bitset<bitset_t, index_t> my_bitset(
+          h, raft::make_const_mdspan(mask_device.view()), index_t(spec.bitset_len));
+
+        raft::copy(h,
+                   raft::make_host_vector_view(bitset_result.data(), bitset_result.size()),
+                   my_bitset.to_mdspan());
+
+        // calculate the reference
+        create_cpu_bitset(bitset_ref, mask_cpu);
+        resource::sync_stream(h, stream);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+        }
 
-    // Add more sample to the bitset and re-test
-    raft::random::uniformInt(res, rng, mask_device.view(), index_t(0), index_t(spec.bitset_len));
-    update_host(mask_cpu.data(), mask_device.data_handle(), mask_device.extent(0), stream);
-    resource::sync_stream(res, stream);
-    my_bitset.set(res, mask_device.view());
-    update_host(bitset_result.data(), my_bitset.data(), bitset_result.size(), stream);
+        // Create queries and verify the test results
+        raft::random::uniformInt(h, rng, query_device.view(), index_t(0), index_t(spec.bitset_len));
+        raft::copy(h,
+                   raft::make_host_vector_view(query_cpu.data(), query_device.extent(0)),
+                   raft::make_const_mdspan(query_device.view()));
+        my_bitset.test(h, raft::make_const_mdspan(query_device.view()), result_device.view());
+        raft::copy(h,
+                   raft::make_host_vector_view(result_cpu.data(), result_device.extent(0)),
+                   raft::make_const_mdspan(result_device.view()));
+        test_cpu_bitset(bitset_ref, query_cpu, result_ref);
+        resource::sync_stream(h, stream);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_TRUE(hostVecMatch(result_cpu, result_ref, Compare<uint8_t>()));
+        }
 
-    add_cpu_bitset(bitset_ref, mask_cpu);
-    resource::sync_stream(res, stream);
-    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
-
-    // Reinterpret the bitset as uint8_t, uint32 then uint64_t
-    {
-      // Test CPU logic
-      test_cpu_bitset(bitset_ref, query_cpu, result_ref);
-      uint8_t* bitset_cpu_uint8 = (uint8_t*)std::malloc(sizeof(bitset_t) * bitset_ref.size());
-      std::memcpy(bitset_cpu_uint8, bitset_ref.data(), sizeof(bitset_t) * bitset_ref.size());
-      test_cpu_bitset_nbits(bitset_cpu_uint8, query_cpu, result_ref_nbits, sizeof(bitset_t) * 8);
-      ASSERT_TRUE(hostVecMatch(result_ref, result_ref_nbits, raft::Compare<uint8_t>()));
-      std::free(bitset_cpu_uint8);
-
-      // Test GPU uint8_t, uint32_t, uint64_t
-      auto my_bitset_view_uint8_t = raft::core::bitset_view<uint8_t, uint32_t>(
-        reinterpret_cast<uint8_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
-      raft::linalg::map(
-        res,
-        result_device.view(),
-        [my_bitset_view_uint8_t] __device__(index_t query) {
-          return my_bitset_view_uint8_t.test(query);
-        },
-        raft::make_const_mdspan(query_device.view()));
-      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
-      resource::sync_stream(res, stream);
-      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
-
-      auto my_bitset_view_uint32_t = raft::core::bitset_view<uint32_t, uint32_t>(
-        reinterpret_cast<uint32_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
-      raft::linalg::map(
-        res,
-        result_device.view(),
-        [my_bitset_view_uint32_t] __device__(index_t query) {
-          return my_bitset_view_uint32_t.test(query);
-        },
-        raft::make_const_mdspan(query_device.view()));
-      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
-      resource::sync_stream(res, stream);
-      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
-
-      auto my_bitset_view_uint64_t = raft::core::bitset_view<uint64_t, uint32_t>(
-        reinterpret_cast<uint64_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
-      raft::linalg::map(
-        res,
-        result_device.view(),
-        [my_bitset_view_uint64_t] __device__(index_t query) {
-          return my_bitset_view_uint64_t.test(query);
-        },
-        raft::make_const_mdspan(query_device.view()));
-      update_host(result_cpu.data(), result_device.data_handle(), result_device.extent(0), stream);
-      resource::sync_stream(res, stream);
-      ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
-    }
+        // Add more sample to the bitset and re-test
+        raft::random::uniformInt(h, rng, mask_device.view(), index_t(0), index_t(spec.bitset_len));
+        raft::copy(h,
+                   raft::make_host_vector_view(mask_cpu.data(), mask_device.extent(0)),
+                   raft::make_const_mdspan(mask_device.view()));
+        resource::sync_stream(h, stream);
+        my_bitset.set(h, mask_device.view());
+        raft::copy(h,
+                   raft::make_host_vector_view(bitset_result.data(), bitset_result.size()),
+                   my_bitset.to_mdspan());
+
+        add_cpu_bitset(bitset_ref, mask_cpu);
+        resource::sync_stream(h, stream);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+        }
 
-    // test sparsity, repeat and eval_n_elements
-    {
-      auto my_bitset_view  = my_bitset.view();
-      auto sparsity_result = my_bitset_view.sparsity(res);
-      auto sparsity_ref    = sparsity_cpu_bitset(bitset_ref, size_t(spec.bitset_len));
-      ASSERT_EQ(sparsity_result, sparsity_ref);
-
-      auto eval_n_elements =
-        bitset_view<bitset_t, index_t>::eval_n_elements(spec.bitset_len * spec.repeat_times);
-      ASSERT_EQ(bitset_repeat_ref.size(), eval_n_elements);
-
-      auto repeat_device = raft::make_device_vector<bitset_t, index_t>(res, eval_n_elements);
-      RAFT_CUDA_TRY(cudaMemsetAsync(
-        repeat_device.data_handle(), 0, eval_n_elements * sizeof(bitset_t), stream));
-      repeat_cpu_bitset(
-        bitset_ref, size_t(spec.bitset_len), size_t(spec.repeat_times), bitset_repeat_ref);
-
-      my_bitset_view.repeat(res, index_t(spec.repeat_times), repeat_device.data_handle());
-
-      ASSERT_EQ(bitset_repeat_ref.size(), repeat_device.size());
-      update_host(
-        bitset_repeat_result.data(), repeat_device.data_handle(), repeat_device.size(), stream);
-      ASSERT_EQ(bitset_repeat_ref.size(), bitset_repeat_result.size());
-
-      index_t errors                        = 0;
-      static constexpr index_t len_per_item = sizeof(bitset_t) * 8;
-      bitset_t tail_len = (index_t(spec.bitset_len * spec.repeat_times) % len_per_item);
-      bitset_t tail_mask =
-        tail_len ? (bitset_t)((bitset_t{1} << tail_len) - bitset_t{1}) : ~bitset_t{0};
-      for (index_t i = 0; i < bitset_repeat_ref.size(); i++) {
-        if (i == bitset_repeat_ref.size() - 1) {
-          errors += (bitset_repeat_ref[i] & tail_mask) != (bitset_repeat_result[i] & tail_mask);
-        } else {
-          errors += (bitset_repeat_ref[i] != bitset_repeat_result[i]);
+        // Reinterpret the bitset as uint8_t, uint32 then uint64_t
+        {
+          // Test CPU logic
+          test_cpu_bitset(bitset_ref, query_cpu, result_ref);
+          auto bitset_cpu_uint8 =
+            raft::make_host_vector<uint8_t>(h, sizeof(bitset_t) * bitset_ref.size());
+          raft::copy(
+            h,
+            bitset_cpu_uint8.view(),
+            raft::make_host_vector_view(reinterpret_cast<const uint8_t*>(bitset_ref.data()),
+                                        bitset_cpu_uint8.extent(0)));
+          test_cpu_bitset_nbits(
+            bitset_cpu_uint8.data_handle(), query_cpu, result_ref_nbits, sizeof(bitset_t) * 8);
+          if (!resource::get_dry_run_flag(h)) {
+            ASSERT_TRUE(hostVecMatch(result_ref, result_ref_nbits, raft::Compare<uint8_t>()));
+          }
+
+          // Test GPU uint8_t, uint32_t, uint64_t
+          auto my_bitset_view_uint8_t = raft::core::bitset_view<uint8_t, uint32_t>(
+            reinterpret_cast<uint8_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+          raft::linalg::map(
+            h,
+            result_device.view(),
+            [my_bitset_view_uint8_t] __device__(index_t query) {
+              return my_bitset_view_uint8_t.test(query);
+            },
+            raft::make_const_mdspan(query_device.view()));
+          raft::copy(h,
+                     raft::make_host_vector_view(result_cpu.data(), result_device.extent(0)),
+                     raft::make_const_mdspan(result_device.view()));
+          resource::sync_stream(h, stream);
+          if (!resource::get_dry_run_flag(h)) {
+            ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+          }
+
+          auto my_bitset_view_uint32_t = raft::core::bitset_view<uint32_t, uint32_t>(
+            reinterpret_cast<uint32_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+          raft::linalg::map(
+            h,
+            result_device.view(),
+            [my_bitset_view_uint32_t] __device__(index_t query) {
+              return my_bitset_view_uint32_t.test(query);
+            },
+            raft::make_const_mdspan(query_device.view()));
+          raft::copy(h,
+                     raft::make_host_vector_view(result_cpu.data(), result_device.extent(0)),
+                     raft::make_const_mdspan(result_device.view()));
+          resource::sync_stream(h, stream);
+          if (!resource::get_dry_run_flag(h)) {
+            ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+          }
+
+          auto my_bitset_view_uint64_t = raft::core::bitset_view<uint64_t, uint32_t>(
+            reinterpret_cast<uint64_t*>(my_bitset.data()), my_bitset.size(), sizeof(bitset_t) * 8);
+          raft::linalg::map(
+            h,
+            result_device.view(),
+            [my_bitset_view_uint64_t] __device__(index_t query) {
+              return my_bitset_view_uint64_t.test(query);
+            },
+            raft::make_const_mdspan(query_device.view()));
+          raft::copy(h,
+                     raft::make_host_vector_view(result_cpu.data(), result_device.extent(0)),
+                     raft::make_const_mdspan(result_device.view()));
+          resource::sync_stream(h, stream);
+          if (!resource::get_dry_run_flag(h)) {
+            ASSERT_TRUE(hostVecMatch(result_ref, result_cpu, Compare<uint8_t>()));
+          }
         }
-      }
-      ASSERT_EQ(errors, 0);
 
-      // recheck the sparsity after repeat
-      sparsity_result =
-        sparsity_cpu_bitset(bitset_repeat_result, size_t(spec.bitset_len * spec.repeat_times));
-      ASSERT_EQ(sparsity_result, sparsity_ref);
-    }
+        // test sparsity, repeat and eval_n_elements
+        {
+          auto my_bitset_view  = my_bitset.view();
+          auto sparsity_result = my_bitset_view.sparsity(h);
+          auto sparsity_ref    = sparsity_cpu_bitset(bitset_ref, size_t(spec.bitset_len));
+          if (!resource::get_dry_run_flag(h)) { ASSERT_EQ(sparsity_result, sparsity_ref); }
+
+          ASSERT_EQ(bitset_repeat_ref.size(), eval_n_elements);
+
+          RAFT_CUDA_TRY(cudaMemsetAsync(
+            repeat_device.data_handle(), 0, eval_n_elements * sizeof(bitset_t), stream));
+          repeat_cpu_bitset(
+            bitset_ref, size_t(spec.bitset_len), size_t(spec.repeat_times), bitset_repeat_ref);
+
+          my_bitset_view.repeat(h, index_t(spec.repeat_times), repeat_device.data_handle());
+
+          ASSERT_EQ(bitset_repeat_ref.size(), repeat_device.size());
+          raft::copy(
+            h,
+            raft::make_host_vector_view(bitset_repeat_result.data(), repeat_device.extent(0)),
+            raft::make_const_mdspan(repeat_device.view()));
+          ASSERT_EQ(bitset_repeat_ref.size(), bitset_repeat_result.size());
+
+          if (!resource::get_dry_run_flag(h)) {
+            index_t errors                        = 0;
+            static constexpr index_t len_per_item = sizeof(bitset_t) * 8;
+            bitset_t tail_len = (index_t(spec.bitset_len * spec.repeat_times) % len_per_item);
+            bitset_t tail_mask =
+              tail_len ? (bitset_t)((bitset_t{1} << tail_len) - bitset_t{1}) : ~bitset_t{0};
+            for (index_t i = 0; i < bitset_repeat_ref.size(); i++) {
+              if (i == bitset_repeat_ref.size() - 1) {
+                errors +=
+                  (bitset_repeat_ref[i] & tail_mask) != (bitset_repeat_result[i] & tail_mask);
+              } else {
+                errors += (bitset_repeat_ref[i] != bitset_repeat_result[i]);
+              }
+            }
+            ASSERT_EQ(errors, 0);
+
+            // recheck the sparsity after repeat
+            sparsity_result = sparsity_cpu_bitset(bitset_repeat_result,
+                                                  size_t(spec.bitset_len * spec.repeat_times));
+            ASSERT_EQ(sparsity_result, sparsity_ref);
+          }
+        }
 
-    // Flip the bitset and re-test
-    auto bitset_count = my_bitset.count(res);
-    my_bitset.flip(res);
-    ASSERT_EQ(my_bitset.count(res), spec.bitset_len - bitset_count);
-    update_host(bitset_result.data(), my_bitset.data(), bitset_result.size(), stream);
-    flip_cpu_bitset(bitset_ref);
-    resource::sync_stream(res, stream);
-    ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
-
-    // Test count() operations
-    my_bitset.reset(res, false);
-    ASSERT_EQ(my_bitset.any(res), false);
-    ASSERT_EQ(my_bitset.none(res), true);
-    raft::linalg::range(query_device.data_handle(), query_device.size(), stream);
-    my_bitset.set(res, raft::make_const_mdspan(query_device.view()), true);
-    bitset_count = my_bitset.count(res);
-    ASSERT_EQ(bitset_count, query_device.size());
-    ASSERT_EQ(my_bitset.any(res), true);
-    ASSERT_EQ(my_bitset.none(res), false);
+        // Flip the bitset and re-test
+        auto bitset_count = my_bitset.count(h);
+        my_bitset.flip(h);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_EQ(my_bitset.count(h), spec.bitset_len - bitset_count);
+        }
+        raft::copy(h,
+                   raft::make_host_vector_view(bitset_result.data(), bitset_result.size()),
+                   my_bitset.to_mdspan());
+        flip_cpu_bitset(bitset_ref);
+        resource::sync_stream(h, stream);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
+        }
+
+        // Test count() operations
+        my_bitset.reset(h, false);
+        ASSERT_EQ(my_bitset.any(h), false);
+        ASSERT_EQ(my_bitset.none(h), true);
+        raft::linalg::range(query_device.data_handle(), query_device.size(), stream);
+        my_bitset.set(h, raft::make_const_mdspan(query_device.view()), true);
+        bitset_count = my_bitset.count(h);
+        if (!resource::get_dry_run_flag(h)) {
+          ASSERT_EQ(bitset_count, query_device.size());
+          ASSERT_EQ(my_bitset.any(h), true);
+          ASSERT_EQ(my_bitset.none(h), false);
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size)) * sizeof(bitset_t));
   }
 };
 
diff --git a/cpp/tests/core/mdarray.cu b/cpp/tests/core/mdarray.cu
index 5c56177571..9c78b503b1 100644
--- a/cpp/tests/core/mdarray.cu
+++ b/cpp/tests/core/mdarray.cu
@@ -347,15 +347,20 @@ void test_factory_methods()
   }
   {
     raft::resources handle;
-    // device mdarray
-    auto d_matrix = make_device_matrix<float>(handle, n, n);
-    ASSERT_EQ(d_matrix.extent(0), n);
-    ASSERT_EQ(d_matrix.extent(1), n);
-    static_assert(d_matrix.rank() == 2);
-
-    auto d_vec = make_device_vector<float>(handle, n);
-    static_assert(d_vec.rank() == 1);
-    ASSERT_EQ(d_vec.extent(0), n);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        auto d_matrix = make_device_matrix<float>(h, n, n);
+        ASSERT_EQ(d_matrix.extent(0), n);
+        ASSERT_EQ(d_matrix.extent(1), n);
+        static_assert(d_matrix.rank() == 2);
+
+        auto d_vec = make_device_vector<float>(h, n);
+        static_assert(d_vec.rank() == 1);
+        ASSERT_EQ(d_vec.extent(0), n);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      n * (n + 1) * sizeof(float));
   }
 
   {
diff --git a/cpp/tests/core/mdbuffer.cu b/cpp/tests/core/mdbuffer.cu
index b00a4c437b..d4eac951ec 100644
--- a/cpp/tests/core/mdbuffer.cu
+++ b/cpp/tests/core/mdbuffer.cu
@@ -1,9 +1,9 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_resources.hpp>
@@ -89,61 +89,50 @@ TEST(MDBuffer, FromDevice)
   auto constexpr depth = std::uint32_t{5};
   auto constexpr rows  = std::uint32_t{3};
   auto constexpr cols  = std::uint32_t{2};
-  auto data = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
-    res, extents<std::uint32_t, depth, rows, cols>{});
-
-  auto buffer = mdbuffer(data);
-  EXPECT_FALSE(buffer.is_owning());
-  EXPECT_EQ(buffer.mem_type(), memory_type::device);
-  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), data.data_handle());
-  EXPECT_EQ(std::as_const(buffer).view<memory_type::device>().data_handle(), data.data_handle());
-  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(),
-            std::as_const(buffer).view<memory_type::device>().data_handle());
-  EXPECT_EQ(buffer.view().index(), variant_index_from_memory_type(memory_type::device));
-
-  buffer = mdbuffer(data.view());
-  EXPECT_FALSE(buffer.is_owning());
-  EXPECT_EQ(buffer.mem_type(), memory_type::device);
-  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), data.data_handle());
-  EXPECT_EQ(std::as_const(buffer).view<memory_type::device>().data_handle(), data.data_handle());
-  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(),
-            std::as_const(buffer).view<memory_type::device>().data_handle());
-
-  auto original_data_handle = data.data_handle();
-  buffer                    = mdbuffer(std::move(data));
-  EXPECT_TRUE(buffer.is_owning());
-  EXPECT_EQ(buffer.mem_type(), memory_type::device);
-  EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), original_data_handle);
-
-  auto buffer2 = mdbuffer(res, buffer);
-  EXPECT_FALSE(buffer2.is_owning());
-  EXPECT_EQ(buffer2.mem_type(), memory_type::device);
-  EXPECT_EQ(buffer2.view<memory_type::device>().data_handle(),
-            buffer.view<memory_type::device>().data_handle());
-
-  buffer2 = mdbuffer(res, buffer, memory_type::host);
-  EXPECT_TRUE(buffer2.is_owning());
-  EXPECT_EQ(buffer2.mem_type(), memory_type::host);
-  EXPECT_NE(buffer2.view<memory_type::host>().data_handle(),
-            buffer.view<memory_type::device>().data_handle());
 
-  buffer2 = mdbuffer(res, buffer, memory_type::device);
-  EXPECT_FALSE(buffer2.is_owning());
-  EXPECT_EQ(buffer2.mem_type(), memory_type::device);
-  EXPECT_EQ(buffer2.view<memory_type::device>().data_handle(),
-            buffer.view<memory_type::device>().data_handle());
-
-  buffer2 = mdbuffer(res, buffer, memory_type::managed);
-  EXPECT_TRUE(buffer2.is_owning());
-  EXPECT_EQ(buffer2.mem_type(), memory_type::managed);
-  EXPECT_NE(buffer2.view<memory_type::managed>().data_handle(),
-            buffer.view<memory_type::device>().data_handle());
-
-  buffer2 = mdbuffer(res, buffer, memory_type::pinned);
-  EXPECT_TRUE(buffer2.is_owning());
-  EXPECT_EQ(buffer2.mem_type(), memory_type::pinned);
-  EXPECT_NE(buffer2.view<memory_type::pinned>().data_handle(),
-            buffer.view<memory_type::device>().data_handle());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) {
+      auto data = make_device_mdarray<int, std::uint32_t, layout_c_contiguous, depth, rows, cols>(
+        h, extents<std::uint32_t, depth, rows, cols>{});
+
+      auto buffer = mdbuffer(data);
+      EXPECT_FALSE(buffer.is_owning());
+      EXPECT_EQ(buffer.mem_type(), memory_type::device);
+      EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), data.data_handle());
+
+      buffer = mdbuffer(data.view());
+      EXPECT_FALSE(buffer.is_owning());
+      EXPECT_EQ(buffer.mem_type(), memory_type::device);
+
+      auto original_data_handle = data.data_handle();
+      buffer                    = mdbuffer(std::move(data));
+      EXPECT_TRUE(buffer.is_owning());
+      EXPECT_EQ(buffer.mem_type(), memory_type::device);
+      EXPECT_EQ(buffer.view<memory_type::device>().data_handle(), original_data_handle);
+
+      auto buffer2 = mdbuffer(h, buffer);
+      EXPECT_FALSE(buffer2.is_owning());
+      EXPECT_EQ(buffer2.mem_type(), memory_type::device);
+
+      buffer2 = mdbuffer(h, buffer, memory_type::host);
+      EXPECT_TRUE(buffer2.is_owning());
+      EXPECT_EQ(buffer2.mem_type(), memory_type::host);
+
+      buffer2 = mdbuffer(h, buffer, memory_type::device);
+      EXPECT_FALSE(buffer2.is_owning());
+      EXPECT_EQ(buffer2.mem_type(), memory_type::device);
+
+      buffer2 = mdbuffer(h, buffer, memory_type::managed);
+      EXPECT_TRUE(buffer2.is_owning());
+      EXPECT_EQ(buffer2.mem_type(), memory_type::managed);
+
+      buffer2 = mdbuffer(h, buffer, memory_type::pinned);
+      EXPECT_TRUE(buffer2.is_owning());
+      EXPECT_EQ(buffer2.mem_type(), memory_type::pinned);
+    },
+    alloc_behavior::ARGUMENT_DRIVEN,
+    depth * rows * cols * sizeof(int));
 }
 
 TEST(MDBuffer, FromManaged)
diff --git a/cpp/tests/core/mdspan_copy.cu b/cpp/tests/core/mdspan_copy.cu
index e89680211f..bfaf022ac2 100644
--- a/cpp/tests/core/mdspan_copy.cu
+++ b/cpp/tests/core/mdspan_copy.cu
@@ -1,8 +1,9 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include "../test_utils.cuh"
 #include "../test_utils.h"
 
 #include <raft/core/copy.cuh>
@@ -45,7 +46,10 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_long.view()), decltype(in_left.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_long.view(), in_left.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_long.view(), in_left.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < depth; ++i) {
     for (auto j = std::uint32_t{}; j < rows; ++j) {
@@ -64,7 +68,10 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_right.view(), in_left.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_right.view(), in_left.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < depth; ++i) {
     for (auto j = std::uint32_t{}; j < rows; ++j) {
@@ -77,7 +84,10 @@ TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_left.view(), in_right.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_left.view(), in_right.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < depth; ++i) {
     for (auto j = std::uint32_t{}; j < rows; ++j) {
@@ -130,7 +140,10 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_right.view(), in_left.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_right.view(), in_left.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < rows; ++i) {
     for (auto j = std::uint32_t{}; j < cols; ++j) {
@@ -141,7 +154,10 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_left.view()), decltype(in_right.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_left.view(), in_right.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_left.view(), in_right.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < rows; ++i) {
     for (auto j = std::uint32_t{}; j < cols; ++j) {
@@ -180,7 +196,10 @@ TEST(MDSpanCopy, Mdspan2DDeviceDeviceCudaHalfWithTranspose)
   static_assert(
     detail::mdspan_copyable_with_kernel_v<decltype(out_right.view()), decltype(in_left.view())>,
     "Current implementation should use kernel for this copy");
-  copy(res, out_right.view(), in_left.view());
+  execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& r) { copy(r, out_right.view(), in_left.view()); },
+    alloc_behavior::NO_ALLOCATIONS);
   res.sync_stream();
   for (auto i = std::uint32_t{}; i < rows; ++i) {
     for (auto j = std::uint32_t{}; j < cols; ++j) {
diff --git a/cpp/tests/core/mdspan_utils.cu b/cpp/tests/core/mdspan_utils.cu
index b7c89fcdc7..776f9c3e34 100644
--- a/cpp/tests/core/mdspan_utils.cu
+++ b/cpp/tests/core/mdspan_utils.cu
@@ -1,8 +1,10 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include "../test_utils.cuh"
+
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_container_policy.hpp>
@@ -110,43 +112,48 @@ TEST(MDArray, HostFlatten) { test_host_flatten(); }
 void test_device_flatten()
 {
   raft::resources handle;
-  // flatten 3d device mdspan
-  {
-    raft::resources handle;
-    using three_d_extents = extents<int, dynamic_extent, dynamic_extent, dynamic_extent>;
-    using three_d_mdarray = device_mdarray<int, three_d_extents>;
-
-    three_d_extents extents{3, 3, 3};
-    typename three_d_mdarray::mapping_type layout{extents};
-    typename three_d_mdarray::container_policy_type policy{};
-    three_d_mdarray mda{handle, layout, policy};
-
-    auto flat_view = flatten(mda);
-
-    static_assert(std::is_same_v<typename three_d_mdarray::layout_type,
-                                 typename decltype(flat_view)::layout_type>,
-                  "layouts not the same");
-
-    ASSERT_EQ(flat_view.extents().rank(), 1);
-    ASSERT_EQ(flat_view.size(), mda.size());
-  }
-
-  // flatten device vector
-  {
-    auto dv        = make_device_vector<int>(handle, 27);
-    auto flat_view = flatten(dv.view());
-
-    ASSERT_EQ(dv.extents().rank(), flat_view.extents().rank());
-    ASSERT_EQ(dv.extent(0), flat_view.extent(0));
-  }
-
-  // flatten device scalar
-  {
-    auto ds        = make_device_scalar<int>(handle, 27);
-    auto flat_view = flatten(ds.view());
-
-    ASSERT_EQ(flat_view.extent(0), 1);
-  }
+  execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      // flatten 3d device mdspan
+      {
+        using three_d_extents = extents<int, dynamic_extent, dynamic_extent, dynamic_extent>;
+        using three_d_mdarray = device_mdarray<int, three_d_extents>;
+
+        three_d_extents extents{3, 3, 3};
+        typename three_d_mdarray::mapping_type layout{extents};
+        typename three_d_mdarray::container_policy_type policy{};
+        three_d_mdarray mda{h, layout, policy};
+
+        auto flat_view = flatten(mda);
+
+        static_assert(std::is_same_v<typename three_d_mdarray::layout_type,
+                                     typename decltype(flat_view)::layout_type>,
+                      "layouts not the same");
+
+        ASSERT_EQ(flat_view.extents().rank(), 1);
+        ASSERT_EQ(flat_view.size(), mda.size());
+      }
+
+      // flatten device vector
+      {
+        auto dv        = make_device_vector<int>(h, 27);
+        auto flat_view = flatten(dv.view());
+
+        ASSERT_EQ(dv.extents().rank(), flat_view.extents().rank());
+        ASSERT_EQ(dv.extent(0), flat_view.extent(0));
+      }
+
+      // flatten device scalar
+      {
+        auto ds        = make_device_scalar<int>(h, 27);
+        auto flat_view = flatten(ds.view());
+
+        ASSERT_EQ(flat_view.extent(0), 1);
+      }
+    },
+    alloc_behavior::ARGUMENT_DRIVEN,
+    27 * sizeof(int));
 }
 
 TEST(MDArray, DeviceFlatten) { test_device_flatten(); }
@@ -172,21 +179,26 @@ void test_reshape()
 
   // reshape 4d device array to 2d
   {
-    raft::resources handle;
-    using four_d_extents =
-      extents<int, dynamic_extent, dynamic_extent, dynamic_extent, dynamic_extent>;
-    using four_d_mdarray = device_mdarray<int, four_d_extents>;
-
-    four_d_extents extents{2, 2, 2, 2};
-    typename four_d_mdarray::mapping_type layout{extents};
-    typename four_d_mdarray::container_policy_type policy{};
-    four_d_mdarray mda{handle, layout, policy};
-
-    auto matrix = reshape(mda, raft::extents<int, dynamic_extent, dynamic_extent>{4, 4});
-
-    ASSERT_EQ(matrix.extents().rank(), 2);
-    ASSERT_EQ(matrix.extent(0), 4);
-    ASSERT_EQ(matrix.extent(1), 4);
+    execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        using four_d_extents =
+          extents<int, dynamic_extent, dynamic_extent, dynamic_extent, dynamic_extent>;
+        using four_d_mdarray = device_mdarray<int, four_d_extents>;
+
+        four_d_extents extents{2, 2, 2, 2};
+        typename four_d_mdarray::mapping_type layout{extents};
+        typename four_d_mdarray::container_policy_type policy{};
+        four_d_mdarray mda{h, layout, policy};
+
+        auto matrix = reshape(mda, raft::extents<int, dynamic_extent, dynamic_extent>{4, 4});
+
+        ASSERT_EQ(matrix.extents().rank(), 2);
+        ASSERT_EQ(matrix.extent(0), 4);
+        ASSERT_EQ(matrix.extent(1), 4);
+      },
+      alloc_behavior::ARGUMENT_DRIVEN,
+      16 * sizeof(int));
   }
 
   // reshape 2d host matrix with static extents to vector
diff --git a/cpp/tests/core/sparse_matrix.cu b/cpp/tests/core/sparse_matrix.cu
index 8ac7ddb41a..d378824c1d 100644
--- a/cpp/tests/core/sparse_matrix.cu
+++ b/cpp/tests/core/sparse_matrix.cu
@@ -1,7 +1,9 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
+#include "../test_utils.cuh"
+
 #include <raft/core/device_coo_matrix.hpp>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/resources.hpp>
@@ -69,82 +71,97 @@ void test_device_coo_sparsity_preserving_ref(S& mat, void* d)
 void test_device_coo_matrix()
 {
   raft::resources handle;
-  auto sparsity_owning = raft::make_device_coo_matrix<float, int, int, int>(handle, 5, 5);
+  execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      auto sparsity_owning = raft::make_device_coo_matrix<float, int, int, int>(h, 5, 5);
 
-  auto structure_view = sparsity_owning.structure_view();
+      auto structure_view = sparsity_owning.structure_view();
 
-  ASSERT_EQ(structure_view.get_n_cols(), 5);
-  ASSERT_EQ(structure_view.get_n_rows(), 5);
-  ASSERT_EQ(structure_view.get_nnz(), 0);
+      ASSERT_EQ(structure_view.get_n_cols(), 5);
+      ASSERT_EQ(structure_view.get_n_rows(), 5);
+      ASSERT_EQ(structure_view.get_nnz(), 0);
 
-  auto coord_struct = raft::make_device_coordinate_structure(handle, 5, 5, 5);
-  auto sparsity_preserving =
-    raft::make_device_coo_matrix<float, int, int>(handle, coord_struct.view());
+      auto coord_struct = raft::make_device_coordinate_structure(h, 5, 5, 5);
+      auto sparsity_preserving =
+        raft::make_device_coo_matrix<float, int, int>(h, coord_struct.view());
 
-  sparsity_owning.initialize_sparsity(5);
+      sparsity_owning.initialize_sparsity(5);
 
-  auto structure_view2 = sparsity_owning.structure_view();
+      auto structure_view2 = sparsity_owning.structure_view();
 
-  ASSERT_EQ(structure_view2.get_n_cols(), 5);
-  ASSERT_EQ(structure_view2.get_n_rows(), 5);
-  ASSERT_EQ(structure_view2.get_nnz(), 5);
+      ASSERT_EQ(structure_view2.get_n_cols(), 5);
+      ASSERT_EQ(structure_view2.get_n_rows(), 5);
+      ASSERT_EQ(structure_view2.get_nnz(), 5);
 
-  void* d_owning     = static_cast<void*>(sparsity_owning.get_elements().data());
-  void* d_preserving = static_cast<void*>(sparsity_preserving.get_elements().data());
+      void* d_owning     = static_cast<void*>(sparsity_owning.get_elements().data());
+      void* d_preserving = static_cast<void*>(sparsity_preserving.get_elements().data());
 
-  test_device_coo_owning_ref(sparsity_owning, d_owning);
-  test_device_coo_owning_ref(sparsity_preserving, d_preserving);
+      test_device_coo_owning_ref(sparsity_owning, d_owning);
+      test_device_coo_owning_ref(sparsity_preserving, d_preserving);
 
-  test_device_coo_sparsity_owning_ref(sparsity_owning, d_owning);
-  test_device_coo_sparsity_preserving_ref(sparsity_preserving, d_preserving);
+      test_device_coo_sparsity_owning_ref(sparsity_owning, d_owning);
+      test_device_coo_sparsity_preserving_ref(sparsity_preserving, d_preserving);
+    },
+    alloc_behavior::ARGUMENT_DRIVEN,
+    4 * 5 * sizeof(int) + 2 * 5 * sizeof(float));
 }
 
 void test_device_csr_matrix()
 {
   raft::resources handle;
-  auto sparsity_owning = raft::make_device_csr_matrix<float, int, int, int>(handle, 5, 5);
+  execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      auto sparsity_owning = raft::make_device_csr_matrix<float, int, int, int>(h, 5, 5);
 
-  auto comp_struct = raft::make_device_compressed_structure(handle, 5, 5, 5);
-  auto sparsity_preserving =
-    raft::make_device_csr_matrix<float, int, int>(handle, comp_struct.view());
+      auto comp_struct = raft::make_device_compressed_structure(h, 5, 5, 5);
+      auto sparsity_preserving =
+        raft::make_device_csr_matrix<float, int, int>(h, comp_struct.view());
 
-  auto structure_view = sparsity_owning.structure_view();
+      auto structure_view = sparsity_owning.structure_view();
 
-  ASSERT_EQ(structure_view.get_n_cols(), 5);
-  ASSERT_EQ(structure_view.get_n_rows(), 5);
-  ASSERT_EQ(structure_view.get_nnz(), 0);
+      ASSERT_EQ(structure_view.get_n_cols(), 5);
+      ASSERT_EQ(structure_view.get_n_rows(), 5);
+      ASSERT_EQ(structure_view.get_nnz(), 0);
 
-  sparsity_owning.initialize_sparsity(5);
+      sparsity_owning.initialize_sparsity(5);
 
-  auto structure_view2 = sparsity_owning.structure_view();
+      auto structure_view2 = sparsity_owning.structure_view();
 
-  ASSERT_EQ(structure_view2.get_n_cols(), 5);
-  ASSERT_EQ(structure_view2.get_n_rows(), 5);
-  ASSERT_EQ(structure_view2.get_nnz(), 5);
+      ASSERT_EQ(structure_view2.get_n_cols(), 5);
+      ASSERT_EQ(structure_view2.get_n_rows(), 5);
+      ASSERT_EQ(structure_view2.get_nnz(), 5);
 
-  void* d_owning     = static_cast<void*>(sparsity_owning.get_elements().data());
-  void* d_preserving = static_cast<void*>(sparsity_preserving.get_elements().data());
+      void* d_owning     = static_cast<void*>(sparsity_owning.get_elements().data());
+      void* d_preserving = static_cast<void*>(sparsity_preserving.get_elements().data());
 
-  test_device_csr_owning_ref(sparsity_owning, d_owning);
-  test_device_csr_owning_ref(sparsity_preserving, d_preserving);
+      test_device_csr_owning_ref(sparsity_owning, d_owning);
+      test_device_csr_owning_ref(sparsity_preserving, d_preserving);
 
-  test_device_csr_sparsity_owning_ref(sparsity_owning, d_owning);
-  test_device_csr_sparsity_preserving_ref(sparsity_preserving, d_preserving);
+      test_device_csr_sparsity_owning_ref(sparsity_owning, d_owning);
+      test_device_csr_sparsity_preserving_ref(sparsity_preserving, d_preserving);
+    },
+    alloc_behavior::ARGUMENT_DRIVEN,
+    2 * (5 + 1) * sizeof(int) + 2 * 5 * sizeof(int) + 2 * 5 * sizeof(float));
 }
 
 TEST(DeviceCoordinateStructure, Initialization)
 {
   raft::resources handle;
-
-  auto uninitialized = raft::make_device_coordinate_structure(handle, 5, 5, 0);
-  // Note: the behaviour of calling `view` on an uninitialized structure is
-  // undefined, this is testing an implementation detail.
-  EXPECT_EQ(uninitialized.view().get_rows().size(), 0);
-  EXPECT_EQ(uninitialized.view().get_rows().data(), nullptr);
-
-  auto initialized = raft::make_device_coordinate_structure(handle, 5, 5, 5);
-  EXPECT_EQ(initialized.view().get_rows().size(), 5);
-  EXPECT_NE(initialized.view().get_rows().data(), nullptr);
+  execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      auto uninitialized = raft::make_device_coordinate_structure(h, 5, 5, 0);
+      EXPECT_EQ(uninitialized.view().get_rows().size(), 0);
+      EXPECT_EQ(uninitialized.view().get_rows().data(), nullptr);
+
+      auto initialized = raft::make_device_coordinate_structure(h, 5, 5, 5);
+      EXPECT_EQ(initialized.view().get_rows().size(), 5);
+      EXPECT_NE(initialized.view().get_rows().data(), nullptr);
+    },
+    alloc_behavior::ARGUMENT_DRIVEN,
+    2 * 5 * sizeof(int));
 }
 
 TEST(DeviceSparseCOOMatrix, Basic) { test_device_coo_matrix(); }
diff --git a/cpp/tests/core/temporary_device_buffer.cu b/cpp/tests/core/temporary_device_buffer.cu
index 89c299fe32..3cd705d8be 100644
--- a/cpp/tests/core/temporary_device_buffer.cu
+++ b/cpp/tests/core/temporary_device_buffer.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -54,18 +54,21 @@ TEST(TemporaryDeviceBuffer, HostPointerWithWriteBack)
   rmm::device_uvector<int> result(5, resource::get_cuda_stream(handle));
 
   {
-    auto d_buf  = raft::make_writeback_temporary_device_buffer(handle, array.data_handle(), exts);
-    auto d_view = d_buf.view();
-
-    thrust::fill(rmm::exec_policy(resource::get_cuda_stream(handle)),
-                 d_view.data_handle(),
-                 d_view.data_handle() + d_view.extent(0),
-                 10);
-    raft::copy(
-      result.data(), d_view.data_handle(), d_view.extent(0), resource::get_cuda_stream(handle));
-
-    static_assert(!std::is_const_v<typename decltype(d_buf.view())::element_type>,
-                  "element_type should not be const");
+    execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        auto d_buf  = raft::make_writeback_temporary_device_buffer(h, array.data_handle(), exts);
+        auto d_view = d_buf.view();
+
+        thrust::fill(rmm::exec_policy(resource::get_cuda_stream(h)),
+                     d_view.data_handle(),
+                     d_view.data_handle() + d_view.extent(0),
+                     10);
+        raft::copy(
+          result.data(), d_view.data_handle(), d_view.extent(0), resource::get_cuda_stream(h));
+      },
+      alloc_behavior::ARGUMENT_DRIVEN,
+      5 * sizeof(int));
   }
 
   ASSERT_TRUE(raft::devArrMatchHost(array.data_handle(),
diff --git a/cpp/tests/linalg/add.cu b/cpp/tests/linalg/add.cu
index 04279a6cbe..490a85126f 100644
--- a/cpp/tests/linalg/add.cu
+++ b/cpp/tests/linalg/add.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -43,8 +43,10 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     auto in1_view = raft::make_device_vector_view<const InT>(in1.data(), in1.size());
     auto in2_view = raft::make_device_vector_view<const InT>(in2.data(), in2.size());
 
-    add(handle, in1_view, in2_view, out_view);
-    resource::sync_stream(handle, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) { add(h, in1_view, in2_view, out_view); },
+      raft::alloc_behavior::NO_ALLOCATIONS);
   }
 
   void compare()
diff --git a/cpp/tests/linalg/axpy.cu b/cpp/tests/linalg/axpy.cu
index 5f0ac772b4..b699c68493 100644
--- a/cpp/tests/linalg/axpy.cu
+++ b/cpp/tests/linalg/axpy.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "../test_utils.cuh"
@@ -88,53 +88,58 @@ class AxpyTest : public ::testing::TestWithParam<AxpyInputs<T>> {
     rmm::device_scalar<T> device_alpha(params.alpha, stream);
     auto device_alpha_view = make_device_scalar_view<const T>(device_alpha.data());
 
-    if ((params.incx > 1) && (params.incy > 1)) {
-      auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        x.data(), make_vector_strided_layout<IndexType>(params.len, params.incx));
-      axpy(handle,
-           host_alpha_view,
-           x_view,
-           make_device_vector_view<T, IndexType, layout_stride>(
-             y_host_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
-      axpy(handle,
-           device_alpha_view,
-           x_view,
-           make_device_vector_view<T, IndexType, layout_stride>(
-             y_device_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
-    } else if (params.incx > 1) {
-      auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        x.data(), make_vector_strided_layout<IndexType>(params.len, params.incx));
-      axpy(handle,
-           host_alpha_view,
-           x_view,
-           make_device_vector_view<T>(y_host_alpha.data(), params.len));
-      axpy(handle,
-           device_alpha_view,
-           x_view,
-           make_device_vector_view<T>(y_device_alpha.data(), params.len));
-    } else if (params.incy > 1) {
-      auto x_view = make_device_vector_view<const T>(x.data(), params.len);
-      axpy(handle,
-           host_alpha_view,
-           x_view,
-           make_device_vector_view<T, IndexType, layout_stride>(
-             y_host_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
-      axpy(handle,
-           device_alpha_view,
-           x_view,
-           make_device_vector_view<T, IndexType, layout_stride>(
-             y_device_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
-    } else {
-      auto x_view = make_device_vector_view<const T>(x.data(), params.len);
-      axpy(handle,
-           host_alpha_view,
-           x_view,
-           make_device_vector_view<T>(y_host_alpha.data(), params.len));
-      axpy(handle,
-           device_alpha_view,
-           x_view,
-           make_device_vector_view<T>(y_device_alpha.data(), params.len));
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if ((params.incx > 1) && (params.incy > 1)) {
+          auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            x.data(), make_vector_strided_layout<IndexType>(params.len, params.incx));
+          axpy(h,
+               host_alpha_view,
+               x_view,
+               make_device_vector_view<T, IndexType, layout_stride>(
+                 y_host_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
+          axpy(h,
+               device_alpha_view,
+               x_view,
+               make_device_vector_view<T, IndexType, layout_stride>(
+                 y_device_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
+        } else if (params.incx > 1) {
+          auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            x.data(), make_vector_strided_layout<IndexType>(params.len, params.incx));
+          axpy(h,
+               host_alpha_view,
+               x_view,
+               make_device_vector_view<T>(y_host_alpha.data(), params.len));
+          axpy(h,
+               device_alpha_view,
+               x_view,
+               make_device_vector_view<T>(y_device_alpha.data(), params.len));
+        } else if (params.incy > 1) {
+          auto x_view = make_device_vector_view<const T>(x.data(), params.len);
+          axpy(h,
+               host_alpha_view,
+               x_view,
+               make_device_vector_view<T, IndexType, layout_stride>(
+                 y_host_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
+          axpy(h,
+               device_alpha_view,
+               x_view,
+               make_device_vector_view<T, IndexType, layout_stride>(
+                 y_device_alpha.data(), make_vector_strided_layout(params.len, params.incy)));
+        } else {
+          auto x_view = make_device_vector_view<const T>(x.data(), params.len);
+          axpy(h,
+               host_alpha_view,
+               x_view,
+               make_device_vector_view<T>(y_host_alpha.data(), params.len));
+          axpy(h,
+               device_alpha_view,
+               x_view,
+               make_device_vector_view<T>(y_device_alpha.data(), params.len));
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     resource::sync_stream(handle);
   }
diff --git a/cpp/tests/linalg/binary_op.cu b/cpp/tests/linalg/binary_op.cu
index bcd3af8548..7596e324a9 100644
--- a/cpp/tests/linalg/binary_op.cu
+++ b/cpp/tests/linalg/binary_op.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -54,7 +54,10 @@ class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxT
     uniform(handle, r, in1.data(), len, InType(-1.0), InType(1.0));
     uniform(handle, r, in2.data(), len, InType(-1.0), InType(1.0));
     naiveAdd(out_ref.data(), in1.data(), in2.data(), len);
-    binaryOpLaunch(handle, out.data(), in1.data(), in2.data(), len);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) { binaryOpLaunch(h, out.data(), in1.data(), in2.data(), len); },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/tests/linalg/coalesced_reduction.cu b/cpp/tests/linalg/coalesced_reduction.cu
index 09e6283c08..0ad8fe4fa5 100644
--- a/cpp/tests/linalg/coalesced_reduction.cu
+++ b/cpp/tests/linalg/coalesced_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -87,8 +87,13 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
                             raft::add_op{},
                             raft::identity_op{});
 
-    coalescedReductionLaunch(handle, dots_act.data(), data.data(), cols, rows);
-    coalescedReductionLaunch(handle, dots_act.data(), data.data(), cols, rows, true);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        coalescedReductionLaunch(h, dots_act.data(), data.data(), cols, rows);
+        coalescedReductionLaunch(h, dots_act.data(), data.data(), cols, rows, true);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
 
     resource::sync_stream(handle, stream);
   }
@@ -120,7 +125,8 @@ const std::vector<coalescedReductionInputs<float>> inputsf = {{0.000002f, 50, 2,
                                                               {0.000002f, 10000, 55, 1234ULL},
                                                               {0.000002f, 10000, 100, 1234ULL},
                                                               {0.000002f, 10000, 270, 1234ULL},
-                                                              {0.0001f, 10, 25000, 1234ULL}};
+                                                              {0.0001f, 10, 25000, 1234ULL},
+                                                              {0.0001f, 2, 200000, 1234ULL}};
 
 const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 50, 2, 1234ULL},
                                                                {0.000000001, 50, 3, 1234ULL},
@@ -136,7 +142,8 @@ const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 50,
                                                                {0.000000001, 10000, 55, 1234ULL},
                                                                {0.000000001, 10000, 100, 1234ULL},
                                                                {0.000000001, 10000, 270, 1234ULL},
-                                                               {0.0000001, 10, 25000, 1234ULL}};
+                                                               {0.0000001, 10, 25000, 1234ULL},
+                                                               {0.0000001, 2, 200000, 1234ULL}};
 
 typedef coalescedReductionTest<float> coalescedReductionTestF;
 TEST_P(coalescedReductionTestF, Result)
diff --git a/cpp/tests/linalg/divide.cu b/cpp/tests/linalg/divide.cu
index c081698b20..fb001a4e76 100644
--- a/cpp/tests/linalg/divide.cu
+++ b/cpp/tests/linalg/divide.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -54,7 +54,10 @@ class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T
     auto out_view    = raft::make_device_vector_view(out.data(), len);
     auto in_view     = raft::make_device_vector_view<const T>(in.data(), len);
     auto scalar_view = raft::make_host_scalar_view<const T>(&params.scalar);
-    divide_scalar(handle, in_view, out_view, scalar_view);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) { divide_scalar(h, in_view, out_view, scalar_view); },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/tests/linalg/dot.cu b/cpp/tests/linalg/dot.cu
index 53e6d4cb3c..41bdba9dde 100644
--- a/cpp/tests/linalg/dot.cu
+++ b/cpp/tests/linalg/dot.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "../test_utils.cuh"
@@ -72,31 +72,36 @@ class DotTest : public ::testing::TestWithParam<DotInputs<T>> {
     auto device_out_view = make_device_scalar_view<T, IndexType>(out.data());
     auto host_out_view   = make_host_scalar_view<T, IndexType>(&host_output);
 
-    if ((params.incx > 1) && (params.incy > 1)) {
-      auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        x.data(), make_vector_strided_layout(params.len, params.incx));
-      auto y_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        y.data(), make_vector_strided_layout(params.len, params.incy));
-      dot(handle, x_view, y_view, device_out_view);
-      dot(handle, x_view, y_view, host_out_view);
-    } else if (params.incx > 1) {
-      auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        x.data(), make_vector_strided_layout(params.len, params.incx));
-      auto y_view = make_device_vector_view<const T>(y.data(), params.len);
-      dot(handle, x_view, y_view, device_out_view);
-      dot(handle, x_view, y_view, host_out_view);
-    } else if (params.incy > 1) {
-      auto x_view = make_device_vector_view<const T>(x.data(), params.len);
-      auto y_view = make_device_vector_view<const T, IndexType, layout_stride>(
-        y.data(), make_vector_strided_layout(params.len, params.incy));
-      dot(handle, x_view, y_view, device_out_view);
-      dot(handle, x_view, y_view, host_out_view);
-    } else {
-      auto x_view = make_device_vector_view<const T>(x.data(), params.len);
-      auto y_view = make_device_vector_view<const T>(y.data(), params.len);
-      dot(handle, x_view, y_view, device_out_view);
-      dot(handle, x_view, y_view, host_out_view);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if ((params.incx > 1) && (params.incy > 1)) {
+          auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            x.data(), make_vector_strided_layout(params.len, params.incx));
+          auto y_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            y.data(), make_vector_strided_layout(params.len, params.incy));
+          dot(h, x_view, y_view, device_out_view);
+          dot(h, x_view, y_view, host_out_view);
+        } else if (params.incx > 1) {
+          auto x_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            x.data(), make_vector_strided_layout(params.len, params.incx));
+          auto y_view = make_device_vector_view<const T>(y.data(), params.len);
+          dot(h, x_view, y_view, device_out_view);
+          dot(h, x_view, y_view, host_out_view);
+        } else if (params.incy > 1) {
+          auto x_view = make_device_vector_view<const T>(x.data(), params.len);
+          auto y_view = make_device_vector_view<const T, IndexType, layout_stride>(
+            y.data(), make_vector_strided_layout(params.len, params.incy));
+          dot(h, x_view, y_view, device_out_view);
+          dot(h, x_view, y_view, host_out_view);
+        } else {
+          auto x_view = make_device_vector_view<const T>(x.data(), params.len);
+          auto y_view = make_device_vector_view<const T>(y.data(), params.len);
+          dot(h, x_view, y_view, device_out_view);
+          dot(h, x_view, y_view, host_out_view);
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     raft::update_host(&device_output, out.data(), 1, stream);
     resource::sync_stream(handle);
   }
diff --git a/cpp/tests/linalg/eig.cu b/cpp/tests/linalg/eig.cu
index 483b214d17..55312b9917 100644
--- a/cpp/tests/linalg/eig.cu
+++ b/cpp/tests/linalg/eig.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -97,11 +97,23 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     auto eig_vals_jacobi_view =
       raft::make_device_vector_view<T, std::uint32_t>(eig_vals_jacobi.data(), params.n_row);
 
-    eig_dc(handle, cov_matrix_view, eig_vectors_view, eig_vals_view);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        eig_dc(h, cov_matrix_view, eig_vectors_view, eig_vals_view);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(int));
 
     T tol      = 1.e-7;
     int sweeps = 15;
-    eig_jacobi(handle, cov_matrix_view, eig_vectors_jacobi_view, eig_vals_jacobi_view, tol, sweeps);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        eig_jacobi(h, cov_matrix_view, eig_vectors_jacobi_view, eig_vals_jacobi_view, tol, sweeps);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(int));
 
     // test code for comparing two methods
     len = params.n * params.n;
@@ -156,10 +168,16 @@ TEST(Raft, EigStream)
     raft::make_device_matrix<float, std::uint32_t, raft::col_major>(handle, n_rows, n_rows);
   auto eig_vals_stream = raft::make_device_vector<float, std::uint32_t>(handle, n_rows);
 
-  raft::linalg::eig_dc(handle,
-                       raft::make_const_mdspan(cov_matrix_stream.view()),
-                       eig_vectors_stream.view(),
-                       eig_vals_stream.view());
+  raft::execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      raft::linalg::eig_dc(h,
+                           raft::make_const_mdspan(cov_matrix_stream.view()),
+                           eig_vectors_stream.view(),
+                           eig_vals_stream.view());
+    },
+    raft::alloc_behavior::ARGUMENT_DRIVEN,
+    sizeof(int));
   raft::resource::sync_stream(handle, raft::resource::get_cuda_stream(handle));
 }
 
diff --git a/cpp/tests/linalg/gemm_basic.cpp b/cpp/tests/linalg/gemm_basic.cpp
index 6ab669c1d0..03db2da0ea 100644
--- a/cpp/tests/linalg/gemm_basic.cpp
+++ b/cpp/tests/linalg/gemm_basic.cpp
@@ -92,8 +92,8 @@ void test_gemm_pointer_mode_host(bool use_alpha, bool use_beta)
   raft::copy(c_device.data_handle(), c_host.data(), c_host.size(), stream);
 
   // Create scalar views for alpha and beta
-  auto alpha_scalar = raft::make_host_scalar(alpha_val);
-  auto beta_scalar  = raft::make_host_scalar(beta_val);
+  auto alpha_scalar = raft::make_host_scalar(res, alpha_val);
+  auto beta_scalar  = raft::make_host_scalar(res, beta_val);
 
   // Perform GEMM: C = alpha * A * B + beta * C
   raft::linalg::gemm(res,
diff --git a/cpp/tests/linalg/gemm_layout.cu b/cpp/tests/linalg/gemm_layout.cu
index 638b5921e3..6f37f5ee1f 100644
--- a/cpp/tests/linalg/gemm_layout.cu
+++ b/cpp/tests/linalg/gemm_layout.cu
@@ -97,23 +97,28 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     auto z_view_col_major =
       raft::make_device_matrix_view<T, int, raft::col_major>(Z, params.M, params.N);
 
-    if (params.xLayout && params.yLayout && params.zLayout) {
-      gemm(handle, x_view_col_major, y_view_col_major, z_view_col_major);
-    } else if (params.xLayout && params.yLayout && !params.zLayout) {
-      gemm(handle, x_view_col_major, y_view_col_major, z_view_row_major);
-    } else if (params.xLayout && !params.yLayout && params.zLayout) {
-      gemm(handle, x_view_col_major, y_view_row_major, z_view_col_major);
-    } else if (!params.xLayout && params.yLayout && params.zLayout) {
-      gemm(handle, x_view_row_major, y_view_col_major, z_view_col_major);
-    } else if (params.xLayout && !params.yLayout && !params.zLayout) {
-      gemm(handle, x_view_col_major, y_view_row_major, z_view_row_major);
-    } else if (!params.xLayout && params.yLayout && !params.zLayout) {
-      gemm(handle, x_view_row_major, y_view_col_major, z_view_row_major);
-    } else if (!params.xLayout && !params.yLayout && params.zLayout) {
-      gemm(handle, x_view_row_major, y_view_row_major, z_view_col_major);
-    } else if (!params.xLayout && !params.yLayout && !params.zLayout) {
-      gemm(handle, x_view_row_major, y_view_row_major, z_view_row_major);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.xLayout && params.yLayout && params.zLayout) {
+          gemm(h, x_view_col_major, y_view_col_major, z_view_col_major);
+        } else if (params.xLayout && params.yLayout && !params.zLayout) {
+          gemm(h, x_view_col_major, y_view_col_major, z_view_row_major);
+        } else if (params.xLayout && !params.yLayout && params.zLayout) {
+          gemm(h, x_view_col_major, y_view_row_major, z_view_col_major);
+        } else if (!params.xLayout && params.yLayout && params.zLayout) {
+          gemm(h, x_view_row_major, y_view_col_major, z_view_col_major);
+        } else if (params.xLayout && !params.yLayout && !params.zLayout) {
+          gemm(h, x_view_col_major, y_view_row_major, z_view_row_major);
+        } else if (!params.xLayout && params.yLayout && !params.zLayout) {
+          gemm(h, x_view_row_major, y_view_col_major, z_view_row_major);
+        } else if (!params.xLayout && !params.yLayout && params.zLayout) {
+          gemm(h, x_view_row_major, y_view_row_major, z_view_col_major);
+        } else if (!params.xLayout && !params.yLayout && !params.zLayout) {
+          gemm(h, x_view_row_major, y_view_row_major, z_view_row_major);
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     resource::sync_stream(handle);
 
diff --git a/cpp/tests/linalg/map.cu b/cpp/tests/linalg/map.cu
index 0cd4434e2f..12d1df1140 100644
--- a/cpp/tests/linalg/map.cu
+++ b/cpp/tests/linalg/map.cu
@@ -108,16 +108,14 @@ struct KVPAddOp {
 };
 
 template <typename InType, typename IdxType, typename OutType>
-void mapLaunch(OutType* out,
+void mapLaunch(const raft::resources& handle,
+               OutType* out,
                const InType* in1,
                const InType* in2,
                const InType* in3,
                InType scalar,
-               IdxType len,
-               cudaStream_t stream)
+               IdxType len)
 {
-  raft::resources handle;
-  resource::set_cuda_stream(handle, stream);
   auto out_view = raft::make_device_vector_view(out, len);
   auto in1_view = raft::make_device_vector_view(in1, len);
   auto in2_view = raft::make_device_vector_view(in2, len);
@@ -289,7 +287,12 @@ class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutTy
     }
 
     create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream);
-    mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        mapLaunch(h, out.data(), in1.data(), in2.data(), in3.data(), params.scalar, len);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
diff --git a/cpp/tests/linalg/matrix_vector_op.cu b/cpp/tests/linalg/matrix_vector_op.cu
index 39bb0ce051..87c27c54a0 100644
--- a/cpp/tests/linalg/matrix_vector_op.cu
+++ b/cpp/tests/linalg/matrix_vector_op.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -161,15 +161,20 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
                   OpT{},
                   stream);
     }
-    matrixVectorOpLaunch<OpT>(handle,
-                              out_ptr,
-                              in_ptr,
-                              vec1.data(),
-                              vec2.data(),
-                              params.cols,
-                              params.rows,
-                              params.rowMajor,
-                              params.bcastAlongRows);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        matrixVectorOpLaunch<OpT>(h,
+                                  out_ptr,
+                                  in_ptr,
+                                  vec1.data(),
+                                  vec2.data(),
+                                  params.cols,
+                                  params.rows,
+                                  params.rowMajor,
+                                  params.bcastAlongRows);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle);
   }
 
diff --git a/cpp/tests/linalg/mean_squared_error.cu b/cpp/tests/linalg/mean_squared_error.cu
index 17c7105416..38dc53354b 100644
--- a/cpp/tests/linalg/mean_squared_error.cu
+++ b/cpp/tests/linalg/mean_squared_error.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "../test_utils.cuh"
@@ -69,11 +69,17 @@ class MeanSquaredErrorTest : public ::testing::TestWithParam<MeanSquaredErrorInp
     uniform(handle, r, b.data(), params.len, T(-1.0), T(1.0));
     resource::sync_stream(handle);
 
-    mean_squared_error<T, std::uint32_t, T>(handle,
-                                            make_device_vector_view<const T>(a.data(), params.len),
-                                            make_device_vector_view<const T>(b.data(), params.len),
-                                            make_device_scalar_view<T>(output.data()),
-                                            params.weight);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        mean_squared_error<T, std::uint32_t, T>(
+          h,
+          make_device_vector_view<const T>(a.data(), params.len),
+          make_device_vector_view<const T>(b.data(), params.len),
+          make_device_scalar_view<T>(output.data()),
+          params.weight);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     naiveMeanSquaredError<<<256, 256, 0, stream>>>(
       params.len, a.data(), b.data(), params.weight, refoutput.data());
diff --git a/cpp/tests/linalg/multiply.cu b/cpp/tests/linalg/multiply.cu
index eab5bbbab7..ae5a83aae0 100644
--- a/cpp/tests/linalg/multiply.cu
+++ b/cpp/tests/linalg/multiply.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -39,8 +39,10 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
     auto out_view    = raft::make_device_vector_view(out.data(), len);
     auto in_view     = raft::make_device_vector_view<const T>(in.data(), len);
     auto scalar_view = raft::make_host_scalar_view<const T>(&params.scalar);
-    multiply_scalar(handle, in_view, out_view, scalar_view);
-    resource::sync_stream(handle, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) { multiply_scalar(h, in_view, out_view, scalar_view); },
+      raft::alloc_behavior::NO_ALLOCATIONS);
   }
 
  protected:
diff --git a/cpp/tests/linalg/norm.cu b/cpp/tests/linalg/norm.cu
index 6abe8aca92..9ef90d585b 100644
--- a/cpp/tests/linalg/norm.cu
+++ b/cpp/tests/linalg/norm.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -99,35 +99,40 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<OutT, IdxT>> {
       data.data(), params.rows, params.cols);
     auto input_col_major = raft::make_device_matrix_view<const T, IdxT, raft::col_major>(
       data.data(), params.rows, params.cols);
-    if (params.do_sqrt) {
-      if (params.rowMajor) {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_ROWS>(handle, input_row_major, output_view, raft::sqrt_op{});
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.do_sqrt) {
+          if (params.rowMajor) {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_ROWS>(h, input_row_major, output_view, raft::sqrt_op{});
+            } else {
+              norm<L1Norm, Apply::ALONG_ROWS>(h, input_row_major, output_view, raft::sqrt_op{});
+            }
+          } else {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_ROWS>(h, input_col_major, output_view, raft::sqrt_op{});
+            } else {
+              norm<L1Norm, Apply::ALONG_ROWS>(h, input_col_major, output_view, raft::sqrt_op{});
+            }
+          }
         } else {
-          norm<L1Norm, Apply::ALONG_ROWS>(handle, input_row_major, output_view, raft::sqrt_op{});
+          if (params.rowMajor) {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_ROWS>(h, input_row_major, output_view);
+            } else {
+              norm<L1Norm, Apply::ALONG_ROWS>(h, input_row_major, output_view);
+            }
+          } else {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_ROWS>(h, input_col_major, output_view);
+            } else {
+              norm<L1Norm, Apply::ALONG_ROWS>(h, input_col_major, output_view);
+            }
+          }
         }
-      } else {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_ROWS>(handle, input_col_major, output_view, raft::sqrt_op{});
-        } else {
-          norm<L1Norm, Apply::ALONG_ROWS>(handle, input_col_major, output_view, raft::sqrt_op{});
-        }
-      }
-    } else {
-      if (params.rowMajor) {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_ROWS>(handle, input_row_major, output_view);
-        } else {
-          norm<L1Norm, Apply::ALONG_ROWS>(handle, input_row_major, output_view);
-        }
-      } else {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_ROWS>(handle, input_col_major, output_view);
-        } else {
-          norm<L1Norm, Apply::ALONG_ROWS>(handle, input_col_major, output_view);
-        }
-      }
-    }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
@@ -192,35 +197,40 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<OutT, IdxT>> {
       data.data(), params.rows, params.cols);
     auto input_col_major = raft::make_device_matrix_view<const T, IdxT, raft::col_major>(
       data.data(), params.rows, params.cols);
-    if (params.do_sqrt) {
-      if (params.rowMajor) {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_COLUMNS>(handle, input_row_major, output_view, raft::sqrt_op{});
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.do_sqrt) {
+          if (params.rowMajor) {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_COLUMNS>(h, input_row_major, output_view, raft::sqrt_op{});
+            } else {
+              norm<L1Norm, Apply::ALONG_COLUMNS>(h, input_row_major, output_view, raft::sqrt_op{});
+            }
+          } else {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_COLUMNS>(h, input_col_major, output_view, raft::sqrt_op{});
+            } else {
+              norm<L1Norm, Apply::ALONG_COLUMNS>(h, input_col_major, output_view, raft::sqrt_op{});
+            }
+          }
         } else {
-          norm<L1Norm, Apply::ALONG_COLUMNS>(handle, input_row_major, output_view, raft::sqrt_op{});
+          if (params.rowMajor) {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_COLUMNS>(h, input_row_major, output_view);
+            } else {
+              norm<L1Norm, Apply::ALONG_COLUMNS>(h, input_row_major, output_view);
+            }
+          } else {
+            if (params.type == L2Norm) {
+              norm<L2Norm, Apply::ALONG_COLUMNS>(h, input_col_major, output_view);
+            } else {
+              norm<L1Norm, Apply::ALONG_COLUMNS>(h, input_col_major, output_view);
+            }
+          }
         }
-      } else {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_COLUMNS>(handle, input_col_major, output_view, raft::sqrt_op{});
-        } else {
-          norm<L1Norm, Apply::ALONG_COLUMNS>(handle, input_col_major, output_view, raft::sqrt_op{});
-        }
-      }
-    } else {
-      if (params.rowMajor) {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_COLUMNS>(handle, input_row_major, output_view);
-        } else {
-          norm<L1Norm, Apply::ALONG_COLUMNS>(handle, input_row_major, output_view);
-        }
-      } else {
-        if (params.type == L2Norm) {
-          norm<L2Norm, Apply::ALONG_COLUMNS>(handle, input_col_major, output_view);
-        } else {
-          norm<L1Norm, Apply::ALONG_COLUMNS>(handle, input_col_major, output_view);
-        }
-      }
-    }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/tests/linalg/normalize.cu b/cpp/tests/linalg/normalize.cu
index 9b3c1ddc5b..152068f666 100644
--- a/cpp/tests/linalg/normalize.cu
+++ b/cpp/tests/linalg/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -82,13 +82,18 @@ class RowNormalizeTest : public ::testing::TestWithParam<RowNormalizeInputs<T, I
       data.data(), params.rows, params.cols);
     auto output_view = raft::make_device_matrix_view<T, IdxT, raft::row_major>(
       out_act.data(), params.rows, params.cols);
-    if (params.norm_type == raft::linalg::L1Norm) {
-      raft::linalg::row_normalize<raft::linalg::L1Norm>(handle, input_view, output_view);
-    } else if (params.norm_type == raft::linalg::L2Norm) {
-      raft::linalg::row_normalize<raft::linalg::L2Norm>(handle, input_view, output_view);
-    } else if (params.norm_type == raft::linalg::LinfNorm) {
-      raft::linalg::row_normalize<raft::linalg::LinfNorm>(handle, input_view, output_view);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.norm_type == raft::linalg::L1Norm) {
+          raft::linalg::row_normalize<raft::linalg::L1Norm>(h, input_view, output_view);
+        } else if (params.norm_type == raft::linalg::L2Norm) {
+          raft::linalg::row_normalize<raft::linalg::L2Norm>(h, input_view, output_view);
+        } else if (params.norm_type == raft::linalg::LinfNorm) {
+          raft::linalg::row_normalize<raft::linalg::LinfNorm>(h, input_view, output_view);
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     resource::sync_stream(handle, stream);
   }
diff --git a/cpp/tests/linalg/power.cu b/cpp/tests/linalg/power.cu
index c3a672f296..4fc80a25e5 100644
--- a/cpp/tests/linalg/power.cu
+++ b/cpp/tests/linalg/power.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -96,10 +96,15 @@ class PowerTest : public ::testing::TestWithParam<PowerInputs<T>> {
     auto const_in2_view = raft::make_device_vector_view<const T>(in2.data(), len);
     const auto scalar   = static_cast<T>(2);
     auto scalar_view    = raft::make_host_scalar_view(&scalar);
-    power(handle, const_in1_view, const_in2_view, out_view);
-    power_scalar(handle, const_out_view, out_view, scalar_view);
-    power(handle, const_in1_view, const_in2_view, in1_view);
-    power_scalar(handle, const_in1_view, in1_view, scalar_view);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        power(h, const_in1_view, const_in2_view, out_view);
+        power_scalar(h, const_out_view, out_view, scalar_view);
+        power(h, const_in1_view, const_in2_view, in1_view);
+        power_scalar(h, const_in1_view, in1_view, scalar_view);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     resource::sync_stream(handle);
   }
diff --git a/cpp/tests/linalg/reduce.cu b/cpp/tests/linalg/reduce.cu
index 102809957b..dd3f89d40c 100644
--- a/cpp/tests/linalg/reduce.cu
+++ b/cpp/tests/linalg/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -45,7 +45,8 @@ template <typename InType,
           typename MainLambda,
           typename ReduceLambda,
           typename FinalLambda>
-void reduceLaunch(OutType* dots,
+void reduceLaunch(const raft::resources& handle,
+                  OutType* dots,
                   const InType* data,
                   IdxType cols,
                   IdxType rows,
@@ -53,7 +54,6 @@ void reduceLaunch(OutType* dots,
                   bool alongRows,
                   OutType init,
                   bool inplace,
-                  cudaStream_t stream,
                   MainLambda main_op,
                   ReduceLambda reduce_op,
                   FinalLambda final_op)
@@ -65,9 +65,6 @@ void reduceLaunch(OutType* dots,
   auto input_view_col_major =
     raft::make_device_matrix_view<const InType, IdxType, raft::col_major>(data, rows, cols);
 
-  raft::resources handle;
-  resource::set_cuda_stream(handle, stream);
-
   if (rowMajor and alongRows) {
     reduce<Apply::ALONG_ROWS>(
       handle, input_view_row_major, output_view, init, inplace, main_op, reduce_op, final_op);
@@ -215,30 +212,35 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType,
                    reduce_op,
                    fin_op);
 
-    reduceLaunch(dots_act.data(),
-                 data.data(),
-                 cols,
-                 rows,
-                 params.rowMajor,
-                 params.alongRows,
-                 params.init,
-                 false,
-                 stream,
-                 main_op,
-                 reduce_op,
-                 fin_op);
-    reduceLaunch(dots_act.data(),
-                 data.data(),
-                 cols,
-                 rows,
-                 params.rowMajor,
-                 params.alongRows,
-                 params.init,
-                 true,
-                 stream,
-                 main_op,
-                 reduce_op,
-                 fin_op);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        reduceLaunch(h,
+                     dots_act.data(),
+                     data.data(),
+                     cols,
+                     rows,
+                     params.rowMajor,
+                     params.alongRows,
+                     params.init,
+                     false,
+                     main_op,
+                     reduce_op,
+                     fin_op);
+        reduceLaunch(h,
+                     dots_act.data(),
+                     data.data(),
+                     cols,
+                     rows,
+                     params.rowMajor,
+                     params.alongRows,
+                     params.init,
+                     true,
+                     main_op,
+                     reduce_op,
+                     fin_op);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
 
     resource::sync_stream(handle, stream);
   }
@@ -320,19 +322,19 @@ REDUCE_TEST((ReduceTest<float, float, int64_t>), ReduceTestFFI64, inputsff_i64);
 
 const std::vector<ReduceInputs<float, float, int>> inputsff_thick_i32 =
   raft::util::itertools::product<ReduceInputs<float, float, int>>(
-    {0.0001f}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0f}, {1234ULL});
+    {0.0001f}, {3, 9}, {17771, 33333, 200000}, {true}, {true}, {0.0f}, {1234ULL});
 const std::vector<ReduceInputs<double, double, int>> inputsdd_thick_i32 =
   raft::util::itertools::product<ReduceInputs<double, double, int>>(
-    {0.000001}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0}, {1234ULL});
+    {0.000001}, {3, 9}, {17771, 33333, 200000}, {true}, {true}, {0.0}, {1234ULL});
 const std::vector<ReduceInputs<float, double, int>> inputsfd_thick_i32 =
   raft::util::itertools::product<ReduceInputs<float, double, int>>(
-    {0.000001}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0f}, {1234ULL});
+    {0.000001}, {3, 9}, {17771, 33333, 200000}, {true}, {true}, {0.0f}, {1234ULL});
 const std::vector<ReduceInputs<float, float, uint32_t>> inputsff_thick_u32 =
   raft::util::itertools::product<ReduceInputs<float, float, uint32_t>>(
-    {0.0001f}, {3u, 9u}, {17771u, 33333u, 100000u}, {true}, {true}, {0.0f}, {1234ULL});
+    {0.0001f}, {3u, 9u}, {17771u, 33333u, 200000u}, {true}, {true}, {0.0f}, {1234ULL});
 const std::vector<ReduceInputs<float, float, int64_t>> inputsff_thick_i64 =
   raft::util::itertools::product<ReduceInputs<float, float, int64_t>>(
-    {0.0001f}, {3, 9}, {17771, 33333, 100000}, {true}, {true}, {0.0f}, {1234ULL});
+    {0.0001f}, {3, 9}, {17771, 33333, 200000}, {true}, {true}, {0.0f}, {1234ULL});
 
 REDUCE_TEST((ReduceTest<float, float, int>), ReduceTestFFI32Thick, inputsff_thick_i32);
 REDUCE_TEST((ReduceTest<double, double, int>), ReduceTestDDI32Thick, inputsdd_thick_i32);
diff --git a/cpp/tests/linalg/rsvd.cu b/cpp/tests/linalg/rsvd.cu
index 6f125afa8e..bbe7cf0da5 100644
--- a/cpp/tests/linalg/rsvd.cu
+++ b/cpp/tests/linalg/rsvd.cu
@@ -1,18 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../test_utils.cuh"
 
+#include <raft/core/device_mdarray.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/rsvd.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
-
-#include <rmm/device_uvector.hpp>
+#include <raft/util/dry_run_resources.hpp>
 
 #include <gtest/gtest.h>
 
@@ -125,22 +125,28 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
     auto S_vec_view = raft::make_device_vector_view(S.data(), params.k);
 
     // RSVD tests
-    if (params.k == 0) {  // Test with PC and upsampling ratio
-      if (params.use_bbt) {
-        rsvd_perc_symmetric(
-          handle, A_view, S_vec_view, params.PC_perc, params.UpS_perc, U_view, V_view);
-      } else {
-        rsvd_perc(handle, A_view, S_vec_view, params.PC_perc, params.UpS_perc, U_view, V_view);
-      }
-    } else {  // Test with directly given fixed rank
-      if (params.use_bbt) {
-        rsvd_fixed_rank_symmetric_jacobi(
-          handle, A_view, S_vec_view, params.p, eig_svd_tol, max_sweeps, U_view, V_view);
-      } else {
-        rsvd_fixed_rank_jacobi(
-          handle, A_view, S_vec_view, params.p, eig_svd_tol, max_sweeps, U_view, V_view);
-      }
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.k == 0) {  // Test with PC and upsampling ratio
+          if (params.use_bbt) {
+            rsvd_perc_symmetric(
+              h, A_view, S_vec_view, params.PC_perc, params.UpS_perc, U_view, V_view);
+          } else {
+            rsvd_perc(h, A_view, S_vec_view, params.PC_perc, params.UpS_perc, U_view, V_view);
+          }
+        } else {  // Test with directly given fixed rank
+          if (params.use_bbt) {
+            rsvd_fixed_rank_symmetric_jacobi(
+              h, A_view, S_vec_view, params.p, eig_svd_tol, max_sweeps, U_view, V_view);
+          } else {
+            rsvd_fixed_rank_jacobi(
+              h, A_view, S_vec_view, params.p, eig_svd_tol, max_sweeps, U_view, V_view);
+          }
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(int));
     raft::update_device(A.data(), A_backup_cpu.data(), m * n, stream);
   }
 
@@ -312,5 +318,296 @@ INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormF, ::testing::ValuesI
 
 INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormD, ::testing::ValuesIn(inputs_dx));
 
+// ===================================================================
+// Dry-run tests for RSVD public API functions
+// ===================================================================
+
+TEST(RsvdDryRun, FixedRankQRWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 256;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank in dry-run mode (QR, no BBT, no Jacobi, both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank(
+      handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, U.view(), V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankSymmetricWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 256;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank_symmetric in dry-run mode (QR, with BBT, no Jacobi, both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank_symmetric(
+      handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, U.view(), V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankJacobiWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows     = 256;
+  constexpr int n_cols     = 128;
+  constexpr int k          = 50;
+  constexpr int p          = 10;
+  constexpr float tol      = 1e-7f;
+  constexpr int max_sweeps = 100;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank_jacobi in dry-run mode (QR, no BBT, with Jacobi, both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank_jacobi(handle,
+                                         raft::make_const_mdspan(M.view()),
+                                         S_vec.view(),
+                                         p,
+                                         tol,
+                                         max_sweeps,
+                                         U.view(),
+                                         V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankSymmetricJacobiWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows     = 256;
+  constexpr int n_cols     = 128;
+  constexpr int k          = 50;
+  constexpr int p          = 10;
+  constexpr float tol      = 1e-7f;
+  constexpr int max_sweeps = 100;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank_symmetric_jacobi in dry-run mode (QR, with BBT, with Jacobi, both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank_symmetric_jacobi(handle,
+                                                   raft::make_const_mdspan(M.view()),
+                                                   S_vec.view(),
+                                                   p,
+                                                   tol,
+                                                   max_sweeps,
+                                                   U.view(),
+                                                   V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankWithOnlyU)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 256;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+
+  // Run rsvd_fixed_rank in dry-run mode (only U, no V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank(
+      handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, U.view(), std::nullopt);
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankWithOnlyV)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 256;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank in dry-run mode (only V, no U)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank(
+      handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, std::nullopt, V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, FixedRankWithNoVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 256;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+
+  // Run rsvd_fixed_rank in dry-run mode (no U, no V - only singular values)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank(
+      handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, std::nullopt, std::nullopt);
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, PercWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows     = 256;
+  constexpr int n_cols     = 128;
+  constexpr float PC_perc  = 0.2f;
+  constexpr float UpS_perc = 0.05f;
+  constexpr int k          = static_cast<int>(std::min(n_rows, n_cols) * PC_perc);
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_perc in dry-run mode (percentage-based, QR, no BBT, no Jacobi, both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_perc(handle,
+                            raft::make_const_mdspan(M.view()),
+                            S_vec.view(),
+                            PC_perc,
+                            UpS_perc,
+                            U.view(),
+                            V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, PercSymmetricJacobiWithBothVectors)
+{
+  raft::resources res;
+
+  constexpr int n_rows     = 256;
+  constexpr int n_cols     = 128;
+  constexpr float PC_perc  = 0.2f;
+  constexpr float UpS_perc = 0.05f;
+  constexpr float tol      = 1e-7f;
+  constexpr int max_sweeps = 100;
+  constexpr int k          = static_cast<int>(std::min(n_rows, n_cols) * PC_perc);
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_perc_symmetric_jacobi in dry-run mode (percentage-based, QR, with BBT, with Jacobi,
+  // both U and V)
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_perc_symmetric_jacobi(handle,
+                                             raft::make_const_mdspan(M.view()),
+                                             S_vec.view(),
+                                             PC_perc,
+                                             UpS_perc,
+                                             tol,
+                                             max_sweeps,
+                                             U.view(),
+                                             V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(RsvdDryRun, TallMatrix)
+{
+  raft::resources res;
+
+  constexpr int n_rows = 512;
+  constexpr int n_cols = 128;
+  constexpr int k      = 50;
+  constexpr int p      = 10;
+
+  // Pre-allocate input/output buffers (outside dry-run)
+  auto M     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S_vec = raft::make_device_vector<float, int>(res, k);
+  auto U     = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto V     = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+
+  // Run rsvd_fixed_rank_jacobi in dry-run mode on a tall matrix
+  constexpr float tol      = 1e-7f;
+  constexpr int max_sweeps = 100;
+  auto stats               = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::rsvd_fixed_rank_jacobi(handle,
+                                         raft::make_const_mdspan(M.view()),
+                                         S_vec.view(),
+                                         p,
+                                         tol,
+                                         max_sweeps,
+                                         U.view(),
+                                         V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/tests/linalg/strided_reduction.cu b/cpp/tests/linalg/strided_reduction.cu
index 57e4c941fe..ab9a5bdd48 100644
--- a/cpp/tests/linalg/strided_reduction.cu
+++ b/cpp/tests/linalg/strided_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -27,10 +27,8 @@ struct stridedReductionInputs {
 
 template <typename T>
 void stridedReductionLaunch(
-  T* dots, const T* data, int cols, int rows, bool inplace, cudaStream_t stream)
+  const raft::resources& handle, T* dots, const T* data, int cols, int rows, bool inplace)
 {
-  raft::resources handle;
-  resource::set_cuda_stream(handle, stream);
   auto dots_view = raft::make_device_vector_view(dots, cols);
   auto data_view = raft::make_device_matrix_view(data, rows, cols);
   strided_reduction(handle, data_view, dots_view, (T)0, inplace, raft::sq_op{});
@@ -78,8 +76,13 @@ class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInp
                           raft::sq_op{},
                           raft::add_op{},
                           raft::identity_op{});
-    stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, false, stream);
-    stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, true, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        stridedReductionLaunch(h, dots_act.data(), data.data(), cols, rows, false);
+        stridedReductionLaunch(h, dots_act.data(), data.data(), cols, rows, true);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/tests/linalg/subtract.cu b/cpp/tests/linalg/subtract.cu
index caa52a33f5..a44a655bfc 100644
--- a/cpp/tests/linalg/subtract.cu
+++ b/cpp/tests/linalg/subtract.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -92,11 +92,15 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     const auto scalar   = static_cast<T>(1);
     auto scalar_view    = raft::make_host_scalar_view(&scalar);
 
-    subtract(handle, const_in1_view, const_in2_view, out_view);
-    subtract_scalar(handle, const_out_view, out_view, scalar_view);
-    subtract(handle, const_in1_view, const_in2_view, in1_view);
-    subtract_scalar(handle, const_in1_view, in1_view, scalar_view);
-    resource::sync_stream(handle, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        subtract(h, const_in1_view, const_in2_view, out_view);
+        subtract_scalar(h, const_out_view, out_view, scalar_view);
+        subtract(h, const_in1_view, const_in2_view, in1_view);
+        subtract_scalar(h, const_in1_view, in1_view, scalar_view);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
   }
 
  protected:
diff --git a/cpp/tests/linalg/svd.cu b/cpp/tests/linalg/svd.cu
index 544263768d..9543f52057 100644
--- a/cpp/tests/linalg/svd.cu
+++ b/cpp/tests/linalg/svd.cu
@@ -1,15 +1,17 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../test_utils.cuh"
 
+#include <raft/core/device_mdarray.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/linalg/init.cuh>
 #include <raft/linalg/svd.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/dry_run_resources.hpp>
 
 #include <gtest/gtest.h>
 
@@ -83,11 +85,17 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
       std::make_optional(raft::make_device_matrix_view<T, int, raft::col_major>(
         right_eig_vectors_trans_qr.data(), params.n_col, params.n_col));
 
-    svd_qr_transpose_right_vec(handle,
-                               data_view,
-                               sing_vals_qr_view,
-                               left_eig_vectors_qr_view,
-                               right_eig_vectors_trans_qr_view);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        svd_qr_transpose_right_vec(h,
+                                   data_view,
+                                   sing_vals_qr_view,
+                                   left_eig_vectors_qr_view,
+                                   right_eig_vectors_trans_qr_view);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(int));
     resource::sync_stream(handle, stream);
   }
 
@@ -197,11 +205,141 @@ INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, ::testing::ValuesIn(inputsf
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecD, ::testing::ValuesIn(inputsd2));
 
-// INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF,
-// ::testing::ValuesIn(inputsf2));
+// ===================================================================
+// Dry-run tests for SVD public API functions
+// ===================================================================
 
-// INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecD,
-//::testing::ValuesIn(inputsd2));
+TEST(SvdDryRun, QrWithBothVectors)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in        = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto sing_vals = raft::make_device_vector<float, int>(res, n_cols);
+  auto U         = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto V         = raft::make_device_matrix<float, int, raft::col_major>(res, n_cols, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_qr(
+      handle, raft::make_const_mdspan(in.view()), sing_vals.view(), U.view(), V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, QrWithOnlyU)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in        = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto sing_vals = raft::make_device_vector<float, int>(res, n_cols);
+  auto U         = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_qr(
+      handle, raft::make_const_mdspan(in.view()), sing_vals.view(), U.view(), std::nullopt);
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, QrWithOnlyV)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in        = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto sing_vals = raft::make_device_vector<float, int>(res, n_cols);
+  auto V         = raft::make_device_matrix<float, int, raft::col_major>(res, n_cols, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_qr(
+      handle, raft::make_const_mdspan(in.view()), sing_vals.view(), std::nullopt, V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, QrTransposeRightVecWithBothVectors)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in        = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto sing_vals = raft::make_device_vector<float, int>(res, n_cols);
+  auto U         = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto V         = raft::make_device_matrix<float, int, raft::col_major>(res, n_cols, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_qr_transpose_right_vec(
+      handle, raft::make_const_mdspan(in.view()), sing_vals.view(), U.view(), V.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, EigWithBothVectors)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S  = raft::make_device_vector<float, int>(res, n_cols);
+  auto V  = raft::make_device_matrix<float, int, raft::col_major>(res, n_cols, n_cols);
+  auto U  = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_eig(handle, raft::make_const_mdspan(in.view()), S.view(), V.view(), U.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, EigWithOnlyV)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128;
+
+  auto in = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+  auto S  = raft::make_device_vector<float, int>(res, n_cols);
+  auto V  = raft::make_device_matrix<float, int, raft::col_major>(res, n_cols, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_eig(
+      handle, raft::make_const_mdspan(in.view()), S.view(), V.view(), std::nullopt);
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
+
+TEST(SvdDryRun, Reconstruction)
+{
+  raft::resources res;
+  constexpr int n_rows = 256, n_cols = 128, k = 64;
+
+  auto U   = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, k);
+  auto S   = raft::make_device_matrix<float, int, raft::col_major>(res, k, k);
+  auto V   = raft::make_device_matrix<float, int, raft::col_major>(res, k, n_cols);
+  auto out = raft::make_device_matrix<float, int, raft::col_major>(res, n_rows, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::linalg::svd_reconstruction(handle,
+                                     raft::make_const_mdspan(U.view()),
+                                     raft::make_const_mdspan(S.view()),
+                                     raft::make_const_mdspan(V.view()),
+                                     out.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device memory allocation";
+}
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/tests/linalg/ternary_op.cu b/cpp/tests/linalg/ternary_op.cu
index 7551800fda..d68c1a89fd 100644
--- a/cpp/tests/linalg/ternary_op.cu
+++ b/cpp/tests/linalg/ternary_op.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -63,8 +63,13 @@ class ternaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<T>> {
     auto in2_view     = raft::make_device_vector_view<const T>(in2.data(), len);
     auto in3_view     = raft::make_device_vector_view<const T>(in3.data(), len);
 
-    ternary_op(handle, in1_view, in2_view, in3_view, out_add_view, add);
-    ternary_op(handle, in1_view, in2_view, in3_view, out_mul_view, mul);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        ternary_op(h, in1_view, in2_view, in3_view, out_add_view, add);
+        ternary_op(h, in1_view, in2_view, in3_view, out_mul_view, mul);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
   }
 
  protected:
diff --git a/cpp/tests/linalg/transpose.cu b/cpp/tests/linalg/transpose.cu
index 0dcdbbce6b..4c41191392 100644
--- a/cpp/tests/linalg/transpose.cu
+++ b/cpp/tests/linalg/transpose.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -109,7 +109,17 @@ class TransposeTest : public ::testing::TestWithParam<TransposeInputs<T>> {
     raft::update_device(data.data(), data_h.data(), len, stream);
     raft::update_device(data_trans_ref.data(), data_ref_h.data(), len, stream);
 
-    transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        transpose(h,
+                  data.data(),
+                  data_trans.data(),
+                  params.n_row,
+                  params.n_col,
+                  resource::get_cuda_stream(h));
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     if (params.n_row == params.n_col) { transpose(data.data(), params.n_col, stream); }
     resource::sync_stream(handle, stream);
   }
diff --git a/cpp/tests/linalg/unary_op.cu b/cpp/tests/linalg/unary_op.cu
index f41fbc2219..d426b3fbea 100644
--- a/cpp/tests/linalg/unary_op.cu
+++ b/cpp/tests/linalg/unary_op.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -46,10 +46,15 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
 
     auto in_view  = raft::make_device_vector_view<const InType>(in.data(), len);
     auto out_view = raft::make_device_vector_view(out.data(), len);
-    unary_op(handle,
-             in_view,
-             out_view,
-             raft::compose_op(raft::cast_op<OutType>(), raft::mul_const_op<InType>(scalar)));
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        unary_op(h,
+                 in_view,
+                 out_view,
+                 raft::compose_op(raft::cast_op<OutType>(), raft::mul_const_op<InType>(scalar)));
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     resource::sync_stream(handle, stream);
   }
 
diff --git a/cpp/tests/matrix/gather.cu b/cpp/tests/matrix/gather.cu
index 603676ac5d..32841ebb4b 100644
--- a/cpp/tests/matrix/gather.cu
+++ b/cpp/tests/matrix/gather.cu
@@ -151,46 +151,50 @@ class GatherTest : public ::testing::TestWithParam<GatherInputs<IdxT>> {
     auto stencil_view =
       raft::make_device_vector_view<const MatrixT, IdxT>(d_stencil.data(), map_length);
 
-    if (params.ncols_margin == 0) {
-      auto in_view = raft::make_device_matrix_view<const MatrixT, IdxT, row_major>(
-        d_in.data(), params.nrows, params.ncols);
-      auto inout_view = raft::make_device_matrix_view<MatrixT, IdxT, row_major>(
-        d_in.data(), params.nrows, params.ncols);
-      if (Conditional && MapTransform) {
-        raft::matrix::gather_if(
-          handle, in_view, out_view, map_view, stencil_view, pred_op, transform_op);
-      } else if (Conditional) {
-        raft::matrix::gather_if(handle, in_view, out_view, map_view, stencil_view, pred_op);
-      } else if (MapTransform && Inplace) {
-        raft::matrix::gather(handle, inout_view, map_view, params.col_batch_size, transform_op);
-      } else if (MapTransform) {
-        raft::matrix::gather(handle, in_view, map_view, out_view, transform_op);
-      } else if (Inplace) {
-        raft::matrix::gather(handle, inout_view, map_view, params.col_batch_size);
-      } else {
-        raft::matrix::gather(handle, in_view, map_view, out_view);
-      }
-    } else {
-      // Test for a view with specifying the leading dimension
-      auto in_view = raft::make_device_strided_matrix_view<const MatrixT, IdxT, row_major>(
-        d_in.data(), params.nrows, params.ncols, ld_in);
-      auto inout_view = raft::make_device_strided_matrix_view<MatrixT, IdxT, row_major>(
-        d_in.data(), params.nrows, params.ncols, ld_in);
-      if (Conditional && MapTransform) {
-        raft::matrix::gather_if(
-          handle, in_view, out_view, map_view, stencil_view, pred_op, transform_op);
-      } else if (Conditional) {
-        raft::matrix::gather_if(handle, in_view, out_view, map_view, stencil_view, pred_op);
-      } else if (MapTransform && Inplace) {
-        raft::matrix::gather(handle, inout_view, map_view, params.col_batch_size, transform_op);
-      } else if (MapTransform) {
-        raft::matrix::gather(handle, in_view, map_view, out_view, transform_op);
-      } else if (Inplace) {
-        raft::matrix::gather(handle, inout_view, map_view, params.col_batch_size);
-      } else {
-        raft::matrix::gather(handle, in_view, map_view, out_view);
-      }
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.ncols_margin == 0) {
+          auto in_view = raft::make_device_matrix_view<const MatrixT, IdxT, row_major>(
+            d_in.data(), params.nrows, params.ncols);
+          auto inout_view = raft::make_device_matrix_view<MatrixT, IdxT, row_major>(
+            d_in.data(), params.nrows, params.ncols);
+          if (Conditional && MapTransform) {
+            raft::matrix::gather_if(
+              h, in_view, out_view, map_view, stencil_view, pred_op, transform_op);
+          } else if (Conditional) {
+            raft::matrix::gather_if(h, in_view, out_view, map_view, stencil_view, pred_op);
+          } else if (MapTransform && Inplace) {
+            raft::matrix::gather(h, inout_view, map_view, params.col_batch_size, transform_op);
+          } else if (MapTransform) {
+            raft::matrix::gather(h, in_view, map_view, out_view, transform_op);
+          } else if (Inplace) {
+            raft::matrix::gather(h, inout_view, map_view, params.col_batch_size);
+          } else {
+            raft::matrix::gather(h, in_view, map_view, out_view);
+          }
+        } else {
+          auto in_view = raft::make_device_strided_matrix_view<const MatrixT, IdxT, row_major>(
+            d_in.data(), params.nrows, params.ncols, ld_in);
+          auto inout_view = raft::make_device_strided_matrix_view<MatrixT, IdxT, row_major>(
+            d_in.data(), params.nrows, params.ncols, ld_in);
+          if (Conditional && MapTransform) {
+            raft::matrix::gather_if(
+              h, in_view, out_view, map_view, stencil_view, pred_op, transform_op);
+          } else if (Conditional) {
+            raft::matrix::gather_if(h, in_view, out_view, map_view, stencil_view, pred_op);
+          } else if (MapTransform && Inplace) {
+            raft::matrix::gather(h, inout_view, map_view, params.col_batch_size, transform_op);
+          } else if (MapTransform) {
+            raft::matrix::gather(h, in_view, map_view, out_view, transform_op);
+          } else if (Inplace) {
+            raft::matrix::gather(h, inout_view, map_view, params.col_batch_size);
+          } else {
+            raft::matrix::gather(h, in_view, map_view, out_view);
+          }
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
 
     if (Inplace) {
       RAFT_CUDA_TRY(cudaMemcpy2DAsync(d_out_act.data(),
diff --git a/cpp/tests/matrix/sample_rows.cu b/cpp/tests/matrix/sample_rows.cu
index 58dd1327bf..f6a09b2641 100644
--- a/cpp/tests/matrix/sample_rows.cu
+++ b/cpp/tests/matrix/sample_rows.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -9,10 +9,12 @@
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/sample_rows.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/dry_run_resources.hpp>
 #include <raft/util/itertools.hpp>
 
 #include <gtest/gtest.h>
@@ -144,5 +146,42 @@ using SampleRowsTestInt64 = SampleRowsTest<float>;
 TEST_P(SampleRowsTestInt64, SamplingTest) { check(); }
 INSTANTIATE_TEST_SUITE_P(SampleRowsTests, SampleRowsTestInt64, ::testing::ValuesIn(inputs1));
 
+// ===== Dry-run tests =====
+
+TEST(SampleRowsDryRun, VoidOverload)
+{
+  raft::resources res;
+  constexpr int64_t n_rows = 1000, n_cols = 64, n_samples = 100;
+
+  auto dataset = raft::make_device_matrix<float, int64_t>(res, n_rows, n_cols);
+  auto output  = raft::make_device_matrix<float, int64_t>(res, n_samples, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    raft::matrix::sample_rows(handle,
+                              raft::random::RngState{42ULL},
+                              raft::make_const_mdspan(dataset.view()),
+                              output.view());
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device allocation";
+}
+
+TEST(SampleRowsDryRun, ReturningOverload)
+{
+  raft::resources res;
+  constexpr int64_t n_rows = 1000, n_cols = 64, n_samples = 100;
+
+  auto dataset = raft::make_device_matrix<float, int64_t>(res, n_rows, n_cols);
+
+  auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) {
+    auto result = raft::matrix::sample_rows<float, int64_t>(
+      handle, raft::random::RngState{42ULL}, raft::make_const_mdspan(dataset.view()), n_samples);
+  });
+
+  EXPECT_FALSE(raft::resource::get_dry_run_flag(res));
+  EXPECT_GT(stats.device_global, 0) << "Expected non-zero peak device allocation";
+}
+
 }  // namespace matrix
 }  // namespace raft
diff --git a/cpp/tests/sparse/convert_csr.cu b/cpp/tests/sparse/convert_csr.cu
index a529041068..3a5670ad59 100644
--- a/cpp/tests/sparse/convert_csr.cu
+++ b/cpp/tests/sparse/convert_csr.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -8,6 +8,7 @@
 #include <raft/core/bitmap.cuh>
 #include <raft/core/bitset.cuh>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 #include <raft/util/cuda_utils.cuh>
@@ -79,6 +80,69 @@ TEST_P(SortedCOOToCSR, Result)
 
 INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf));
 
+/**************************** COO to CSR ****************************/
+
+typedef SparseConvertCSRTest<float> COOToCSRTest;
+TEST_P(COOToCSRTest, Result)
+{
+  raft::resources handle;
+  auto stream = resource::get_cuda_stream(handle);
+
+  int nnz = 8;
+  int m   = 4;
+
+  int rows_h[]      = {3, 0, 1, 0, 2, 1, 3, 2};
+  int cols_h[]      = {1, 0, 1, 1, 0, 0, 0, 1};
+  float vals_h[]    = {8.0f, 1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 7.0f, 6.0f};
+  int exp_offsets[] = {0, 2, 4, 6, 8};
+  int exp_cols[]    = {0, 1, 0, 1, 0, 1, 0, 1};
+  float exp_vals[]  = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+
+  rmm::device_uvector<int> rows_d(nnz, stream);
+  rmm::device_uvector<int> cols_d(nnz, stream);
+  rmm::device_uvector<float> vals_d(nnz, stream);
+  rmm::device_uvector<int> dst_offsets_d(m + 1, stream);
+  rmm::device_uvector<int> dst_cols_d(nnz, stream);
+  rmm::device_uvector<float> dst_vals_d(nnz, stream);
+
+  raft::update_device(rows_d.data(), rows_h, nnz, stream);
+  raft::update_device(cols_d.data(), cols_h, nnz, stream);
+  raft::update_device(vals_d.data(), vals_h, nnz, stream);
+
+  raft::execute_with_dry_run_check(
+    handle,
+    [&](raft::resources const& h) {
+      convert::coo_to_csr(h,
+                          rows_d.data(),
+                          cols_d.data(),
+                          vals_d.data(),
+                          nnz,
+                          m,
+                          dst_offsets_d.data(),
+                          dst_cols_d.data(),
+                          dst_vals_d.data());
+    },
+    raft::alloc_behavior::ARGUMENT_DRIVEN,
+    2 * nnz * sizeof(int));
+
+  rmm::device_uvector<int> exp_offsets_d(m + 1, stream);
+  rmm::device_uvector<int> exp_cols_d(nnz, stream);
+  rmm::device_uvector<float> exp_vals_d(nnz, stream);
+
+  raft::update_device(exp_offsets_d.data(), exp_offsets, m + 1, stream);
+  raft::update_device(exp_cols_d.data(), exp_cols, nnz, stream);
+  raft::update_device(exp_vals_d.data(), exp_vals, nnz, stream);
+
+  ASSERT_TRUE(raft::devArrMatch<int>(
+    dst_offsets_d.data(), exp_offsets_d.data(), m + 1, raft::Compare<int>(), stream));
+  ASSERT_TRUE(raft::devArrMatch<int>(
+    dst_cols_d.data(), exp_cols_d.data(), nnz, raft::Compare<int>(), stream));
+  ASSERT_TRUE(raft::devArrMatch<float>(
+    dst_vals_d.data(), exp_vals_d.data(), nnz, raft::Compare<float>(), stream));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, COOToCSRTest, ::testing::ValuesIn(inputsf));
+
 /******************************** adj graph ********************************/
 
 template <typename index_t>
@@ -145,13 +209,18 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<index_
 
   void Run()
   {
-    convert::adj_to_csr<index_t>(handle,
-                                 adj.data(),
-                                 row_ind.data(),
-                                 params.n_rows,
-                                 params.n_cols,
-                                 row_counters.data(),
-                                 col_ind.data());
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        convert::adj_to_csr<index_t>(h,
+                                     adj.data(),
+                                     row_ind.data(),
+                                     params.n_rows,
+                                     params.n_cols,
+                                     row_counters.data(),
+                                     col_ind.data());
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     std::vector<index_t> col_ind_host(col_ind.size());
     raft::update_host(col_ind_host.data(), col_ind.data(), col_ind.size(), stream);
@@ -355,23 +424,31 @@ class BitmapToCSRTest : public ::testing::TestWithParam<BitmapToCSRInputs<index_
     auto bitmap =
       raft::core::bitmap_view<bitmap_t, index_t>(bitmap_d.data(), params.n_rows, params.n_cols);
 
-    if (params.owning) {
-      auto csr =
-        raft::make_device_csr_matrix<value_t, index_t>(handle, params.n_rows, params.n_cols, nnz);
-      auto csr_view = csr.structure_view();
-
-      bitmap.to_csr(handle, csr);
-      raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
-      raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
-      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
-    } else {
-      auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
-        indptr_d.data(), indices_d.data(), params.n_rows, params.n_cols, nnz);
-      auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
-
-      bitmap.to_csr(handle, csr);
-      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.owning) {
+          auto csr =
+            raft::make_device_csr_matrix<value_t, index_t>(h, params.n_rows, params.n_cols, nnz);
+          bitmap.to_csr(h, csr);
+          if (!resource::get_dry_run_flag(h)) {
+            auto csr_view = csr.structure_view();
+            raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
+            raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
+            raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+          }
+        } else {
+          auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+            indptr_d.data(), indices_d.data(), params.n_rows, params.n_cols, nnz);
+          auto csr = raft::make_device_csr_matrix<value_t, index_t>(h, csr_view);
+          bitmap.to_csr(h, csr);
+          if (!resource::get_dry_run_flag(h)) {
+            raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+          }
+        }
+      },
+      raft::alloc_behavior::DATA_DRIVEN,
+      sizeof(float) * nnz);
     resource::sync_stream(handle);
 
     std::vector<index_t> indices_h(indices_expected_d.size(), 0);
@@ -645,23 +722,31 @@ class BitsetToCSRTest : public ::testing::TestWithParam<BitsetToCSRInputs<index_
   {
     auto bitset = raft::core::bitset_view<bitset_t, index_t>(bitset_d.data(), params.n_cols);
 
-    if (params.owning) {
-      auto csr =
-        raft::make_device_csr_matrix<value_t, index_t>(handle, params.n_repeat, params.n_cols, nnz);
-      auto csr_view = csr.structure_view();
-
-      bitset.to_csr(handle, csr);
-      raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
-      raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
-      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
-    } else {
-      auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
-        indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
-      auto csr = raft::make_device_csr_matrix<value_t, index_t>(handle, csr_view);
-
-      bitset.to_csr(handle, csr);
-      raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.owning) {
+          auto csr =
+            raft::make_device_csr_matrix<value_t, index_t>(h, params.n_repeat, params.n_cols, nnz);
+          bitset.to_csr(h, csr);
+          if (!resource::get_dry_run_flag(h)) {
+            auto csr_view = csr.structure_view();
+            raft::copy(indptr_d.data(), csr_view.get_indptr().data(), indptr_d.size(), stream);
+            raft::copy(indices_d.data(), csr_view.get_indices().data(), indices_d.size(), stream);
+            raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+          }
+        } else {
+          auto csr_view = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+            indptr_d.data(), indices_d.data(), params.n_repeat, params.n_cols, nnz);
+          auto csr = raft::make_device_csr_matrix<value_t, index_t>(h, csr_view);
+          bitset.to_csr(h, csr);
+          if (!resource::get_dry_run_flag(h)) {
+            raft::copy(values_d.data(), csr.get_elements().data(), nnz, stream);
+          }
+        }
+      },
+      raft::alloc_behavior::DATA_DRIVEN,
+      sizeof(float) * nnz);
     resource::sync_stream(handle);
 
     std::vector<index_t> indices_h(indices_expected_d.size(), 0);
diff --git a/cpp/tests/sparse/csr_transpose.cu b/cpp/tests/sparse/csr_transpose.cu
index f4fc2c3a8b..0857dbfe8d 100644
--- a/cpp/tests/sparse/csr_transpose.cu
+++ b/cpp/tests/sparse/csr_transpose.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -94,17 +94,23 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
 
     make_data();
 
-    raft::sparse::linalg::csr_transpose(handle,
-                                        indptr.data(),
-                                        indices.data(),
-                                        data.data(),
-                                        out_indptr.data(),
-                                        out_indices.data(),
-                                        out_data.data(),
-                                        params.nrows,
-                                        params.ncols,
-                                        params.nnz,
-                                        stream);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        raft::sparse::linalg::csr_transpose(h,
+                                            indptr.data(),
+                                            indices.data(),
+                                            data.data(),
+                                            out_indptr.data(),
+                                            out_indices.data(),
+                                            out_data.data(),
+                                            params.nrows,
+                                            params.ncols,
+                                            params.nnz,
+                                            resource::get_cuda_stream(h));
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      1);
 
     resource::sync_stream(handle, stream);
   }
diff --git a/cpp/tests/sparse/diagonal.cu b/cpp/tests/sparse/diagonal.cu
index b66cffe676..a05531cd1e 100644
--- a/cpp/tests/sparse/diagonal.cu
+++ b/cpp/tests/sparse/diagonal.cu
@@ -1,8 +1,10 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include "../test_utils.cuh"
+
 #include <raft/core/device_coo_matrix.hpp>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
@@ -81,8 +83,10 @@ TEST(SparseMatrixDiagonal, GetDiagonalVectorFromCSR)
   // Create diagonal output vector
   auto diagonal_vec = raft::make_device_vector<float, int>(res, 4);
 
-  // Get diagonal (function initializes to zero internally)
-  diagonal(res, matrix.view(), diagonal_vec.view());
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) { diagonal(h, matrix.view(), diagonal_vec.view()); },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto diagonal_host = std::vector<float>(4);
@@ -110,8 +114,12 @@ TEST(SparseMatrixDiagonal, ScaleCSRByDiagonalSymmetric)
              diagonal_data.size(),
              raft::resource::get_cuda_stream(res));
 
-  // Scale matrix by diagonal
-  scale_by_diagonal_symmetric(res, diagonal_vec.view(), matrix.view());
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) {
+      scale_by_diagonal_symmetric(h, diagonal_vec.view(), matrix.view());
+    },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto matrix_structure = matrix.structure_view();
@@ -150,8 +158,10 @@ TEST(SparseMatrixDiagonal, SetCSRDiagonalToOnes)
   auto res    = raft::resources{};
   auto matrix = create_test_csr_matrix(res);
 
-  // Set diagonal to ones
-  set_diagonal(res, matrix.view(), 1.0f);
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) { set_diagonal(h, matrix.view(), 1.0f); },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto matrix_structure = matrix.structure_view();
@@ -178,15 +188,15 @@ TEST(SparseMatrixDiagonal, CompleteWorkflow)
   auto res    = raft::resources{};
   auto matrix = create_test_csr_matrix(res);
 
-  // 1. Get diagonal
   auto diagonal_vec = raft::make_device_vector<float, int>(res, 4);
-  diagonal(res, matrix.view(), diagonal_vec.view());
-
-  // 2. Scale matrix by diagonal
-  scale_by_diagonal_symmetric(res, diagonal_vec.view(), matrix.view());
-
-  // 3. Set diagonal to ones
-  set_diagonal(res, matrix.view(), 1.0f);
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) {
+      diagonal(h, matrix.view(), diagonal_vec.view());
+      scale_by_diagonal_symmetric(h, diagonal_vec.view(), matrix.view());
+      set_diagonal(h, matrix.view(), 1.0f);
+    },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy results back to host
   auto matrix_structure = matrix.structure_view();
@@ -237,11 +247,12 @@ TEST(SparseMatrixDiagonal, GetDiagonalVectorFromCOO)
   auto res    = raft::resources{};
   auto matrix = create_test_coo_matrix(res);
 
-  // Create diagonal output vector
   auto diagonal_vec = raft::make_device_vector<float, int>(res, 4);
 
-  // Get diagonal (function initializes to zero internally)
-  diagonal(res, matrix.view(), diagonal_vec.view());
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) { diagonal(h, matrix.view(), diagonal_vec.view()); },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto diagonal_host = std::vector<float>(4);
@@ -261,7 +272,6 @@ TEST(SparseMatrixDiagonal, ScaleCOOByDiagonalSymmetric)
   auto res    = raft::resources{};
   auto matrix = create_test_coo_matrix(res);
 
-  // Create diagonal with values [2, 4, 2, 4]
   auto diagonal_data = std::vector<float>{2, 4, 2, 4};
   auto diagonal_vec  = raft::make_device_vector<float, int>(res, 4);
   raft::copy(diagonal_vec.data_handle(),
@@ -269,8 +279,12 @@ TEST(SparseMatrixDiagonal, ScaleCOOByDiagonalSymmetric)
              diagonal_data.size(),
              raft::resource::get_cuda_stream(res));
 
-  // Scale matrix by diagonal
-  scale_by_diagonal_symmetric(res, diagonal_vec.view(), matrix.view());
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) {
+      scale_by_diagonal_symmetric(h, diagonal_vec.view(), matrix.view());
+    },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto matrix_structure = matrix.structure_view();
@@ -309,8 +323,10 @@ TEST(SparseMatrixDiagonal, SetCOODiagonalToOnes)
   auto res    = raft::resources{};
   auto matrix = create_test_coo_matrix(res);
 
-  // Set diagonal to ones
-  set_diagonal(res, matrix.view(), 1.0f);
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) { set_diagonal(h, matrix.view(), 1.0f); },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy result back to host
   auto matrix_structure = matrix.structure_view();
@@ -337,15 +353,15 @@ TEST(SparseMatrixDiagonal, CompleteWorkflowCOO)
   auto res    = raft::resources{};
   auto matrix = create_test_coo_matrix(res);
 
-  // 1. Get diagonal
   auto diagonal_vec = raft::make_device_vector<float, int>(res, 4);
-  diagonal(res, matrix.view(), diagonal_vec.view());
-
-  // 2. Scale matrix by diagonal
-  scale_by_diagonal_symmetric(res, diagonal_vec.view(), matrix.view());
-
-  // 3. Set diagonal to ones
-  set_diagonal(res, matrix.view(), 1.0f);
+  raft::execute_with_dry_run_check(
+    res,
+    [&](raft::resources const& h) {
+      diagonal(h, matrix.view(), diagonal_vec.view());
+      scale_by_diagonal_symmetric(h, diagonal_vec.view(), matrix.view());
+      set_diagonal(h, matrix.view(), 1.0f);
+    },
+    raft::alloc_behavior::NO_ALLOCATIONS);
 
   // Copy results back to host
   auto matrix_structure = matrix.structure_view();
diff --git a/cpp/tests/sparse/filter.cu b/cpp/tests/sparse/filter.cu
index 5771f25c51..b5863d4852 100644
--- a/cpp/tests/sparse/filter.cu
+++ b/cpp/tests/sparse/filter.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -151,7 +151,13 @@ TEST_P(COORemoveScalarView, ResultView)
 
   auto scalar = raft::make_host_scalar<float>(0.0f);
 
-  op::coo_remove_scalar<128, float, int, int>(h, in_view, scalar.view(), out_matrix);
+  raft::execute_with_dry_run_check(
+    h,
+    [&](raft::resources const& h) {
+      op::coo_remove_scalar<128, float, int, int>(h, in_view, scalar.view(), out_matrix);
+    },
+    raft::alloc_behavior::DATA_DRIVEN,
+    2 * 5 * sizeof(int));
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   auto out_nnz = out_matrix.structure_view().get_nnz();
diff --git a/cpp/tests/sparse/masked_matmul.cu b/cpp/tests/sparse/masked_matmul.cu
index 6972d76997..737b207e5e 100644
--- a/cpp/tests/sparse/masked_matmul.cu
+++ b/cpp/tests/sparse/masked_matmul.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -338,12 +338,18 @@ class MaskedMatmulTest
 
     if constexpr (bits_layout == BitsLayout::Bitmap) {
       auto mask = raft::core::bitmap_view<const bits_t, index_t>(bits_d.data(), params.m, params.n);
-      raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
+      raft::execute_with_dry_run_check(
+        handle,
+        [&](raft::resources const& h) { raft::sparse::linalg::masked_matmul(h, A, B, mask, C); },
+        raft::alloc_behavior::ARGUMENT_DRIVEN,
+        c_data_d.size() * sizeof(output_t));
     } else if constexpr (bits_layout == BitsLayout::Bitset) {
       auto mask = raft::core::bitset_view<const bits_t, index_t>(bits_d.data(), params.n);
-      raft::sparse::linalg::masked_matmul(handle, A, B, mask, C);
-    } else {
-      GTEST_SKIP() << "Unsupported BitsLayout!";
+      raft::execute_with_dry_run_check(
+        handle,
+        [&](raft::resources const& h) { raft::sparse::linalg::masked_matmul(h, A, B, mask, C); },
+        raft::alloc_behavior::ARGUMENT_DRIVEN,
+        c_data_d.size() * sizeof(output_t));
     }
 
     resource::sync_stream(handle);
diff --git a/cpp/tests/sparse/norm.cu b/cpp/tests/sparse/norm.cu
index 89fff8acba..0d73d69c2e 100644
--- a/cpp/tests/sparse/norm.cu
+++ b/cpp/tests/sparse/norm.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -52,7 +52,12 @@ class CSRRowNormTest : public ::testing::TestWithParam<CSRRowNormInputs<Type_f,
     raft::update_device(data.data(), params.data.data(), nnz, stream);
     raft::update_device(verify.data(), params.verify.data(), n_rows, stream);
 
-    linalg::rowNormCsr(handle, indptr.data(), data.data(), nnz, n_rows, result.data(), params.norm);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        linalg::rowNormCsr(h, indptr.data(), data.data(), nnz, n_rows, result.data(), params.norm);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     ASSERT_TRUE(
diff --git a/cpp/tests/sparse/preprocess.cu b/cpp/tests/sparse/preprocess.cu
index 0ad6051f66..a955a6ecc6 100644
--- a/cpp/tests/sparse/preprocess.cu
+++ b/cpp/tests/sparse/preprocess.cu
@@ -137,9 +137,21 @@ class SparsePreprocessCSR
       auto bm25_vals = raft::make_device_vector<Type_f, int64_t>(handle, int(coo_a.nnz));
       raft::util::calc_tfidf_bm25<Index_, Type_f>(handle, csr_matrix.view(), bm25_vals.view());
       if (coo_on) {
-        raft::sparse::matrix::encode_bm25<float, int>(handle, coo_a_matrix, result.view());
+        raft::execute_with_dry_run_check(
+          handle,
+          [&](raft::resources const& h) {
+            raft::sparse::matrix::encode_bm25<float, int>(h, coo_a_matrix, result.view());
+          },
+          raft::alloc_behavior::DATA_DRIVEN,
+          sizeof(float) * coo_a.nnz);
       } else {
-        raft::sparse::matrix::encode_bm25<float, int>(handle, csr_matrix, result.view());
+        raft::execute_with_dry_run_check(
+          handle,
+          [&](raft::resources const& h) {
+            raft::sparse::matrix::encode_bm25<float, int>(h, csr_matrix, result.view());
+          },
+          raft::alloc_behavior::DATA_DRIVEN,
+          sizeof(float) * coo_a.nnz);
       }
       ASSERT_TRUE(raft::devArrMatch<Type_f>(bm25_vals.data_handle(),
                                             result.data_handle(),
@@ -151,9 +163,21 @@ class SparsePreprocessCSR
       raft::util::calc_tfidf_bm25<Index_, Type_f>(
         handle, csr_matrix.view(), tfidf_vals.view(), true);
       if (coo_on) {
-        raft::sparse::matrix::encode_tfidf<float, int>(handle, coo_a_matrix, result.view());
+        raft::execute_with_dry_run_check(
+          handle,
+          [&](raft::resources const& h) {
+            raft::sparse::matrix::encode_tfidf<float, int>(h, coo_a_matrix, result.view());
+          },
+          raft::alloc_behavior::ARGUMENT_DRIVEN,
+          1);
       } else {
-        raft::sparse::matrix::encode_tfidf<float, int>(handle, csr_matrix, result.view());
+        raft::execute_with_dry_run_check(
+          handle,
+          [&](raft::resources const& h) {
+            raft::sparse::matrix::encode_tfidf<float, int>(h, csr_matrix, result.view());
+          },
+          raft::alloc_behavior::ARGUMENT_DRIVEN,
+          1);
       }
       ASSERT_TRUE(raft::devArrMatch<Type_f>(tfidf_vals.data_handle(),
                                             result.data_handle(),
diff --git a/cpp/tests/sparse/reduce.cu b/cpp/tests/sparse/reduce.cu
index 2d5fa3e041..1204edd214 100644
--- a/cpp/tests/sparse/reduce.cu
+++ b/cpp/tests/sparse/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -63,14 +63,23 @@ class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<valu
     raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream);
 
     raft::sparse::COO<value_t, value_idx, value_idx> out(stream);
-    raft::sparse::op::max_duplicates(handle,
-                                     out,
-                                     in_rows.data(),
-                                     in_cols.data(),
-                                     in_vals.data(),
-                                     (value_idx)params.in_rows.size(),
-                                     (value_idx)params.m,
-                                     (value_idx)params.n);
+    // min_alloc: internal workspace of max_duplicates (diff array + CUB scan workspace).
+    // The COO output itself is not tracked because `out` was created outside the wrapper.
+    auto min_alloc = (params.in_rows.size() + 1) * sizeof(value_idx);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        raft::sparse::op::max_duplicates(h,
+                                         out,
+                                         in_rows.data(),
+                                         in_cols.data(),
+                                         in_vals.data(),
+                                         (value_idx)params.in_rows.size(),
+                                         (value_idx)params.m,
+                                         (value_idx)params.n);
+      },
+      raft::alloc_behavior::DATA_DRIVEN,
+      min_alloc);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_rows.data(), out.rows(), out.nnz, raft::Compare<value_idx>()));
diff --git a/cpp/tests/sparse/sddmm.cu b/cpp/tests/sparse/sddmm.cu
index 781bb423f9..bff4add929 100644
--- a/cpp/tests/sparse/sddmm.cu
+++ b/cpp/tests/sparse/sddmm.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -313,14 +313,20 @@ class SDDMMTest : public ::testing::TestWithParam<SDDMMInputs<ValueType, IndexTy
     auto op_b = params.transpose_b ? raft::linalg::Operation::TRANSPOSE
                                    : raft::linalg::Operation::NON_TRANSPOSE;
 
-    raft::sparse::linalg::sddmm(handle,
-                                a,
-                                b,
-                                c,
-                                op_a,
-                                op_b,
-                                raft::make_host_scalar_view<OutputType>(&params.alpha),
-                                raft::make_host_scalar_view<OutputType>(&params.beta));
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        raft::sparse::linalg::sddmm(h,
+                                    a,
+                                    b,
+                                    c,
+                                    op_a,
+                                    op_b,
+                                    raft::make_host_scalar_view<OutputType>(&params.alpha),
+                                    raft::make_host_scalar_view<OutputType>(&params.beta));
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      1);
 
     resource::sync_stream(handle);
 
diff --git a/cpp/tests/sparse/select_k_csr.cu b/cpp/tests/sparse/select_k_csr.cu
index 88bd20b95c..968c21e718 100644
--- a/cpp/tests/sparse/select_k_csr.cu
+++ b/cpp/tests/sparse/select_k_csr.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -287,8 +287,13 @@ class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>
     auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
       dst_indices_d.data(), params.n_rows, params.top_k);
 
-    raft::sparse::matrix::select_k(
-      handle, in_val, in_idx, out_val, out_idx, params.select_min, true);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        raft::sparse::matrix::select_k(
+          h, in_val, in_idx, out_val, out_idx, params.select_min, true);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
 
     ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
                                            out_idx.data_handle(),
diff --git a/cpp/tests/sparse/solver/lanczos.cu b/cpp/tests/sparse/solver/lanczos.cu
index ba3e897caf..c609edf9f9 100644
--- a/cpp/tests/sparse/solver/lanczos.cu
+++ b/cpp/tests/sparse/solver/lanczos.cu
@@ -192,13 +192,19 @@ class rmat_lanczos_tests
     auto csr_matrix = raft::make_device_csr_matrix_view<ValueType, IndexType, IndexType, IndexType>(
       const_cast<ValueType*>(symmetric_coo.vals()), csr_structure);
 
-    std::get<0>(stats) = raft::sparse::solver::lanczos_compute_eigenpairs<IndexType, ValueType>(
+    raft::execute_with_dry_run_check(
       handle,
-      config,
-      csr_matrix,
-      std::make_optional(v0.view()),
-      eigenvalues.view(),
-      eigenvectors.view());
+      [&](raft::resources const& h) {
+        std::get<0>(stats) = raft::sparse::solver::lanczos_compute_eigenpairs<IndexType, ValueType>(
+          h,
+          config,
+          csr_matrix,
+          std::make_optional(v0.view()),
+          eigenvalues.view(),
+          eigenvectors.view());
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(ValueType) * symmetric_coo.n_rows * config.ncv);
 
     ASSERT_TRUE(raft::devArrMatch<ValueType>(eigenvalues.data_handle(),
                                              expected_eigenvalues.data_handle(),
@@ -340,13 +346,19 @@ class lanczos_tests : public ::testing::TestWithParam<lanczos_inputs<IndexType,
     auto csr_matrix = raft::make_device_csr_matrix_view<ValueType, IndexType, IndexType, IndexType>(
       const_cast<ValueType*>(vals.data_handle()), csr_structure);
 
-    std::get<0>(stats) = raft::sparse::solver::lanczos_compute_eigenpairs<IndexType, ValueType>(
+    raft::execute_with_dry_run_check(
       handle,
-      config,
-      csr_matrix,
-      std::make_optional(v0.view()),
-      eigenvalues.view(),
-      eigenvectors.view());
+      [&](raft::resources const& h) {
+        std::get<0>(stats) = raft::sparse::solver::lanczos_compute_eigenpairs<IndexType, ValueType>(
+          h,
+          config,
+          csr_matrix,
+          std::make_optional(v0.view()),
+          eigenvalues.view(),
+          eigenvectors.view());
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      sizeof(ValueType) * n * config.ncv);
 
     ASSERT_TRUE(raft::devArrMatch<ValueType>(eigenvalues.data_handle(),
                                              expected_eigenvalues.data_handle(),
diff --git a/cpp/tests/sparse/spmm.cu b/cpp/tests/sparse/spmm.cu
index d3df89aecb..72f61e5d2c 100644
--- a/cpp/tests/sparse/spmm.cu
+++ b/cpp/tests/sparse/spmm.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -189,8 +189,16 @@ class SpmmTest : public ::testing::TestWithParam<SpmmInputs<T>> {
                                               ldz,
                                               params.row_major);
 
-    spmm(
-      handle, params.trans_x, params.trans_y, &alpha, X_csr, y_stride_view, &beta, z_stride_view);
+    // min_alloc: the actual contiguous span of the strided z matrix (what spmm allocates for z_tmp)
+    auto z_span = params.row_major ? (size_t(params.M) - 1) * ldz + params.N
+                                   : (size_t(params.N) - 1) * ldz + params.M;
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        spmm(h, params.trans_x, params.trans_y, &alpha, X_csr, y_stride_view, &beta, z_stride_view);
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      z_span * sizeof(T));
 
     resource::sync_stream(handle, stream);
 
diff --git a/cpp/tests/sparse/symmetrize.cu b/cpp/tests/sparse/symmetrize.cu
index 17c3390f2a..ba4f898cfe 100644
--- a/cpp/tests/sparse/symmetrize.cu
+++ b/cpp/tests/sparse/symmetrize.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -94,8 +94,14 @@ class SparseSymmetrizeTest
 
     raft::sparse::COO<value_t, value_idx, nnz_t> out(stream);
 
-    raft::sparse::linalg::symmetrize(
-      handle, coo_rows.data(), indices.data(), data.data(), m, n, coo_rows.size(), out);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        raft::sparse::linalg::symmetrize(
+          h, coo_rows.data(), indices.data(), data.data(), m, n, coo_rows.size(), out);
+      },
+      raft::alloc_behavior::DATA_DRIVEN,
+      nnz * 2 * (2 * sizeof(value_idx) + sizeof(value_t)));
 
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
diff --git a/cpp/tests/stats/cov.cu b/cpp/tests/stats/cov.cu
index 761fbb073a..b7ca455a97 100644
--- a/cpp/tests/stats/cov.cu
+++ b/cpp/tests/stats/cov.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -64,21 +64,31 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     if (params.rowMajor) {
       using layout = raft::row_major;
       raft::stats::mean<true>(mean_act.data(), data.data(), cols, rows, stream);
-      cov(handle,
-          raft::make_device_matrix_view<T, std::uint32_t, layout>(data.data(), rows, cols),
-          raft::make_device_vector_view<const T, std::uint32_t>(mean_act.data(), cols),
-          raft::make_device_matrix_view<T, std::uint32_t, layout>(cov_act.data(), cols, cols),
-          params.sample,
-          params.stable);
+      raft::execute_with_dry_run_check(
+        handle,
+        [&](raft::resources const& h) {
+          cov(h,
+              raft::make_device_matrix_view<T, std::uint32_t, layout>(data.data(), rows, cols),
+              raft::make_device_vector_view<const T, std::uint32_t>(mean_act.data(), cols),
+              raft::make_device_matrix_view<T, std::uint32_t, layout>(cov_act.data(), cols, cols),
+              params.sample,
+              params.stable);
+        },
+        raft::alloc_behavior::NO_ALLOCATIONS);
     } else {
       using layout = raft::col_major;
       raft::stats::mean<false>(mean_act.data(), data.data(), cols, rows, stream);
-      cov(handle,
-          raft::make_device_matrix_view<T, std::uint32_t, layout>(data.data(), rows, cols),
-          raft::make_device_vector_view<const T, std::uint32_t>(mean_act.data(), cols),
-          raft::make_device_matrix_view<T, std::uint32_t, layout>(cov_act.data(), cols, cols),
-          params.sample,
-          params.stable);
+      raft::execute_with_dry_run_check(
+        handle,
+        [&](raft::resources const& h) {
+          cov(h,
+              raft::make_device_matrix_view<T, std::uint32_t, layout>(data.data(), rows, cols),
+              raft::make_device_vector_view<const T, std::uint32_t>(mean_act.data(), cols),
+              raft::make_device_matrix_view<T, std::uint32_t, layout>(cov_act.data(), cols, cols),
+              params.sample,
+              params.stable);
+        },
+        raft::alloc_behavior::NO_ALLOCATIONS);
     }
 
     T data_h[6]       = {1.0, 2.0, 5.0, 4.0, 2.0, 1.0};
diff --git a/cpp/tests/stats/homogeneity_score.cu b/cpp/tests/stats/homogeneity_score.cu
index 90b456d32a..0590913af2 100644
--- a/cpp/tests/stats/homogeneity_score.cu
+++ b/cpp/tests/stats/homogeneity_score.cu
@@ -1,10 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #include "../test_utils.cuh"
 
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/stats/entropy.cuh>
 #include <raft/stats/homogeneity_score.cuh>
 #include <raft/stats/mutual_info_score.cuh>
 #include <raft/util/cudart_utils.hpp>
diff --git a/cpp/tests/stats/mean.cu b/cpp/tests/stats/mean.cu
index 4323d2f0bd..72d5b3c4ea 100644
--- a/cpp/tests/stats/mean.cu
+++ b/cpp/tests/stats/mean.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -59,17 +59,22 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
   void meanSGtest(T* data, cudaStream_t stream)
   {
     int rows = params.rows, cols = params.cols;
-    if (params.rowMajor) {
-      using layout = raft::row_major;
-      mean(handle,
-           raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
-    } else {
-      using layout = raft::col_major;
-      mean(handle,
-           raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.rowMajor) {
+          using layout = raft::row_major;
+          mean(h,
+               raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
+               raft::make_device_vector_view<T, int>(mean_act.data(), cols));
+        } else {
+          using layout = raft::col_major;
+          mean(h,
+               raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
+               raft::make_device_vector_view<T, int>(mean_act.data(), cols));
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
   }
 
  protected:
diff --git a/cpp/tests/stats/meanvar.cu b/cpp/tests/stats/meanvar.cu
index 92f133e9c2..6cd6603890 100644
--- a/cpp/tests/stats/meanvar.cu
+++ b/cpp/tests/stats/meanvar.cu
@@ -59,23 +59,31 @@ class MeanVarTest : public ::testing::TestWithParam<MeanVarInputs<T>> {
     random::RngState r(params.seed);
     normal(handle, r, data.data(), params.cols * params.rows, params.mean, params.stddev);
 
-    if (params.rowMajor) {
-      using layout = raft::row_major;
-      meanvar(
-        handle,
-        raft::make_device_matrix_view<const T, int, layout>(data.data(), params.rows, params.cols),
-        raft::make_device_vector_view<T, int>(mean_act.data(), params.cols),
-        raft::make_device_vector_view<T, int>(vars_act.data(), params.cols),
-        params.sample);
-    } else {
-      using layout = raft::col_major;
-      meanvar(
-        handle,
-        raft::make_device_matrix_view<const T, int, layout>(data.data(), params.rows, params.cols),
-        raft::make_device_vector_view<T, int>(mean_act.data(), params.cols),
-        raft::make_device_vector_view<T, int>(vars_act.data(), params.cols),
-        params.sample);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.rowMajor) {
+          using layout = raft::row_major;
+          meanvar(h,
+                  raft::make_device_matrix_view<const T, int, layout>(
+                    data.data(), params.rows, params.cols),
+                  raft::make_device_vector_view<T, int>(mean_act.data(), params.cols),
+                  raft::make_device_vector_view<T, int>(vars_act.data(), params.cols),
+                  params.sample);
+        } else {
+          using layout = raft::col_major;
+          meanvar(h,
+                  raft::make_device_matrix_view<const T, int, layout>(
+                    data.data(), params.rows, params.cols),
+                  raft::make_device_vector_view<T, int>(mean_act.data(), params.cols),
+                  raft::make_device_vector_view<T, int>(vars_act.data(), params.cols),
+                  params.sample);
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN,
+      params.rowMajor ? sizeof(raft::stats::detail::mean_var<T>) * params.cols +
+                          sizeof(int) * raft::ceildiv(params.cols, 32)
+                      : 0);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
diff --git a/cpp/tests/stats/minmax.cu b/cpp/tests/stats/minmax.cu
index 8d17b13e01..0322c41c30 100644
--- a/cpp/tests/stats/minmax.cu
+++ b/cpp/tests/stats/minmax.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -116,15 +116,20 @@ class MinMaxTest : public ::testing::TestWithParam<MinMaxInputs<T>> {
                 minmax_ref.data(),
                 minmax_ref.data() + params.cols,
                 stream);
-    raft::stats::minmax<T, int>(
+    raft::execute_with_dry_run_check(
       handle,
-      raft::make_device_matrix_view<const T, int, raft::layout_f_contiguous>(
-        data.data(), params.rows, params.cols),
-      std::nullopt,
-      std::nullopt,
-      raft::make_device_vector_view<T, int>(minmax_act.data(), params.cols),
-      raft::make_device_vector_view<T, int>(minmax_act.data() + params.cols, params.cols),
-      std::nullopt);
+      [&](raft::resources const& h) {
+        raft::stats::minmax<T, int>(
+          h,
+          raft::make_device_matrix_view<const T, int, raft::layout_f_contiguous>(
+            data.data(), params.rows, params.cols),
+          std::nullopt,
+          std::nullopt,
+          raft::make_device_vector_view<T, int>(minmax_act.data(), params.cols),
+          raft::make_device_vector_view<T, int>(minmax_act.data() + params.cols, params.cols),
+          std::nullopt);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
   }
 
  protected:
diff --git a/cpp/tests/stats/stddev.cu b/cpp/tests/stats/stddev.cu
index fcdb4baca2..c8031747ca 100644
--- a/cpp/tests/stats/stddev.cu
+++ b/cpp/tests/stats/stddev.cu
@@ -66,41 +66,46 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
   {
     int rows = params.rows, cols = params.cols;
 
-    if (params.rowMajor) {
-      using layout_t = raft::row_major;
-      mean(handle,
-           raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
-
-      stddev(handle,
-             raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-             raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
-             raft::make_device_vector_view<T, int>(stddev_act.data(), cols),
-             params.sample);
-
-      vars(handle,
-           raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
-           raft::make_device_vector_view<T, int>(vars_act.data(), cols),
-           params.sample);
-    } else {
-      using layout_t = raft::col_major;
-      mean(handle,
-           raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<T>(mean_act.data(), cols));
-
-      stddev(handle,
-             raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-             raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
-             raft::make_device_vector_view<T, int>(stddev_act.data(), cols),
-             params.sample);
-
-      vars(handle,
-           raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
-           raft::make_device_vector_view<T, int>(vars_act.data(), cols),
-           params.sample);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.rowMajor) {
+          using layout_t = raft::row_major;
+          mean(h,
+               raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+               raft::make_device_vector_view<T, int>(mean_act.data(), cols));
+
+          stddev(h,
+                 raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+                 raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
+                 raft::make_device_vector_view<T, int>(stddev_act.data(), cols),
+                 params.sample);
+
+          vars(h,
+               raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+               raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
+               raft::make_device_vector_view<T, int>(vars_act.data(), cols),
+               params.sample);
+        } else {
+          using layout_t = raft::col_major;
+          mean(h,
+               raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+               raft::make_device_vector_view<T>(mean_act.data(), cols));
+
+          stddev(h,
+                 raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+                 raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
+                 raft::make_device_vector_view<T, int>(stddev_act.data(), cols),
+                 params.sample);
+
+          vars(h,
+               raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
+               raft::make_device_vector_view<const T, int>(mean_act.data(), cols),
+               raft::make_device_vector_view<T, int>(vars_act.data(), cols),
+               params.sample);
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
     T scalar = T(1);
     raft::matrix::weighted_sqrt(
       handle,
diff --git a/cpp/tests/stats/sum.cu b/cpp/tests/stats/sum.cu
index 711c90acf7..58f949854a 100644
--- a/cpp/tests/stats/sum.cu
+++ b/cpp/tests/stats/sum.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -76,17 +76,22 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
 
     raft::update_device(data.data(), data_h.data(), len, stream);
 
-    if (params.rowMajor) {
-      using layout = raft::row_major;
-      sum(handle,
-          raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
-          raft::make_device_vector_view(sum_act.data(), cols));
-    } else {
-      using layout = raft::col_major;
-      sum(handle,
-          raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
-          raft::make_device_vector_view(sum_act.data(), cols));
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.rowMajor) {
+          using layout = raft::row_major;
+          sum(h,
+              raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
+              raft::make_device_vector_view(sum_act.data(), cols));
+        } else {
+          using layout = raft::col_major;
+          sum(h,
+              raft::make_device_matrix_view<const T, int, layout>(data.data(), rows, cols),
+              raft::make_device_vector_view(sum_act.data(), cols));
+        }
+      },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
     resource::sync_stream(handle, stream);
 
     double expected = double(params.rows) * params.value;
diff --git a/cpp/tests/stats/weighted_mean.cu b/cpp/tests/stats/weighted_mean.cu
index 05640b530e..78a919eb49 100644
--- a/cpp/tests/stats/weighted_mean.cu
+++ b/cpp/tests/stats/weighted_mean.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -89,17 +89,20 @@ class RowWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
     auto weights =
       raft::make_device_vector_view<const T, std::uint32_t>(dweights.data().get(), cols);
 
-    if (params.row_major) {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      row_weighted_mean(handle, input, weights, output);
-    } else {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      row_weighted_mean(handle, input, weights, output);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.row_major) {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
+            din.data().get(), rows, cols);
+          row_weighted_mean(h, input, weights, output);
+        } else {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
+            din.data().get(), rows, cols);
+          row_weighted_mean(h, input, weights, output);
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
 
     // adjust tolerance to account for round-off accumulation
     params.tolerance *= params.N;
@@ -164,17 +167,20 @@ class ColWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
     auto output = raft::make_device_vector_view<T, std::uint32_t>(dact.data().get(), cols);
     auto weights =
       raft::make_device_vector_view<const T, std::uint32_t>(dweights.data().get(), rows);
-    if (params.row_major) {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      col_weighted_mean(handle, input, weights, output);
-    } else {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      col_weighted_mean(handle, input, weights, output);
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.row_major) {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
+            din.data().get(), rows, cols);
+          col_weighted_mean(h, input, weights, output);
+        } else {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
+            din.data().get(), rows, cols);
+          col_weighted_mean(h, input, weights, output);
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     // adjust tolerance to account for round-off accumulation
     params.tolerance *= params.M;
   }
@@ -222,25 +228,28 @@ class WeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>>
     auto output = raft::make_device_vector_view<T, std::uint32_t>(dact.data().get(), mean_size);
     auto weights =
       raft::make_device_vector_view<const T, std::uint32_t>(dweights.data().get(), weight_size);
-    if (params.row_major) {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      if (params.along_rows) {
-        weighted_mean<Apply::ALONG_ROWS>(handle, input, weights, output);
-      } else {
-        weighted_mean<Apply::ALONG_COLUMNS>(handle, input, weights, output);
-      }
-    } else {
-      auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
-        din.data().get(), rows, cols);
-      // compute result
-      if (params.along_rows) {
-        weighted_mean<Apply::ALONG_ROWS>(handle, input, weights, output);
-      } else {
-        weighted_mean<Apply::ALONG_COLUMNS>(handle, input, weights, output);
-      }
-    }
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) {
+        if (params.row_major) {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::row_major>(
+            din.data().get(), rows, cols);
+          if (params.along_rows) {
+            weighted_mean<Apply::ALONG_ROWS>(h, input, weights, output);
+          } else {
+            weighted_mean<Apply::ALONG_COLUMNS>(h, input, weights, output);
+          }
+        } else {
+          auto input = raft::make_device_matrix_view<const T, std::uint32_t, raft::col_major>(
+            din.data().get(), rows, cols);
+          if (params.along_rows) {
+            weighted_mean<Apply::ALONG_ROWS>(h, input, weights, output);
+          } else {
+            weighted_mean<Apply::ALONG_COLUMNS>(h, input, weights, output);
+          }
+        }
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     // adjust tolerance to account for round-off accumulation
     params.tolerance *= params.N;
   }
diff --git a/cpp/tests/test_utils.cuh b/cpp/tests/test_utils.cuh
index 646eb46a54..51a6ab2a7c 100644
--- a/cpp/tests/test_utils.cuh
+++ b/cpp/tests/test_utils.cuh
@@ -7,11 +7,17 @@
 
 #include "test_utils.h"
 
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/dry_run_resources.hpp>
+#include <raft/util/memory_stats_resources.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/iterator>
 #include <thrust/for_each.h>
@@ -327,4 +333,96 @@ inline std::vector<float> read_csv(std::string filename, bool skip_first_n_colum
   return result;
 }
 
+enum class alloc_behavior {
+  NO_ALLOCATIONS,
+  ARGUMENT_DRIVEN,
+  DATA_DRIVEN,
+};
+
+/**
+ * @brief Execute an action and check dry-run protocol compliance.
+ *
+ * Runs @p action once in dry-run mode (via dry_run_execute) to record predicted
+ * allocations, then runs it for real with all six memory resources wrapped in
+ * statistics adaptors to record actual peak usage. Compares the predicted and
+ * actual peaks according to the specified @p behavior.
+ *
+ * @tparam Action callable with signature void(raft::resources const&)
+ */
+template <typename Action>
+void execute_with_dry_run_check(raft::resources const& res,
+                                Action&& action,
+                                alloc_behavior behavior,
+                                std::size_t min_alloc = 0)
+{
+  auto dry = raft::util::dry_run_execute(res, action);
+
+  raft::memory_stats_resources stat_res(res);
+  std::forward<Action>(action)(static_cast<const raft::resources&>(stat_res));
+  resource::sync_stream(stat_res);
+  auto actual = stat_res.get_bytes_peak();
+
+  auto total_dry    = dry.total();
+  auto total_actual = actual.total();
+
+  if (dry.device_workspace != actual.device_workspace ||
+      dry.device_large_workspace != actual.device_large_workspace ||
+      dry.device_global != actual.device_global || dry.device_managed != actual.device_managed ||
+      dry.host != actual.host || dry.host_pinned != actual.host_pinned) {
+    printf(
+      "  dry-run: ws=%zu large_ws=%zu global=%zu managed=%zu host=%zu pinned=%zu (total=%zu)\n"
+      "  actual:  ws=%zu large_ws=%zu global=%zu managed=%zu host=%zu pinned=%zu (total=%zu)\n",
+      dry.device_workspace,
+      dry.device_large_workspace,
+      dry.device_global,
+      dry.device_managed,
+      dry.host,
+      dry.host_pinned,
+      total_dry,
+      actual.device_workspace,
+      actual.device_large_workspace,
+      actual.device_global,
+      actual.device_managed,
+      actual.host,
+      actual.host_pinned,
+      total_actual);
+  }
+
+  EXPECT_GE(total_actual, min_alloc);
+  EXPECT_GE(total_dry, min_alloc);
+
+  switch (behavior) {
+    case alloc_behavior::NO_ALLOCATIONS:
+      EXPECT_EQ(dry.device_workspace, std::size_t{0});
+      EXPECT_EQ(dry.device_large_workspace, std::size_t{0});
+      EXPECT_EQ(dry.device_global, std::size_t{0});
+      EXPECT_EQ(dry.device_managed, std::size_t{0});
+      EXPECT_EQ(dry.host, std::size_t{0});
+      EXPECT_EQ(dry.host_pinned, std::size_t{0});
+      EXPECT_EQ(actual.device_workspace, std::size_t{0});
+      EXPECT_EQ(actual.device_large_workspace, std::size_t{0});
+      EXPECT_EQ(actual.device_global, std::size_t{0});
+      EXPECT_EQ(actual.device_managed, std::size_t{0});
+      EXPECT_EQ(actual.host, std::size_t{0});
+      EXPECT_EQ(actual.host_pinned, std::size_t{0});
+      break;
+    case alloc_behavior::ARGUMENT_DRIVEN:
+      EXPECT_EQ(dry.device_workspace, actual.device_workspace);
+      EXPECT_EQ(dry.device_large_workspace, actual.device_large_workspace);
+      EXPECT_EQ(dry.device_global, actual.device_global);
+      EXPECT_EQ(dry.device_managed, actual.device_managed);
+      EXPECT_EQ(dry.host, actual.host);
+      EXPECT_EQ(dry.host_pinned, actual.host_pinned);
+      break;
+    case alloc_behavior::DATA_DRIVEN:
+      EXPECT_GE(dry.device_workspace, actual.device_workspace);
+      EXPECT_GE(dry.device_large_workspace, actual.device_large_workspace);
+      EXPECT_GE(dry.device_global, actual.device_global);
+      EXPECT_GE(dry.device_managed, actual.device_managed);
+      EXPECT_GE(dry.host, actual.host);
+      EXPECT_GE(dry.host_pinned, actual.host_pinned);
+      break;
+  }
+}
+
 };  // end namespace raft
diff --git a/cpp/tests/util/bitonic_sort.cu b/cpp/tests/util/bitonic_sort.cu
index 742cf8aeda..5f0658643f 100644
--- a/cpp/tests/util/bitonic_sort.cu
+++ b/cpp/tests/util/bitonic_sort.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -143,8 +143,13 @@ class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
     fill_random(arr_d);
     update_host(in.data(), arr_d.data(), arr_d.size(), stream);
 
-    // calculate the results
-    bitonic_launch<kMaxCapacity>::run(spec, arr_d.data(), stream);
+    // calculate the results (verify dry-run compliance of bitonic sort launch)
+    raft::execute_with_dry_run_check(
+      handle_,
+      [&](raft::resources const&) {
+        bitonic_launch<kMaxCapacity>::run(spec, arr_d.data(), stream);
+      },
+      raft::alloc_behavior::NO_ALLOCATIONS);
     update_host(out.data(), arr_d.data(), arr_d.size(), stream);
 
     // make sure the results are available on host
diff --git a/cpp/tests/util/dry_run_guards.cu b/cpp/tests/util/dry_run_guards.cu
new file mode 100644
index 0000000000..37b7ef381e
--- /dev/null
+++ b/cpp/tests/util/dry_run_guards.cu
@@ -0,0 +1,268 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <raft/core/copy.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/dry_run_resources.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+
+namespace raft::util {
+
+// ===================================================================
+// Test Category 1: No CUDA stream activity during dry-run
+// ===================================================================
+
+/**
+ * @brief Verify that dry-run guards prevent actual kernel execution.
+ *
+ * Strategy: fill a device array with a known value, run a RAFT function
+ * under dry-run mode that would overwrite it, then read back and confirm
+ * the original value is untouched.
+ */
+TEST(DryRunGuard, AddDoesNotExecute)
+{
+  raft::resources res;
+  auto stream     = resource::get_cuda_stream(res);
+  constexpr int n = 256;
+
+  auto a   = raft::make_device_vector<float>(res, n);
+  auto b   = raft::make_device_vector<float>(res, n);
+  auto out = raft::make_device_vector<float>(res, n);
+
+  raft::linalg::map(res, a.view(), raft::const_op<float>{1.0f});
+  raft::linalg::map(res, b.view(), raft::const_op<float>{2.0f});
+  raft::linalg::map(res, out.view(), raft::const_op<float>{99.0f});
+  resource::sync_stream(res);
+
+  // Enable dry-run; add would write 3.0 to out
+  resource::set_dry_run_flag(res, true);
+  raft::linalg::add(
+    res, raft::make_const_mdspan(a.view()), raft::make_const_mdspan(b.view()), out.view());
+  resource::set_dry_run_flag(res, false);
+  resource::sync_stream(res);
+
+  // Verify data was NOT modified
+  std::vector<float> h_out(n);
+  auto out_src_view = raft::make_const_mdspan(out.view());
+  auto out_dst_view = raft::make_host_vector_view<float, int>(h_out.data(), n);
+  raft::copy(res, out_dst_view, out_src_view);
+  resource::sync_stream(res);
+  for (int i = 0; i < n; ++i) {
+    EXPECT_FLOAT_EQ(h_out[i], 99.0f) << "at index " << i;
+  }
+}
+
+TEST(DryRunGuard, RngDoesNotExecute)
+{
+  raft::resources res;
+  auto stream     = resource::get_cuda_stream(res);
+  constexpr int n = 1024;
+
+  auto out = raft::make_device_vector<float>(res, n);
+  raft::linalg::map(res, out.view(), raft::const_op<float>{0.0f});
+  resource::sync_stream(res);
+
+  raft::random::RngState state(42);
+
+  resource::set_dry_run_flag(res, true);
+  raft::random::uniform(res, state, out.view(), -1.0f, 1.0f);
+  resource::set_dry_run_flag(res, false);
+  resource::sync_stream(res);
+
+  std::vector<float> h_out(n);
+  auto out_src_view = raft::make_const_mdspan(out.view());
+  auto out_dst_view = raft::make_host_vector_view<float, int>(h_out.data(), n);
+  raft::copy(res, out_dst_view, out_src_view);
+  resource::sync_stream(res);
+  for (int i = 0; i < n; ++i) {
+    EXPECT_FLOAT_EQ(h_out[i], 0.0f) << "at index " << i;
+  }
+}
+
+// ===================================================================
+// Test Category 2: Accurate allocation tracking
+// ===================================================================
+
+TEST(DryRunAllocTracking, DeviceUvectorTracked)
+{
+  raft::resources res;
+
+  constexpr std::size_t kAllocSize = 16UL * 1024UL * 1024UL;  // 16 MiB
+
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    // Allocate an rmm::device_uvector; the dry-run MR should track it
+    rmm::device_uvector<float> buf(kAllocSize / sizeof(float), resource::get_cuda_stream(r));
+  });
+
+  // The allocation should be tracked (note: rmm may align the size)
+  EXPECT_GE(stats.device_global, kAllocSize);
+}
+
+TEST(DryRunAllocTracking, MakeDeviceArrayTracked)
+{
+  raft::resources res;
+
+  constexpr int rows             = 1024;
+  constexpr int cols             = 512;
+  constexpr std::size_t expected = rows * cols * sizeof(float);
+
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    auto mat = raft::make_device_matrix<float>(r, rows, cols);
+  });
+
+  EXPECT_GE(stats.device_global, expected);
+}
+
+TEST(DryRunAllocTracking, MultipleAllocationsSum)
+{
+  raft::resources res;
+
+  constexpr std::size_t kSize1 = 4UL * 1024UL * 1024UL;  // 4 MiB
+  constexpr std::size_t kSize2 = 8UL * 1024UL * 1024UL;  // 8 MiB
+
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    auto stream = resource::get_cuda_stream(r);
+    rmm::device_uvector<char> buf1(kSize1, stream);
+    rmm::device_uvector<char> buf2(kSize2, stream);
+    // Both alive at same time -> peak should be sum
+  });
+
+  EXPECT_GE(stats.device_global, kSize1 + kSize2);
+}
+
+TEST(DryRunAllocTracking, DeallocReducesCurrent)
+{
+  raft::resources res;
+
+  constexpr std::size_t kSize1 = 4UL * 1024UL * 1024UL;
+  constexpr std::size_t kSize2 = 8UL * 1024UL * 1024UL;
+
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    auto stream = resource::get_cuda_stream(r);
+    {
+      rmm::device_uvector<char> buf1(kSize1, stream);
+    }
+    // buf1 is freed now
+    rmm::device_uvector<char> buf2(kSize2, stream);
+    // Peak should be max(kSize1, kSize2) = kSize2 since they don't overlap
+  });
+
+  // Peak should be at least kSize2 (the larger single allocation)
+  EXPECT_GE(stats.device_global, kSize2);
+  // But could be less than kSize1 + kSize2 (since buf1 is freed before buf2)
+  // This depends on timing/implementation, so we just check the peak is reasonable
+}
+
+// ===================================================================
+// Test Category 3: End-to-end integration tests for composite functions
+// ===================================================================
+
+TEST(DryRunE2E, StatsComposite)
+{
+  raft::resources res;
+  auto stream        = resource::get_cuda_stream(res);
+  constexpr int rows = 256;
+  constexpr int cols = 64;
+
+  // Pre-allocate input (outside dry-run)
+  auto input  = raft::make_device_matrix<float>(res, rows, cols);
+  auto output = raft::make_device_vector<float>(res, cols);
+
+  raft::linalg::map(res, input.view(), raft::const_op<float>{1.0f});
+  raft::linalg::map(res, output.view(), raft::const_op<float>{-1.0f});
+  resource::sync_stream(res);
+
+  // Dry-run: compute column means
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    raft::stats::mean(r, raft::make_const_mdspan(input.view()), output.view(), false);
+  });
+
+  // The output should NOT be modified (still -1.0)
+  std::vector<float> h_output(cols);
+  auto output_src_view = raft::make_const_mdspan(output.view());
+  auto output_dst_view = raft::make_host_vector_view<float, int>(h_output.data(), cols);
+  raft::copy(res, output_dst_view, output_src_view);
+  resource::sync_stream(res);
+  for (int i = 0; i < cols; ++i) {
+    EXPECT_FLOAT_EQ(h_output[i], -1.0f) << "at index " << i;
+  }
+
+  // Verify dry-run flag is restored
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+  // Stats should be non-negative
+  EXPECT_GE(stats.device_global, 0);
+}
+
+TEST(DryRunE2E, DryRunExecuteWithArgs)
+{
+  raft::resources res;
+
+  // dry_run_execute with extra args perfect-forwarded to the action
+  auto stats = dry_run_execute(
+    res,
+    [](raft::resources const& r, int expected_size) {
+      // Just verify args are forwarded correctly
+      EXPECT_EQ(expected_size, 512);
+      // Allocate some memory to verify tracking
+      rmm::device_uvector<float> tmp(1024, resource::get_cuda_stream(r));
+    },
+    512);
+
+  EXPECT_GE(stats.device_global, 1024 * sizeof(float));
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+TEST(DryRunE2E, NestedDryRunIsNoop)
+{
+  raft::resources res;
+
+  // dry_run_execute is already running, setting the flag again inside should not interfere
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    EXPECT_TRUE(resource::get_dry_run_flag(r));
+    // Manually set it again (should be harmless)
+    resource::set_dry_run_flag(r, true);
+    EXPECT_TRUE(resource::get_dry_run_flag(r));
+
+    rmm::device_uvector<float> buf(256, resource::get_cuda_stream(r));
+  });
+
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+  EXPECT_GE(stats.device_global, 256 * sizeof(float));
+}
+
+TEST(DryRunE2E, ExceptionRestoresResources)
+{
+  raft::resources res;
+  auto original_mr   = rmm::mr::get_current_device_resource_ref();
+  auto original_host = raft::mr::get_default_host_resource();
+
+  EXPECT_THROW(dry_run_execute(
+                 res, [](raft::resources const&) { throw std::runtime_error("test exception"); }),
+               std::runtime_error);
+
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), original_mr);
+  EXPECT_EQ(raft::mr::get_default_host_resource(), original_host);
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+}  // namespace raft::util
diff --git a/cpp/tests/util/dry_run_resources.cpp b/cpp/tests/util/dry_run_resources.cpp
new file mode 100644
index 0000000000..2b01b923a9
--- /dev/null
+++ b/cpp/tests/util/dry_run_resources.cpp
@@ -0,0 +1,389 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resource/dry_run_flag.hpp>
+#include <raft/core/resource/managed_memory_resource.hpp>
+#include <raft/core/resource/pinned_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/mr/dry_run_resource.hpp>
+#include <raft/mr/host_device_resource.hpp>
+#include <raft/mr/host_memory_resource.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/dry_run_resources.hpp>
+
+#include <rmm/mr/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cuda/stream_ref>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <memory>
+#include <stdexcept>
+
+namespace raft::util {
+
+// ===== dry_run_resource tests (device async) =====
+
+TEST(DryRunResource, DeviceAsyncPeakTracking)
+{
+  auto dev_ref = rmm::mr::get_current_device_resource_ref();
+  raft::mr::dry_run_resource<rmm::device_async_resource_ref> dr{dev_ref};
+  auto counter = dr.get_counter();
+
+  constexpr std::size_t kSize1 = 100UL * 1024UL * 1024UL;
+  constexpr std::size_t kSize2 = 200UL * 1024UL * 1024UL;
+
+  void* p1 = dr.allocate(cuda::stream_ref{cudaStreamLegacy}, kSize1);
+  ASSERT_NE(p1, nullptr);
+  EXPECT_EQ(counter->get_allocated_bytes(), kSize1);
+  EXPECT_EQ(counter->get_peak_bytes(), kSize1);
+
+  void* p2 = dr.allocate(cuda::stream_ref{cudaStreamLegacy}, kSize2);
+  EXPECT_EQ(p2, p1);  // same probed pointer for all allocations
+  EXPECT_EQ(counter->get_peak_bytes(), kSize1 + kSize2);
+
+  dr.deallocate(cuda::stream_ref{cudaStreamLegacy}, p1, kSize1);
+  EXPECT_EQ(counter->get_allocated_bytes(), kSize2);
+  EXPECT_EQ(counter->get_peak_bytes(), kSize1 + kSize2);
+
+  dr.deallocate(cuda::stream_ref{cudaStreamLegacy}, p2, kSize2);
+  EXPECT_EQ(counter->get_allocated_bytes(), 0UL);
+}
+
+TEST(DryRunResource, DeviceAsyncLargeAllocation)
+{
+  auto dev_ref = rmm::mr::get_current_device_resource_ref();
+  raft::mr::dry_run_resource<rmm::device_async_resource_ref> dr{dev_ref};
+  auto counter = dr.get_counter();
+
+  constexpr std::size_t kOneGiB = 1024UL * 1024UL * 1024UL;
+  void* ptr                     = dr.allocate(cuda::stream_ref{cudaStreamLegacy}, kOneGiB);
+  ASSERT_NE(ptr, nullptr);
+  EXPECT_EQ(counter->get_allocated_bytes(), kOneGiB);
+
+  dr.deallocate(cuda::stream_ref{cudaStreamLegacy}, ptr, kOneGiB);
+  EXPECT_EQ(counter->get_allocated_bytes(), 0UL);
+  EXPECT_EQ(counter->get_peak_bytes(), kOneGiB);
+}
+
+// ===== dry_run_resource tests (host sync) =====
+
+TEST(DryRunResource, HostSyncPeakTracking)
+{
+  auto host_ref = raft::mr::get_default_host_resource();
+  raft::mr::dry_run_resource<raft::mr::host_resource_ref> dr{host_ref};
+  auto counter = dr.get_counter();
+
+  constexpr std::size_t kSize1 = 100UL * 1024UL * 1024UL;
+  constexpr std::size_t kSize2 = 200UL * 1024UL * 1024UL;
+
+  void* p1 = dr.allocate_sync(kSize1);
+  void* p2 = dr.allocate_sync(kSize2);
+  EXPECT_EQ(p1, p2);  // same probed pointer
+  EXPECT_EQ(counter->get_peak_bytes(), kSize1 + kSize2);
+
+  dr.deallocate_sync(p1, kSize1);
+  dr.deallocate_sync(p2, kSize2);
+  EXPECT_EQ(counter->get_allocated_bytes(), 0UL);
+  EXPECT_EQ(counter->get_peak_bytes(), kSize1 + kSize2);
+}
+
+// ===== dry_run_flag resource tests =====
+
+TEST(DryRunFlag, DefaultIsFalse)
+{
+  raft::resources res;
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+TEST(DryRunFlag, SetAndGet)
+{
+  raft::resources res;
+  resource::set_dry_run_flag(res, true);
+  EXPECT_TRUE(resource::get_dry_run_flag(res));
+
+  resource::set_dry_run_flag(res, false);
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+// ===== dry_run_resources tests =====
+
+TEST(DryRunResources, SetsFlag)
+{
+  raft::resources res;
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+  {
+    dry_run_resources dry_res(res);
+    EXPECT_TRUE(resource::get_dry_run_flag(dry_res));
+    EXPECT_TRUE(resource::get_dry_run_flag(res));
+  }
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+TEST(DryRunResources, RestoresGlobalDeviceResource)
+{
+  auto original_mr = rmm::mr::get_current_device_resource_ref();
+  raft::resources res;
+  {
+    dry_run_resources dry_res(res);
+    auto current_mr = rmm::mr::get_current_device_resource_ref();
+    EXPECT_NE(current_mr, original_mr);
+  }
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), original_mr);
+}
+
+TEST(DryRunResources, RestoresGlobalHostResource)
+{
+  auto original_ref = raft::mr::get_default_host_resource();
+  raft::resources res;
+  {
+    dry_run_resources dry_res(res);
+    auto current_ref = raft::mr::get_default_host_resource();
+    EXPECT_NE(current_ref, original_ref);
+  }
+  EXPECT_EQ(raft::mr::get_default_host_resource(), original_ref);
+}
+
+TEST(DryRunResources, StatsAccuracy)
+{
+  raft::resources res;
+  constexpr std::size_t kAllocSize = 64UL * 1024UL * 1024UL;
+
+  dry_run_resources dry_res(res);
+
+  auto mr   = rmm::mr::get_current_device_resource_ref();
+  void* ptr = mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kAllocSize);
+  mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, ptr, kAllocSize);
+
+  auto stats = dry_res.get_bytes_peak();
+  EXPECT_EQ(stats.device_global, kAllocSize);
+}
+
+TEST(DryRunResources, PinnedStatsAccuracy)
+{
+  raft::resources res;
+  constexpr std::size_t kAllocSize = 64UL * 1024UL * 1024UL;
+
+  dry_run_resources dry_res(res);
+
+  auto ref  = resource::get_pinned_memory_resource_ref(dry_res);
+  void* ptr = ref.allocate_sync(kAllocSize);
+  ref.deallocate_sync(ptr, kAllocSize);
+
+  auto stats = dry_res.get_bytes_peak();
+  EXPECT_EQ(stats.host_pinned, kAllocSize);
+}
+
+TEST(DryRunResources, ManagedStatsAccuracy)
+{
+  raft::resources res;
+  constexpr std::size_t kAllocSize = 64UL * 1024UL * 1024UL;
+
+  dry_run_resources dry_res(res);
+
+  auto ref  = resource::get_managed_memory_resource_ref(dry_res);
+  void* ptr = ref.allocate_sync(kAllocSize);
+  ref.deallocate_sync(ptr, kAllocSize);
+
+  auto stats = dry_res.get_bytes_peak();
+  EXPECT_EQ(stats.device_managed, kAllocSize);
+}
+
+// ===== dry_run_execute tests =====
+
+TEST(DryRunExecute, BasicExecution)
+{
+  raft::resources res;
+  bool action_ran = false;
+
+  auto stats = dry_run_execute(res, [&](raft::resources const& r) {
+    action_ran = true;
+    EXPECT_TRUE(resource::get_dry_run_flag(r));
+
+    auto mr                     = rmm::mr::get_current_device_resource_ref();
+    constexpr std::size_t kSize = 32UL * 1024UL * 1024UL;
+    void* ptr                   = mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kSize);
+    mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, ptr, kSize);
+  });
+
+  EXPECT_TRUE(action_ran);
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+  EXPECT_EQ(stats.device_global, 32UL * 1024UL * 1024UL);
+}
+
+TEST(DryRunExecute, ExceptionSafety)
+{
+  raft::resources res;
+  auto original_mr   = rmm::mr::get_current_device_resource_ref();
+  auto original_host = raft::mr::get_default_host_resource();
+
+  EXPECT_THROW(dry_run_execute(
+                 res, [](raft::resources const&) { throw std::runtime_error("test exception"); }),
+               std::runtime_error);
+
+  EXPECT_EQ(rmm::mr::get_current_device_resource_ref(), original_mr);
+  EXPECT_EQ(raft::mr::get_default_host_resource(), original_host);
+  EXPECT_FALSE(resource::get_dry_run_flag(res));
+}
+
+// ===== Independent-counting tests for dry_run_resources =====
+
+TEST(DryRunResources, IndependentCounting_DefaultWorkspace)
+{
+  raft::resources res;
+
+  dry_run_resources dry_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(dry_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = dry_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+TEST(DryRunResources, IndependentCounting_WorkspaceSetToGlobal)
+{
+  raft::resources res;
+  resource::set_workspace_to_global_resource(res);
+
+  dry_run_resources dry_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(dry_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = dry_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+// ===== Independent-counting tests for memory_stats_resources =====
+
+TEST(MemoryStatsResources, IndependentCounting_DefaultWorkspace)
+{
+  raft::resources res;
+
+  memory_stats_resources stat_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(stat_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = stat_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+TEST(MemoryStatsResources, IndependentCounting_WorkspaceSetToGlobal)
+{
+  raft::resources res;
+  resource::set_workspace_to_global_resource(res);
+
+  memory_stats_resources stat_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(stat_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = stat_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+TEST(MemoryStatsResources, IndependentCounting_PoolWorkspace)
+{
+  raft::resources res;
+  constexpr std::size_t kPoolLimit = 64UL * 1024UL * 1024UL;
+  resource::set_workspace_to_pool_resource(res, kPoolLimit);
+
+  memory_stats_resources stat_res(res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(stat_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = stat_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+// ===== Nested wrappers test =====
+
+TEST(IndependentCounting, NestedDryRunInStats)
+{
+  raft::resources res;
+
+  memory_stats_resources stat_res(res);
+  dry_run_resources dry_res(stat_res);
+
+  constexpr std::size_t kWsSize     = 1024;
+  constexpr std::size_t kGlobalSize = 2048;
+
+  auto ws_ref  = resource::get_workspace_resource_ref(dry_res);
+  void* ws_ptr = ws_ref.allocate(cuda::stream_ref{cudaStreamLegacy}, kWsSize);
+
+  auto dev_mr   = rmm::mr::get_current_device_resource_ref();
+  void* dev_ptr = dev_mr.allocate(cuda::stream_ref{cudaStreamLegacy}, kGlobalSize);
+
+  auto peak = dry_res.get_bytes_peak();
+  EXPECT_EQ(peak.device_workspace, kWsSize);
+  EXPECT_EQ(peak.device_global, kGlobalSize);
+  EXPECT_EQ(peak.total(), kWsSize + kGlobalSize);
+
+  ws_ref.deallocate(cuda::stream_ref{cudaStreamLegacy}, ws_ptr, kWsSize);
+  dev_mr.deallocate(cuda::stream_ref{cudaStreamLegacy}, dev_ptr, kGlobalSize);
+}
+
+}  // namespace raft::util
diff --git a/cpp/tests/util/memory_type_dispatcher.cu b/cpp/tests/util/memory_type_dispatcher.cu
index 36ab4b143a..41ff8f0b50 100644
--- a/cpp/tests/util/memory_type_dispatcher.cu
+++ b/cpp/tests/util/memory_type_dispatcher.cu
@@ -1,9 +1,9 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
-#include "../test_utils.h"
+#include "../test_utils.cuh"
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -220,172 +220,176 @@ auto generate_input(raft::resources const& res)
 template <memory_type input_memory_type>
 void test_memory_type_dispatcher()
 {
-  auto res          = raft::device_resources{};
-  auto data         = generate_input<input_memory_type>(res);
-  auto data_float   = generate_input<input_memory_type, float>(res);
-  auto data_f       = generate_input<input_memory_type, double, layout_f_contiguous>(res);
-  auto data_f_float = generate_input<input_memory_type, float, layout_f_contiguous>(res);
+  execute_with_dry_run_check(
+    raft::device_resources{},
+    [&](raft::resources const& res) {
+      auto data         = generate_input<input_memory_type>(res);
+      auto data_float   = generate_input<input_memory_type, float>(res);
+      auto data_f       = generate_input<input_memory_type, double, layout_f_contiguous>(res);
+      auto data_f_float = generate_input<input_memory_type, float, layout_f_contiguous>(res);
 
-  EXPECT_EQ(memory_type_dispatcher(res, functor_h{}, data.view()),
-            functor_h::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_d{}, data.view()),
-            functor_d::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_m{}, data.view()),
-            functor_m::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_p{}, data.view()),
-            functor_p::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hd{}, data.view()),
-            functor_hd::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hm{}, data.view()),
-            functor_hm::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hp{}, data.view()),
-            functor_hp::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_dm{}, data.view()),
-            functor_dm::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_dp{}, data.view()),
-            functor_dp::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_mp{}, data.view()),
-            functor_mp::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hdm{}, data.view()),
-            functor_hdm::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hdp{}, data.view()),
-            functor_hdp::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_dmp{}, data.view()),
-            functor_dmp::expected_output<input_memory_type>());
-  EXPECT_EQ(memory_type_dispatcher(res, functor_hdmp{}, data.view()),
-            functor_hdmp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_h{}, data.view()),
+                functor_h::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_d{}, data.view()),
+                functor_d::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_m{}, data.view()),
+                functor_m::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_p{}, data.view()),
+                functor_p::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hd{}, data.view()),
+                functor_hd::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hm{}, data.view()),
+                functor_hm::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hp{}, data.view()),
+                functor_hp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_dm{}, data.view()),
+                functor_dm::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_dp{}, data.view()),
+                functor_dp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_mp{}, data.view()),
+                functor_mp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hdm{}, data.view()),
+                functor_hdm::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hdp{}, data.view()),
+                functor_hdp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_dmp{}, data.view()),
+                functor_dmp::expected_output<input_memory_type>());
+      EXPECT_EQ(memory_type_dispatcher(res, functor_hdmp{}, data.view()),
+                functor_hdmp::expected_output<input_memory_type>());
 
-  // Functor expects double; input is float
-  auto out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_h{}, data_float.view());
-  EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_d{}, data_float.view());
-  EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_m{}, data_float.view());
-  EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_p{}, data_float.view());
-  EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hd{}, data_float.view());
-  EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hm{}, data_float.view());
-  EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hp{}, data_float.view());
-  EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dm{}, data_float.view());
-  EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dp{}, data_float.view());
-  EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_mp{}, data_float.view());
-  EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdm{}, data_float.view());
-  EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdp{}, data_float.view());
-  EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dmp{}, data_float.view());
-  EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdmp{}, data_float.view());
-  EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+      // Functor expects double; input is float
+      auto out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_h{}, data_float.view());
+      EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_d{}, data_float.view());
+      EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_m{}, data_float.view());
+      EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_p{}, data_float.view());
+      EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hd{}, data_float.view());
+      EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hm{}, data_float.view());
+      EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hp{}, data_float.view());
+      EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dm{}, data_float.view());
+      EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dp{}, data_float.view());
+      EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_mp{}, data_float.view());
+      EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdm{}, data_float.view());
+      EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdp{}, data_float.view());
+      EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dmp{}, data_float.view());
+      EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdmp{}, data_float.view());
+      EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
 
-  // Functor expects C-contiguous; input is F-contiguous
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_h{}, data_f.view());
-  EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_d{}, data_f.view());
-  EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_m{}, data_f.view());
-  EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_p{}, data_f.view());
-  EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hd{}, data_f.view());
-  EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hm{}, data_f.view());
-  EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hp{}, data_f.view());
-  EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dm{}, data_f.view());
-  EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dp{}, data_f.view());
-  EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_mp{}, data_f.view());
-  EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdm{}, data_f.view());
-  EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdp{}, data_f.view());
-  EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dmp{}, data_f.view());
-  EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdmp{}, data_f.view());
-  EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+      // Functor expects C-contiguous; input is F-contiguous
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_h{}, data_f.view());
+      EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_d{}, data_f.view());
+      EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_m{}, data_f.view());
+      EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_p{}, data_f.view());
+      EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hd{}, data_f.view());
+      EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hm{}, data_f.view());
+      EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hp{}, data_f.view());
+      EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dm{}, data_f.view());
+      EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dp{}, data_f.view());
+      EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_mp{}, data_f.view());
+      EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdm{}, data_f.view());
+      EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdp{}, data_f.view());
+      EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dmp{}, data_f.view());
+      EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdmp{}, data_f.view());
+      EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
 
-  // Functor expects C-contiguous double; input is F-contiguous float
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_h{}, data_f_float.view());
-  EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_d{}, data_f_float.view());
-  EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_m{}, data_f_float.view());
-  EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_p{}, data_f_float.view());
-  EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hd{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hm{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dm{}, data_f_float.view());
-  EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_mp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdm{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_dmp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
-  out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
-    res, functor_hdmp{}, data_f_float.view());
-  EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+      // Functor expects C-contiguous double; input is F-contiguous float
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_h{}, data_f_float.view());
+      EXPECT_EQ(out, functor_h::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_d{}, data_f_float.view());
+      EXPECT_EQ(out, functor_d::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_m{}, data_f_float.view());
+      EXPECT_EQ(out, functor_m::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_p{}, data_f_float.view());
+      EXPECT_EQ(out, functor_p::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hd{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hd::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hm{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dm{}, data_f_float.view());
+      EXPECT_EQ(out, functor_dm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_dp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_mp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_mp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdm{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hdm::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hdp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_dmp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_dmp::expected_output<input_memory_type>());
+      out = memory_type_dispatcher<mdbuffer<double, matrix_extent<std::uint32_t>>>(
+        res, functor_hdmp{}, data_f_float.view());
+      EXPECT_EQ(out, functor_hdmp::expected_output<input_memory_type>());
+    },
+    alloc_behavior::ARGUMENT_DRIVEN);
 }
 
 }  // namespace dispatch_test
diff --git a/cpp/tests/util/popc.cu b/cpp/tests/util/popc.cu
index f309f15efb..2b2ccbf7ba 100644
--- a/cpp/tests/util/popc.cu
+++ b/cpp/tests/util/popc.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -96,7 +96,10 @@ class PopcTest : public ::testing::TestWithParam<PopcInputs<index_t>> {
     rmm::device_scalar<index_t> nnz_actual_d(0, stream);
     auto nnz_actual_view = raft::make_device_scalar_view<index_t>(nnz_actual_d.data());
 
-    raft::popc(handle, bits_view, max_len_view, nnz_actual_view);
+    raft::execute_with_dry_run_check(
+      handle,
+      [&](raft::resources const& h) { raft::popc(h, bits_view, max_len_view, nnz_actual_view); },
+      raft::alloc_behavior::ARGUMENT_DRIVEN);
     raft::copy(&nnz_actual_h, nnz_actual_d.data(), 1, stream);
     resource::sync_stream(handle);
 
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 33c7a254f3..38c17d7ea5 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -292,6 +292,27 @@ Sometimes, we need to temporarily change the log pattern (eg: for reporting deci
 
 4. Before creating a new primitive, check to see if one exists already. If one exists but the API isn't flexible enough to include your use-case, consider first refactoring the existing primitive. If that is not possible without an extreme number of changes, consider how the public API could be made more flexible. If the new primitive is different enough from all existing primitives, consider whether an existing public API could invoke the new primitive as an option or argument. If the new primitive is different enough from what exists already, add a header for the new public API function to the appropriate subdirectory and namespace.
 
+## Dry Run Protocol
+
+The dry run protocol defines a mechanism to simulate the execution of algorithms to get a precise estimate of the memory requirements for a real execution with the same parameters.
+
+In dry run mode:
+- no CUDA work happens in any CUDA stream
+- no expensive CPU algorithms are allowed to run
+- no real allocations happen in any of:
+  - `rmm` default device resource (device mdarrays and `rmm::device_uvector`)
+  - `cuda::mr` (host/managed/pinned) resources (all mdarray types)
+  - workspace memory resources managed by `raft::resources`.
+All attempted allocations in the above resources are tracked and reported, thus enabling planning of the memory usage with a relatively small overhead of simulated execution.
+
+To keep the dry run mode functional, the developers must follow the protocol:
+- Any function that takes `raft::resources` handle as an argument can run in dry run mode.
+  It's always safe to call such functions without any precautions.
+- Any other expensive function or any function involving CUDA-calls must be guarded by `resource::get_dry_run_flag(res)`
+- Allocations through rmm or raft memory resources must NOT be guarded to accurately track the allocation statistics.
+
+See the full [Dry Run Protocol](dry_run_protocol.md) guide for rules, patterns, and common mistakes.
+
 ## Header organization of expensive function templates
 
 RAFT is a heavily templated library. Several core functions are expensive to compile and we want to prevent duplicate compilation of this functionality. To limit build time, RAFT provides a precompiled library (libraft.so) where expensive function templates are instantiated for the most commonly used template parameters. To prevent (1) accidental instantiation of these templates and (2) unnecessary dependency on the internals of these templates, we use a split header structure and define macros to control template instantiation. This section describes the macros and header structure.
diff --git a/docs/source/dry_run_protocol.md b/docs/source/dry_run_protocol.md
new file mode 100644
index 0000000000..92781b1073
--- /dev/null
+++ b/docs/source/dry_run_protocol.md
@@ -0,0 +1,92 @@
+# Dry Run Protocol
+
+The dry run protocol lets callers estimate an algorithm's memory footprint without executing it. When enabled, the runtime swaps memory resources for lightweight trackers that record every allocation and deallocation, producing peak-usage statistics at the end.
+
+## Using Dry Run Mode
+
+```cpp
+#include <raft/util/dry_run_resources.hpp>
+
+raft::resources res;
+// auto my_function(const raft::resources& res, my_args...);
+auto stats = raft::util::dry_run_execute(res, my_function, my_args...);
+// stats.device_global  – peak device memory (bytes)
+```
+
+`dry_run_execute` swaps the memory resources, sets the flag, runs the callable, restores everything, and returns a `raft::memory_stats` snapshot of peak allocation usage.
+
+You can also construct `raft::dry_run_resources` directly for finer control (e.g. reading `get_bytes_current()` in addition to `get_bytes_peak()`).
+
+## Three Rules
+
+1. **Allocations must not be guarded.** Every `rmm::device_uvector`, `rmm::device_scalar`, `rmm::device_buffer`, `raft::make_(device|host|pinned|managed)_(mdarray|matrix|vector|scalar)` allocation must execute in both modes so the tracker sees it.
+
+2. **CUDA work must be guarded.** Kernel launches, Thrust algorithms, cuBLAS/cuSOLVER/cuSPARSE compute calls, `cudaMemcpyAsync`, `cudaMemsetAsync`, and `raft::interruptible::synchronize` must not run in dry-run mode.
+
+3. **Every function taking `raft::resources` must be callable in dry-run mode.** If it only delegates to other compliant functions, it needs no guard at all. If it performs raw CUDA work, it must guard that work internally.
+
+## What Needs Guarding
+
+| Must guard | Safe in dry-run (no guard needed) |
+|---|---|
+| Kernel launches (`<<<>>>`) | Allocations (`rmm::device_uvector`, `make_device_*`, …) |
+| `thrust::reduce`, `thrust::for_each`, … | Workspace-size queries (`cub::…(nullptr, &size, …)`, `cusparse…_bufferSize`) |
+| cuBLAS / cuSOLVER / cuSPARSE compute calls | cuSPARSE descriptor create/destroy |
+| CUB compute calls (second pass) | `resource::sync_stream()` |
+| `cudaMemcpyAsync`, `cudaMemsetAsync` | `raft::copy` (takes `raft::resources`) |
+| `raft::interruptible::synchronize()` | `raft::linalg::map`, `raft::linalg::reduce`, and other compliant RAFT APIs |
+
+## Patterns
+
+### Basic: allocate, then guard
+
+```cpp
+void algo(raft::resources const& handle, int n, cudaStream_t stream)
+{
+  rmm::device_uvector<float> buf(n, stream);           // tracked
+  if (resource::get_dry_run_flag(handle)) { return; }
+  kernel<<<grid, block, 0, stream>>>(buf.data(), n);   // skipped in dry-run
+}
+```
+
+### Workspace-size query before guard
+
+_We assume_ CUB and cuSPARSE workspace queries do not launch device work when the workspace pointer is `nullptr`, so they are safe to run in dry-run mode.
+
+```cpp
+size_t ws_bytes = 0;
+cub::DeviceRadixSort::SortPairs(nullptr, &ws_bytes, ...);   // query only
+rmm::device_uvector<char> workspace(ws_bytes, stream);       // tracked
+if (resource::get_dry_run_flag(handle)) { return; }
+cub::DeviceRadixSort::SortPairs(workspace.data(), &ws_bytes, ...);  // real work
+```
+
+### Guard individual operations (not the whole body)
+
+When cleanup or descriptor destruction must always run, guard each operation instead of returning early.
+
+```cpp
+cusparseSpMV_bufferSize(handle, ..., &buf_size);         // safe
+rmm::device_uvector<char> tmp(buf_size, stream);         // tracked
+if (!is_dry_run) {
+  cusparseSpMV(handle, ..., tmp.data());                 // guarded
+}
+cusparseDestroyDnVec(descr);                             // always runs
+```
+
+### Public wrappers: delegate without guards
+
+A wrapper that only calls compliant functions must **not** add an early return—doing so hides allocations made by the callee.
+
+```cpp
+// WRONG – hides allocations inside detail::foo
+void foo(raft::resources const& handle, ...) {
+  if (resource::get_dry_run_flag(handle)) { return; }
+  detail::foo(handle, ...);
+}
+
+// CORRECT
+void foo(raft::resources const& handle, ...) {
+  detail::foo(handle, ...);
+}
+```