From 99dc47ff8f667787643ececc3cfe4926e82191b3 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 18 Feb 2026 10:33:54 +0100 Subject: [PATCH 01/62] Add dry-run memory resources for allocation profiling without real memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a dry-run execution framework that replaces device and host memory resources with lightweight fake allocators to measure peak memory usage without holding real memory. New files: - dry_run_memory_resource.hpp: dry_run_allocator (lock-free bump allocator), dry_run_device_memory_resource, dry_run_host_memory_resource, dry_run_resource_manager (RAII), and dry_run_execute() helper. - dry_run_flag.hpp: boolean dry-run flag as a raft resource, allowing algorithms to skip kernel execution during profiling. - tests/util/dry_run_memory_resource.cpp: unit tests. The dry_run_allocator probes the upstream once to obtain a base address, then atomically bumps a pointer for each allocation — no mutex, no map, no real memory held after the initial probe. --- .../raft/core/resource/dry_run_flag.hpp | 89 +++++ .../raft/core/resource/resource_types.hpp | 3 +- .../raft/util/dry_run_memory_resource.hpp | 370 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 3 +- cpp/tests/util/dry_run_memory_resource.cpp | 249 ++++++++++++ 5 files changed, 712 insertions(+), 2 deletions(-) create mode 100644 cpp/include/raft/core/resource/dry_run_flag.hpp create mode 100644 cpp/include/raft/util/dry_run_memory_resource.hpp create mode 100644 cpp/tests/util/dry_run_memory_resource.cpp diff --git a/cpp/include/raft/core/resource/dry_run_flag.hpp b/cpp/include/raft/core/resource/dry_run_flag.hpp new file mode 100644 index 0000000000..4d0c9e27b5 --- /dev/null +++ b/cpp/include/raft/core/resource/dry_run_flag.hpp @@ -0,0 +1,89 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include + +#include + +namespace raft::resource { + +/** + * @defgroup dry_run_flag Dry-run flag resource + * @{ + */ + +/** + * @brief Resource that holds a boolean dry-run flag. + * + * When the dry-run flag is set, algorithms should skip kernel execution + * and only perform allocations to measure memory usage. + */ +class dry_run_flag_resource : public resource { + public: + dry_run_flag_resource() = default; + explicit dry_run_flag_resource(bool value) : flag_(value) {} + ~dry_run_flag_resource() override = default; + + auto get_resource() -> void* override { return &flag_; } + + void set(bool value) { flag_ = value; } + [[nodiscard]] auto get() const -> bool { return flag_; } + + private: + bool flag_{false}; +}; + +/** + * @brief Factory that creates a dry_run_flag_resource. + */ +class dry_run_flag_resource_factory : public resource_factory { + public: + explicit dry_run_flag_resource_factory(bool initial_value = false) : initial_value_(initial_value) + { + } + + auto get_resource_type() -> resource_type override { return resource_type::DRY_RUN_FLAG; } + auto make_resource() -> resource* override { return new dry_run_flag_resource(initial_value_); } + + private: + bool initial_value_; +}; + +/** + * @brief Get the dry-run flag from a resources handle. + * + * @param res raft resources object + * @return true if dry-run mode is active + */ +inline auto get_dry_run_flag(resources const& res) -> bool +{ + if (!res.has_resource_factory(resource_type::DRY_RUN_FLAG)) { + res.add_resource_factory(std::make_shared()); + } + return *res.get_resource(resource_type::DRY_RUN_FLAG); +} + +/** + * @brief Set the dry-run flag on a resources handle. + * + * @param res raft resources object + * @param value true to enable dry-run mode, false to disable + */ +inline void set_dry_run_flag(resources const& res, bool value) +{ + if (!res.has_resource_factory(resource_type::DRY_RUN_FLAG)) { + res.add_resource_factory(std::make_shared(value)); + } else { + // The resource may already be instantiated; update it directly + auto* flag = res.get_resource(resource_type::DRY_RUN_FLAG); + *flag = value; + } +} + +/** @} */ + +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp index f904c93bdf..f1ae719d19 100644 --- a/cpp/include/raft/core/resource/resource_types.hpp +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -38,6 +38,7 @@ enum resource_type { NCCL_COMM, // nccl comm ROOT_RANK, // root rank in multi-gpu world MULTI_GPU, // resource that tracks resource of each device in multi-gpu world + DRY_RUN_FLAG, // boolean flag indicating dry-run mode LAST_KEY // reserved for the last key }; diff --git a/cpp/include/raft/util/dry_run_memory_resource.hpp b/cpp/include/raft/util/dry_run_memory_resource.hpp new file mode 100644 index 0000000000..4b309fd10b --- /dev/null +++ b/cpp/include/raft/util/dry_run_memory_resource.hpp @@ -0,0 +1,370 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace raft::util { + +/** + * @defgroup dry_run_memory Dry-run memory resources + * @{ + */ + +/** + * @brief Statistics collected during a dry-run execution. + */ +struct dry_run_stats { + std::size_t device_workspace_peak; ///< Peak device workspace bytes + std::size_t device_large_workspace_peak; ///< Peak device large workspace bytes + std::size_t device_global_peak; ///< Peak device global allocation bytes + std::size_t host_peak; ///< Peak host allocation bytes +}; + +/** + * @brief Lock-free bump allocator that tracks peak usage without holding real memory. + * + * On first allocation, invokes a user-supplied probe callable to obtain a base address + * (typically by briefly allocating and freeing a small chunk from a real upstream). + * After that, every allocation bumps an atomic address by kProbeSize bytes to produce + * unique fake pointers. No real memory is held. + * + * Tracks total allocated bytes and peak usage for reporting. + */ +struct dry_run_allocator { + static constexpr std::size_t kProbeSize = 256; + + /** + * @brief Record an allocation of @p bytes and return a fake pointer. + * @tparam ProbeFn Callable with signature `void*()` that allocates and immediately frees + * a small chunk from a real upstream, returning the probed pointer. + * @param bytes The number of bytes to record as allocated. + * @param probe_fn Called exactly once (on the first allocation) to obtain a base address. + * @return A fake pointer (must not be dereferenced for data access). + */ + template + auto allocate(std::size_t bytes, ProbeFn&& probe_fn) -> void* + { + // Ensure the base address is probed exactly once. + if (address_.load(std::memory_order_relaxed) <= kAddressLocked) { + auto addr = kAddressUnset; + while (!address_.compare_exchange_weak( + addr, kAddressLocked, std::memory_order_relaxed, std::memory_order_relaxed)) { + if (addr > kAddressLocked) { + break; // The address is already set, so we can use it. + } + addr = kAddressUnset; // Otherwise, wait for the lock to be released + } + if (addr == kAddressUnset) { // We acquired the lock + try { + void* probe = probe_fn(); + address_.store(reinterpret_cast(probe), std::memory_order_relaxed); + } catch (...) { + address_.store(kAddressUnset, std::memory_order_relaxed); // release the lock + throw; + } + } + } + + // Bump the address atomically to produce a fake pointer. + void* ptr = reinterpret_cast(address_.fetch_add(kProbeSize, std::memory_order_relaxed)); + + // Track allocated bytes and update peak (lock-free). + auto new_total = allocated_bytes_.fetch_add(bytes, std::memory_order_relaxed) + bytes; + auto current_peak = peak_bytes_.load(std::memory_order_relaxed); + while (new_total > current_peak && + !peak_bytes_.compare_exchange_weak( + current_peak, new_total, std::memory_order_relaxed, std::memory_order_relaxed)) {} + return ptr; + } + + /** + * @brief Record a deallocation of @p bytes. + */ + void deallocate(std::size_t bytes) noexcept + { + allocated_bytes_.fetch_sub(bytes, std::memory_order_relaxed); + } + + /// @brief Get the current number of allocated (tracked) bytes. + [[nodiscard]] auto get_allocated_bytes() const noexcept -> std::size_t + { + return allocated_bytes_.load(std::memory_order_relaxed); + } + + /// @brief Get the peak number of allocated (tracked) bytes. + [[nodiscard]] auto get_peak_bytes() const noexcept -> std::size_t + { + return peak_bytes_.load(std::memory_order_relaxed); + } + + private: + static constexpr std::uintptr_t kAddressUnset = 0x0; + static constexpr std::uintptr_t kAddressLocked = 0x1; + + std::atomic address_{kAddressUnset}; + std::atomic allocated_bytes_{0}; + std::atomic peak_bytes_{0}; +}; + +/** + * @brief A device memory resource that tracks allocations without real memory. + * + * Wraps a dry_run_allocator behind the rmm::mr::device_memory_resource interface. + * On first use, briefly probes the upstream to obtain a plausible device base address. + * After that, every allocation bumps an atomic offset and returns a fake pointer. + * + * The returned pointers must NOT be dereferenced — they exist only to satisfy the + * allocator interface during a dry run. + */ +class dry_run_device_memory_resource : public rmm::mr::device_memory_resource { + public: + explicit dry_run_device_memory_resource(rmm::mr::device_memory_resource* upstream) + : upstream_(upstream) + { + } + ~dry_run_device_memory_resource() override = default; + + [[nodiscard]] auto get_allocated_bytes() const noexcept -> std::size_t + { + return alloc_.get_allocated_bytes(); + } + [[nodiscard]] auto get_peak_bytes() const noexcept -> std::size_t + { + return alloc_.get_peak_bytes(); + } + + private: + auto do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) -> void* override + { + return alloc_.allocate(bytes, [&] { + void* p = upstream_->allocate(stream, dry_run_allocator::kProbeSize); + upstream_->deallocate(stream, p, dry_run_allocator::kProbeSize); + return p; + }); + } + + void do_deallocate(void* /*ptr*/, + std::size_t bytes, + rmm::cuda_stream_view /*stream*/) noexcept override + { + alloc_.deallocate(bytes); + } + + [[nodiscard]] auto do_is_equal(rmm::mr::device_memory_resource const& other) const noexcept + -> bool override + { + return reinterpret_cast(this) == &other; + } + + rmm::mr::device_memory_resource* upstream_; + dry_run_allocator alloc_; +}; + +/** + * @brief A host memory resource (std::pmr) that tracks allocations without real memory. + * + * Wraps a dry_run_allocator behind the std::pmr::memory_resource interface. + * Analogous to dry_run_device_memory_resource but for host memory. + */ +class dry_run_host_memory_resource : public std::pmr::memory_resource { + public: + explicit dry_run_host_memory_resource(std::pmr::memory_resource* upstream) : upstream_(upstream) + { + } + ~dry_run_host_memory_resource() override = default; + + [[nodiscard]] auto get_allocated_bytes() const noexcept -> std::size_t + { + return alloc_.get_allocated_bytes(); + } + [[nodiscard]] auto get_peak_bytes() const noexcept -> std::size_t + { + return alloc_.get_peak_bytes(); + } + + private: + auto do_allocate(std::size_t bytes, std::size_t alignment) -> void* override + { + return alloc_.allocate(bytes, [&] { + void* p = upstream_->allocate(dry_run_allocator::kProbeSize, alignment); + upstream_->deallocate(p, dry_run_allocator::kProbeSize, alignment); + return p; + }); + } + + void do_deallocate(void* /*ptr*/, std::size_t bytes, std::size_t /*alignment*/) noexcept override + { + alloc_.deallocate(bytes); + } + + [[nodiscard]] auto do_is_equal(std::pmr::memory_resource const& other) const noexcept + -> bool override + { + return reinterpret_cast(this) == &other; + } + + std::pmr::memory_resource* upstream_; + dry_run_allocator alloc_; +}; + +/** + * @brief RAII manager that replaces memory resources with dry-run versions. + * + * On construction, saves all current memory resource state and replaces it with + * dry-run resources. On destruction, restores all original resources. + * + * This class only manages resources; the action to be dry-run is executed + * separately (see dry_run_execute()). + */ +class dry_run_resource_manager { + public: + /** + * @brief Set up dry-run resources on the given raft::resources handle. + * @param res The resources handle to modify. + */ + explicit dry_run_resource_manager(const raft::resources& res) : res_(res) + { + // Save original device resource state + orig_global_device_mr_ = rmm::mr::get_current_device_resource(); + orig_pmr_ = std::pmr::get_default_resource(); + + // Save workspace settings (use accessors that handle lazy initialization) + auto* workspace_mr = resource::get_workspace_resource(res); + workspace_limit_ = workspace_mr->get_allocation_limit(); + orig_workspace_upstream_ = orig_global_device_mr_; + + // Save large workspace + orig_large_workspace_mr_ = resource::get_large_workspace_resource(res); + + // Create dry-run resources + dry_run_workspace_ = std::make_shared(orig_workspace_upstream_); + dry_run_large_workspace_ = + std::make_shared(orig_large_workspace_mr_); + dry_run_global_ = std::make_shared(orig_global_device_mr_); + dry_run_host_ = std::make_unique(orig_pmr_); + + // Replace global device resource + rmm::mr::set_current_device_resource(dry_run_global_.get()); + // Replace global host resource + std::pmr::set_default_resource(dry_run_host_.get()); + + // Replace workspace resources + resource::set_workspace_resource(res, dry_run_workspace_, workspace_limit_, std::nullopt); + resource::set_large_workspace_resource(res, dry_run_large_workspace_); + + // Set dry-run flag + resource::set_dry_run_flag(res, true); + } + + ~dry_run_resource_manager() noexcept + { + // Restore dry-run flag + resource::set_dry_run_flag(res_, false); + + // Restore global resources + rmm::mr::set_current_device_resource(orig_global_device_mr_); + std::pmr::set_default_resource(orig_pmr_); + + // Restore workspace resources with original settings. + // Use non-owning shared_ptrs (void_op deleter) since lifetime is managed externally. + resource::set_workspace_resource( + res_, + std::shared_ptr(orig_workspace_upstream_, void_op{}), + workspace_limit_, + std::nullopt); + resource::set_large_workspace_resource( + res_, std::shared_ptr(orig_large_workspace_mr_, void_op{})); + } + + // Non-copyable, non-movable + dry_run_resource_manager(dry_run_resource_manager const&) = delete; + dry_run_resource_manager& operator=(dry_run_resource_manager const&) = delete; + dry_run_resource_manager(dry_run_resource_manager&&) = delete; + dry_run_resource_manager& operator=(dry_run_resource_manager&&) = delete; + + /** + * @brief Get the collected dry-run statistics. + * @return dry_run_stats with peak usage information. + */ + [[nodiscard]] auto get_stats() const -> dry_run_stats + { + return { + .device_workspace_peak = dry_run_workspace_->get_peak_bytes(), + .device_large_workspace_peak = dry_run_large_workspace_->get_peak_bytes(), + .device_global_peak = dry_run_global_->get_peak_bytes(), + .host_peak = dry_run_host_->get_peak_bytes(), + }; + } + + private: + const raft::resources& res_; + + // Original resources (saved in constructor) + rmm::mr::device_memory_resource* orig_global_device_mr_{nullptr}; + std::pmr::memory_resource* orig_pmr_{nullptr}; + std::optional workspace_limit_; + rmm::mr::device_memory_resource* orig_workspace_upstream_{nullptr}; + rmm::mr::device_memory_resource* orig_large_workspace_mr_{nullptr}; + + // Dry-run resources + std::shared_ptr dry_run_workspace_; + std::shared_ptr dry_run_large_workspace_; + std::shared_ptr dry_run_global_; + std::unique_ptr dry_run_host_; +}; + +/** + * @brief Execute an action in dry-run mode and return memory usage statistics. + * + * This function: + * 1. Replaces all memory resources with dry-run versions (RAII). + * 2. Executes the provided action. + * 3. Restores all original resources (RAII destructor). + * 4. Returns statistics about peak memory usage. + * + * The action receives the resources handle and can check the dry-run flag via + * `raft::resource::get_dry_run_flag(res)` to skip kernel execution. + * + * @tparam Action A callable with signature `void(const raft::resources&, Args...)`. + * @tparam Args Additional argument types to forward to the action. + * @param res The raft resources handle. + * @param action The action to execute in dry-run mode. + * @param args Additional arguments to forward to the action. + * @return dry_run_stats with peak memory usage from the dry run. + * + * @code{.cpp} + * raft::resources res; + * auto stats = raft::util::dry_run_execute(res, [](const raft::resources& r) { + * my_algorithm(r); + * }); + * std::cout << "Peak workspace: " << stats.device_workspace_peak << " bytes\n"; + * @endcode + */ +template +auto dry_run_execute(const raft::resources& res, Action&& action, Args&&... args) -> dry_run_stats +{ + dry_run_resource_manager manager(res); + std::forward(action)(res, std::forward(args)...); + return manager.get_stats(); +} + +/** @} */ + +} // namespace raft::util diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index dff227dd8b..0c807af35d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,6 +1,6 @@ # ============================================================================= # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on # ============================================================================= @@ -319,6 +319,7 @@ if(BUILD_TESTS) util/popc.cu util/pow2_utils.cu util/reduction.cu + util/dry_run_memory_resource.cpp ) endif() diff --git a/cpp/tests/util/dry_run_memory_resource.cpp b/cpp/tests/util/dry_run_memory_resource.cpp new file mode 100644 index 0000000000..f997ace299 --- /dev/null +++ b/cpp/tests/util/dry_run_memory_resource.cpp @@ -0,0 +1,249 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include + +namespace raft::util { + +// ===== dry_run_device_memory_resource tests ===== + +TEST(DryRunDeviceMemoryResource, LazyAllocation) +{ + auto* upstream = rmm::mr::get_current_device_resource(); + dry_run_device_memory_resource mr(upstream); + + // Request 1 GiB; actual allocation should be at most 2 MiB + constexpr std::size_t kOneGiB = 1024UL * 1024UL * 1024UL; + void* ptr = mr.allocate(rmm::cuda_stream_view{}, kOneGiB); + ASSERT_NE(ptr, nullptr); + EXPECT_EQ(mr.get_allocated_bytes(), kOneGiB); + EXPECT_EQ(mr.get_peak_bytes(), kOneGiB); + + mr.deallocate(rmm::cuda_stream_view{}, ptr, kOneGiB); + EXPECT_EQ(mr.get_allocated_bytes(), 0); + EXPECT_EQ(mr.get_peak_bytes(), kOneGiB); // peak should not decrease +} + +TEST(DryRunDeviceMemoryResource, SmallAllocation) +{ + auto* upstream = rmm::mr::get_current_device_resource(); + dry_run_device_memory_resource mr(upstream); + + // Request less than 2MB - should allocate the actual requested size + constexpr std::size_t kSmall = 1024; + void* ptr = mr.allocate(rmm::cuda_stream_view{}, kSmall); + ASSERT_NE(ptr, nullptr); + EXPECT_EQ(mr.get_allocated_bytes(), kSmall); + + mr.deallocate(rmm::cuda_stream_view{}, ptr, kSmall); + EXPECT_EQ(mr.get_allocated_bytes(), 0); +} + +TEST(DryRunDeviceMemoryResource, PeakTracking) +{ + auto* upstream = rmm::mr::get_current_device_resource(); + dry_run_device_memory_resource mr(upstream); + + constexpr std::size_t kSize1 = 100UL * 1024UL * 1024UL; // 100 MiB + constexpr std::size_t kSize2 = 200UL * 1024UL * 1024UL; // 200 MiB + + void* p1 = mr.allocate(rmm::cuda_stream_view{}, kSize1); + void* p2 = mr.allocate(rmm::cuda_stream_view{}, kSize2); + EXPECT_EQ(mr.get_peak_bytes(), kSize1 + kSize2); + + mr.deallocate(rmm::cuda_stream_view{}, p1, kSize1); + EXPECT_EQ(mr.get_allocated_bytes(), kSize2); + EXPECT_EQ(mr.get_peak_bytes(), kSize1 + kSize2); // peak unchanged + + void* p3 = mr.allocate(rmm::cuda_stream_view{}, kSize1 / 2); + EXPECT_EQ(mr.get_peak_bytes(), kSize1 + kSize2); // still the previous peak + + mr.deallocate(rmm::cuda_stream_view{}, p2, kSize2); + mr.deallocate(rmm::cuda_stream_view{}, p3, kSize1 / 2); + EXPECT_EQ(mr.get_allocated_bytes(), 0); +} + +TEST(DryRunDeviceMemoryResource, MultipleAllocations) +{ + auto* upstream = rmm::mr::get_current_device_resource(); + dry_run_device_memory_resource mr(upstream); + + constexpr int kNumAllocs = 10; + constexpr std::size_t kEachSize = 50UL * 1024UL * 1024UL; // 50 MiB each + void* ptrs[kNumAllocs]; + + for (int i = 0; i < kNumAllocs; ++i) { + ptrs[i] = mr.allocate(rmm::cuda_stream_view{}, kEachSize); + ASSERT_NE(ptrs[i], nullptr); + } + EXPECT_EQ(mr.get_allocated_bytes(), kNumAllocs * kEachSize); + EXPECT_EQ(mr.get_peak_bytes(), kNumAllocs * kEachSize); + + for (int i = 0; i < kNumAllocs; ++i) { + mr.deallocate(rmm::cuda_stream_view{}, ptrs[i], kEachSize); + } + EXPECT_EQ(mr.get_allocated_bytes(), 0); +} + +// ===== dry_run_host_memory_resource tests ===== + +TEST(DryRunHostMemoryResource, LazyAllocation) +{ + auto* upstream = std::pmr::get_default_resource(); + dry_run_host_memory_resource mr(upstream); + + constexpr std::size_t kLarge = 1024UL * 1024UL * 1024UL; // 1 GiB + void* ptr = mr.allocate(kLarge); + ASSERT_NE(ptr, nullptr); + EXPECT_EQ(mr.get_allocated_bytes(), kLarge); + EXPECT_EQ(mr.get_peak_bytes(), kLarge); + + mr.deallocate(ptr, kLarge); + EXPECT_EQ(mr.get_allocated_bytes(), 0); + EXPECT_EQ(mr.get_peak_bytes(), kLarge); +} + +TEST(DryRunHostMemoryResource, PeakTracking) +{ + auto* upstream = std::pmr::get_default_resource(); + dry_run_host_memory_resource mr(upstream); + + constexpr std::size_t kSize1 = 100UL * 1024UL * 1024UL; + constexpr std::size_t kSize2 = 200UL * 1024UL * 1024UL; + + void* p1 = mr.allocate(kSize1); + void* p2 = mr.allocate(kSize2); + EXPECT_EQ(mr.get_peak_bytes(), kSize1 + kSize2); + + mr.deallocate(p1, kSize1); + mr.deallocate(p2, kSize2); + EXPECT_EQ(mr.get_allocated_bytes(), 0); + EXPECT_EQ(mr.get_peak_bytes(), kSize1 + kSize2); +} + +// ===== dry_run_flag resource tests ===== + +TEST(DryRunFlag, DefaultIsFalse) +{ + raft::resources res; + EXPECT_FALSE(resource::get_dry_run_flag(res)); +} + +TEST(DryRunFlag, SetAndGet) +{ + raft::resources res; + resource::set_dry_run_flag(res, true); + EXPECT_TRUE(resource::get_dry_run_flag(res)); + + resource::set_dry_run_flag(res, false); + EXPECT_FALSE(resource::get_dry_run_flag(res)); +} + +// ===== dry_run_resource_manager tests ===== + +TEST(DryRunResourceManager, SetsAndRestoresFlag) +{ + raft::resources res; + EXPECT_FALSE(resource::get_dry_run_flag(res)); + { + dry_run_resource_manager manager(res); + EXPECT_TRUE(resource::get_dry_run_flag(res)); + } + EXPECT_FALSE(resource::get_dry_run_flag(res)); +} + +TEST(DryRunResourceManager, RestoresGlobalDeviceResource) +{ + auto* original_mr = rmm::mr::get_current_device_resource(); + raft::resources res; + { + dry_run_resource_manager manager(res); + auto* current_mr = rmm::mr::get_current_device_resource(); + EXPECT_NE(current_mr, original_mr); + } + EXPECT_EQ(rmm::mr::get_current_device_resource(), original_mr); +} + +TEST(DryRunResourceManager, RestoresHostResource) +{ + auto* original_pmr = std::pmr::get_default_resource(); + raft::resources res; + { + dry_run_resource_manager manager(res); + auto* current_pmr = std::pmr::get_default_resource(); + EXPECT_NE(current_pmr, original_pmr); + } + EXPECT_EQ(std::pmr::get_default_resource(), original_pmr); +} + +TEST(DryRunResourceManager, StatsAccuracy) +{ + raft::resources res; + constexpr std::size_t kAllocSize = 64UL * 1024UL * 1024UL; // 64 MiB + + dry_run_resource_manager manager(res); + + // Allocate from global device resource + auto* mr = rmm::mr::get_current_device_resource(); + void* ptr = mr->allocate(rmm::cuda_stream_view{}, kAllocSize); + mr->deallocate(rmm::cuda_stream_view{}, ptr, kAllocSize); + + auto stats = manager.get_stats(); + EXPECT_EQ(stats.device_global_peak, kAllocSize); +} + +// ===== dry_run_execute tests ===== + +TEST(DryRunExecute, BasicExecution) +{ + raft::resources res; + bool action_ran = false; + + auto stats = dry_run_execute(res, [&](raft::resources const& r) { + action_ran = true; + EXPECT_TRUE(resource::get_dry_run_flag(r)); + + // Allocate via global device resource + auto* mr = rmm::mr::get_current_device_resource(); + constexpr std::size_t kSize = 32UL * 1024UL * 1024UL; + void* ptr = mr->allocate(rmm::cuda_stream_view{}, kSize); + mr->deallocate(rmm::cuda_stream_view{}, ptr, kSize); + }); + + EXPECT_TRUE(action_ran); + EXPECT_FALSE(resource::get_dry_run_flag(res)); + EXPECT_EQ(stats.device_global_peak, 32UL * 1024UL * 1024UL); +} + +TEST(DryRunExecute, ExceptionSafety) +{ + raft::resources res; + auto* original_mr = rmm::mr::get_current_device_resource(); + auto* original_pmr = std::pmr::get_default_resource(); + + EXPECT_THROW(dry_run_execute( + res, [](raft::resources const&) { throw std::runtime_error("test exception"); }), + std::runtime_error); + + // Resources should be restored even after exception + EXPECT_EQ(rmm::mr::get_current_device_resource(), original_mr); + EXPECT_EQ(std::pmr::get_default_resource(), original_pmr); + EXPECT_FALSE(resource::get_dry_run_flag(res)); +} + +} // namespace raft::util From 695a8a3aaec8e1583e0cba744bff10936aa4ab3a Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 18 Feb 2026 18:52:03 +0100 Subject: [PATCH 02/62] First batch of dry-run guards --- cpp/include/raft/linalg/add.cuh | 6 ++- .../raft/linalg/coalesced_reduction.cuh | 47 ++++++++++--------- cpp/include/raft/linalg/detail/axpy.cuh | 4 +- .../raft/linalg/detail/cholesky_r1_update.cuh | 4 +- .../linalg/detail/coalesced_reduction-inl.cuh | 18 +++++-- .../raft/linalg/detail/cublaslt_wrappers.hpp | 5 +- cpp/include/raft/linalg/detail/eig.cuh | 24 ++++++++-- cpp/include/raft/linalg/detail/gemv.hpp | 5 +- cpp/include/raft/linalg/detail/lstsq.cuh | 44 ++++++++++++----- cpp/include/raft/linalg/detail/map.cuh | 2 + .../raft/linalg/detail/matrix_vector_op.cuh | 11 +++-- cpp/include/raft/linalg/detail/reduce.cuh | 17 ++++--- cpp/include/raft/linalg/matrix_vector_op.cuh | 40 +++++++++------- cpp/include/raft/linalg/reduce.cuh | 27 ++++++----- cpp/include/raft/linalg/strided_reduction.cuh | 4 +- cpp/include/raft/matrix/detail/math.cuh | 32 +++++++------ cpp/include/raft/matrix/detail/matrix.cuh | 4 +- cpp/include/raft/matrix/linewise_op.cuh | 5 +- cpp/include/raft/stats/accuracy.cuh | 11 ++++- .../raft/stats/adjusted_rand_index.cuh | 8 ++-- .../raft/stats/detail/adjusted_rand_index.cuh | 24 +++++++++- cpp/include/raft/stats/detail/entropy.cuh | 21 +++++++-- cpp/include/raft/stats/detail/mean.cuh | 46 +++++++++--------- cpp/include/raft/stats/detail/rand_index.cuh | 5 +- cpp/include/raft/stats/entropy.cuh | 8 ++-- cpp/include/raft/stats/histogram.cuh | 7 ++- cpp/include/raft/stats/mean.cuh | 12 +++-- cpp/include/raft/stats/rand_index.cuh | 8 ++-- cpp/tests/stats/homogeneity_score.cu | 3 +- 29 files changed, 301 insertions(+), 151 deletions(-) diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh index b1953470b0..c0e086f43f 100644 --- a/cpp/include/raft/linalg/add.cuh +++ b/cpp/include/raft/linalg/add.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __ADD_H @@ -12,6 +12,7 @@ #include #include #include +#include #include namespace raft { @@ -102,6 +103,7 @@ template > void add(raft::resources const& handle, InType in1, InType in2, OutType out) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; @@ -139,6 +141,7 @@ void add_scalar(raft::resources const& handle, OutType out, raft::device_scalar_view scalar) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; @@ -174,6 +177,7 @@ void add_scalar(raft::resources const& handle, OutType out, raft::host_scalar_view scalar) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh index 3ed5ed7736..ca6548f28b 100644 --- a/cpp/include/raft/linalg/coalesced_reduction.cuh +++ b/cpp/include/raft/linalg/coalesced_reduction.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __COALESCED_REDUCTION_H @@ -12,6 +12,7 @@ #include #include #include +#include #include namespace raft { @@ -62,7 +63,7 @@ void coalescedReduction(OutType* dots, FinalLambda final_op = raft::identity_op()) { detail::coalescedReduction( - dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + false, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } /** @@ -120,30 +121,32 @@ void coalesced_reduction(raft::resources const& handle, RAFT_EXPECTS(static_cast(dots.size()) == data.extent(0), "Output should be equal to number of rows in Input"); - coalescedReduction(dots.data_handle(), - data.data_handle(), - data.extent(1), - data.extent(0), - init, - resource::get_cuda_stream(handle), - inplace, - main_op, - reduce_op, - final_op); + detail::coalescedReduction(resource::get_dry_run_flag(handle), + dots.data_handle(), + data.data_handle(), + data.extent(1), + data.extent(0), + init, + resource::get_cuda_stream(handle), + inplace, + main_op, + reduce_op, + final_op); } else if constexpr (std::is_same_v) { RAFT_EXPECTS(static_cast(dots.size()) == data.extent(1), "Output should be equal to number of columns in Input"); - coalescedReduction(dots.data_handle(), - data.data_handle(), - data.extent(0), - data.extent(1), - init, - resource::get_cuda_stream(handle), - inplace, - main_op, - reduce_op, - final_op); + detail::coalescedReduction(resource::get_dry_run_flag(handle), + dots.data_handle(), + data.data_handle(), + data.extent(0), + data.extent(1), + init, + resource::get_cuda_stream(handle), + inplace, + main_op, + reduce_op, + final_op); } } diff --git a/cpp/include/raft/linalg/detail/axpy.cuh b/cpp/include/raft/linalg/detail/axpy.cuh index 1ab690937d..40634b6428 100644 --- a/cpp/include/raft/linalg/detail/axpy.cuh +++ b/cpp/include/raft/linalg/detail/axpy.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -8,6 +8,7 @@ #include "cublas_wrappers.hpp" #include +#include #include #include @@ -24,6 +25,7 @@ void axpy(raft::resources const& handle, const int incy, cudaStream_t stream) { + if (resource::get_dry_run_flag(handle)) { return; } auto cublas_h = resource::get_cublas_handle(handle); cublas_device_pointer_mode pmode(cublas_h); RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, n, alpha, x, incx, y, incy, stream)); diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh index b05449f90a..d997377d54 100644 --- a/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh +++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -53,6 +54,7 @@ void choleskyRank1Update(raft::resources const& handle, *n_bytes = offset + 1 * sizeof(math_t); return; } + if (resource::get_dry_run_flag(handle)) { return; } math_t* s = reinterpret_cast(((char*)workspace) + offset); math_t* L_22 = L + (n - 1) * ld + n - 1; diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh index 2d513b433d..593c09fc75 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh @@ -498,7 +498,8 @@ template -void coalescedReductionThick(OutType* dots, +void coalescedReductionThick(bool dry_run, + OutType* dots, const InType* data, IdxType D, IdxType N, @@ -517,6 +518,8 @@ void coalescedReductionThick(OutType* dots, rmm::device_uvector buffer(N * ThickPolicy::BlocksPerRow, stream); + if (dry_run) { return; } + /* We apply a two-step reduction: * 1. coalescedReductionThickKernel reduces the [N x D] input data to [N x BlocksPerRow]. It * applies the main_op but not the final op. @@ -550,7 +553,8 @@ template -void coalescedReductionThickDispatcher(OutType* dots, +void coalescedReductionThickDispatcher(bool dry_run, + OutType* dots, const InType* data, IdxType D, IdxType N, @@ -564,7 +568,7 @@ void coalescedReductionThickDispatcher(OutType* dots, // Note: multiple elements per thread to take advantage of the sequential reduction and loop // unrolling coalescedReductionThick, ReductionThinPolicy<32, 128, 1>>( - dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + dry_run, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } // Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along @@ -579,7 +583,8 @@ template -void coalescedReduction(OutType* dots, +void coalescedReduction(bool dry_run, + OutType* dots, const InType* data, IdxType D, IdxType N, @@ -600,12 +605,15 @@ void coalescedReduction(OutType* dots, */ const IdxType numSMs = raft::getMultiProcessorCount(); if (D <= IdxType(512) || (N >= IdxType(16) * numSMs && D < IdxType(2048))) { + if (dry_run) { return; } coalescedReductionThinDispatcher( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if (N < numSMs && D >= IdxType(1 << 17)) { + if (dry_run) { return; } coalescedReductionThickDispatcher( - dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + dry_run, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else { + if (dry_run) { return; } coalescedReductionMediumDispatcher( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } diff --git a/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp b/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp index 469780ba1f..3ffa4ded84 100644 --- a/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp +++ b/cpp/include/raft/linalg/detail/cublaslt_wrappers.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -282,6 +283,8 @@ template batch_scope( "linalg::matmul(m = %d, n = %d, k = %d)", m, n, k); std::shared_ptr mm_desc{nullptr}; diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh index 5b64add128..d8d31fc411 100644 --- a/cpp/include/raft/linalg/detail/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -44,9 +45,13 @@ void eigDC_legacy(raft::resources const& handle, eig_vals, &lwork)); + // TODO(achirkin): Consider using the workspace resource for these temporary allocations. rmm::device_uvector d_work(lwork, stream); rmm::device_scalar d_dev_info(stream); + // The workspace is already allocated, no more allocation are foreseeable. + if (resource::get_dry_run_flag(handle)) { return; } + raft::matrix::copy(handle, make_device_matrix_view(in, n_rows, n_cols), make_device_matrix_view(eig_vectors, n_rows, n_cols)); @@ -115,6 +120,12 @@ void eigDC(raft::resources const& handle, rmm::device_scalar d_dev_info(stream_new); std::vector h_work(workspaceHost / sizeof(math_t)); + if (resource::get_dry_run_flag(handle)) { + // No more allocations beyond this points, but need to cleanup. + RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params)); + return; + } + raft::copy(eig_vectors, in, n_rows * n_cols, stream_new); RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH, @@ -181,7 +192,9 @@ void eigSelDC(raft::resources const& handle, rmm::device_uvector d_work(lwork, stream); rmm::device_scalar d_dev_info(stream); - rmm::device_uvector d_eig_vectors(0, stream); + rmm::device_uvector d_eig_vectors(memUsage == COPY_INPUT ? n_rows * n_cols : 0, stream); + + if (resource::get_dry_run_flag(handle)) { return; } if (memUsage == OVERWRITE_INPUT) { RAFT_CUSOLVER_TRY(cusolverDnsyevdx(cusolverH, @@ -202,7 +215,6 @@ void eigSelDC(raft::resources const& handle, d_dev_info.data(), stream)); } else if (memUsage == COPY_INPUT) { - d_eig_vectors.resize(n_rows * n_cols, stream); raft::matrix::copy(handle, make_device_matrix_view(in, n_rows, n_cols), make_device_matrix_view(eig_vectors, n_rows, n_cols)); @@ -279,6 +291,12 @@ void eigJacobi(raft::resources const& handle, rmm::device_uvector d_work(lwork, stream); rmm::device_scalar dev_info(stream); + if (resource::get_dry_run_flag(handle)) { + // No more allocations beyond this points, but need to cleanup. + RAFT_CUSOLVER_TRY(cusolverDnDestroySyevjInfo(syevj_params)); + return; + } + raft::matrix::copy(handle, make_device_matrix_view(in, n_rows, n_cols), make_device_matrix_view(eig_vectors, n_rows, n_cols)); diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp index 3233940a66..905ecab0c5 100644 --- a/cpp/include/raft/linalg/detail/gemv.hpp +++ b/cpp/include/raft/linalg/detail/gemv.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -8,6 +8,7 @@ #include "cublas_wrappers.hpp" #include +#include #include #include @@ -31,6 +32,7 @@ void gemv(raft::resources const& handle, const int incy, cudaStream_t stream) { + if (resource::get_dry_run_flag(handle)) { return; } cublasHandle_t cublas_h = resource::get_cublas_handle(handle); detail::cublas_device_pointer_mode pmode(cublas_h); RAFT_CUBLAS_TRY(detail::cublasgemv(cublas_h, @@ -109,6 +111,7 @@ void gemv(raft::resources const& handle, const math_t beta, cudaStream_t stream) { + if (resource::get_dry_run_flag(handle)) { return; } cublasHandle_t cublas_h = resource::get_cublas_handle(handle); cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; RAFT_CUBLAS_TRY( diff --git a/cpp/include/raft/linalg/detail/lstsq.cuh b/cpp/include/raft/linalg/detail/lstsq.cuh index 4a2aeb8d4c..c930bb8f4f 100644 --- a/cpp/include/raft/linalg/detail/lstsq.cuh +++ b/cpp/include/raft/linalg/detail/lstsq.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -133,6 +134,9 @@ void lstsqSvdQR(raft::resources const& handle, + 1 // devInfo , stream); + + if (resource::get_dry_run_flag(handle)) { return; } + math_t* cusolverWorkSet = workset.data(); math_t* U = cusolverWorkSet + cusolverWorkSetSize; math_t* Vt = U + n_rows * minmn; @@ -207,6 +211,12 @@ void lstsqSvdJacobi(raft::resources const& handle, + 1 // devInfo , stream); + + if (resource::get_dry_run_flag(handle)) { + RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params)); + return; + } + math_t* cusolverWorkSet = workset.data(); math_t* U = cusolverWorkSet + cusolverWorkSetSize; math_t* V = U + n_rows * minmn; @@ -251,21 +261,27 @@ void lstsqEig(raft::resources const& handle, { rmm::cuda_stream_view mainStream = rmm::cuda_stream_view(stream); rmm::cuda_stream_view multAbStream = resource::get_next_usable_stream(handle); + bool dry_run = resource::get_dry_run_flag(handle); bool concurrent; - // Check if the two streams can run concurrently. This is needed because a legacy default stream - // would synchronize with other blocking streams. To avoid synchronization in such case, we try to - // use an additional stream from the pool. - if (!are_implicitly_synchronized(mainStream, multAbStream)) { - concurrent = true; - } else if (resource::get_stream_pool_size(handle) > 1) { - mainStream = resource::get_next_usable_stream(handle); - concurrent = true; + if (dry_run) { + concurrent = false; } else { - multAbStream = mainStream; - concurrent = false; + // Check if the two streams can run concurrently. This is needed because a legacy default stream + // would synchronize with other blocking streams. To avoid synchronization in such case, we try + // to use an additional stream from the pool. + if (!are_implicitly_synchronized(mainStream, multAbStream)) { + concurrent = true; + } else if (resource::get_stream_pool_size(handle) > 1) { + mainStream = resource::get_next_usable_stream(handle); + concurrent = true; + } else { + multAbStream = mainStream; + concurrent = false; + } } rmm::device_uvector workset(n_cols * n_cols * 3 + n_cols * 2, mainStream); + // the event is created only if the given raft handle is capable of running // at least two CUDA streams without implicit synchronization. DeviceEvent worksetDone(concurrent); @@ -305,8 +321,8 @@ void lstsqEig(raft::resources const& handle, raft::common::nvtx::pop_range(); // QS <- Q invS - raft::linalg::matrixVectorOp( - QS, Q, S, n_cols, n_cols, DivideByNonZero(), mainStream); + raft::linalg::detail::matrixVectorOp( + dry_run, QS, Q, S, n_cols, n_cols, DivideByNonZero(), mainStream); // covA <- QS Q* == Q invS Q* == inv(A* A) raft::linalg::gemm(handle, QS, @@ -395,6 +411,8 @@ void lstsqQR(raft::resources const& handle, rmm::device_uvector d_work(lwork, stream); + if (resource::get_dry_run_flag(handle)) { return; } + // #TODO: Call from public API when ready RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf( cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream)); diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index 3153de5396..5678f8e39b 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -206,6 +207,7 @@ template > void map(const raft::resources& res, OutType out, Func f, InTypes... ins) { + if (resource::get_dry_run_flag(res)) { return; } RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); (map_check_shape(out, ins), ...); diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh index 64de01a3fe..3275410bac 100644 --- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh @@ -1,11 +1,12 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once #include +#include #include namespace raft { @@ -19,7 +20,8 @@ template -void matrixVectorOp(MatT* out, +void matrixVectorOp(bool dry_run, + MatT* out, const MatT* matrix, const VecT* vec, IdxType D, @@ -27,6 +29,7 @@ void matrixVectorOp(MatT* out, Lambda op, cudaStream_t stream) { + if (dry_run) { return; } raft::resources handle; resource::set_cuda_stream(handle, stream); constexpr raft::Apply apply = @@ -56,7 +59,8 @@ template -void matrixVectorOp(MatT* out, +void matrixVectorOp(bool dry_run, + MatT* out, const MatT* matrix, const Vec1T* vec1, const Vec2T* vec2, @@ -65,6 +69,7 @@ void matrixVectorOp(MatT* out, Lambda op, cudaStream_t stream) { + if (dry_run) { return; } raft::resources handle; resource::set_cuda_stream(handle, stream); constexpr raft::Apply apply = diff --git a/cpp/include/raft/linalg/detail/reduce.cuh b/cpp/include/raft/linalg/detail/reduce.cuh index 4d90e32e99..811e10d70a 100644 --- a/cpp/include/raft/linalg/detail/reduce.cuh +++ b/cpp/include/raft/linalg/detail/reduce.cuh @@ -1,12 +1,12 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once #include -#include +#include #include namespace raft { @@ -21,7 +21,8 @@ template -void reduce(OutType* dots, +void reduce(bool dry_run, + OutType* dots, const InType* data, IdxType D, IdxType N, @@ -33,17 +34,19 @@ void reduce(OutType* dots, FinalLambda final_op = raft::identity_op()) { if constexpr (rowMajor && alongRows) { - raft::linalg::coalescedReduction( - dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + coalescedReduction( + dry_run, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if constexpr (rowMajor && !alongRows) { + if (dry_run) { return; } // no allocations in strided reduction raft::linalg::stridedReduction( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else if constexpr (!rowMajor && alongRows) { + if (dry_run) { return; } // no allocations in strided reduction raft::linalg::stridedReduction( dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } else { - raft::linalg::coalescedReduction( - dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); + coalescedReduction( + dry_run, dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op); } } diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 47a3cd9ce8..abd437ab91 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __MATRIX_VECTOR_OP_H @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -56,7 +57,7 @@ void matrixVectorOp(MatT* out, Lambda op, cudaStream_t stream) { - detail::matrixVectorOp(out, matrix, vec, D, N, op, stream); + detail::matrixVectorOp(false, out, matrix, vec, D, N, op, stream); } /** @@ -100,7 +101,8 @@ void matrixVectorOp(MatT* out, Lambda op, cudaStream_t stream) { - detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, op, stream); + detail::matrixVectorOp( + false, out, matrix, vec1, vec2, D, N, op, stream); } /** @@ -156,13 +158,14 @@ void matrix_vector_op(raft::resources const& handle, "Size mismatch between matrix and vector"); } - matrixVectorOp(out.data_handle(), - matrix.data_handle(), - vec.data_handle(), - out.extent(1), - out.extent(0), - op, - resource::get_cuda_stream(handle)); + detail::matrixVectorOp(resource::get_dry_run_flag(handle), + out.data_handle(), + matrix.data_handle(), + vec.data_handle(), + out.extent(1), + out.extent(0), + op, + resource::get_cuda_stream(handle)); } /** @@ -221,14 +224,15 @@ void matrix_vector_op(raft::resources const& handle, "Size mismatch between matrix and vector"); } - matrixVectorOp(out.data_handle(), - matrix.data_handle(), - vec1.data_handle(), - vec2.data_handle(), - out.extent(1), - out.extent(0), - op, - resource::get_cuda_stream(handle)); + detail::matrixVectorOp(resource::get_dry_run_flag(handle), + out.data_handle(), + matrix.data_handle(), + vec1.data_handle(), + vec2.data_handle(), + out.extent(1), + out.extent(0), + op, + resource::get_cuda_stream(handle)); } /** @} */ // end of group matrix_vector_op diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh index ce2c324f24..e3650469df 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __REDUCE_H @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -71,7 +72,7 @@ void reduce(OutType* dots, FinalLambda final_op = raft::identity_op()) { detail::reduce( - dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); + false, dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } /** @@ -166,16 +167,18 @@ void reduce(raft::resources const& handle, "Output should be equal to number of columns in Input"); } - reduce(dots.data_handle(), - data.data_handle(), - data.extent(1), - data.extent(0), - init, - resource::get_cuda_stream(handle), - inplace, - main_op, - reduce_op, - final_op); + detail::reduce( + resource::get_dry_run_flag(handle), + dots.data_handle(), + data.data_handle(), + data.extent(1), + data.extent(0), + init, + resource::get_cuda_stream(handle), + inplace, + main_op, + reduce_op, + final_op); } /** @} */ // end of group reduction diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh index efbd80126e..eb34a99452 100644 --- a/cpp/include/raft/linalg/strided_reduction.cuh +++ b/cpp/include/raft/linalg/strided_reduction.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -127,6 +128,7 @@ void strided_reduction(raft::resources const& handle, ReduceLambda reduce_op = raft::add_op(), FinalLambda final_op = raft::identity_op()) { + if (resource::get_dry_run_flag(handle)) { return; } if constexpr (std::is_same_v) { RAFT_EXPECTS(static_cast(dots.size()) == data.extent(1), "Output should be equal to number of columns in Input"); diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh index 05416d16be..9eefcf547e 100644 --- a/cpp/include/raft/matrix/detail/math.cuh +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -6,6 +6,7 @@ #pragma once #include +#include #include #include #include @@ -186,10 +187,10 @@ template void ratio( raft::resources const& handle, const math_t* src, math_t* dest, IdxType len, cudaStream_t stream) { - auto d_src = src; - auto d_dest = dest; - rmm::device_scalar d_sum(stream); + if (resource::get_dry_run_flag(handle)) { return; } + auto d_src = src; + auto d_dest = dest; auto* d_sum_ptr = d_sum.data(); raft::linalg::mapThenSumReduce(d_sum_ptr, len, raft::identity_op{}, stream, src); raft::linalg::unaryOp( @@ -200,15 +201,16 @@ template ( - data, data, vec, n_col, n_row, raft::mul_op(), stream); + raft::linalg::detail::matrixVectorOp( + false, data, data, vec, n_col, n_row, raft::mul_op(), stream); } template void matrixVectorBinaryMultSkipZero( Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream) { - raft::linalg::matrixVectorOp( + raft::linalg::detail::matrixVectorOp( + false, data, data, vec, @@ -227,8 +229,8 @@ template ( - data, data, vec, n_col, n_row, raft::div_op(), stream); + raft::linalg::detail::matrixVectorOp( + false, data, data, vec, n_col, n_row, raft::div_op(), stream); } template @@ -240,7 +242,8 @@ void matrixVectorBinaryDivSkipZero(Type* data, bool return_zero = false) { if (return_zero) { - raft::linalg::matrixVectorOp( + raft::linalg::detail::matrixVectorOp( + false, data, data, vec, @@ -254,7 +257,8 @@ void matrixVectorBinaryDivSkipZero(Type* data, }, stream); } else { - raft::linalg::matrixVectorOp( + raft::linalg::detail::matrixVectorOp( + false, data, data, vec, @@ -274,16 +278,16 @@ template ( - data, data, vec, n_col, n_row, raft::add_op(), stream); + raft::linalg::detail::matrixVectorOp( + false, data, data, vec, n_col, n_row, raft::add_op(), stream); } template void matrixVectorBinarySub( Type* data, const Type* vec, IdxType n_row, IdxType n_col, cudaStream_t stream) { - raft::linalg::matrixVectorOp( - data, data, vec, n_col, n_row, raft::sub_op(), stream); + raft::linalg::detail::matrixVectorOp( + false, data, data, vec, n_col, n_row, raft::sub_op(), stream); } // Computes an argmin/argmax column-wise in a DxN matrix diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh index 0cdd9cfc58..6584681de0 100644 --- a/cpp/include/raft/matrix/detail/matrix.cuh +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -1,11 +1,12 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once #include +#include #include #include #include @@ -296,6 +297,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream) template m_t getL2Norm(raft::resources const& handle, const m_t* in, idx_t size, cudaStream_t stream) { + if (resource::get_dry_run_flag(handle)) { return m_t{0}; } cublasHandle_t cublasH = resource::get_cublas_handle(handle); m_t normval = 0; RAFT_EXPECTS( diff --git a/cpp/include/raft/matrix/linewise_op.cuh b/cpp/include/raft/matrix/linewise_op.cuh index 8b49f55d95..778f3faa13 100644 --- a/cpp/include/raft/matrix/linewise_op.cuh +++ b/cpp/include/raft/matrix/linewise_op.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -60,6 +61,7 @@ void linewise_op(raft::resources const& handle, Lambda op, vec_t... vecs) { + if (resource::get_dry_run_flag(handle)) { return; } constexpr auto is_rowmajor = std::is_same_v; constexpr auto is_colmajor = std::is_same_v; @@ -95,6 +97,7 @@ void linewise_op(raft::resources const& handle, Lambda op, vec_t... vecs) { + if (resource::get_dry_run_flag(handle)) { return; } constexpr auto is_rowmajor = std::is_same_v>; constexpr auto is_colmajor = std::is_same_v>; diff --git a/cpp/include/raft/stats/accuracy.cuh b/cpp/include/raft/stats/accuracy.cuh index 6b96d4a5e7..a83cfd6873 100644 --- a/cpp/include/raft/stats/accuracy.cuh +++ b/cpp/include/raft/stats/accuracy.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -10,6 +10,7 @@ #include #include +#include #include namespace raft { @@ -49,6 +50,14 @@ float accuracy(raft::resources const& handle, raft::device_vector_view predictions, raft::device_vector_view ref_predictions) { + if (resource::get_dry_run_flag(handle)) { + // detail::accuracy_score allocates this, but we can't pass 'dry-run' to it, because it doesn't + // accept raft::resources handle. + // We can't change the signature of it, because the overload above uses it too. + [[maybe_unused]] rmm::device_uvector diffs_array(predictions.extent(0), + resource::get_cuda_stream(handle)); + return 0; + } RAFT_EXPECTS(predictions.size() == ref_predictions.size(), "Size mismatch"); RAFT_EXPECTS(predictions.is_exhaustive(), "predictions must be contiguous"); RAFT_EXPECTS(ref_predictions.is_exhaustive(), "ref_predictions must be contiguous"); diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh index 5ab6e14e14..6f0f38ccf8 100644 --- a/cpp/include/raft/stats/adjusted_rand_index.cuh +++ b/cpp/include/raft/stats/adjusted_rand_index.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ /** @@ -15,6 +15,7 @@ #include #include +#include #include namespace raft { @@ -37,7 +38,7 @@ double adjusted_rand_index(const T* firstClusterArray, cudaStream_t stream) { return detail::compute_adjusted_rand_index( - firstClusterArray, secondClusterArray, size, stream); + false, firstClusterArray, secondClusterArray, size, stream); } /** @@ -65,7 +66,8 @@ double adjusted_rand_index(raft::resources const& handle, RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous"); RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous"); - return detail::compute_adjusted_rand_index(first_cluster_array.data_handle(), + return detail::compute_adjusted_rand_index(resource::get_dry_run_flag(handle), + first_cluster_array.data_handle(), second_cluster_array.data_handle(), first_cluster_array.extent(0), resource::get_cuda_stream(handle)); diff --git a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh index 3161b9c423..3cc6181229 100644 --- a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh +++ b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh @@ -104,13 +104,15 @@ int countUnique(const T* arr, int size, T& minLabel, T& maxLabel, cudaStream_t s * here * @tparam T data-type for input label arrays * @tparam MathT integral data-type used for computing n-choose-r + * @param dry_run: whether to run in dry-run mode * @param firstClusterArray: the array of classes * @param secondClusterArray: the array of classes * @param size: the size of the data points of type int * @param stream: the cudaStream object */ template -double compute_adjusted_rand_index(const T* firstClusterArray, +double compute_adjusted_rand_index(bool dry_run, + const T* firstClusterArray, const T* secondClusterArray, int size, cudaStream_t stream) @@ -119,6 +121,26 @@ double compute_adjusted_rand_index(const T* firstClusterArray, // 1 or 0 labels always have a perfect score. This also matches sklearn behavior. return 1.0; } + if (dry_run) { + // Upper bound on total extra allocations in terms of `size`. + // The label range nClasses = maxLabel - minLabel + 1 is bounded by `size` + // (at most `size` distinct labels with contiguous labeling). + // + // Allocations (all alive simultaneously): + // dContingencyMatrix: nClasses^2 * sizeof(MathT) <= size^2 * sizeof(MathT) + // a, b: 2 * nClasses * sizeof(MathT) <= 2 * size * sizeof(MathT) + // d_aCTwoSum, d_bCTwoSum, d_nChooseTwoSum: 3 * sizeof(MathT) + // workspaceBuff (SORT_AND_GATOMICS worst case): + // staging: 2 * alignTo(size * sizeof(T), 256) <= 2 * size * sizeof(T) + 512 + // CUB tmp: <= size * sizeof(T) + // + // Total <= (size^2 + 2*size + 3) * sizeof(MathT) + 3*size*sizeof(T) + 512 + // NB: this is a very generous bound, as nClasses is probably much smaller that the size. + auto n = static_cast(size); + auto alloc_upper_bound = (n * n + 2 * n + 3) * sizeof(MathT) + 3 * n * sizeof(T) + 512; + rmm::device_uvector dry_run_alloc(alloc_upper_bound, stream); + return 0.0; + } T minFirst, maxFirst, minSecond, maxSecond; auto nUniqFirst = countUnique(firstClusterArray, size, minFirst, maxFirst, stream); auto nUniqSecond = countUnique(secondClusterArray, size, minSecond, maxSecond, stream); diff --git a/cpp/include/raft/stats/detail/entropy.cuh b/cpp/include/raft/stats/detail/entropy.cuh index 3cfd0ad582..9fc124b475 100644 --- a/cpp/include/raft/stats/detail/entropy.cuh +++ b/cpp/include/raft/stats/detail/entropy.cuh @@ -45,6 +45,7 @@ struct entropyOp { * @brief function to calculate the bincounts of number of samples in every label * * @tparam LabelT: type of the labels + * @param dry_run: whether to run in dry-run mode * @param labels: the pointer to the array containing labels for every data sample * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster * @param nRows: number of data samples @@ -54,7 +55,8 @@ struct entropyOp { * @param stream: the cuda stream where to launch this kernel */ template -void countLabels(const LabelT* labels, +void countLabels(bool dry_run, + const LabelT* labels, double* binCountArray, int nRows, LabelT lowerLabelRange, @@ -79,6 +81,8 @@ void countLabels(const LabelT* labels, workspace.resize(temp_storage_bytes, stream); + if (dry_run) { return; } + RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(), temp_storage_bytes, labels, @@ -94,6 +98,7 @@ void countLabels(const LabelT* labels, * @brief Function to calculate entropy * more info on entropy * + * @param dry_run: whether to run in dry-run mode * @param clusterArray: the array of classes of type T * @param size: the size of the data points of type int * @param lowerLabelRange: the lower bound of the range of labels @@ -102,7 +107,8 @@ void countLabels(const LabelT* labels, * @return the entropy score */ template -double entropy(const T* clusterArray, +double entropy(bool dry_run, + const T* clusterArray, const int size, const T lowerLabelRange, const T upperLabelRange, @@ -114,15 +120,20 @@ double entropy(const T* clusterArray, // declaring, allocating and initializing memory for bincount array and entropy values rmm::device_uvector prob(numUniqueClasses, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(prob.data(), 0, numUniqueClasses * sizeof(double), stream)); + if (!dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(prob.data(), 0, numUniqueClasses * sizeof(double), stream)); + } rmm::device_scalar d_entropy(stream); - RAFT_CUDA_TRY(cudaMemsetAsync(d_entropy.data(), 0, sizeof(double), stream)); + if (!dry_run) { RAFT_CUDA_TRY(cudaMemsetAsync(d_entropy.data(), 0, sizeof(double), stream)); } // workspace allocation rmm::device_uvector workspace(1, stream); // calculating the bincounts and populating the prob array - countLabels(clusterArray, prob.data(), size, lowerLabelRange, upperLabelRange, workspace, stream); + countLabels( + dry_run, clusterArray, prob.data(), size, lowerLabelRange, upperLabelRange, workspace, stream); + + if (dry_run) { return 0.0; } // scalar dividing by size raft::linalg::divideScalar( diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh index 4f9420919d..9aa4d50170 100644 --- a/cpp/include/raft/stats/detail/mean.cuh +++ b/cpp/include/raft/stats/detail/mean.cuh @@ -14,36 +14,38 @@ namespace stats { namespace detail { template -void mean(Type* mu, const Type* data, IdxType D, IdxType N, cudaStream_t stream) +void mean(bool dry_run, Type* mu, const Type* data, IdxType D, IdxType N, cudaStream_t stream) { Type ratio = Type(1) / Type(N); - raft::linalg::reduce(mu, - data, - D, - N, - Type(0), - stream, - false, - raft::identity_op(), - raft::add_op(), - raft::mul_const_op(ratio)); + raft::linalg::detail::reduce(dry_run, + mu, + data, + D, + N, + Type(0), + stream, + false, + raft::identity_op(), + raft::add_op(), + raft::mul_const_op(ratio)); } template [[deprecated]] void mean( - Type* mu, const Type* data, IdxType D, IdxType N, bool sample, cudaStream_t stream) + bool dry_run, Type* mu, const Type* data, IdxType D, IdxType N, bool sample, cudaStream_t stream) { Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); - raft::linalg::reduce(mu, - data, - D, - N, - Type(0), - stream, - false, - raft::identity_op(), - raft::add_op(), - raft::mul_const_op(ratio)); + raft::linalg::detail::reduce(dry_run, + mu, + data, + D, + N, + Type(0), + stream, + false, + raft::identity_op(), + raft::add_op(), + raft::mul_const_op(ratio)); } } // namespace detail diff --git a/cpp/include/raft/stats/detail/rand_index.cuh b/cpp/include/raft/stats/detail/rand_index.cuh index fe8c5faac2..a2ef3c21ee 100644 --- a/cpp/include/raft/stats/detail/rand_index.cuh +++ b/cpp/include/raft/stats/detail/rand_index.cuh @@ -111,13 +111,15 @@ RAFT_KERNEL computeTheNumerator( /** * @brief Function to calculate RandIndex * more info on rand index + * @param dry_run: whether to run in dry-run mode * @param firstClusterArray: the array of classes of type T * @param secondClusterArray: the array of classes of type T * @param size: the size of the data points of type uint64_t * @param stream: the cudaStream object */ template -double compute_rand_index(const T* firstClusterArray, +double compute_rand_index(bool dry_run, + const T* firstClusterArray, const T* secondClusterArray, uint64_t size, cudaStream_t stream) @@ -129,6 +131,7 @@ double compute_rand_index(const T* firstClusterArray, // allocating and initializing memory for a and b in the GPU rmm::device_uvector arr_buf(2, stream); + if (dry_run) { return 0.0; } RAFT_CUDA_TRY(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); // kernel configuration diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh index e47f33efbe..e68492f8d5 100644 --- a/cpp/include/raft/stats/entropy.cuh +++ b/cpp/include/raft/stats/entropy.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -9,6 +9,7 @@ #pragma once #include #include +#include #include namespace raft { @@ -33,7 +34,7 @@ double entropy(const T* clusterArray, const T upperLabelRange, cudaStream_t stream) { - return detail::entropy(clusterArray, size, lowerLabelRange, upperLabelRange, stream); + return detail::entropy(false, clusterArray, size, lowerLabelRange, upperLabelRange, stream); } /** @@ -60,7 +61,8 @@ double entropy(raft::resources const& handle, const value_t upper_label_range) { RAFT_EXPECTS(cluster_array.is_exhaustive(), "cluster_array must be contiguous"); - return detail::entropy(cluster_array.data_handle(), + return detail::entropy(resource::get_dry_run_flag(handle), + cluster_array.data_handle(), cluster_array.extent(0), lower_label_range, upper_label_range, diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh index 36e2ff64e2..9449e863b9 100644 --- a/cpp/include/raft/stats/histogram.cuh +++ b/cpp/include/raft/stats/histogram.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -87,6 +88,10 @@ void histogram(raft::resources const& handle, raft::device_matrix_view bins, binner_op binner = IdentityBinner()) { + // Seems like neither implementation of histogram does any CUDA allocations. + // There is one allocation of std::vector inside Seive object in computeHashTableSize, + // but it doesn't go through sdt::pmr, so isn't counted. + if (resource::get_dry_run_flag(handle)) { return; } RAFT_EXPECTS(std::is_integral_v && data.extent(0) <= std::numeric_limits::max(), "Index type not supported"); RAFT_EXPECTS(bins.extent(1) == data.extent(1), "Size mismatch"); diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh index 9739b45e88..cda32fdea0 100644 --- a/cpp/include/raft/stats/mean.cuh +++ b/cpp/include/raft/stats/mean.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -33,7 +33,7 @@ namespace stats { template void mean(Type* mu, const Type* data, IdxType D, IdxType N, cudaStream_t stream) { - detail::mean(mu, data, D, N, stream); + detail::mean(false, mu, data, D, N, stream); } /** @@ -58,7 +58,7 @@ template [[deprecated("'sample' parameter deprecated")]] void mean( Type* mu, const Type* data, IdxType D, IdxType N, bool sample, cudaStream_t stream) { - detail::mean(mu, data, D, N, sample, stream); + detail::mean(false, mu, data, D, N, sample, stream); } /** @@ -89,7 +89,8 @@ void mean(raft::resources const& handle, RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch between data and mu"); RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous"); RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous"); - detail::mean>(mu.data_handle(), + detail::mean>(raft::resource::get_dry_run_flag(handle), + mu.data_handle(), data.data_handle(), data.extent(1), data.extent(0), @@ -124,7 +125,8 @@ template RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch between data and mu"); RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous"); RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous"); - detail::mean>(mu.data_handle(), + detail::mean>(raft::resource::get_dry_run_flag(handle), + mu.data_handle(), data.data_handle(), data.extent(1), data.extent(0), diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh index b58807c053..2d8b671cd2 100644 --- a/cpp/include/raft/stats/rand_index.cuh +++ b/cpp/include/raft/stats/rand_index.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __RAND_INDEX_H @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -26,7 +27,7 @@ namespace stats { template double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cudaStream_t stream) { - return detail::compute_rand_index(firstClusterArray, secondClusterArray, size, stream); + return detail::compute_rand_index(false, firstClusterArray, secondClusterArray, size, stream); } /** @@ -53,7 +54,8 @@ double rand_index(raft::resources const& handle, "Size mismatch between first_cluster_array and second_cluster_array"); RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous"); RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous"); - return detail::compute_rand_index(first_cluster_array.data_handle(), + return detail::compute_rand_index(resource::get_dry_run_flag(handle), + first_cluster_array.data_handle(), second_cluster_array.data_handle(), second_cluster_array.extent(0), resource::get_cuda_stream(handle)); diff --git a/cpp/tests/stats/homogeneity_score.cu b/cpp/tests/stats/homogeneity_score.cu index 90b456d32a..0590913af2 100644 --- a/cpp/tests/stats/homogeneity_score.cu +++ b/cpp/tests/stats/homogeneity_score.cu @@ -1,10 +1,11 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #include "../test_utils.cuh" #include +#include #include #include #include From 42d8ad40ac7d524d9625be0e3d6e5f9c9a2f585c Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 19 Feb 2026 10:13:05 +0100 Subject: [PATCH 03/62] Dry run compliance for raft::linalg namespace --- cpp/include/raft/linalg/detail/norm.cuh | 46 +-- cpp/include/raft/linalg/detail/qr.cuh | 61 ++-- cpp/include/raft/linalg/detail/rsvd.cuh | 60 +++- cpp/include/raft/linalg/detail/svd.cuh | 27 +- cpp/include/raft/linalg/detail/transpose.cuh | 7 +- cpp/include/raft/linalg/divide.cuh | 4 +- cpp/include/raft/linalg/dot.cuh | 5 +- cpp/include/raft/linalg/map_reduce.cuh | 4 +- .../raft/linalg/mean_squared_error.cuh | 4 +- cpp/include/raft/linalg/multiply.cuh | 4 +- cpp/include/raft/linalg/norm.cuh | 33 +- cpp/include/raft/linalg/normalize.cuh | 4 +- cpp/include/raft/linalg/power.cuh | 5 +- .../raft/linalg/reduce_cols_by_key.cuh | 4 +- .../raft/linalg/reduce_rows_by_key.cuh | 4 +- cpp/include/raft/linalg/sqrt.cuh | 4 +- cpp/include/raft/linalg/subtract.cuh | 6 +- cpp/include/raft/linalg/unary_op.cuh | 4 +- cpp/include/raft/matrix/copy.cuh | 7 +- cpp/tests/linalg/rsvd.cu | 297 +++++++++++++++++- cpp/tests/linalg/svd.cu | 142 ++++++++- 21 files changed, 632 insertions(+), 100 deletions(-) diff --git a/cpp/include/raft/linalg/detail/norm.cuh b/cpp/include/raft/linalg/detail/norm.cuh index ea7f5c8d28..e748e38997 100644 --- a/cpp/include/raft/linalg/detail/norm.cuh +++ b/cpp/include/raft/linalg/detail/norm.cuh @@ -1,13 +1,13 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once #include +#include #include -#include namespace raft { namespace linalg { @@ -19,18 +19,23 @@ template -void rowNormCaller( - OutType* dots, const Type* data, IdxType D, IdxType N, cudaStream_t stream, Lambda fin_op) +void rowNormCaller(bool dry_run, + OutType* dots, + const Type* data, + IdxType D, + IdxType N, + cudaStream_t stream, + Lambda fin_op) { if constexpr (norm_type == L1Norm) { - raft::linalg::reduce( - dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op); + reduce( + dry_run, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op); } else if constexpr (norm_type == L2Norm) { - raft::linalg::reduce( - dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op); + reduce( + dry_run, dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op); } else if constexpr (norm_type == LinfNorm) { - raft::linalg::reduce( - dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op); + reduce( + dry_run, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op); } else { THROW("Unsupported norm type: %d", norm_type); } @@ -42,18 +47,23 @@ template -void colNormCaller( - OutType* dots, const Type* data, IdxType D, IdxType N, cudaStream_t stream, Lambda fin_op) +void colNormCaller(bool dry_run, + OutType* dots, + const Type* data, + IdxType D, + IdxType N, + cudaStream_t stream, + Lambda fin_op) { if constexpr (norm_type == L1Norm) { - raft::linalg::reduce( - dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op); + reduce( + dry_run, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::add_op(), fin_op); } else if constexpr (norm_type == L2Norm) { - raft::linalg::reduce( - dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op); + reduce( + dry_run, dots, data, D, N, (OutType)0, stream, false, raft::sq_op(), raft::add_op(), fin_op); } else if constexpr (norm_type == LinfNorm) { - raft::linalg::reduce( - dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op); + reduce( + false, dots, data, D, N, (OutType)0, stream, false, raft::abs_op(), raft::max_op(), fin_op); } else { THROW("Unsupported norm type: %d", norm_type); } diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh index 63cba5d73c..73657ab039 100644 --- a/cpp/include/raft/linalg/detail/qr.cuh +++ b/cpp/include/raft/linalg/detail/qr.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -9,6 +9,7 @@ #include "cusolver_wrappers.hpp" #include +#include #include #include @@ -44,10 +45,18 @@ void qrGetQ_inplace( RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * n_cols, stream)); rmm::device_scalar dev_info(stream); - int ws_size; + int ws_size_Dngeqrf; + int ws_size_Dnorgqr; + + RAFT_CUSOLVER_TRY( + cusolverDngeqrf_bufferSize(cusolver, n_rows, n_cols, Q, n_rows, &ws_size_Dngeqrf)); + RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize( + cusolver, n_rows, n_cols, n_cols, Q, n_rows, tau.data(), &ws_size_Dnorgqr)); + + rmm::device_uvector workspace(std::max(ws_size_Dngeqrf, ws_size_Dnorgqr), stream); + + if (resource::get_dry_run_flag(handle)) { return; } - RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(cusolver, n_rows, n_cols, Q, n_rows, &ws_size)); - rmm::device_uvector workspace(ws_size, stream); RAFT_CUSOLVER_TRY(cusolverDngeqrf(cusolver, n_rows, n_cols, @@ -55,13 +64,10 @@ void qrGetQ_inplace( n_rows, tau.data(), workspace.data(), - ws_size, + ws_size_Dngeqrf, dev_info.data(), stream)); - RAFT_CUSOLVER_TRY( - cusolverDnorgqr_bufferSize(cusolver, n_rows, n_cols, n_cols, Q, n_rows, tau.data(), &ws_size)); - workspace.resize(ws_size, stream); RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolver, n_rows, n_cols, @@ -70,7 +76,7 @@ void qrGetQ_inplace( n_rows, tau.data(), workspace.data(), - ws_size, + ws_size_Dnorgqr, dev_info.data(), stream)); } @@ -83,7 +89,7 @@ void qrGetQ(raft::resources const& handle, int n_cols, cudaStream_t stream) { - raft::copy(Q, M, n_rows * n_cols, stream); + if (!resource::get_dry_run_flag(handle)) { raft::copy(Q, M, n_rows * n_cols, stream); } qrGetQ_inplace(handle, Q, n_rows, n_cols, stream); } @@ -99,19 +105,32 @@ void qrGetQR(raft::resources const& handle, cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle); int m = n_rows, n = n_cols; + int R_full_nrows = m, R_full_ncols = n; + int Q_nrows = m, Q_ncols = n; + int Lwork_Dngeqrf, Lwork_Dnorgqr; rmm::device_uvector R_full(m * n, stream); rmm::device_uvector tau(std::min(m, n), stream); + rmm::device_scalar devInfo(stream); + + RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize( + cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork_Dngeqrf)); + RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH, + Q_nrows, + Q_ncols, + std::min(Q_ncols, Q_nrows), + Q, + Q_nrows, + tau.data(), + &Lwork_Dnorgqr)); + + rmm::device_uvector workspace(std::max(Lwork_Dngeqrf, Lwork_Dnorgqr), stream); + + if (resource::get_dry_run_flag(handle)) { return; } + RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * std::min(m, n), stream)); - int R_full_nrows = m, R_full_ncols = n; RAFT_CUDA_TRY( cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); - int Lwork; - rmm::device_scalar devInfo(stream); - - RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize( - cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork)); - rmm::device_uvector workspace(Lwork, stream); RAFT_CUSOLVER_TRY(cusolverDngeqrf(cusolverH, R_full_nrows, R_full_ncols, @@ -119,7 +138,7 @@ void qrGetQR(raft::resources const& handle, R_full_nrows, tau.data(), workspace.data(), - Lwork, + Lwork_Dngeqrf, devInfo.data(), stream)); @@ -130,11 +149,7 @@ void qrGetQR(raft::resources const& handle, RAFT_CUDA_TRY( cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream)); - int Q_nrows = m, Q_ncols = n; - RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize( - cusolverH, Q_nrows, Q_ncols, std::min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork)); - workspace.resize(Lwork, stream); RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolverH, Q_nrows, Q_ncols, @@ -143,7 +158,7 @@ void qrGetQR(raft::resources const& handle, Q_nrows, tau.data(), workspace.data(), - Lwork, + Lwork_Dnorgqr, devInfo.data(), stream)); } diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh index cc93c57ab5..9eea7fee62 100644 --- a/cpp/include/raft/linalg/detail/rsvd.cuh +++ b/cpp/include/raft/linalg/detail/rsvd.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2018-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -84,6 +85,8 @@ void randomized_svd(const raft::resources& handle, auto h_workspace = raft::make_host_vector(workspaceHost); auto devInfo = raft::make_device_scalar(handle, 0); + if (resource::get_dry_run_flag(handle)) { return; } + RAFT_CUSOLVER_TRY(cusolverDnxgesvdr(cusolverH, jobu, jobv, @@ -153,6 +156,7 @@ void rsvdFixedRank(raft::resources const& handle, int max_sweeps, cudaStream_t stream) { + bool is_dry_run = resource::get_dry_run_flag(handle); cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle); cublasHandle_t cublasH = resource::get_cublas_handle(handle); @@ -170,7 +174,9 @@ void rsvdFixedRank(raft::resources const& handle, // Build temporary U, S, V matrices rmm::device_uvector S_vec_tmp(l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream)); + } // build random matrix rmm::device_uvector RN(n * l, stream); @@ -186,9 +192,11 @@ void rsvdFixedRank(raft::resources const& handle, rmm::device_uvector Z(n * l, stream); rmm::device_uvector Yorth(m * l, stream); rmm::device_uvector Zorth(n * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream)); + } // power sampling scheme for (int j = 1; j < q; j++) { @@ -235,30 +243,40 @@ void rsvdFixedRank(raft::resources const& handle, // orthogonalize on exit from loop to get Q rmm::device_uvector Q(m * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream)); + if (!is_dry_run) { RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream)); } raft::linalg::qrGetQ(handle, Y.data(), Q.data(), m, l, stream); // either QR of B^T method, or eigendecompose BB^T method if (!use_bbt) { // form Bt = Mt*Q : nxm * mxl = nxl rmm::device_uvector Bt(n * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream)); + } raft::linalg::gemm( handle, M, m, n, Q.data(), Bt.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); // compute QR factorization of Bt // M is mxn ; Q is mxn ; R is min(m,n) x min(m,n) */ rmm::device_uvector Qhat(n * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream)); + } rmm::device_uvector Rhat(l * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream)); + } raft::linalg::qrGetQR(handle, Bt.data(), Qhat.data(), Rhat.data(), n, l, stream); // compute SVD of Rhat (lxl) rmm::device_uvector Uhat(l * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream)); + } rmm::device_uvector Vhat(l * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream)); + } if (use_jacobi) raft::linalg::svdJacobi(handle, Rhat.data(), @@ -349,9 +367,13 @@ void rsvdFixedRank(raft::resources const& handle, // compute eigendecomposition of BBt rmm::device_uvector Uhat(l * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream)); + } rmm::device_uvector Uhat_dup(l * l, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream)); + } raft::matrix::upper_triangular( handle, @@ -363,7 +385,7 @@ void rsvdFixedRank(raft::resources const& handle, handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream, tol, max_sweeps); else raft::linalg::eigDC(handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream); - raft::matrix::seqRoot(S_vec_tmp.data(), l, stream); + if (!is_dry_run) { raft::matrix::seqRoot(S_vec_tmp.data(), l, stream); } auto S_vec_view = make_device_matrix_view(S_vec, 1, k); raft::matrix::slice( @@ -395,10 +417,14 @@ void rsvdFixedRank(raft::resources const& handle, // Sigma^{-1}[(p+1):l, (p+1):l] nxl * lxk * kxk = nxk if (gen_right_vec) { rmm::device_uvector Sinv(k * k, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream)); + } rmm::device_uvector UhatSinv(l * k, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream)); - raft::matrix::reciprocal(S_vec_tmp.data(), l, stream); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream)); + } + if (!is_dry_run) { raft::matrix::reciprocal(S_vec_tmp.data(), l, stream); } raft::matrix::set_diagonal(handle, make_device_vector_view(S_vec_tmp.data() + p, k), make_device_matrix_view(Sinv.data(), k, k)); diff --git a/cpp/include/raft/linalg/detail/svd.cuh b/cpp/include/raft/linalg/detail/svd.cuh index 19b569398d..bb164f65bf 100644 --- a/cpp/include/raft/linalg/detail/svd.cuh +++ b/cpp/include/raft/linalg/detail/svd.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ void svdQR(raft::resources const& handle, RAFT_CUSOLVER_TRY(cusolverDngesvd_bufferSize(cusolverH, n_rows, n_cols, &lwork)); rmm::device_uvector d_work(lwork, stream); + if (resource::get_dry_run_flag(handle)) { return; } + char jobu = 'S'; char jobvt = 'A'; @@ -139,6 +142,8 @@ void svdEig(raft::resources const& handle, raft::matrix::row_reverse(handle, make_device_matrix_view(S, n_cols, idx_t(1))); + if (resource::get_dry_run_flag(handle)) { return; } + raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true); if (gen_left_vec) { @@ -208,6 +213,11 @@ void svdJacobi(raft::resources const& handle, rmm::device_uvector d_work(lwork, stream); + if (resource::get_dry_run_flag(handle)) { + RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params)); + return; + } + RAFT_CUSOLVER_TRY(cusolverDngesvdj(cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, @@ -272,16 +282,19 @@ bool evaluateSVDByL2Norm(raft::resources const& handle, math_t tol, cudaStream_t stream) { - cublasHandle_t cublasH = resource::get_cublas_handle(handle); - int m = n_rows, n = n_cols; + bool is_dry_run = resource::get_dry_run_flag(handle); // form product matrix rmm::device_uvector P_d(m * n, stream); rmm::device_uvector S_mat(k * k, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream)); + if (!is_dry_run) { + RAFT_CUDA_TRY(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream)); + RAFT_CUDA_TRY(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream)); + } + + // These RAFT functions have their own dry-run guards at the leaf level raft::matrix::set_diagonal(handle, make_device_vector_view(S_vec, k), make_device_matrix_view(S_mat.data(), k, k)); @@ -299,8 +312,12 @@ bool evaluateSVDByL2Norm(raft::resources const& handle, // calculate percent error const math_t alpha = 1.0, beta = -1.0; rmm::device_uvector A_minus_P(m * n, stream); + + if (is_dry_run) { return false; } + RAFT_CUDA_TRY(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream)); + cublasHandle_t cublasH = resource::get_cublas_handle(handle); RAFT_CUBLAS_TRY(cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh index 8f91a57525..6f4c4a0d1e 100644 --- a/cpp/include/raft/linalg/detail/transpose.cuh +++ b/cpp/include/raft/linalg/detail/transpose.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -87,6 +88,7 @@ void transpose_half(raft::resources const& handle, const IndexType stride_out = 1) { if (n_cols == 0 || n_rows == 0) return; + if (resource::get_dry_run_flag(handle)) { return; } auto stream = resource::get_cuda_stream(handle); int dev_id, sm_count; @@ -134,6 +136,7 @@ void transpose(raft::resources const& handle, int n_cols, cudaStream_t stream) { + if (resource::get_dry_run_flag(handle)) { return; } int out_n_rows = n_cols; int out_n_cols = n_rows; @@ -188,6 +191,7 @@ void transpose_row_major_impl( raft::mdspan, LayoutPolicy, AccessorPolicy> in, raft::mdspan, LayoutPolicy, AccessorPolicy> out) { + if (resource::get_dry_run_flag(handle)) { return; } auto out_n_rows = in.extent(1); auto out_n_cols = in.extent(0); T constexpr kOne = 1; @@ -230,6 +234,7 @@ void transpose_col_major_impl( raft::mdspan, LayoutPolicy, AccessorPolicy> in, raft::mdspan, LayoutPolicy, AccessorPolicy> out) { + if (resource::get_dry_run_flag(handle)) { return; } auto out_n_rows = in.extent(1); auto out_n_cols = in.extent(0); T constexpr kOne = 1; diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh index 69600f016c..0a64b8db55 100644 --- a/cpp/include/raft/linalg/divide.cuh +++ b/cpp/include/raft/linalg/divide.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __DIVIDE_H @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -61,6 +62,7 @@ void divide_scalar(raft::resources const& handle, OutType out, raft::host_scalar_view scalar) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; diff --git a/cpp/include/raft/linalg/dot.cuh b/cpp/include/raft/linalg/dot.cuh index af40c07459..086633745b 100644 --- a/cpp/include/raft/linalg/dot.cuh +++ b/cpp/include/raft/linalg/dot.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __DOT_H @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ void dot(raft::resources const& handle, { RAFT_EXPECTS(x.size() == y.size(), "Size mismatch between x and y input vectors in raft::linalg::dot"); + if (resource::get_dry_run_flag(handle)) { return; } RAFT_CUBLAS_TRY(detail::cublasdot(resource::get_cublas_handle(handle), x.size(), @@ -70,6 +72,7 @@ void dot(raft::resources const& handle, { RAFT_EXPECTS(x.size() == y.size(), "Size mismatch between x and y input vectors in raft::linalg::dot"); + if (resource::get_dry_run_flag(handle)) { return; } RAFT_CUBLAS_TRY(detail::cublasdot(resource::get_cublas_handle(handle), x.size(), diff --git a/cpp/include/raft/linalg/map_reduce.cuh b/cpp/include/raft/linalg/map_reduce.cuh index e5176dda01..3c206bc11b 100644 --- a/cpp/include/raft/linalg/map_reduce.cuh +++ b/cpp/include/raft/linalg/map_reduce.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __MAP_REDUCE_H @@ -11,6 +11,7 @@ #include #include +#include namespace raft::linalg { @@ -89,6 +90,7 @@ void map_reduce(raft::resources const& handle, ReduceLambda op, Args... args) { + if (resource::get_dry_run_flag(handle)) { return; } mapReduce( out.data_handle(), in.extent(0), diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh index 70c04ccc6b..85ca248cf5 100644 --- a/cpp/include/raft/linalg/mean_squared_error.cuh +++ b/cpp/include/raft/linalg/mean_squared_error.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __MSE_H @@ -11,6 +11,7 @@ #include #include +#include namespace raft { namespace linalg { @@ -57,6 +58,7 @@ void mean_squared_error(raft::resources const& handle, raft::device_scalar_view out, OutValueType weight) { + if (resource::get_dry_run_flag(handle)) { return; } RAFT_EXPECTS(A.size() == B.size(), "Size mismatch between inputs"); meanSquaredError(out.data_handle(), diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh index 22c89a5883..325918868e 100644 --- a/cpp/include/raft/linalg/multiply.cuh +++ b/cpp/include/raft/linalg/multiply.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __MULTIPLY_H @@ -12,6 +12,7 @@ #include #include #include +#include #include namespace raft { @@ -63,6 +64,7 @@ void multiply_scalar( OutType out, raft::host_scalar_view scalar) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh index e16fbf4353..c0839aca44 100644 --- a/cpp/include/raft/linalg/norm.cuh +++ b/cpp/include/raft/linalg/norm.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __NORM_H @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -54,7 +55,7 @@ void rowNorm(OutType* dots, cudaStream_t stream, Lambda fin_op = raft::identity_op()) { - detail::rowNormCaller(dots, data, D, N, stream, fin_op); + detail::rowNormCaller(false, dots, data, D, N, stream, fin_op); } /** @@ -85,7 +86,7 @@ void colNorm(OutType* dots, cudaStream_t stream, Lambda fin_op = raft::identity_op()) { - detail::colNormCaller(dots, data, D, N, stream, fin_op); + detail::colNormCaller(false, dots, data, D, N, stream, fin_op); } /** @@ -128,21 +129,23 @@ void norm(raft::resources const& handle, if constexpr (along_rows) { RAFT_EXPECTS(static_cast(out.size()) == in.extent(0), "Output should be equal to number of rows in Input"); - rowNorm(out.data_handle(), - in.data_handle(), - in.extent(1), - in.extent(0), - resource::get_cuda_stream(handle), - fin_op); + detail::rowNormCaller(resource::get_dry_run_flag(handle), + out.data_handle(), + in.data_handle(), + in.extent(1), + in.extent(0), + resource::get_cuda_stream(handle), + fin_op); } else { RAFT_EXPECTS(static_cast(out.size()) == in.extent(1), "Output should be equal to number of columns in Input"); - colNorm(out.data_handle(), - in.data_handle(), - in.extent(1), - in.extent(0), - resource::get_cuda_stream(handle), - fin_op); + detail::colNormCaller(resource::get_dry_run_flag(handle), + out.data_handle(), + in.data_handle(), + in.extent(1), + in.extent(0), + resource::get_cuda_stream(handle), + fin_op); } } diff --git a/cpp/include/raft/linalg/normalize.cuh b/cpp/include/raft/linalg/normalize.cuh index 730d5aff25..86b59751f5 100644 --- a/cpp/include/raft/linalg/normalize.cuh +++ b/cpp/include/raft/linalg/normalize.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,7 @@ void row_normalize(raft::resources const& handle, FinalLambda fin_op, ElementType eps = ElementType(1e-8)) { + if (resource::get_dry_run_flag(handle)) { return; } RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); RAFT_EXPECTS(in.extent(0) == out.extent(0), diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh index ae4820cda3..de6461bc83 100644 --- a/cpp/include/raft/linalg/power.cuh +++ b/cpp/include/raft/linalg/power.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2018-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __POWER_H @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -74,6 +75,7 @@ template > void power(raft::resources const& handle, InType in1, InType in2, OutType out) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; @@ -112,6 +114,7 @@ void power_scalar( OutType out, const raft::host_scalar_view scalar) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh index e0ac2d6544..eb90244cc3 100644 --- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh +++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __REDUCE_COLS_BY_KEY @@ -11,6 +11,7 @@ #include #include +#include #include namespace raft { @@ -81,6 +82,7 @@ void reduce_cols_by_key( IndexType nkeys = 0, bool reset_sums = true) { + if (resource::get_dry_run_flag(handle)) { return; } if (nkeys > 0) { RAFT_EXPECTS(out.extent(1) == nkeys, "Output doesn't have nkeys columns"); } else { diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh index 7e7e91bcb9..685f8fb962 100644 --- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __REDUCE_ROWS_BY_KEY @@ -11,6 +11,7 @@ #include #include +#include #include namespace raft { @@ -147,6 +148,7 @@ void reduce_rows_by_key( std::optional> d_weights = std::nullopt, bool reset_sums = true) { + if (resource::get_dry_run_flag(handle)) { return; } RAFT_EXPECTS(d_A.extent(0) == d_A.extent(0) && d_sums.extent(1) == n_unique_keys, "Output is not of size ncols * n_unique_keys"); RAFT_EXPECTS(d_keys.extent(0) == d_A.extent(1), "Keys is not of size nrows"); diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh index e0c232e62a..abf19e765e 100644 --- a/cpp/include/raft/linalg/sqrt.cuh +++ b/cpp/include/raft/linalg/sqrt.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2018-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __SQRT_H @@ -10,6 +10,7 @@ #include #include #include +#include #include namespace raft { @@ -51,6 +52,7 @@ template > void sqrt(raft::resources const& handle, InType in, OutType out) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh index 1aba864100..08e5f38fbe 100644 --- a/cpp/include/raft/linalg/subtract.cuh +++ b/cpp/include/raft/linalg/subtract.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -13,6 +13,7 @@ #include #include #include +#include #include namespace raft { @@ -98,6 +99,7 @@ template > void subtract(raft::resources const& handle, InType in1, InType in2, OutType out) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; @@ -136,6 +138,7 @@ void subtract_scalar( OutType out, raft::device_scalar_view scalar) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; @@ -172,6 +175,7 @@ void subtract_scalar( OutType out, raft::host_scalar_view scalar) { + if (resource::get_dry_run_flag(handle)) { return; } using in_value_t = typename InType::value_type; using out_value_t = typename OutType::value_type; diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index 69e2130adb..6cf4b3a266 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __UNARY_OP_H @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -109,6 +110,7 @@ template > void write_only_unary_op(const raft::resources& handle, OutType out, Lambda op) { + if (resource::get_dry_run_flag(handle)) { return; } return writeOnlyUnaryOp(out.data_handle(), out.size(), op, resource::get_cuda_stream(handle)); } diff --git a/cpp/include/raft/matrix/copy.cuh b/cpp/include/raft/matrix/copy.cuh index 8c3f00eca5..0aca60483a 100644 --- a/cpp/include/raft/matrix/copy.cuh +++ b/cpp/include/raft/matrix/copy.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -34,6 +35,7 @@ void copy_rows(raft::resources const& handle, raft::device_matrix_view out, raft::device_vector_view indices) { + if (resource::get_dry_run_flag(handle)) { return; } RAFT_EXPECTS(in.extent(1) == out.extent(1), "Input and output matrices must have same number of columns"); RAFT_EXPECTS(indices.extent(0) == out.extent(0), @@ -59,6 +61,7 @@ void copy(raft::resources const& handle, raft::device_matrix_view in, raft::device_matrix_view out) { + if (resource::get_dry_run_flag(handle)) { return; } RAFT_EXPECTS(in.extent(0) == out.extent(0) && in.extent(1) == out.extent(1), "Input and output matrix shapes must match."); @@ -79,6 +82,7 @@ void copy(raft::resources const& handle, raft::device_matrix_view in, raft::device_matrix_view out) { + if (resource::get_dry_run_flag(handle)) { return; } RAFT_EXPECTS(in.extent(0) == out.extent(0) && in.extent(1) == out.extent(1), "Input and output matrix shapes must match."); @@ -100,6 +104,7 @@ void trunc_zero_origin(raft::resources const& handle, raft::device_matrix_view in, raft::device_matrix_view out) { + if (resource::get_dry_run_flag(handle)) { return; } RAFT_EXPECTS(out.extent(0) <= in.extent(0) && out.extent(1) <= in.extent(1), "Output matrix must have less or equal number of rows and columns"); diff --git a/cpp/tests/linalg/rsvd.cu b/cpp/tests/linalg/rsvd.cu index 6f125afa8e..d8599d601e 100644 --- a/cpp/tests/linalg/rsvd.cu +++ b/cpp/tests/linalg/rsvd.cu @@ -1,18 +1,18 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #include "../test_utils.cuh" +#include #include #include #include #include #include #include - -#include +#include #include @@ -312,5 +312,296 @@ INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormF, ::testing::ValuesI INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormD, ::testing::ValuesIn(inputs_dx)); +// =================================================================== +// Dry-run tests for RSVD public API functions +// =================================================================== + +TEST(RsvdDryRun, FixedRankQRWithBothVectors) +{ + raft::resources res; + + constexpr int n_rows = 256; + constexpr int n_cols = 128; + constexpr int k = 50; + constexpr int p = 10; + + // Pre-allocate input/output buffers (outside dry-run) + auto M = raft::make_device_matrix(res, n_rows, n_cols); + auto S_vec = raft::make_device_vector(res, k); + auto U = raft::make_device_matrix(res, n_rows, k); + auto V = raft::make_device_matrix(res, k, n_cols); + + // Run rsvd_fixed_rank in dry-run mode (QR, no BBT, no Jacobi, both U and V) + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::rsvd_fixed_rank( + handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, U.view(), V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0) << "Expected non-zero peak device memory allocation"; +} + +TEST(RsvdDryRun, FixedRankSymmetricWithBothVectors) +{ + raft::resources res; + + constexpr int n_rows = 256; + constexpr int n_cols = 128; + constexpr int k = 50; + constexpr int p = 10; + + // Pre-allocate input/output buffers (outside dry-run) + auto M = raft::make_device_matrix(res, n_rows, n_cols); + auto S_vec = raft::make_device_vector(res, k); + auto U = raft::make_device_matrix(res, n_rows, k); + auto V = raft::make_device_matrix(res, k, n_cols); + + // Run rsvd_fixed_rank_symmetric in dry-run mode (QR, with BBT, no Jacobi, both U and V) + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::rsvd_fixed_rank_symmetric( + handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, U.view(), V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0) << "Expected non-zero peak device memory allocation"; +} + +TEST(RsvdDryRun, FixedRankJacobiWithBothVectors) +{ + raft::resources res; + + constexpr int n_rows = 256; + constexpr int n_cols = 128; + constexpr int k = 50; + constexpr int p = 10; + constexpr float tol = 1e-7f; + constexpr int max_sweeps = 100; + + // Pre-allocate input/output buffers (outside dry-run) + auto M = raft::make_device_matrix(res, n_rows, n_cols); + auto S_vec = raft::make_device_vector(res, k); + auto U = raft::make_device_matrix(res, n_rows, k); + auto V = raft::make_device_matrix(res, k, n_cols); + + // Run rsvd_fixed_rank_jacobi in dry-run mode (QR, no BBT, with Jacobi, both U and V) + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::rsvd_fixed_rank_jacobi(handle, + raft::make_const_mdspan(M.view()), + S_vec.view(), + p, + tol, + max_sweeps, + U.view(), + V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0) << "Expected non-zero peak device memory allocation"; +} + +TEST(RsvdDryRun, FixedRankSymmetricJacobiWithBothVectors) +{ + raft::resources res; + + constexpr int n_rows = 256; + constexpr int n_cols = 128; + constexpr int k = 50; + constexpr int p = 10; + constexpr float tol = 1e-7f; + constexpr int max_sweeps = 100; + + // Pre-allocate input/output buffers (outside dry-run) + auto M = raft::make_device_matrix(res, n_rows, n_cols); + auto S_vec = raft::make_device_vector(res, k); + auto U = raft::make_device_matrix(res, n_rows, k); + auto V = raft::make_device_matrix(res, k, n_cols); + + // Run rsvd_fixed_rank_symmetric_jacobi in dry-run mode (QR, with BBT, with Jacobi, both U and V) + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::rsvd_fixed_rank_symmetric_jacobi(handle, + raft::make_const_mdspan(M.view()), + S_vec.view(), + p, + tol, + max_sweeps, + U.view(), + V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0) << "Expected non-zero peak device memory allocation"; +} + +TEST(RsvdDryRun, FixedRankWithOnlyU) +{ + raft::resources res; + + constexpr int n_rows = 256; + constexpr int n_cols = 128; + constexpr int k = 50; + constexpr int p = 10; + + // Pre-allocate input/output buffers (outside dry-run) + auto M = raft::make_device_matrix(res, n_rows, n_cols); + auto S_vec = raft::make_device_vector(res, k); + auto U = raft::make_device_matrix(res, n_rows, k); + + // Run rsvd_fixed_rank in dry-run mode (only U, no V) + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::rsvd_fixed_rank( + handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, U.view(), std::nullopt); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0) << "Expected non-zero peak device memory allocation"; +} + +TEST(RsvdDryRun, FixedRankWithOnlyV) +{ + raft::resources res; + + constexpr int n_rows = 256; + constexpr int n_cols = 128; + constexpr int k = 50; + constexpr int p = 10; + + // Pre-allocate input/output buffers (outside dry-run) + auto M = raft::make_device_matrix(res, n_rows, n_cols); + auto S_vec = raft::make_device_vector(res, k); + auto V = raft::make_device_matrix(res, k, n_cols); + + // Run rsvd_fixed_rank in dry-run mode (only V, no U) + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::rsvd_fixed_rank( + handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, std::nullopt, V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0) << "Expected non-zero peak device memory allocation"; +} + +TEST(RsvdDryRun, FixedRankWithNoVectors) +{ + raft::resources res; + + constexpr int n_rows = 256; + constexpr int n_cols = 128; + constexpr int k = 50; + constexpr int p = 10; + + // Pre-allocate input/output buffers (outside dry-run) + auto M = raft::make_device_matrix(res, n_rows, n_cols); + auto S_vec = raft::make_device_vector(res, k); + + // Run rsvd_fixed_rank in dry-run mode (no U, no V - only singular values) + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::rsvd_fixed_rank( + handle, raft::make_const_mdspan(M.view()), S_vec.view(), p, std::nullopt, std::nullopt); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0) << "Expected non-zero peak device memory allocation"; +} + +TEST(RsvdDryRun, PercWithBothVectors) +{ + raft::resources res; + + constexpr int n_rows = 256; + constexpr int n_cols = 128; + constexpr float PC_perc = 0.2f; + constexpr float UpS_perc = 0.05f; + constexpr int k = static_cast(std::min(n_rows, n_cols) * PC_perc); + + // Pre-allocate input/output buffers (outside dry-run) + auto M = raft::make_device_matrix(res, n_rows, n_cols); + auto S_vec = raft::make_device_vector(res, k); + auto U = raft::make_device_matrix(res, n_rows, k); + auto V = raft::make_device_matrix(res, k, n_cols); + + // Run rsvd_perc in dry-run mode (percentage-based, QR, no BBT, no Jacobi, both U and V) + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::rsvd_perc(handle, + raft::make_const_mdspan(M.view()), + S_vec.view(), + PC_perc, + UpS_perc, + U.view(), + V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0) << "Expected non-zero peak device memory allocation"; +} + +TEST(RsvdDryRun, PercSymmetricJacobiWithBothVectors) +{ + raft::resources res; + + constexpr int n_rows = 256; + constexpr int n_cols = 128; + constexpr float PC_perc = 0.2f; + constexpr float UpS_perc = 0.05f; + constexpr float tol = 1e-7f; + constexpr int max_sweeps = 100; + constexpr int k = static_cast(std::min(n_rows, n_cols) * PC_perc); + + // Pre-allocate input/output buffers (outside dry-run) + auto M = raft::make_device_matrix(res, n_rows, n_cols); + auto S_vec = raft::make_device_vector(res, k); + auto U = raft::make_device_matrix(res, n_rows, k); + auto V = raft::make_device_matrix(res, k, n_cols); + + // Run rsvd_perc_symmetric_jacobi in dry-run mode (percentage-based, QR, with BBT, with Jacobi, + // both U and V) + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::rsvd_perc_symmetric_jacobi(handle, + raft::make_const_mdspan(M.view()), + S_vec.view(), + PC_perc, + UpS_perc, + tol, + max_sweeps, + U.view(), + V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0) << "Expected non-zero peak device memory allocation"; +} + +TEST(RsvdDryRun, TallMatrix) +{ + raft::resources res; + + constexpr int n_rows = 512; + constexpr int n_cols = 128; + constexpr int k = 50; + constexpr int p = 10; + + // Pre-allocate input/output buffers (outside dry-run) + auto M = raft::make_device_matrix(res, n_rows, n_cols); + auto S_vec = raft::make_device_vector(res, k); + auto U = raft::make_device_matrix(res, n_rows, k); + auto V = raft::make_device_matrix(res, k, n_cols); + + // Run rsvd_fixed_rank_jacobi in dry-run mode on a tall matrix + constexpr float tol = 1e-7f; + constexpr int max_sweeps = 100; + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::rsvd_fixed_rank_jacobi(handle, + raft::make_const_mdspan(M.view()), + S_vec.view(), + p, + tol, + max_sweeps, + U.view(), + V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0) << "Expected non-zero peak device memory allocation"; +} + } // end namespace linalg } // end namespace raft diff --git a/cpp/tests/linalg/svd.cu b/cpp/tests/linalg/svd.cu index 544263768d..897fdf2c3e 100644 --- a/cpp/tests/linalg/svd.cu +++ b/cpp/tests/linalg/svd.cu @@ -1,15 +1,17 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #include "../test_utils.cuh" +#include #include #include #include #include #include +#include #include @@ -197,11 +199,141 @@ INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, ::testing::ValuesIn(inputsf INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecD, ::testing::ValuesIn(inputsd2)); -// INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF, -// ::testing::ValuesIn(inputsf2)); +// =================================================================== +// Dry-run tests for SVD public API functions +// =================================================================== -// INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecD, -//::testing::ValuesIn(inputsd2)); +TEST(SvdDryRun, QrWithBothVectors) +{ + raft::resources res; + constexpr int n_rows = 256, n_cols = 128; + + auto in = raft::make_device_matrix(res, n_rows, n_cols); + auto sing_vals = raft::make_device_vector(res, n_cols); + auto U = raft::make_device_matrix(res, n_rows, n_cols); + auto V = raft::make_device_matrix(res, n_cols, n_cols); + + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::svd_qr( + handle, raft::make_const_mdspan(in.view()), sing_vals.view(), U.view(), V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0); +} + +TEST(SvdDryRun, QrWithOnlyU) +{ + raft::resources res; + constexpr int n_rows = 256, n_cols = 128; + + auto in = raft::make_device_matrix(res, n_rows, n_cols); + auto sing_vals = raft::make_device_vector(res, n_cols); + auto U = raft::make_device_matrix(res, n_rows, n_cols); + + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::svd_qr( + handle, raft::make_const_mdspan(in.view()), sing_vals.view(), U.view(), std::nullopt); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0); +} + +TEST(SvdDryRun, QrWithOnlyV) +{ + raft::resources res; + constexpr int n_rows = 256, n_cols = 128; + + auto in = raft::make_device_matrix(res, n_rows, n_cols); + auto sing_vals = raft::make_device_vector(res, n_cols); + auto V = raft::make_device_matrix(res, n_cols, n_cols); + + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::svd_qr( + handle, raft::make_const_mdspan(in.view()), sing_vals.view(), std::nullopt, V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0); +} + +TEST(SvdDryRun, QrTransposeRightVecWithBothVectors) +{ + raft::resources res; + constexpr int n_rows = 256, n_cols = 128; + + auto in = raft::make_device_matrix(res, n_rows, n_cols); + auto sing_vals = raft::make_device_vector(res, n_cols); + auto U = raft::make_device_matrix(res, n_rows, n_cols); + auto V = raft::make_device_matrix(res, n_cols, n_cols); + + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::svd_qr_transpose_right_vec( + handle, raft::make_const_mdspan(in.view()), sing_vals.view(), U.view(), V.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0); +} + +TEST(SvdDryRun, EigWithBothVectors) +{ + raft::resources res; + constexpr int n_rows = 256, n_cols = 128; + + auto in = raft::make_device_matrix(res, n_rows, n_cols); + auto S = raft::make_device_vector(res, n_cols); + auto V = raft::make_device_matrix(res, n_cols, n_cols); + auto U = raft::make_device_matrix(res, n_rows, n_cols); + + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::svd_eig(handle, raft::make_const_mdspan(in.view()), S.view(), V.view(), U.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0); +} + +TEST(SvdDryRun, EigWithOnlyV) +{ + raft::resources res; + constexpr int n_rows = 256, n_cols = 128; + + auto in = raft::make_device_matrix(res, n_rows, n_cols); + auto S = raft::make_device_vector(res, n_cols); + auto V = raft::make_device_matrix(res, n_cols, n_cols); + + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::svd_eig( + handle, raft::make_const_mdspan(in.view()), S.view(), V.view(), std::nullopt); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0); +} + +TEST(SvdDryRun, Reconstruction) +{ + raft::resources res; + constexpr int n_rows = 256, n_cols = 128, k = 64; + + auto U = raft::make_device_matrix(res, n_rows, k); + auto S = raft::make_device_matrix(res, k, k); + auto V = raft::make_device_matrix(res, k, n_cols); + auto out = raft::make_device_matrix(res, n_rows, n_cols); + + auto stats = raft::util::dry_run_execute(res, [&](raft::resources const& handle) { + raft::linalg::svd_reconstruction(handle, + raft::make_const_mdspan(U.view()), + raft::make_const_mdspan(S.view()), + raft::make_const_mdspan(V.view()), + out.view()); + }); + + EXPECT_FALSE(raft::resource::get_dry_run_flag(res)); + EXPECT_GT(stats.device_global_peak, 0); +} } // end namespace linalg } // end namespace raft From 6db7ec87526cd70a9496cc7b391a04f5e958c28c Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 19 Feb 2026 10:13:39 +0100 Subject: [PATCH 04/62] Update developer guide with the dry run protocol --- docs/source/developer_guide.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md index 33c7a254f3..15fb5c51a7 100644 --- a/docs/source/developer_guide.md +++ b/docs/source/developer_guide.md @@ -292,6 +292,25 @@ Sometimes, we need to temporarily change the log pattern (eg: for reporting deci 4. Before creating a new primitive, check to see if one exists already. If one exists but the API isn't flexible enough to include your use-case, consider first refactoring the existing primitive. If that is not possible without an extreme number of changes, consider how the public API could be made more flexible. If the new primitive is different enough from all existing primitives, consider whether an existing public API could invoke the new primitive as an option or argument. If the new primitive is different enough from what exists already, add a header for the new public API function to the appropriate subdirectory and namespace. +## Dry Run Protocol + +The dry run protocol defines a mechanism to simulate the execution of algorithms to get a precise estimate of the memory requirements for a real execution with the same parameters. + +In dry run mode: +- no CUDA work happens in any CUDA stream +- no expensive CPU algorithms are allowed to run +- no real allocations happen in any of: + - `rmm` default device resource (device mdarrays and `rmm::device_uvector`) + - `std::pmr` default (host) resource (host mdarrays) + - workspace memory resources managed by `raft::resources`. +All attempted allocations in the above resources are tracked and reported, thus enabling planning of the memory usage with a relatively small overhead of simulated execution. + +To keep the dry run mode functional, the developers must follow the protocol: +- Any function that takes `raft::resources` handle as an argument can run in dry run mode. + It's always safe to call such functions without any precautions. +- Any other expensive function or any function involving CUDA-calls must be guarded by `resource::get_dry_run_flag(res)` +- Allocations through rmm or raft memory resources must NOT be guarded to accurately track the allocation statistics. + ## Header organization of expensive function templates RAFT is a heavily templated library. Several core functions are expensive to compile and we want to prevent duplicate compilation of this functionality. To limit build time, RAFT provides a precompiled library (libraft.so) where expensive function templates are instantiated for the most commonly used template parameters. To prevent (1) accidental instantiation of these templates and (2) unnecessary dependency on the internals of these templates, we use a split header structure and define macros to control template instantiation. This section describes the macros and header structure. From d91a1c61c610f8ce0ff60812851a2568ea929e7a Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 19 Feb 2026 13:11:48 +0100 Subject: [PATCH 05/62] BREAKING CHANGE: replaced pinned_container with host_container using pinned_memory_resource Add pinned and managed resources to the raft::resources handle to make it possible to customize / temporarily replace these resources --- .../raft/core/managed_container_policy.hpp | 20 +--- .../raft/core/pinned_container_policy.hpp | 79 ++------------- .../raft/core/resource/cuda_stream.hpp | 10 +- .../core/resource/managed_memory_resource.hpp | 82 ++++++++++++++++ .../core/resource/pinned_memory_resource.hpp | 96 +++++++++++++++++++ .../raft/core/resource/resource_types.hpp | 2 + .../raft/pmr/pinned_memory_resource.hpp | 50 ++++++++++ .../raft/util/dry_run_memory_resource.hpp | 44 +++++++-- cpp/tests/util/dry_run_memory_resource.cpp | 69 ++++++++++++- 9 files changed, 353 insertions(+), 99 deletions(-) create mode 100644 cpp/include/raft/core/resource/managed_memory_resource.hpp create mode 100644 cpp/include/raft/core/resource/pinned_memory_resource.hpp create mode 100644 cpp/include/raft/pmr/pinned_memory_resource.hpp diff --git a/cpp/include/raft/core/managed_container_policy.hpp b/cpp/include/raft/core/managed_container_policy.hpp index aac9dbae9c..0059d42ce2 100644 --- a/cpp/include/raft/core/managed_container_policy.hpp +++ b/cpp/include/raft/core/managed_container_policy.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -9,18 +9,15 @@ #include #include #include -#include +#include #include -#include - namespace raft { /** * @brief A container policy for managed mdarray. */ template -class managed_uvector_policy { - public: +struct managed_uvector_policy { using element_type = ElementType; using container_type = device_uvector; using pointer = typename container_type::pointer; @@ -33,7 +30,8 @@ class managed_uvector_policy { auto create(raft::resources const& res, size_t n) -> container_type { - return container_type(n, resource::get_cuda_stream(res), mr_); + return container_type( + n, resource::get_cuda_stream(res), raft::resource::get_managed_memory_resource(res)); } [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference @@ -48,14 +46,6 @@ class managed_uvector_policy { [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; } [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; } - - private: - static auto* get_default_memory_resource() - { - auto static result = rmm::mr::managed_memory_resource{}; - return &result; - } - rmm::mr::managed_memory_resource* mr_{get_default_memory_resource()}; }; } // namespace raft diff --git a/cpp/include/raft/core/pinned_container_policy.hpp b/cpp/include/raft/core/pinned_container_policy.hpp index db417164f0..33b18b7abe 100644 --- a/cpp/include/raft/core/pinned_container_policy.hpp +++ b/cpp/include/raft/core/pinned_container_policy.hpp @@ -1,93 +1,27 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once +#include #include #include -#include #ifndef RAFT_DISABLE_CUDA -#include - -#include +#include #else #include #endif namespace raft { #ifndef RAFT_DISABLE_CUDA - -/** - * @brief A thin wrapper over cudaMallocHost/cudaFreeHost for implementing the pinned mdarray - * container policy. - * - */ -template -struct pinned_container { - using value_type = std::remove_cv_t; - - private: - value_type* data_ = nullptr; - - public: - using size_type = std::size_t; - - using reference = value_type&; - using const_reference = value_type const&; - - using pointer = value_type*; - using const_pointer = value_type const*; - - using iterator = pointer; - using const_iterator = const_pointer; - - explicit pinned_container(std::size_t size) - { - RAFT_CUDA_TRY(cudaMallocHost(&data_, size * sizeof(value_type))); - } - ~pinned_container() noexcept - { - if (data_ != nullptr) { RAFT_CUDA_TRY_NO_THROW(cudaFreeHost(data_)); } - } - - pinned_container(pinned_container&& other) { std::swap(this->data_, other.data_); } - pinned_container& operator=(pinned_container&& other) - { - std::swap(this->data_, other.data_); - return *this; - } - pinned_container(pinned_container const&) = delete; // Copying disallowed: one array one owner - pinned_container& operator=(pinned_container const&) = delete; - - /** - * @brief Index operator that returns a reference to the actual data. - */ - template - auto operator[](Index i) noexcept -> reference - { - return data_[i]; - } - /** - * @brief Index operator that returns a reference to the actual data. - */ - template - auto operator[](Index i) const noexcept -> const_reference - { - return data_[i]; - } - - [[nodiscard]] auto data() noexcept -> pointer { return data_; } - [[nodiscard]] auto data() const noexcept -> const_pointer { return data_; } -}; - /** * @brief A container policy for pinned mdarray. */ template struct pinned_vector_policy { using element_type = ElementType; - using container_type = pinned_container; + using container_type = host_container; using pointer = typename container_type::pointer; using const_pointer = typename container_type::const_pointer; using reference = typename container_type::reference; @@ -95,7 +29,10 @@ struct pinned_vector_policy { using accessor_policy = cuda::std::default_accessor; using const_accessor_policy = cuda::std::default_accessor; - auto create(raft::resources const&, size_t n) -> container_type { return container_type(n); } + auto create(raft::resources const& res, size_t n) -> container_type + { + return container_type(n, raft::resource::get_pinned_memory_resource(res)); + } [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference { diff --git a/cpp/include/raft/core/resource/cuda_stream.hpp b/cpp/include/raft/core/resource/cuda_stream.hpp index 690bd610f9..454082d7c3 100644 --- a/cpp/include/raft/core/resource/cuda_stream.hpp +++ b/cpp/include/raft/core/resource/cuda_stream.hpp @@ -1,10 +1,11 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once #include +#include #include #include #include @@ -82,13 +83,18 @@ inline void set_cuda_stream(resources const& res, rmm::cuda_stream_view stream_v */ inline void sync_stream(const resources& res, rmm::cuda_stream_view stream) { + if (raft::resource::get_dry_run_flag(res)) { return; } interruptible::synchronize(stream); } /** * @brief synchronize main stream on the resources instance */ -inline void sync_stream(const resources& res) { sync_stream(res, get_cuda_stream(res)); } +inline void sync_stream(const resources& res) +{ + if (raft::resource::get_dry_run_flag(res)) { return; } + sync_stream(res, get_cuda_stream(res)); +} /** * @} diff --git a/cpp/include/raft/core/resource/managed_memory_resource.hpp b/cpp/include/raft/core/resource/managed_memory_resource.hpp new file mode 100644 index 0000000000..34c2f61a9e --- /dev/null +++ b/cpp/include/raft/core/resource/managed_memory_resource.hpp @@ -0,0 +1,82 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include +#include + +#include +#include + +#include + +namespace raft::resource { + +/** + * @defgroup managed_memory_resource Managed memory resource + * @{ + */ + +/** + * @brief Factory that creates a device_memory_resource for managed (unified) memory. + * + * Defaults to a lazily initialized static rmm::mr::managed_memory_resource. + */ +class managed_memory_resource_factory : public resource_factory { + public: + explicit managed_memory_resource_factory( + std::shared_ptr mr = {nullptr}) + : mr_{mr ? std::move(mr) : default_resource()} + { + } + + auto get_resource_type() -> resource_type override + { + return resource_type::MANAGED_MEMORY_RESOURCE; + } + auto make_resource() -> resource* override { return new device_memory_resource(mr_); } + + private: + std::shared_ptr mr_; + + static auto default_resource() -> std::shared_ptr + { + static auto result = std::make_shared(); + return result; + } +}; + +/** + * @brief Get the managed memory resource from a resources handle. + * + * The default is a static rmm::mr::managed_memory_resource. + * + * @param res raft resources object + * @return pointer to the managed rmm::mr::device_memory_resource + */ +inline auto get_managed_memory_resource(resources const& res) -> rmm::mr::device_memory_resource* +{ + if (!res.has_resource_factory(resource_type::MANAGED_MEMORY_RESOURCE)) { + res.add_resource_factory(std::make_shared()); + } + return res.get_resource(resource_type::MANAGED_MEMORY_RESOURCE); +} + +/** + * @brief Set the managed memory resource on a resources handle. + * + * @param res raft resources object + * @param mr the managed memory resource to use + */ +inline void set_managed_memory_resource(resources const& res, + std::shared_ptr mr) +{ + res.add_resource_factory(std::make_shared(std::move(mr))); +} + +/** @} */ + +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/pinned_memory_resource.hpp b/cpp/include/raft/core/resource/pinned_memory_resource.hpp new file mode 100644 index 0000000000..7fe0e33760 --- /dev/null +++ b/cpp/include/raft/core/resource/pinned_memory_resource.hpp @@ -0,0 +1,96 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include +#include + +#include +#include + +namespace raft::resource { + +/** + * @defgroup pinned_memory_resource Pinned memory resource + * @{ + */ + +/** + * @brief Resource that holds a std::pmr::memory_resource for host allocations. + */ +class host_memory_resource : public resource { + public: + explicit host_memory_resource(std::shared_ptr mr) : mr_(std::move(mr)) + { + } + ~host_memory_resource() override = default; + + auto get_resource() -> void* override { return mr_.get(); } + + private: + std::shared_ptr mr_; +}; + +/** + * @brief Factory that creates a host_memory_resource. + * + * Defaults to a lazily initialized static pinned memory resource + * (cudaMallocHost/cudaFreeHost). + */ +class pinned_memory_resource_factory : public resource_factory { + public: + explicit pinned_memory_resource_factory(std::shared_ptr mr = {nullptr}) + : mr_{mr ? std::move(mr) : default_resource()} + { + } + + auto get_resource_type() -> resource_type override + { + return resource_type::PINNED_MEMORY_RESOURCE; + } + auto make_resource() -> resource* override { return new host_memory_resource(mr_); } + + private: + std::shared_ptr mr_; + + static auto default_resource() -> std::shared_ptr + { + static auto result = std::make_shared(); + return result; + } +}; + +/** + * @brief Get the pinned memory resource from a resources handle. + * + * The default is a static pinned_memory_resource backed by cudaMallocHost/cudaFreeHost. + * + * @param res raft resources object + * @return pointer to the pinned std::pmr::memory_resource + */ +inline auto get_pinned_memory_resource(resources const& res) -> std::pmr::memory_resource* +{ + if (!res.has_resource_factory(resource_type::PINNED_MEMORY_RESOURCE)) { + res.add_resource_factory(std::make_shared()); + } + return res.get_resource(resource_type::PINNED_MEMORY_RESOURCE); +} + +/** + * @brief Set the pinned memory resource on a resources handle. + * + * @param res raft resources object + * @param mr the pinned memory resource to use + */ +inline void set_pinned_memory_resource(resources const& res, + std::shared_ptr mr) +{ + res.add_resource_factory(std::make_shared(std::move(mr))); +} + +/** @} */ + +} // namespace raft::resource diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp index f1ae719d19..6f91e3d626 100644 --- a/cpp/include/raft/core/resource/resource_types.hpp +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -39,6 +39,8 @@ enum resource_type { ROOT_RANK, // root rank in multi-gpu world MULTI_GPU, // resource that tracks resource of each device in multi-gpu world DRY_RUN_FLAG, // boolean flag indicating dry-run mode + PINNED_MEMORY_RESOURCE, // std::pmr memory resource for pinned (page-locked) host allocations + MANAGED_MEMORY_RESOURCE, // rmm device memory resource for managed (unified) allocations LAST_KEY // reserved for the last key }; diff --git a/cpp/include/raft/pmr/pinned_memory_resource.hpp b/cpp/include/raft/pmr/pinned_memory_resource.hpp new file mode 100644 index 0000000000..460d8e03e0 --- /dev/null +++ b/cpp/include/raft/pmr/pinned_memory_resource.hpp @@ -0,0 +1,50 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include + +#include + +#include +#include + +namespace raft::pmr { + +/** + * @brief A std::pmr::memory_resource backed by cudaMallocHost / cudaFreeHost. + * + * This provides CUDA-pinned (page-locked) host memory through the polymorphic + * memory resource interface. It can be used with host_container_policy to + * create pinned mdarrays that are compatible with dry-run tracking. + */ +class pinned_memory_resource : public std::pmr::memory_resource { + public: + pinned_memory_resource() noexcept = default; + ~pinned_memory_resource() noexcept override = default; + + protected: + void* do_allocate(std::size_t bytes, std::size_t /*alignment*/) override + { + if (bytes == 0) { return nullptr; } + void* ptr = nullptr; + RAFT_CUDA_TRY(cudaMallocHost(&ptr, bytes)); + return ptr; + } + + void do_deallocate(void* ptr, std::size_t /*bytes*/, std::size_t /*alignment*/) noexcept override + { + if (ptr == nullptr) { return; } + RAFT_CUDA_TRY_NO_THROW(cudaFreeHost(ptr)); + } + + bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override + { + return dynamic_cast(&other) != nullptr; + } +}; + +} // namespace raft::pmr diff --git a/cpp/include/raft/util/dry_run_memory_resource.hpp b/cpp/include/raft/util/dry_run_memory_resource.hpp index 4b309fd10b..997f072593 100644 --- a/cpp/include/raft/util/dry_run_memory_resource.hpp +++ b/cpp/include/raft/util/dry_run_memory_resource.hpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include @@ -34,7 +36,9 @@ struct dry_run_stats { std::size_t device_workspace_peak; ///< Peak device workspace bytes std::size_t device_large_workspace_peak; ///< Peak device large workspace bytes std::size_t device_global_peak; ///< Peak device global allocation bytes - std::size_t host_peak; ///< Peak host allocation bytes + std::size_t device_managed_peak; ///< Peak device managed allocation bytes + std::size_t host_peak; ///< Peak host (default pmr) allocation bytes + std::size_t host_pinned_peak; ///< Peak host pinned allocation bytes }; /** @@ -230,6 +234,9 @@ class dry_run_host_memory_resource : public std::pmr::memory_resource { * On construction, saves all current memory resource state and replaces it with * dry-run resources. On destruction, restores all original resources. * + * Global resources (rmm device, std::pmr host) are replaced globally. + * Handle-local resources (workspace, pinned, managed) are replaced only on the handle. + * * This class only manages resources; the action to be dry-run is executed * separately (see dry_run_execute()). */ @@ -241,10 +248,14 @@ class dry_run_resource_manager { */ explicit dry_run_resource_manager(const raft::resources& res) : res_(res) { - // Save original device resource state + // Save original global resource state orig_global_device_mr_ = rmm::mr::get_current_device_resource(); orig_pmr_ = std::pmr::get_default_resource(); + // Save handle-local resources + orig_pinned_mr_ = resource::get_pinned_memory_resource(res); + orig_managed_mr_ = resource::get_managed_memory_resource(res); + // Save workspace settings (use accessors that handle lazy initialization) auto* workspace_mr = resource::get_workspace_resource(res); workspace_limit_ = workspace_mr->get_allocation_limit(); @@ -257,15 +268,18 @@ class dry_run_resource_manager { dry_run_workspace_ = std::make_shared(orig_workspace_upstream_); dry_run_large_workspace_ = std::make_shared(orig_large_workspace_mr_); - dry_run_global_ = std::make_shared(orig_global_device_mr_); - dry_run_host_ = std::make_unique(orig_pmr_); + dry_run_global_ = std::make_shared(orig_global_device_mr_); + dry_run_managed_ = std::make_shared(orig_managed_mr_); + dry_run_host_ = std::make_unique(orig_pmr_); + dry_run_pinned_ = std::make_shared(orig_pinned_mr_); - // Replace global device resource + // Replace global resources rmm::mr::set_current_device_resource(dry_run_global_.get()); - // Replace global host resource std::pmr::set_default_resource(dry_run_host_.get()); - // Replace workspace resources + // Replace handle-local resources + resource::set_pinned_memory_resource(res, dry_run_pinned_); + resource::set_managed_memory_resource(res, dry_run_managed_); resource::set_workspace_resource(res, dry_run_workspace_, workspace_limit_, std::nullopt); resource::set_large_workspace_resource(res, dry_run_large_workspace_); @@ -282,6 +296,12 @@ class dry_run_resource_manager { rmm::mr::set_current_device_resource(orig_global_device_mr_); std::pmr::set_default_resource(orig_pmr_); + // Restore handle-local resources + resource::set_pinned_memory_resource( + res_, std::shared_ptr(orig_pinned_mr_, void_op{})); + resource::set_managed_memory_resource( + res_, std::shared_ptr(orig_managed_mr_, void_op{})); + // Restore workspace resources with original settings. // Use non-owning shared_ptrs (void_op deleter) since lifetime is managed externally. resource::set_workspace_resource( @@ -309,16 +329,22 @@ class dry_run_resource_manager { .device_workspace_peak = dry_run_workspace_->get_peak_bytes(), .device_large_workspace_peak = dry_run_large_workspace_->get_peak_bytes(), .device_global_peak = dry_run_global_->get_peak_bytes(), + .device_managed_peak = dry_run_managed_->get_peak_bytes(), .host_peak = dry_run_host_->get_peak_bytes(), + .host_pinned_peak = dry_run_pinned_->get_peak_bytes(), }; } private: const raft::resources& res_; - // Original resources (saved in constructor) + // Original global resources rmm::mr::device_memory_resource* orig_global_device_mr_{nullptr}; std::pmr::memory_resource* orig_pmr_{nullptr}; + + // Original handle-local resources + std::pmr::memory_resource* orig_pinned_mr_{nullptr}; + rmm::mr::device_memory_resource* orig_managed_mr_{nullptr}; std::optional workspace_limit_; rmm::mr::device_memory_resource* orig_workspace_upstream_{nullptr}; rmm::mr::device_memory_resource* orig_large_workspace_mr_{nullptr}; @@ -327,7 +353,9 @@ class dry_run_resource_manager { std::shared_ptr dry_run_workspace_; std::shared_ptr dry_run_large_workspace_; std::shared_ptr dry_run_global_; + std::shared_ptr dry_run_managed_; std::unique_ptr dry_run_host_; + std::shared_ptr dry_run_pinned_; }; /** diff --git a/cpp/tests/util/dry_run_memory_resource.cpp b/cpp/tests/util/dry_run_memory_resource.cpp index f997ace299..51afefe0b6 100644 --- a/cpp/tests/util/dry_run_memory_resource.cpp +++ b/cpp/tests/util/dry_run_memory_resource.cpp @@ -5,6 +5,8 @@ #include #include +#include +#include #include #include #include @@ -191,6 +193,30 @@ TEST(DryRunResourceManager, RestoresHostResource) EXPECT_EQ(std::pmr::get_default_resource(), original_pmr); } +TEST(DryRunResourceManager, RestoresPinnedResource) +{ + raft::resources res; + auto* original_pinned = resource::get_pinned_memory_resource(res); + { + dry_run_resource_manager manager(res); + auto* current_pinned = resource::get_pinned_memory_resource(res); + EXPECT_NE(current_pinned, original_pinned); + } + EXPECT_EQ(resource::get_pinned_memory_resource(res), original_pinned); +} + +TEST(DryRunResourceManager, RestoresManagedResource) +{ + raft::resources res; + auto* original_managed = resource::get_managed_memory_resource(res); + { + dry_run_resource_manager manager(res); + auto* current_managed = resource::get_managed_memory_resource(res); + EXPECT_NE(current_managed, original_managed); + } + EXPECT_EQ(resource::get_managed_memory_resource(res), original_managed); +} + TEST(DryRunResourceManager, StatsAccuracy) { raft::resources res; @@ -207,6 +233,38 @@ TEST(DryRunResourceManager, StatsAccuracy) EXPECT_EQ(stats.device_global_peak, kAllocSize); } +TEST(DryRunResourceManager, PinnedStatsAccuracy) +{ + raft::resources res; + constexpr std::size_t kAllocSize = 64UL * 1024UL * 1024UL; // 64 MiB + + dry_run_resource_manager manager(res); + + // Allocate from pinned resource in the handle + auto* mr = resource::get_pinned_memory_resource(res); + void* ptr = mr->allocate(kAllocSize); + mr->deallocate(ptr, kAllocSize); + + auto stats = manager.get_stats(); + EXPECT_EQ(stats.host_pinned_peak, kAllocSize); +} + +TEST(DryRunResourceManager, ManagedStatsAccuracy) +{ + raft::resources res; + constexpr std::size_t kAllocSize = 64UL * 1024UL * 1024UL; // 64 MiB + + dry_run_resource_manager manager(res); + + // Allocate from managed resource in the handle + auto* mr = resource::get_managed_memory_resource(res); + void* ptr = mr->allocate(rmm::cuda_stream_view{}, kAllocSize); + mr->deallocate(rmm::cuda_stream_view{}, ptr, kAllocSize); + + auto stats = manager.get_stats(); + EXPECT_EQ(stats.device_managed_peak, kAllocSize); +} + // ===== dry_run_execute tests ===== TEST(DryRunExecute, BasicExecution) @@ -233,16 +291,21 @@ TEST(DryRunExecute, BasicExecution) TEST(DryRunExecute, ExceptionSafety) { raft::resources res; - auto* original_mr = rmm::mr::get_current_device_resource(); - auto* original_pmr = std::pmr::get_default_resource(); + auto* original_mr = rmm::mr::get_current_device_resource(); + auto* original_pmr = std::pmr::get_default_resource(); + auto* original_pinned = resource::get_pinned_memory_resource(res); + auto* original_managed = resource::get_managed_memory_resource(res); EXPECT_THROW(dry_run_execute( res, [](raft::resources const&) { throw std::runtime_error("test exception"); }), std::runtime_error); - // Resources should be restored even after exception + // Global resources should be restored even after exception EXPECT_EQ(rmm::mr::get_current_device_resource(), original_mr); EXPECT_EQ(std::pmr::get_default_resource(), original_pmr); + // Handle-local resources should be restored even after exception + EXPECT_EQ(resource::get_pinned_memory_resource(res), original_pinned); + EXPECT_EQ(resource::get_managed_memory_resource(res), original_managed); EXPECT_FALSE(resource::get_dry_run_flag(res)); } From 1a114f6e11e5ad8737ff16decb6e35f6e31579ca Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 19 Feb 2026 14:57:36 +0100 Subject: [PATCH 06/62] Dry run compliance for raft::matrix namespace --- cpp/include/raft/matrix/argmax.cuh | 4 +- cpp/include/raft/matrix/argmin.cuh | 4 +- cpp/include/raft/matrix/col_wise_sort.cuh | 24 +++++++--- .../raft/matrix/detail/columnWiseSort.cuh | 9 +++- cpp/include/raft/matrix/detail/gather.cuh | 15 +++++-- .../raft/matrix/detail/gather_inplace.cuh | 9 ++-- .../raft/matrix/detail/scatter_inplace.cuh | 9 ++-- .../raft/matrix/detail/select_k-inl.cuh | 3 ++ .../raft/matrix/detail/select_radix.cuh | 39 ++++++++++++---- .../raft/matrix/detail/select_warpsort.cuh | 17 ++++--- cpp/include/raft/matrix/detail/shift.cuh | 6 ++- cpp/include/raft/matrix/diagonal.cuh | 7 ++- cpp/include/raft/matrix/gather.cuh | 7 ++- cpp/include/raft/matrix/init.cuh | 5 ++- cpp/include/raft/matrix/matrix.cuh | 5 ++- cpp/include/raft/matrix/norm.cuh | 4 +- cpp/include/raft/matrix/power.cuh | 7 ++- cpp/include/raft/matrix/print.cuh | 4 +- cpp/include/raft/matrix/reciprocal.cuh | 5 ++- cpp/include/raft/matrix/reverse.cuh | 5 ++- cpp/include/raft/matrix/sign_flip.cuh | 4 +- cpp/include/raft/matrix/slice.cuh | 4 +- cpp/include/raft/matrix/sqrt.cuh | 5 +++ cpp/include/raft/matrix/threshold.cuh | 5 ++- cpp/include/raft/matrix/triangular.cuh | 4 +- cpp/include/raft/random/detail/rng_impl.cuh | 26 ++++++++++- .../random/detail/rng_impl_deprecated.cuh | 5 ++- cpp/include/raft/random/rng.cuh | 45 +++++++++++++++++-- cpp/tests/matrix/sample_rows.cu | 41 ++++++++++++++++- 29 files changed, 273 insertions(+), 54 deletions(-) diff --git a/cpp/include/raft/matrix/argmax.cuh b/cpp/include/raft/matrix/argmax.cuh index 36a8999b64..caa477fa8e 100644 --- a/cpp/include/raft/matrix/argmax.cuh +++ b/cpp/include/raft/matrix/argmax.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -7,6 +7,7 @@ #include #include +#include #include namespace raft::matrix { @@ -27,6 +28,7 @@ void argmax(raft::resources const& handle, raft::device_matrix_view in, raft::device_vector_view out) { + if (resource::get_dry_run_flag(handle)) { return; } RAFT_EXPECTS(out.extent(0) == in.extent(0), "Size of output vector must equal number of rows in input matrix."); detail::argmax(in.data_handle(), diff --git a/cpp/include/raft/matrix/argmin.cuh b/cpp/include/raft/matrix/argmin.cuh index a168d3969a..9531b6a426 100644 --- a/cpp/include/raft/matrix/argmin.cuh +++ b/cpp/include/raft/matrix/argmin.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -7,6 +7,7 @@ #include #include +#include #include namespace raft::matrix { @@ -27,6 +28,7 @@ void argmin(raft::resources const& handle, raft::device_matrix_view in, raft::device_vector_view out) { + if (resource::get_dry_run_flag(handle)) { return; } RAFT_EXPECTS(out.extent(0) == in.extent(0), "Size of output vector must equal number of rows in input matrix."); detail::argmin(in.data_handle(), diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh index 0347797a4c..7e5d95f3eb 100644 --- a/cpp/include/raft/matrix/col_wise_sort.cuh +++ b/cpp/include/raft/matrix/col_wise_sort.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #ifndef __COL_WISE_SORT_H @@ -10,6 +10,7 @@ #include #include #include +#include #include namespace raft::matrix { @@ -38,8 +39,16 @@ void sort_cols_per_row(const InType* in, cudaStream_t stream, InType* sortedKeys = nullptr) { - detail::sortColumnsPerRow( - in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys); + detail::sortColumnsPerRow(false, + in, + out, + n_rows, + n_columns, + bAllocWorkspace, + workspacePtr, + workspaceSize, + stream, + sortedKeys); } /** @@ -78,12 +87,14 @@ void sort_cols_per_row(raft::resources const& handle, "Input and `sorted_keys` matrices must have the same shape."); } + bool dry_run = resource::get_dry_run_flag(handle); size_t workspace_size = 0; bool alloc_workspace = false; in_t* keys = sorted_keys.has_value() ? sorted_keys.value().data_handle() : nullptr; - detail::sortColumnsPerRow(in.data_handle(), + detail::sortColumnsPerRow(dry_run, + in.data_handle(), out.data_handle(), in.extent(0), in.extent(1), @@ -96,7 +107,10 @@ void sort_cols_per_row(raft::resources const& handle, if (alloc_workspace) { auto workspace = raft::make_device_vector(handle, workspace_size); - detail::sortColumnsPerRow(in.data_handle(), + if (dry_run) { return; } + + detail::sortColumnsPerRow(dry_run, + in.data_handle(), out.data_handle(), in.extent(0), in.extent(1), diff --git a/cpp/include/raft/matrix/detail/columnWiseSort.cuh b/cpp/include/raft/matrix/detail/columnWiseSort.cuh index a8f654557d..a36e9ee4da 100644 --- a/cpp/include/raft/matrix/detail/columnWiseSort.cuh +++ b/cpp/include/raft/matrix/detail/columnWiseSort.cuh @@ -163,7 +163,8 @@ cudaError_t layoutSortOffset(T* in, T value, int n_times, cudaStream_t stream) * @param sortedKeys: Optional, output matrix for sorted keys (input) */ template -void sortColumnsPerRow(const InType* in, +void sortColumnsPerRow(bool dry_run, + const InType* in, OutType* out, int n_rows, int n_columns, @@ -203,6 +204,8 @@ void sortColumnsPerRow(const InType* in, // more elements per thread --> more register pressure // 512(blockSize) * 8 elements per thread = 71 register / thread + if (dry_run) { return; } + // instantiate some kernel combinations if (n_columns <= 512) INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 128, 4, stream); @@ -250,6 +253,8 @@ void sortColumnsPerRow(const InType* in, // for segment offsets workspaceSize += raft::alignTo(sizeof(int) * (size_t)numSegments, memAlignWidth); } else { + if (dry_run) { return; } + size_t workspaceOffset = 0; if (!sortedKeys) { @@ -301,6 +306,8 @@ void sortColumnsPerRow(const InType* in, workspaceSize += raft::alignTo(sizeof(OutType) * (size_t)n_columns, memAlignWidth); } else { + if (dry_run) { return; } + size_t workspaceOffset = 0; bool userKeyOutputBuffer = true; diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh index 26128074e1..fb9fbb5978 100644 --- a/cpp/include/raft/matrix/detail/gather.cuh +++ b/cpp/include/raft/matrix/detail/gather.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -550,13 +551,15 @@ void gather(raft::resources const& res, device_vector_view indices, raft::device_matrix_view output) { + auto dry_run = resource::get_dry_run_flag(res); raft::common::nvtx::range fun_scope("gather"); IdxT n_dim = output.extent(1); IdxT n_train = output.extent(0); auto indices_host = raft::make_host_vector(n_train); - raft::copy( - indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res)); - resource::sync_stream(res); + if (!dry_run) { + raft::copy( + indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res)); + } const size_t buffer_size = 32768 * 1024; // bytes const size_t max_batch_size = @@ -568,6 +571,10 @@ void gather(raft::resources const& res, auto out_tmp1 = raft::make_pinned_matrix(res, max_batch_size, n_dim); auto out_tmp2 = raft::make_pinned_matrix(res, max_batch_size, n_dim); + if (dry_run) { return; } + + resource::sync_stream(res); + // Usually a limited number of threads provide sufficient bandwidth for gathering data. #if defined(_OPENMP) int n_threads = std::min(omp_get_max_threads(), 32); diff --git a/cpp/include/raft/matrix/detail/gather_inplace.cuh b/cpp/include/raft/matrix/detail/gather_inplace.cuh index cdc13df020..5835be2e01 100644 --- a/cpp/include/raft/matrix/detail/gather_inplace.cuh +++ b/cpp/include/raft/matrix/detail/gather_inplace.cuh @@ -1,10 +1,11 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once #include +#include #include #include #include @@ -38,12 +39,14 @@ void gatherInplaceImpl(raft::resources const& handle, // re-assign batch_size for default case if (batch_size == 0 || batch_size > n) batch_size = n; + auto scratch_space = raft::make_device_vector(handle, map_length * batch_size); + + if (resource::get_dry_run_flag(handle)) { return; } + auto exec_policy = resource::get_thrust_policy(handle); IndexT n_batches = raft::ceildiv(n, batch_size); - auto scratch_space = raft::make_device_vector(handle, map_length * batch_size); - for (IndexT bid = 0; bid < n_batches; bid++) { IndexT batch_offset = bid * batch_size; IndexT cols_per_batch = min(batch_size, n - batch_offset); diff --git a/cpp/include/raft/matrix/detail/scatter_inplace.cuh b/cpp/include/raft/matrix/detail/scatter_inplace.cuh index 16a4a254b1..38341eb9c8 100644 --- a/cpp/include/raft/matrix/detail/scatter_inplace.cuh +++ b/cpp/include/raft/matrix/detail/scatter_inplace.cuh @@ -1,10 +1,11 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once #include +#include #include #include #include @@ -63,12 +64,14 @@ void scatterInplaceImpl( // re-assign batch_size for default case if (batch_size == 0 || batch_size > n) batch_size = n; + auto scratch_space = raft::make_device_vector(handle, m * batch_size); + + if (resource::get_dry_run_flag(handle)) { return; } + auto exec_policy = resource::get_thrust_policy(handle); IndexT n_batches = raft::ceildiv(n, batch_size); - auto scratch_space = raft::make_device_vector(handle, m * batch_size); - for (IndexT bid = 0; bid < n_batches; bid++) { IndexT batch_offset = bid * batch_size; IndexT cols_per_batch = min(batch_size, n - batch_offset); diff --git a/cpp/include/raft/matrix/detail/select_k-inl.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh index a16933f97b..cf638516fd 100644 --- a/cpp/include/raft/matrix/detail/select_k-inl.cuh +++ b/cpp/include/raft/matrix/detail/select_k-inl.cuh @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -125,6 +126,8 @@ void segmented_sort_by_key(raft::resources const& handle, auto d_temp_storage = raft::make_device_mdarray( handle, mr, raft::make_extents(temp_storage_bytes)); + if (resource::get_dry_run_flag(handle)) { return; } + if (asc) { // Run sorting operation cub::DeviceSegmentedRadixSort::SortPairs((void*)d_temp_storage.data_handle(), diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh index f28b2acc57..ce11a51175 100644 --- a/cpp/include/raft/matrix/detail/select_radix.cuh +++ b/cpp/include/raft/matrix/detail/select_radix.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -871,7 +872,8 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt) } template -void radix_topk(const T* in, +void radix_topk(bool dry_run, + const T* in, const IdxT* in_idx, int batch_size, IdxT len, @@ -905,6 +907,8 @@ void radix_topk(const T* in, rmm::device_buffer bufs(max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr); + if (dry_run) { return; } + for (size_t offset = 0; offset < static_cast(batch_size); offset += max_chunk_size) { int chunk_size = std::min(max_chunk_size, batch_size - offset); RAFT_CUDA_TRY( @@ -1151,7 +1155,8 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in, // used. It's used when len is relatively small or when the number of blocks per row calculated by // `calc_grid_dim()` is 1. template -void radix_topk_one_block(const T* in, +void radix_topk_one_block(bool dry_run, + const T* in, const IdxT* in_idx, int batch_size, IdxT len, @@ -1173,6 +1178,8 @@ void radix_topk_one_block(const T* in, rmm::device_buffer bufs(max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr); + if (dry_run) { return; } + for (size_t offset = 0; offset < static_cast(batch_size); offset += max_chunk_size) { int chunk_size = std::min(max_chunk_size, batch_size - offset); const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr; @@ -1270,9 +1277,11 @@ void select_k(raft::resources const& res, RAFT_EXPECTS(!(!len_or_indptr && (len_i == nullptr)), "When `len_or_indptr` is false, `len_i` must not be nullptr!"); - auto stream = resource::get_cuda_stream(res); - auto mr = resource::get_workspace_resource(res); + bool dry_run = resource::get_dry_run_flag(res); + auto stream = resource::get_cuda_stream(res); + auto mr = resource::get_workspace_resource(res); if (k == len && len_or_indptr) { + if (dry_run) { return; } RAFT_CUDA_TRY( cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream)); if (in_idx) { @@ -1292,15 +1301,27 @@ void select_k(raft::resources const& res, if (len <= BlockSize * items_per_thread) { impl::radix_topk_one_block( - in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr); + dry_run, in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr); } else { unsigned grid_dim = impl::calc_grid_dim(batch_size, len, sm_cnt); if (grid_dim == 1) { - impl::radix_topk_one_block( - in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr); + impl::radix_topk_one_block(dry_run, + in, + in_idx, + batch_size, + len, + k, + out, + out_idx, + select_min, + len_i, + sm_cnt, + stream, + mr); } else { - impl::radix_topk(in, + impl::radix_topk(dry_run, + in, in_idx, batch_size, len, diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh index 1393a44073..5d9a20c8aa 100644 --- a/cpp/include/raft/matrix/detail/select_warpsort.cuh +++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -1035,7 +1036,8 @@ void calc_launch_parameter(raft::resources const& res, } template