Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions cpp/include/rapidsmpf/memory/pinned_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,25 @@ class PinnedMemoryResource final
/// @brief Sentinel value indicating that pinned host memory is disabled.
static constexpr std::nullopt_t Disabled = std::nullopt;

/**
* @brief Fraction of total host memory per GPU used as the initial pinned pool size
* when no explicit `pinned_initial_pool_size` option is provided.
*
* Applied as: `initial_pool_size = get_host_memory_per_gpu() *
* DEFAULT_INIT_POOL_SIZE_FACTOR`.
*/
static constexpr double DEFAULT_INIT_POOL_SIZE_FACTOR = 0.1;
Comment thread
madsbk marked this conversation as resolved.
Outdated

/**
* @brief Fraction of total host memory per GPU used as the maximum pinned pool size
* when no explicit `pinned_max_pool_size` option is provided.
*
* Applied as: `max_pool_size = get_host_memory_per_gpu() *
* DEFAULT_MAX_POOL_SIZE_FACTOR`. `get_host_memory_per_gpu()` is computed as total
* host memory divided by the number of GPUs visible to the system.
*/
static constexpr double DEFAULT_MAX_POOL_SIZE_FACTOR = 0.8;

/**
* @brief Create a pinned memory resource if the system supports pinned memory.
*
Expand All @@ -118,6 +137,13 @@ class PinnedMemoryResource final
/**
* @brief Construct from configuration options.
*
* Recognized options:
* - `pinned_memory` (bool): enables pinned memory; defaults to `true`.
* - `pinned_initial_pool_size` (nbytes string): initial pool size; defaults to
* `get_host_memory_per_gpu() * DEFAULT_INIT_POOL_SIZE_FACTOR`.
* - `pinned_max_pool_size` (nbytes string or empty): maximum pool size; defaults to
* `get_host_memory_per_gpu() * DEFAULT_MAX_POOL_SIZE_FACTOR`.
*
* @param options Configuration options.
*
* @return A `PinnedMemoryResource` if pinned memory is enabled and supported,
Expand Down
6 changes: 6 additions & 0 deletions cpp/include/rapidsmpf/system_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,11 @@ std::vector<int> get_current_numa_nodes() noexcept;
*/
std::uint64_t get_numa_node_host_memory(int numa_id = get_current_numa_node()) noexcept;

/**
* @brief Get the amount of host memory per GPU.
*
Comment thread
nirandaperera marked this conversation as resolved.
* @return Amount of host memory per GPU in bytes.
*/
std::uint64_t get_host_memory_per_gpu();

} // namespace rapidsmpf
18 changes: 14 additions & 4 deletions cpp/src/memory/pinned_memory_resource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,21 +74,31 @@ std::optional<PinnedMemoryResource> PinnedMemoryResource::from_options(
config::Options options
) {
bool const pinned_memory = options.get<bool>("pinned_memory", [](auto const& s) {
return parse_string<bool>(s.empty() ? "True" : s);
return s.empty() ? true : parse_string<bool>(s);
});

if (pinned_memory && is_pinned_memory_resources_supported()) {
PinnedPoolProperties pool_properties{
.initial_pool_size = options.get<size_t>(
"pinned_initial_pool_size",
[](auto const& s) { return s.empty() ? 0 : parse_nbytes_unsigned(s); }
[](auto const& s) {
return s.empty() ? safe_cast<size_t>(
get_host_memory_per_gpu()
* DEFAULT_INIT_POOL_SIZE_FACTOR
)
: parse_nbytes_unsigned(s);
}
),
.max_pool_size = options.get<std::optional<size_t>>(
"pinned_max_pool_size", [](auto const& s) -> std::optional<size_t> {
auto parsed = parse_optional(s);
const auto parsed = parse_optional(s);
if (parsed.has_value() && !parsed->empty()) {
return parse_nbytes_unsigned(*parsed);
} else {
return safe_cast<size_t>(
get_host_memory_per_gpu() * DEFAULT_MAX_POOL_SIZE_FACTOR
);
}
return std::nullopt;
}
)
};
Expand Down
22 changes: 22 additions & 0 deletions cpp/src/system_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <sched.h>
#include <unistd.h>

#include <cucascade/memory/topology_discovery.hpp>

#include <rapidsmpf/error.hpp>
#include <rapidsmpf/system_info.hpp>

Expand Down Expand Up @@ -79,4 +81,24 @@ std::uint64_t get_numa_node_host_memory([[maybe_unused]] int numa_id) noexcept {
return safe_cast<std::uint64_t>(ret);
}

namespace {
const auto& get_topology() {
static const auto topo = [] {
cucascade::memory::topology_discovery discovery;
RAPIDSMPF_EXPECTS(
discovery.discover(),
"get_host_memory_per_gpu(): failed to discover system topology",
std::runtime_error
);
return discovery;
}();
return topo.get_topology();
}
} // namespace

std::uint64_t get_host_memory_per_gpu() {
auto const num_gpus = get_topology().num_gpus;
return get_total_host_memory() / std::max<std::uint64_t>(1, num_gpus);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you double-check how get_total_host_memory behaves on devices where GPU memory is also exposed as NUMA nodes? In those cases free will also show GPU memory as part of system memory, which may break the assumption get_total_host_memory presents only system memory.

IIRC, the conditions for GPU memory to be exposed as system memory are:

  1. Coherent system (e.g., Grace-Hopper/Grace-Blackwell)
  2. NVIDIA driver in NUMA mode (not CDMM)
  3. HMM enabled on the kernel

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah! this is a good point.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pentschev you were right Peter.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice that there was a simple solution. I also didn't think of the bigger picture of the problem, we cannot just take the whole host memory and divide equally among GPUs, since their NUMA nodes may have different amount of memory, and looks like your new solution already ensures that.

}

} // namespace rapidsmpf
1 change: 0 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,6 @@ add_executable(
"${PROJECT_SOURCE_DIR}/src/bootstrap/bootstrap.cpp"
"${PROJECT_SOURCE_DIR}/src/bootstrap/file_backend.cpp"
"${PROJECT_SOURCE_DIR}/src/bootstrap/utils.cpp"
"${PROJECT_SOURCE_DIR}/src/system_info.cpp"
"$<$<BOOL:${RAPIDSMPF_HAVE_SLURM}>:${PROJECT_SOURCE_DIR}/src/bootstrap/slurm_backend.cpp>"
)
set_target_properties(
Expand Down
24 changes: 23 additions & 1 deletion cpp/tests/test_host_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>

#include <rapidsmpf/config.hpp>
#include <rapidsmpf/cuda_stream.hpp>
#include <rapidsmpf/memory/pinned_memory_resource.hpp>
#include <rapidsmpf/utils/misc.hpp>
Expand Down Expand Up @@ -265,7 +266,7 @@ std::size_t discover_pinned_pool_actual_size(

} // namespace

TEST(PinnedResourceMaxSize, max_pool_size_limit) {
TEST(PinnedResource, max_pool_size_limit) {
// Ensure CUDA device context is initialized (required for pinned memory pools).
RAPIDSMPF_CUDA_TRY(cudaFree(nullptr));
auto stream = cudf::get_default_stream();
Expand All @@ -292,3 +293,24 @@ TEST(PinnedResourceMaxSize, max_pool_size_limit) {
EXPECT_THROW(alloc_and_dealloc(actual_pool_size + 1), cuda::cuda_error);
stream.synchronize();
}

TEST(PinnedResource, from_default_options) {
auto mr = rapidsmpf::PinnedMemoryResource::from_options(rapidsmpf::config::Options{});
if (mr == rapidsmpf::PinnedMemoryResource::Disabled) {
GTEST_SKIP() << "PinnedMemoryResource is not supported";
}
EXPECT_EQ(
mr->properties().initial_pool_size,
rapidsmpf::safe_cast<std::size_t>(
rapidsmpf::get_host_memory_per_gpu()
* rapidsmpf::PinnedMemoryResource::DEFAULT_INIT_POOL_SIZE_FACTOR
)
);
EXPECT_EQ(
mr->properties().max_pool_size.value(),
rapidsmpf::safe_cast<std::size_t>(
rapidsmpf::get_host_memory_per_gpu()
* rapidsmpf::PinnedMemoryResource::DEFAULT_MAX_POOL_SIZE_FACTOR
)
);
}
20 changes: 11 additions & 9 deletions docs/source/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ rapidsmpf::config::Options options{rapidsmpf::config::get_environment_variables(

- **`pinned_memory`**
- **Environment Variable**: `RAPIDSMPF_PINNED_MEMORY`
- **Default**: `false`
- **Default**: `true`
- **Description**: Enables pinned host memory if it is available on the system.
Pinned host memory provides higher bandwidth and lower latency for device-to-host
transfers compared to regular pageable host memory. When enabled, RapidsMPF
Expand All @@ -105,17 +105,19 @@ rapidsmpf::config::Options options{rapidsmpf::config::get_environment_variables(

- **`pinned_initial_pool_size`**
- **Environment Variable**: `RAPIDSMPF_PINNED_INITIAL_POOL_SIZE`
- **Default**: `0`
- **Description**: Initial size (in bytes) of the pinned host memory pool when
`pinned_memory` is enabled. A value of `0` means the pool starts empty and grows
on demand. Accepts byte counts (e.g. `"1GiB"`, `"512MiB"`).
- **Default**: 10% of per-GPU host memory (`get_host_memory_per_gpu() * 0.1`)
- **Description**: Initial size of the pinned host memory pool when `pinned_memory` is
enabled. When unset or empty, the pool is pre-allocated to 10% of total host memory
divided by the number of GPUs in the system. Accepts byte counts
(e.g. `"1GiB"`, `"512MiB"`).

- **`pinned_max_pool_size`**
- **Environment Variable**: `RAPIDSMPF_PINNED_MAX_POOL_SIZE`
- **Default**: `"disabled"`
- **Description**: Maximum size (in bytes) of the pinned host memory pool when
`pinned_memory` is enabled. When unset or empty, the pool is allowed to grow
without an upper bound. Accepts byte counts (e.g. `"4GiB"`, `"2048MiB"`).
- **Default**: 80% of per-GPU host memory (`get_host_memory_per_gpu() * 0.8`)
- **Description**: Maximum size of the pinned host memory pool when `pinned_memory` is
enabled. When unset or empty, the pool is capped at 80% of total host memory divided
by the number of GPUs in the system. Accepts byte counts
(e.g. `"4GiB"`, `"2048MiB"`).

- **`spill_device_limit`**
- **Environment Variable**: `RAPIDSMPF_SPILL_DEVICE_LIMIT`
Expand Down
1 change: 1 addition & 0 deletions python/rapidsmpf/rapidsmpf/utils/system_info.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
def get_total_host_memory() -> int: ...
def get_current_numa_node() -> int: ...
def get_numa_node_host_memory(numa_id: int | None = None) -> int: ...
def get_host_memory_per_gpu() -> int: ...
24 changes: 24 additions & 0 deletions python/rapidsmpf/rapidsmpf/utils/system_info.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ cdef extern from "<rapidsmpf/system_info.hpp>" nogil:
cdef uint64_t cpp_get_numa_node_host_memory \
"rapidsmpf::get_numa_node_host_memory"(int numa_id) noexcept

cdef uint64_t cpp_get_host_memory_per_gpu \
"rapidsmpf::get_host_memory_per_gpu"() except+


def get_total_host_memory():
"""
Expand Down Expand Up @@ -84,3 +87,24 @@ def get_numa_node_host_memory(numa_id = None):
else:
_numa_id = numa_id
return cpp_get_numa_node_host_memory(_numa_id)


def get_host_memory_per_gpu():
"""
Get the total host memory divided by the number of GPUs in the system.
Returns the amount of host memory attributed to each GPU, computed as
total host memory divided by the number of GPUs visible to the system.
Falls back to total host memory when only one GPU (or none) is present.
Returns
-------
int
Host memory per GPU in bytes.
Comment thread
nirandaperera marked this conversation as resolved.
Outdated
Raises
------
RuntimeError
If system topology discovery fails.
"""
return cpp_get_host_memory_per_gpu()
Loading