Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 94 additions & 8 deletions unified-runtime/source/adapters/level_zero/v2/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,30 @@ void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice,
return ptr;
}

void *ur_discrete_buffer_handle_t::ensureDeviceAlloc(ur_device_handle_t hDevice,
size_t size) {
auto id = hDevice->Id.value();
if (void *existing = deviceAllocations[id].get()) {
return existing;
}

// Allocate without touching activeAllocationDevice; the caller is
// responsible for updating it at the correct point in the migration flow.
void *ptr;
UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
hContext, hDevice, nullptr, UR_USM_TYPE_DEVICE, size, &ptr));

deviceAllocations[id] =
usm_unique_ptr_t(ptr, [hContext = this->hContext](void *ptr) {
auto ret = hContext->getDefaultUSMPool()->free(ptr);
if (ret != UR_RESULT_SUCCESS) {
UR_LOG(ERR, "Failed to free device memory: {}", ret);
}
});

return ptr;
}

ur_result_t
ur_discrete_buffer_handle_t::migrateBufferTo(ur_device_handle_t hDevice,
void *src, size_t size) {
Expand Down Expand Up @@ -340,8 +364,8 @@ void *ur_discrete_buffer_handle_t::getActiveDeviceAlloc(size_t offset) {

void *ur_discrete_buffer_handle_t::getDevicePtr(
ur_device_handle_t hDevice, device_access_mode_t /*access*/, size_t offset,
size_t /*size*/, ze_command_list_handle_t /*cmdList*/,
wait_list_view & /*waitListView*/) {
size_t /*size*/, ze_command_list_handle_t cmdList,
wait_list_view &waitListView) {
TRACK_SCOPE_LATENCY("ur_discrete_buffer_handle_t::getDevicePtr");

if (!activeAllocationDevice) {
Expand All @@ -366,12 +390,74 @@ void *ur_discrete_buffer_handle_t::getDevicePtr(
activeAllocationDevice) != p2pDevices.end();

if (!p2pAccessible) {
// TODO: migrate buffer through the host
UR_LOG(WARN,
"p2p is not accessible: requesting device ptr:{} cannot access "
"allocation on device ptr:{}",
(void *)hDevice, (void *)activeAllocationDevice);
throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
// Allocate a USM HOST staging buffer for the migration.
auto bufferSize = getSize();
void *hostBuf = nullptr;
UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
hContext, nullptr, nullptr, UR_USM_TYPE_HOST, bufferSize, &hostBuf));
usm_unique_ptr_t hostBufPtr(
hostBuf, [hContext = this->hContext](void *ptr) {
auto ret = hContext->getDefaultUSMPool()->free(ptr);
if (ret != UR_RESULT_SUCCESS) {
UR_LOG(ERR, "Failed to free migration staging buffer: {}", ret);
}
});

if (cmdList) {
// Order the migration relative to both the explicit wait events and any
// in-flight work already on the destination command list, then drain it
// so the host can safely read from the source device.
if (waitListView.num > 0) {
ZE2UR_CALL_THROWS(zeCommandListAppendWaitOnEvents,
(cmdList, waitListView.num, waitListView.handles));
}
ZE2UR_CALL_THROWS(zeCommandListHostSynchronize, (cmdList, UINT64_MAX));
waitListView.clear();

// The synchronization above guarantees any previous async staging copy
// on this command list has completed; release the old staging buffer now
// to prevent unbounded memory growth across repeated migrations.
migrationStagingBuffer.reset();

// The destination device's command list cannot access source device
// memory (P2P is not available), so use the source device's own
// synchronous command list for the device->host copy.
UR_CALL_THROWS(synchronousZeCopy(hContext, activeAllocationDevice,
hostBuf, getActiveDeviceAlloc(),
bufferSize));

// Use ensureDeviceAlloc instead of allocateOnDevice: the latter has a
// side-effect of setting activeAllocationDevice = hDevice immediately,
// before the async copy is enqueued. activeAllocationDevice must only
// be updated after the copy is successfully scheduled (see below).
void *dstDevPtr = ensureDeviceAlloc(hDevice, bufferSize);

// Host memory is accessible by all devices; enqueue the host->dest
// copy on the provided command list to keep the destination side async.
ZE2UR_CALL_THROWS(
zeCommandListAppendMemoryCopy,
(cmdList, dstDevPtr, hostBuf, bufferSize, nullptr, 0, nullptr));

// Keep exactly one staging buffer alive; it is released at the start
// of the next migration (after zeCommandListHostSynchronize) or when
// the buffer is released.
migrationStagingBuffer = std::move(hostBufPtr);
} else {
// Synchronous fallback when no command list is available.
for (uint32_t i = 0; i < waitListView.num; i++) {
ZE2UR_CALL_THROWS(zeEventHostSynchronize,
(waitListView.handles[i], UINT64_MAX));
}
waitListView.clear();

UR_CALL_THROWS(synchronousZeCopy(hContext, activeAllocationDevice,
hostBuf, getActiveDeviceAlloc(),
bufferSize));
UR_CALL_THROWS(migrateBufferTo(hDevice, hostBuf, bufferSize));
}

activeAllocationDevice = hDevice;
return getActiveDeviceAlloc(offset);
}

// TODO: see if it's better to migrate the memory to the specified device
Expand Down
11 changes: 11 additions & 0 deletions unified-runtime/source/adapters/level_zero/v2/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,19 @@ struct ur_discrete_buffer_handle_t : ur_mem_buffer_t {

std::vector<host_allocation_desc_t> hostAllocations;

// USM HOST staging buffer for the most recent async host-based migration
// (when P2P is not accessible). Released at the start of the next migration
// after zeCommandListHostSynchronize guarantees the previous async copy has
// completed, or when the buffer is released. Only one buffer is kept at a
// time, preventing unbounded memory growth for long-lived buffers.
usm_unique_ptr_t migrationStagingBuffer;

void *getActiveDeviceAlloc(size_t offset = 0);
void *allocateOnDevice(ur_device_handle_t hDevice, size_t size);
// Ensures a device allocation exists for hDevice and returns its pointer.
// Unlike allocateOnDevice, does NOT update activeAllocationDevice, so it
// is safe to call before the data migration is complete.
void *ensureDeviceAlloc(ur_device_handle_t hDevice, size_t size);
ur_result_t migrateBufferTo(ur_device_handle_t hDevice, void *src,
size_t size);
};
Expand Down
6 changes: 6 additions & 0 deletions unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ add_l0_v2_devices_test(memory_residency
${PROJECT_SOURCE_DIR}/source/adapters/level_zero/ur_level_zero.cpp
)

add_l0_v2_devices_test(discrete_buffer_host_migration
discrete_buffer_host_migration.cpp
${PROJECT_SOURCE_DIR}/source/adapters/level_zero/common.cpp
${PROJECT_SOURCE_DIR}/source/adapters/level_zero/ur_level_zero.cpp
)

add_l0_v2_devices_test(batched_queue
batched_queue_test.cpp
${PROJECT_SOURCE_DIR}/source/adapters/level_zero/adapter.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// Part of the LLVM Project, under the Apache License v2.0 with LLVM
// Exceptions. See https://llvm.org/LICENSE.txt for license information.
//
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// Level Zero v2 adapter regression tests for discrete-buffer host migration
// (see urEnqueueMemBufferMultiDeviceMigration.cpp).

// RUN: %with-v2 ./discrete_buffer_host_migration-test
// REQUIRES: v2

#include "../../../conformance/enqueue/urEnqueueMemBufferMultiDeviceMigration.cpp"
27 changes: 12 additions & 15 deletions unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -546,15 +546,20 @@ TEST_P(urMemoryMultiResidencyTest, p2pReadFailsAfterRevokingAccess) {
}

// Verify that a USM allocation on devices[0] is NOT made resident on
// devices[1] when P2P access has not been enabled. The feature under test
// restricts residency, not hardware access: Level Zero hardware can still
// transfer data cross-device via the interconnect regardless of residency
// state, so the copy result is not checked here. The observable guarantee
// is that devices[1] free memory must not decrease by a full allocSize,
// proving the allocation was never pinned on the peer device.
// devices[1] when P2P access has not been enabled. This test runs after
// several P2P enable/disable cycles to confirm that the residency restriction
// is still enforced once P2P is turned back off.
//
// The memory check is done immediately after urUSMDeviceAlloc, without
// creating a queue or issuing any GPU work. Waiting for GPU operations to
// complete (e.g. urQueueFinish) introduces a timing window during which
// background activity — async driver cleanup from earlier tests, other
// concurrent GPU workloads on shared CI hardware — can change the free-memory
// reading on devices[1] and cause spurious failures. The allocation step
// alone is sufficient to trigger any peer-residency side-effects, so
// measuring immediately after it keeps the window as short as possible.
TEST_P(urMemoryMultiResidencyTest, allocationNotResidentOnPeerWithoutP2P) {
constexpr size_t allocSize = kAllocSize;
static constexpr uint8_t fillPattern = 0xAB;

uint64_t initialMemFreePeer = 0;
ASSERT_SUCCESS(urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE,
Expand All @@ -571,14 +576,6 @@ TEST_P(urMemoryMultiResidencyTest, allocationNotResidentOnPeerWithoutP2P) {
ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr,
allocSize, &srcPtr));

ur_queue_handle_t srcQueue = nullptr;
ASSERT_SUCCESS(urQueueCreate(context, devices[0], nullptr, &srcQueue));
ASSERT_SUCCESS(urEnqueueUSMFill(srcQueue, srcPtr, sizeof(fillPattern),
&fillPattern, allocSize, 0, nullptr,
nullptr));
ASSERT_SUCCESS(urQueueFinish(srcQueue));
urQueueRelease(srcQueue);

uint64_t currentMemFreePeer = 0;
ur_result_t memRes =
urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE,
Expand Down
1 change: 1 addition & 0 deletions unified-runtime/test/conformance/enqueue/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ add_conformance_kernels_test(enqueue
urEnqueueMemBufferCopy.cpp
urEnqueueMemBufferFill.cpp
urEnqueueMemBufferMap.cpp
urEnqueueMemBufferMultiDeviceMigration.cpp
urEnqueueMemBufferRead.cpp
urEnqueueMemBufferReadRect.cpp
urEnqueueMemBufferWrite.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// Part of the LLVM Project, under the Apache License v2.0 with LLVM
// Exceptions. See https://llvm.org/LICENSE.txt for license information.
//
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Multi-device buffer tests that stress host-staged migration when a discrete
// buffer is accessed from different devices/queues (for example when device
// peer access is not available). Corresponds to L0 v2 discrete-buffer
// getDevicePtr migration ordering.
//
// The tests cover two migration paths inside getDevicePtr:
// - Async path (cmdList != nullptr): triggered by urEnqueueMem* operations.
// - Sync fallback (cmdList == nullptr): triggered by urMemGetNativeHandle.

#include <uur/fixtures.h>
#include <vector>

using urEnqueueMemBufferMultiDeviceMigrationTest =
uur::urMultiDeviceMemBufferQueueTest;
UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urEnqueueMemBufferMultiDeviceMigrationTest);

TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest,
AsyncFillThenReadOnSecondQueueWithWait) {
if (queues.size() < 2) {
GTEST_SKIP() << "Test requires at least 2 device queues";
}

const uint32_t pattern = 0xA5A5A501;
ur_event_handle_t fillEv = nullptr;
ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &pattern,
sizeof(pattern), 0, size, 0, nullptr,
&fillEv));
ASSERT_NE(fillEv, nullptr);

std::vector<uint32_t> output(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size,
output.data(), 1, &fillEv, nullptr));

ASSERT_SUCCESS(urEventRelease(fillEv));

for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(pattern, output[i]) << "Mismatch at index " << i;
}
}

TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest,
PingPongFillBetweenTwoDeviceQueues) {
if (queues.size() < 2) {
GTEST_SKIP() << "Test requires at least 2 device queues";
}

const uint32_t pattern1 = 0xC001D00u;
ur_event_handle_t evFill1 = nullptr;
ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &pattern1,
sizeof(pattern1), 0, size, 0, nullptr,
&evFill1));
ASSERT_NE(evFill1, nullptr);

std::vector<uint32_t> stage1(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size,
stage1.data(), 1, &evFill1, nullptr));
ASSERT_SUCCESS(urEventRelease(evFill1));
for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(pattern1, stage1[i]);
}

const uint32_t pattern2 = 0xD00DAD00u;
ur_event_handle_t evFill2 = nullptr;
ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[1], buffer, &pattern2,
sizeof(pattern2), 0, size, 0, nullptr,
&evFill2));
ASSERT_NE(evFill2, nullptr);

std::vector<uint32_t> stage2(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, true, 0, size,
stage2.data(), 1, &evFill2, nullptr));
ASSERT_SUCCESS(urEventRelease(evFill2));
for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(pattern2, stage2[i]);
}
}

TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest,
ChainedAsyncOpsAcrossQueuesWithEvents) {
if (queues.size() < 2) {
GTEST_SKIP() << "Test requires at least 2 device queues";
}

const uint32_t patternA = 0x11111111u;
ur_event_handle_t evFill = nullptr;
ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &patternA,
sizeof(patternA), 0, size, 0, nullptr,
&evFill));
ASSERT_NE(evFill, nullptr);

std::vector<uint32_t> verifyA(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size,
verifyA.data(), 1, &evFill, nullptr));
ASSERT_SUCCESS(urEventRelease(evFill));
for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(patternA, verifyA[i]);
}

const uint32_t patternB = 0x22222222u;
std::vector<uint32_t> hostB(count, patternB);
ur_event_handle_t evWrite = nullptr;
ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[1], buffer, true, 0, size,
hostB.data(), 0, nullptr, &evWrite));
ASSERT_NE(evWrite, nullptr);

std::vector<uint32_t> verifyB(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, true, 0, size,
verifyB.data(), 1, &evWrite, nullptr));
ASSERT_SUCCESS(urEventRelease(evWrite));
for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(patternB, verifyB[i]);
}
}

// Exercise the synchronous fallback migration path in getDevicePtr
// (cmdList == nullptr), which is triggered by urMemGetNativeHandle.
// Fill the buffer on device 0, then request its native pointer on device 1 to
// force a synchronous host-staged migration, then verify the data on device 1.
TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest,
SyncFallbackMigrationViaNativeHandle) {
if (queues.size() < 2) {
GTEST_SKIP() << "Test requires at least 2 device queues";
}

const uint32_t pattern = 0xDEADBEEFu;
ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &pattern,
sizeof(pattern), 0, size, 0, nullptr,
nullptr));
ASSERT_SUCCESS(urQueueFinish(queues[0]));

// urMemGetNativeHandle calls getDevicePtr with cmdList == nullptr,
// triggering the synchronous device->host->device migration path.
ur_native_handle_t nativePtr = 0;
ASSERT_SUCCESS(urMemGetNativeHandle(buffer, devices[1], &nativePtr));
ASSERT_NE(nativePtr, (ur_native_handle_t)0);

std::vector<uint32_t> output(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size,
output.data(), 0, nullptr, nullptr));
for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(pattern, output[i]) << "Mismatch at index " << i;
}
}
Loading