diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index b0601ba956af2..2c27c14e577d9 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -251,6 +251,30 @@ void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice, return ptr; } +void *ur_discrete_buffer_handle_t::ensureDeviceAlloc(ur_device_handle_t hDevice, + size_t size) { + auto id = hDevice->Id.value(); + if (void *existing = deviceAllocations[id].get()) { + return existing; + } + + // Allocate without touching activeAllocationDevice; the caller is + // responsible for updating it at the correct point in the migration flow. + void *ptr; + UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( + hContext, hDevice, nullptr, UR_USM_TYPE_DEVICE, size, &ptr)); + + deviceAllocations[id] = + usm_unique_ptr_t(ptr, [hContext = this->hContext](void *ptr) { + auto ret = hContext->getDefaultUSMPool()->free(ptr); + if (ret != UR_RESULT_SUCCESS) { + UR_LOG(ERR, "Failed to free device memory: {}", ret); + } + }); + + return ptr; +} + ur_result_t ur_discrete_buffer_handle_t::migrateBufferTo(ur_device_handle_t hDevice, void *src, size_t size) { @@ -340,8 +364,8 @@ void *ur_discrete_buffer_handle_t::getActiveDeviceAlloc(size_t offset) { void *ur_discrete_buffer_handle_t::getDevicePtr( ur_device_handle_t hDevice, device_access_mode_t /*access*/, size_t offset, - size_t /*size*/, ze_command_list_handle_t /*cmdList*/, - wait_list_view & /*waitListView*/) { + size_t /*size*/, ze_command_list_handle_t cmdList, + wait_list_view &waitListView) { TRACK_SCOPE_LATENCY("ur_discrete_buffer_handle_t::getDevicePtr"); if (!activeAllocationDevice) { @@ -366,12 +390,74 @@ void *ur_discrete_buffer_handle_t::getDevicePtr( activeAllocationDevice) != p2pDevices.end(); if (!p2pAccessible) { - // TODO: migrate buffer through the host - UR_LOG(WARN, - "p2p is not accessible: requesting device ptr:{} cannot access " - "allocation on device ptr:{}", - (void *)hDevice, (void *)activeAllocationDevice); - throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // Allocate a USM HOST staging buffer for the migration. + auto bufferSize = getSize(); + void *hostBuf = nullptr; + UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( + hContext, nullptr, nullptr, UR_USM_TYPE_HOST, bufferSize, &hostBuf)); + usm_unique_ptr_t hostBufPtr( + hostBuf, [hContext = this->hContext](void *ptr) { + auto ret = hContext->getDefaultUSMPool()->free(ptr); + if (ret != UR_RESULT_SUCCESS) { + UR_LOG(ERR, "Failed to free migration staging buffer: {}", ret); + } + }); + + if (cmdList) { + // Order the migration relative to both the explicit wait events and any + // in-flight work already on the destination command list, then drain it + // so the host can safely read from the source device. + if (waitListView.num > 0) { + ZE2UR_CALL_THROWS(zeCommandListAppendWaitOnEvents, + (cmdList, waitListView.num, waitListView.handles)); + } + ZE2UR_CALL_THROWS(zeCommandListHostSynchronize, (cmdList, UINT64_MAX)); + waitListView.clear(); + + // The synchronization above guarantees any previous async staging copy + // on this command list has completed; release the old staging buffer now + // to prevent unbounded memory growth across repeated migrations. + migrationStagingBuffer.reset(); + + // The destination device's command list cannot access source device + // memory (P2P is not available), so use the source device's own + // synchronous command list for the device->host copy. + UR_CALL_THROWS(synchronousZeCopy(hContext, activeAllocationDevice, + hostBuf, getActiveDeviceAlloc(), + bufferSize)); + + // Use ensureDeviceAlloc instead of allocateOnDevice: the latter has a + // side-effect of setting activeAllocationDevice = hDevice immediately, + // before the async copy is enqueued. activeAllocationDevice must only + // be updated after the copy is successfully scheduled (see below). + void *dstDevPtr = ensureDeviceAlloc(hDevice, bufferSize); + + // Host memory is accessible by all devices; enqueue the host->dest + // copy on the provided command list to keep the destination side async. + ZE2UR_CALL_THROWS( + zeCommandListAppendMemoryCopy, + (cmdList, dstDevPtr, hostBuf, bufferSize, nullptr, 0, nullptr)); + + // Keep exactly one staging buffer alive; it is released at the start + // of the next migration (after zeCommandListHostSynchronize) or when + // the buffer is released. + migrationStagingBuffer = std::move(hostBufPtr); + } else { + // Synchronous fallback when no command list is available. + for (uint32_t i = 0; i < waitListView.num; i++) { + ZE2UR_CALL_THROWS(zeEventHostSynchronize, + (waitListView.handles[i], UINT64_MAX)); + } + waitListView.clear(); + + UR_CALL_THROWS(synchronousZeCopy(hContext, activeAllocationDevice, + hostBuf, getActiveDeviceAlloc(), + bufferSize)); + UR_CALL_THROWS(migrateBufferTo(hDevice, hostBuf, bufferSize)); + } + + activeAllocationDevice = hDevice; + return getActiveDeviceAlloc(offset); } // TODO: see if it's better to migrate the memory to the specified device diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.hpp b/unified-runtime/source/adapters/level_zero/v2/memory.hpp index 709bb3e71c600..62ae3ef2b543c 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.hpp @@ -171,8 +171,19 @@ struct ur_discrete_buffer_handle_t : ur_mem_buffer_t { std::vector hostAllocations; + // USM HOST staging buffer for the most recent async host-based migration + // (when P2P is not accessible). Released at the start of the next migration + // after zeCommandListHostSynchronize guarantees the previous async copy has + // completed, or when the buffer is released. Only one buffer is kept at a + // time, preventing unbounded memory growth for long-lived buffers. + usm_unique_ptr_t migrationStagingBuffer; + void *getActiveDeviceAlloc(size_t offset = 0); void *allocateOnDevice(ur_device_handle_t hDevice, size_t size); + // Ensures a device allocation exists for hDevice and returns its pointer. + // Unlike allocateOnDevice, does NOT update activeAllocationDevice, so it + // is safe to call before the data migration is complete. + void *ensureDeviceAlloc(ur_device_handle_t hDevice, size_t size); ur_result_t migrateBufferTo(ur_device_handle_t hDevice, void *src, size_t size); }; diff --git a/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt b/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt index 2e9397c7c5741..233b94ed1735d 100644 --- a/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt +++ b/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt @@ -66,6 +66,12 @@ add_l0_v2_devices_test(memory_residency ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/ur_level_zero.cpp ) +add_l0_v2_devices_test(discrete_buffer_host_migration + discrete_buffer_host_migration.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/common.cpp + ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/ur_level_zero.cpp +) + add_l0_v2_devices_test(batched_queue batched_queue_test.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/adapter.cpp diff --git a/unified-runtime/test/adapters/level_zero/v2/discrete_buffer_host_migration.cpp b/unified-runtime/test/adapters/level_zero/v2/discrete_buffer_host_migration.cpp new file mode 100644 index 0000000000000..66836a1b6c7a8 --- /dev/null +++ b/unified-runtime/test/adapters/level_zero/v2/discrete_buffer_host_migration.cpp @@ -0,0 +1,12 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. See https://llvm.org/LICENSE.txt for license information. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Level Zero v2 adapter regression tests for discrete-buffer host migration +// (see urEnqueueMemBufferMultiDeviceMigration.cpp). + +// RUN: %with-v2 ./discrete_buffer_host_migration-test +// REQUIRES: v2 + +#include "../../../conformance/enqueue/urEnqueueMemBufferMultiDeviceMigration.cpp" diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index 83797172a67b9..2f49189ebfbb7 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -546,15 +546,20 @@ TEST_P(urMemoryMultiResidencyTest, p2pReadFailsAfterRevokingAccess) { } // Verify that a USM allocation on devices[0] is NOT made resident on -// devices[1] when P2P access has not been enabled. The feature under test -// restricts residency, not hardware access: Level Zero hardware can still -// transfer data cross-device via the interconnect regardless of residency -// state, so the copy result is not checked here. The observable guarantee -// is that devices[1] free memory must not decrease by a full allocSize, -// proving the allocation was never pinned on the peer device. +// devices[1] when P2P access has not been enabled. This test runs after +// several P2P enable/disable cycles to confirm that the residency restriction +// is still enforced once P2P is turned back off. +// +// The memory check is done immediately after urUSMDeviceAlloc, without +// creating a queue or issuing any GPU work. Waiting for GPU operations to +// complete (e.g. urQueueFinish) introduces a timing window during which +// background activity — async driver cleanup from earlier tests, other +// concurrent GPU workloads on shared CI hardware — can change the free-memory +// reading on devices[1] and cause spurious failures. The allocation step +// alone is sufficient to trigger any peer-residency side-effects, so +// measuring immediately after it keeps the window as short as possible. TEST_P(urMemoryMultiResidencyTest, allocationNotResidentOnPeerWithoutP2P) { constexpr size_t allocSize = kAllocSize; - static constexpr uint8_t fillPattern = 0xAB; uint64_t initialMemFreePeer = 0; ASSERT_SUCCESS(urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, @@ -571,14 +576,6 @@ TEST_P(urMemoryMultiResidencyTest, allocationNotResidentOnPeerWithoutP2P) { ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &srcPtr)); - ur_queue_handle_t srcQueue = nullptr; - ASSERT_SUCCESS(urQueueCreate(context, devices[0], nullptr, &srcQueue)); - ASSERT_SUCCESS(urEnqueueUSMFill(srcQueue, srcPtr, sizeof(fillPattern), - &fillPattern, allocSize, 0, nullptr, - nullptr)); - ASSERT_SUCCESS(urQueueFinish(srcQueue)); - urQueueRelease(srcQueue); - uint64_t currentMemFreePeer = 0; ur_result_t memRes = urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, diff --git a/unified-runtime/test/conformance/enqueue/CMakeLists.txt b/unified-runtime/test/conformance/enqueue/CMakeLists.txt index a2471bb108e56..0d8012d271818 100644 --- a/unified-runtime/test/conformance/enqueue/CMakeLists.txt +++ b/unified-runtime/test/conformance/enqueue/CMakeLists.txt @@ -14,6 +14,7 @@ add_conformance_kernels_test(enqueue urEnqueueMemBufferCopy.cpp urEnqueueMemBufferFill.cpp urEnqueueMemBufferMap.cpp + urEnqueueMemBufferMultiDeviceMigration.cpp urEnqueueMemBufferRead.cpp urEnqueueMemBufferReadRect.cpp urEnqueueMemBufferWrite.cpp diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueMemBufferMultiDeviceMigration.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueMemBufferMultiDeviceMigration.cpp new file mode 100644 index 0000000000000..031e03aa975fe --- /dev/null +++ b/unified-runtime/test/conformance/enqueue/urEnqueueMemBufferMultiDeviceMigration.cpp @@ -0,0 +1,148 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. See https://llvm.org/LICENSE.txt for license information. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Multi-device buffer tests that stress host-staged migration when a discrete +// buffer is accessed from different devices/queues (for example when device +// peer access is not available). Corresponds to L0 v2 discrete-buffer +// getDevicePtr migration ordering. +// +// The tests cover two migration paths inside getDevicePtr: +// - Async path (cmdList != nullptr): triggered by urEnqueueMem* operations. +// - Sync fallback (cmdList == nullptr): triggered by urMemGetNativeHandle. + +#include +#include + +using urEnqueueMemBufferMultiDeviceMigrationTest = + uur::urMultiDeviceMemBufferQueueTest; +UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urEnqueueMemBufferMultiDeviceMigrationTest); + +TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest, + AsyncFillThenReadOnSecondQueueWithWait) { + if (queues.size() < 2) { + GTEST_SKIP() << "Test requires at least 2 device queues"; + } + + const uint32_t pattern = 0xA5A5A501; + ur_event_handle_t fillEv = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &pattern, + sizeof(pattern), 0, size, 0, nullptr, + &fillEv)); + ASSERT_NE(fillEv, nullptr); + + std::vector output(count, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size, + output.data(), 1, &fillEv, nullptr)); + + ASSERT_SUCCESS(urEventRelease(fillEv)); + + for (size_t i = 0; i < count; ++i) { + ASSERT_EQ(pattern, output[i]) << "Mismatch at index " << i; + } +} + +TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest, + PingPongFillBetweenTwoDeviceQueues) { + if (queues.size() < 2) { + GTEST_SKIP() << "Test requires at least 2 device queues"; + } + + const uint32_t pattern1 = 0xC001D00u; + ur_event_handle_t evFill1 = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &pattern1, + sizeof(pattern1), 0, size, 0, nullptr, + &evFill1)); + ASSERT_NE(evFill1, nullptr); + + std::vector stage1(count, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size, + stage1.data(), 1, &evFill1, nullptr)); + ASSERT_SUCCESS(urEventRelease(evFill1)); + for (size_t i = 0; i < count; ++i) { + ASSERT_EQ(pattern1, stage1[i]); + } + + const uint32_t pattern2 = 0xD00DAD00u; + ur_event_handle_t evFill2 = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[1], buffer, &pattern2, + sizeof(pattern2), 0, size, 0, nullptr, + &evFill2)); + ASSERT_NE(evFill2, nullptr); + + std::vector stage2(count, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, true, 0, size, + stage2.data(), 1, &evFill2, nullptr)); + ASSERT_SUCCESS(urEventRelease(evFill2)); + for (size_t i = 0; i < count; ++i) { + ASSERT_EQ(pattern2, stage2[i]); + } +} + +TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest, + ChainedAsyncOpsAcrossQueuesWithEvents) { + if (queues.size() < 2) { + GTEST_SKIP() << "Test requires at least 2 device queues"; + } + + const uint32_t patternA = 0x11111111u; + ur_event_handle_t evFill = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &patternA, + sizeof(patternA), 0, size, 0, nullptr, + &evFill)); + ASSERT_NE(evFill, nullptr); + + std::vector verifyA(count, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size, + verifyA.data(), 1, &evFill, nullptr)); + ASSERT_SUCCESS(urEventRelease(evFill)); + for (size_t i = 0; i < count; ++i) { + ASSERT_EQ(patternA, verifyA[i]); + } + + const uint32_t patternB = 0x22222222u; + std::vector hostB(count, patternB); + ur_event_handle_t evWrite = nullptr; + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[1], buffer, true, 0, size, + hostB.data(), 0, nullptr, &evWrite)); + ASSERT_NE(evWrite, nullptr); + + std::vector verifyB(count, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, true, 0, size, + verifyB.data(), 1, &evWrite, nullptr)); + ASSERT_SUCCESS(urEventRelease(evWrite)); + for (size_t i = 0; i < count; ++i) { + ASSERT_EQ(patternB, verifyB[i]); + } +} + +// Exercise the synchronous fallback migration path in getDevicePtr +// (cmdList == nullptr), which is triggered by urMemGetNativeHandle. +// Fill the buffer on device 0, then request its native pointer on device 1 to +// force a synchronous host-staged migration, then verify the data on device 1. +TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest, + SyncFallbackMigrationViaNativeHandle) { + if (queues.size() < 2) { + GTEST_SKIP() << "Test requires at least 2 device queues"; + } + + const uint32_t pattern = 0xDEADBEEFu; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &pattern, + sizeof(pattern), 0, size, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(queues[0])); + + // urMemGetNativeHandle calls getDevicePtr with cmdList == nullptr, + // triggering the synchronous device->host->device migration path. + ur_native_handle_t nativePtr = 0; + ASSERT_SUCCESS(urMemGetNativeHandle(buffer, devices[1], &nativePtr)); + ASSERT_NE(nativePtr, (ur_native_handle_t)0); + + std::vector output(count, 0); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size, + output.data(), 0, nullptr, nullptr)); + for (size_t i = 0; i < count; ++i) { + ASSERT_EQ(pattern, output[i]) << "Mismatch at index " << i; + } +}