From f165e713a5a583efb5a26d633e23667a44150897 Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Thu, 12 Jun 2025 15:33:25 +0000 Subject: [PATCH] fix: cache canAccessPeer result of each pair of devices query peer access only once per pair of devices Related-To: NEO-14938 Signed-off-by: Mateusz Jablonski --- level_zero/core/source/device/device_imp.cpp | 65 +++++++++-------- level_zero/core/source/device/device_imp.h | 3 +- .../core/source/driver/driver_handle_imp.h | 5 ++ .../unit_tests/mt_tests/device/CMakeLists.txt | 11 +++ .../mt_tests/device/test_mt_device.cpp | 72 +++++++++++++++++++ 5 files changed, 127 insertions(+), 29 deletions(-) create mode 100644 level_zero/core/test/unit_tests/mt_tests/device/CMakeLists.txt create mode 100644 level_zero/core/test/unit_tests/mt_tests/device/test_mt_device.cpp diff --git a/level_zero/core/source/device/device_imp.cpp b/level_zero/core/source/device/device_imp.cpp index 900cfcfe3f..cc66aefb65 100644 --- a/level_zero/core/source/device/device_imp.cpp +++ b/level_zero/core/source/device/device_imp.cpp @@ -94,11 +94,8 @@ ze_result_t DeviceImp::getStatus() { return ZE_RESULT_SUCCESS; } -ze_result_t DeviceImp::submitCopyForP2P(ze_device_handle_t hPeerDevice, ze_bool_t *value) { - DeviceImp *pPeerDevice = static_cast(Device::fromHandle(hPeerDevice)); - uint32_t peerRootDeviceIndex = pPeerDevice->getNEODevice()->getRootDeviceIndex(); - *value = false; - +bool DeviceImp::submitCopyForP2P(DeviceImp *peerDevice, ze_result_t &ret) { + auto canAccessPeer = false; ze_command_list_handle_t commandList = nullptr; ze_command_list_desc_t listDescriptor = {}; listDescriptor.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC; @@ -116,7 +113,7 @@ ze_result_t DeviceImp::submitCopyForP2P(ze_device_handle_t hPeerDevice, ze_bool_ queueDescriptor.ordinal = 0; queueDescriptor.index = 0; - auto ret = this->createInternalCommandList(&listDescriptor, &commandList); + ret = this->createInternalCommandList(&listDescriptor, &commandList); UNRECOVERABLE_IF(ret != ZE_RESULT_SUCCESS); ret = this->createInternalCommandQueue(&queueDescriptor, &commandQueue); UNRECOVERABLE_IF(ret != ZE_RESULT_SUCCESS); @@ -146,7 +143,7 @@ ze_result_t DeviceImp::submitCopyForP2P(ze_device_handle_t hPeerDevice, ze_bool_ peerDeviceDesc.pNext = nullptr; contextImp->allocDeviceMem(this->toHandle(), &deviceDesc, 8, 1, &memory); - contextImp->allocDeviceMem(hPeerDevice, &peerDeviceDesc, 8, 1, &peerMemory); + contextImp->allocDeviceMem(peerDevice->toHandle(), &peerDeviceDesc, 8, 1, &peerMemory); CmdListMemoryCopyParams memoryCopyParams = {}; ret = L0::CommandList::fromHandle(commandList)->appendMemoryCopy(peerMemory, memory, 8, nullptr, 0, nullptr, memoryCopyParams); @@ -155,12 +152,10 @@ ze_result_t DeviceImp::submitCopyForP2P(ze_device_handle_t hPeerDevice, ze_bool_ if (ret == ZE_RESULT_SUCCESS) { ret = L0::CommandQueue::fromHandle(commandQueue)->executeCommandLists(1, &commandList, nullptr, true, nullptr, nullptr); if (ret == ZE_RESULT_SUCCESS) { - this->crossAccessEnabledDevices[peerRootDeviceIndex] = true; - pPeerDevice->crossAccessEnabledDevices[this->getNEODevice()->getRootDeviceIndex()] = true; ret = L0::CommandQueue::fromHandle(commandQueue)->synchronize(std::numeric_limits::max()); if (ret == ZE_RESULT_SUCCESS) { - *value = true; + canAccessPeer = true; } } } @@ -172,43 +167,57 @@ ze_result_t DeviceImp::submitCopyForP2P(ze_device_handle_t hPeerDevice, ze_bool_ L0::CommandQueue::fromHandle(commandQueue)->destroy(); L0::Context::fromHandle(context)->destroy(); - if (ret == ZE_RESULT_ERROR_DEVICE_LOST) { - return ZE_RESULT_ERROR_DEVICE_LOST; + if (ret != ZE_RESULT_ERROR_DEVICE_LOST) { + ret = ZE_RESULT_SUCCESS; } - return ZE_RESULT_SUCCESS; + return canAccessPeer; } ze_result_t DeviceImp::canAccessPeer(ze_device_handle_t hPeerDevice, ze_bool_t *value) { - *value = false; - DeviceImp *pPeerDevice = static_cast(Device::fromHandle(hPeerDevice)); uint32_t peerRootDeviceIndex = pPeerDevice->getNEODevice()->getRootDeviceIndex(); + auto rootDeviceIndex = getRootDeviceIndex(); + + auto retVal = ZE_RESULT_SUCCESS; if (NEO::debugManager.flags.ForceZeDeviceCanAccessPerReturnValue.get() != -1) { *value = !!NEO::debugManager.flags.ForceZeDeviceCanAccessPerReturnValue.get(); - return ZE_RESULT_SUCCESS; + return retVal; } - if (this->crossAccessEnabledDevices.find(peerRootDeviceIndex) != this->crossAccessEnabledDevices.end()) { - *value = this->crossAccessEnabledDevices[peerRootDeviceIndex]; - return ZE_RESULT_SUCCESS; - } - - if (this->getNEODevice()->getRootDeviceIndex() == peerRootDeviceIndex) { + if (rootDeviceIndex == peerRootDeviceIndex) { *value = true; - return ZE_RESULT_SUCCESS; + return retVal; } + auto lock = static_cast(driverHandle)->obtainPeerAccessQueryLock(); + if (this->crossAccessEnabledDevices.find(peerRootDeviceIndex) == this->crossAccessEnabledDevices.end()) { + retVal = queryPeerAccess(pPeerDevice); + } + *value = this->crossAccessEnabledDevices[peerRootDeviceIndex]; + return retVal; +} + +ze_result_t DeviceImp::queryPeerAccess(DeviceImp *peerDevice) { + auto retVal = ZE_RESULT_SUCCESS; + auto rootDeviceIndex = getRootDeviceIndex(); + auto peerRootDeviceIndex = peerDevice->getRootDeviceIndex(); + + auto setPeerAccess = [&](bool value) { + this->crossAccessEnabledDevices[peerRootDeviceIndex] = value; + peerDevice->crossAccessEnabledDevices[rootDeviceIndex] = value; + }; + uint32_t latency = std::numeric_limits::max(); uint32_t bandwidth = 0; - ze_result_t res = queryFabricStats(pPeerDevice, latency, bandwidth); + ze_result_t res = queryFabricStats(peerDevice, latency, bandwidth); if (res == ZE_RESULT_ERROR_UNSUPPORTED_FEATURE || bandwidth == 0) { - return submitCopyForP2P(hPeerDevice, value); + setPeerAccess(submitCopyForP2P(peerDevice, retVal)); + } else { + setPeerAccess(true); } - - *value = true; - return ZE_RESULT_SUCCESS; + return retVal; } ze_result_t DeviceImp::createCommandList(const ze_command_list_desc_t *desc, diff --git a/level_zero/core/source/device/device_imp.h b/level_zero/core/source/device/device_imp.h index 250e0fe78e..4bab954c54 100644 --- a/level_zero/core/source/device/device_imp.h +++ b/level_zero/core/source/device/device_imp.h @@ -33,7 +33,6 @@ class CacheReservation; struct DeviceImp : public Device, NEO::NonCopyableAndNonMovableClass { DeviceImp(); ze_result_t getStatus() override; - ze_result_t submitCopyForP2P(ze_device_handle_t hPeerDevice, ze_bool_t *value); MOCKABLE_VIRTUAL ze_result_t queryFabricStats(DeviceImp *pPeerDevice, uint32_t &latency, uint32_t &bandwidth); ze_result_t canAccessPeer(ze_device_handle_t hPeerDevice, ze_bool_t *value) override; ze_result_t createCommandList(const ze_command_list_desc_t *desc, @@ -181,6 +180,8 @@ struct DeviceImp : public Device, NEO::NonCopyableAndNonMovableClass { std::optional tryGetCopyEngineOrdinal() const; protected: + ze_result_t queryPeerAccess(DeviceImp *peerDevice); + bool submitCopyForP2P(DeviceImp *hPeerDevice, ze_result_t &result); ze_result_t getGlobalTimestampsUsingSubmission(uint64_t *hostTimestamp, uint64_t *deviceTimestamp); ze_result_t getGlobalTimestampsUsingOsInterface(uint64_t *hostTimestamp, uint64_t *deviceTimestamp); const char *getDeviceMemoryName(); diff --git a/level_zero/core/source/driver/driver_handle_imp.h b/level_zero/core/source/driver/driver_handle_imp.h index 3f3b2cd6e6..01c2a5b2f1 100644 --- a/level_zero/core/source/driver/driver_handle_imp.h +++ b/level_zero/core/source/driver/driver_handle_imp.h @@ -125,6 +125,10 @@ struct DriverHandleImp : public DriverHandle { void initHostUsmAllocPool(); void initDeviceUsmAllocPool(NEO::Device &device); + std::unique_lock obtainPeerAccessQueryLock() { + return std::unique_lock(peerAccessQueryMutex); + } + std::unique_ptr hostPointerManager; std::mutex sharedMakeResidentAllocationsLock; @@ -174,6 +178,7 @@ struct DriverHandleImp : public DriverHandle { // not based on the lifetime of the object of a class. std::unordered_map errorDescs; std::mutex errorDescsMutex; + std::mutex peerAccessQueryMutex; int setErrorDescription(const std::string &str) override; ze_result_t getErrorDescription(const char **ppString) override; ze_result_t clearErrorDescription() override; diff --git a/level_zero/core/test/unit_tests/mt_tests/device/CMakeLists.txt b/level_zero/core/test/unit_tests/mt_tests/device/CMakeLists.txt new file mode 100644 index 0000000000..cc68b1c64a --- /dev/null +++ b/level_zero/core/test/unit_tests/mt_tests/device/CMakeLists.txt @@ -0,0 +1,11 @@ +# +# Copyright (C) 2025 Intel Corporation +# +# SPDX-License-Identifier: MIT +# + +target_sources(${TARGET_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt + ${CMAKE_CURRENT_SOURCE_DIR}/test_mt_device.cpp +) +add_subdirectories() diff --git a/level_zero/core/test/unit_tests/mt_tests/device/test_mt_device.cpp b/level_zero/core/test/unit_tests/mt_tests/device/test_mt_device.cpp new file mode 100644 index 0000000000..249d0ee968 --- /dev/null +++ b/level_zero/core/test/unit_tests/mt_tests/device/test_mt_device.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_stream/command_stream_receiver.h" +#include "shared/test/common/test_macros/test.h" + +#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" + +namespace L0 { +namespace ult { + +using MultiDeviceMtTest = Test; + +TEST_F(MultiDeviceMtTest, givenTwoDevicesWhenCanAccessPeerIsCalledManyTimesFromMultiThreadsInBothWaysThenPeerAccessIsQueriedOnlyOnce) { + L0::Device *device0 = driverHandle->devices[0]; + L0::Device *device1 = driverHandle->devices[1]; + + auto taskCount0 = device0->getNEODevice()->getInternalEngine().commandStreamReceiver->peekLatestFlushedTaskCount(); + auto taskCount1 = device1->getNEODevice()->getInternalEngine().commandStreamReceiver->peekLatestFlushedTaskCount(); + + EXPECT_EQ(taskCount0, taskCount1); + EXPECT_EQ(0u, taskCount0); + + std::atomic_bool started = false; + constexpr int numThreads = 8; + constexpr int iterationCount = 20; + std::vector threads; + + auto threadBody = [&](int threadId) { + while (!started.load()) { + std::this_thread::yield(); + } + + auto device = device0; + auto peerDevice = device1; + + if (threadId & 1) { + device = device1; + peerDevice = device0; + } + for (auto i = 0; i < iterationCount; i++) { + ze_bool_t canAccess = false; + ze_result_t res = device->canAccessPeer(peerDevice->toHandle(), &canAccess); + EXPECT_EQ(ZE_RESULT_SUCCESS, res); + EXPECT_TRUE(canAccess); + } + }; + + for (int i = 0; i < numThreads; ++i) { + threads.push_back(std::thread(threadBody, i)); + } + + started = true; + + for (auto &thread : threads) { + thread.join(); + } + + taskCount0 = device0->getNEODevice()->getInternalEngine().commandStreamReceiver->peekLatestFlushedTaskCount(); + taskCount1 = device1->getNEODevice()->getInternalEngine().commandStreamReceiver->peekLatestFlushedTaskCount(); + + EXPECT_NE(taskCount0, taskCount1); + + EXPECT_GE(2u, std::max(taskCount0, taskCount1)); + EXPECT_EQ(0u, std::min(taskCount0, taskCount1)); +} +} // namespace ult +} // namespace L0