diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index f0367ba4fe..9e0e690a32 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -63,10 +63,7 @@ ze_result_t CommandQueueHw::executeCommandLists( if (NEO::DebugManager.flags.ForceMemoryPrefetchForKmdMigratedSharedAllocations.get()) { auto svmAllocMgr = device->getDriverHandle()->getSvmAllocsManager(); - for (auto &allocation : svmAllocMgr->getSVMAllocs()->allocations) { - NEO::SvmAllocationData allocData = allocation.second; - svmAllocMgr->prefetchMemory(*device->getNEODevice(), *csr, allocData); - } + svmAllocMgr->prefetchSVMAllocs(*device->getNEODevice(), *csr); } if (this->clientId == CommandQueue::clientNotRegistered) { diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 9f9110a76b..081027bd64 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -16,6 +16,7 @@ #include "shared/source/helpers/timestamp_packet.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/surface.h" +#include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/program/sync_buffer_handler.h" #include "shared/source/program/sync_buffer_handler.inl" @@ -147,6 +148,11 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, TagNodeBase *hwTimeStamps = nullptr; CommandStreamReceiver &computeCommandStreamReceiver = getGpgpuCommandStreamReceiver(); + if (NEO::DebugManager.flags.ForceMemoryPrefetchForKmdMigratedSharedAllocations.get()) { + auto pSvmAllocMgr = this->context->getSVMAllocsManager(); + pSvmAllocMgr->prefetchSVMAllocs(this->getDevice(), computeCommandStreamReceiver); + } + EventBuilder eventBuilder; setupEvent(eventBuilder, event, commandType); diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp index 8296eac344..7c4aef8b22 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp @@ -11,11 +11,13 @@ #include "shared/source/helpers/pause_on_gpu_properties.h" #include "shared/source/helpers/preamble.h" #include "shared/source/memory_manager/allocation_properties.h" +#include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/kernel_binary_helper.h" #include "shared/test/common/helpers/raii_gfx_core_helper.h" #include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/mocks/mock_csr.h" +#include "shared/test/common/mocks/mock_memory_manager.h" #include "shared/test/common/mocks/mock_submissions_aggregator.h" #include "opencl/source/api/api.h" @@ -1682,6 +1684,40 @@ HWTEST_F(EnqueueKernelTest, whenEnqueueKernelWithEngineHintsThenEpilogRequiredIs EXPECT_EQ(csr.recordedDispatchFlags.engineHints, 1u); } +HWTEST_F(EnqueueKernelTest, GivenForceMemoryPrefetchForKmdMigratedSharedAllocationsWhenEnqueingKernelWithoutSharedAllocationsThenMemoryPrefetchIsNotCalled) { + DebugManagerStateRestore stateRestore; + DebugManager.flags.UseKmdMigration.set(true); + DebugManager.flags.ForceMemoryPrefetchForKmdMigratedSharedAllocations.set(true); + + MockKernelWithInternals mockKernel(*pClDevice); + size_t gws[3] = {1, 1, 1}; + + pCmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + + auto memoryManager = static_cast(context->getMemoryManager()); + EXPECT_FALSE(memoryManager->setMemPrefetchCalled); +} + +HWTEST_F(EnqueueKernelTest, GivenForceMemoryPrefetchForKmdMigratedSharedAllocationsWhenEnqueingKernelWithSharedAllocationsThenMemoryPrefetchIsCalled) { + DebugManagerStateRestore stateRestore; + DebugManager.flags.UseKmdMigration.set(true); + DebugManager.flags.ForceMemoryPrefetchForKmdMigratedSharedAllocations.set(true); + + SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::SHARED_UNIFIED_MEMORY, context->getRootDeviceIndices(), context->getDeviceBitfields()); + auto ptr = context->getSVMAllocsManager()->createSharedUnifiedMemoryAllocation(4096u, unifiedMemoryProperties, pCmdQ); + EXPECT_NE(nullptr, ptr); + + MockKernelWithInternals mockKernel(*pClDevice); + size_t gws[3] = {1, 1, 1}; + + pCmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + + auto memoryManager = static_cast(context->getMemoryManager()); + EXPECT_TRUE(memoryManager->setMemPrefetchCalled); + + context->getSVMAllocsManager()->freeSVMAlloc(ptr); +} + struct PauseOnGpuTests : public EnqueueKernelTest { void SetUp() override { EnqueueKernelTest::SetUp(); diff --git a/shared/source/memory_manager/unified_memory_manager.cpp b/shared/source/memory_manager/unified_memory_manager.cpp index ad327727e5..6b0a8226cd 100644 --- a/shared/source/memory_manager/unified_memory_manager.cpp +++ b/shared/source/memory_manager/unified_memory_manager.cpp @@ -802,6 +802,14 @@ void SVMAllocsManager::prefetchMemory(Device &device, CommandStreamReceiver &com } } +void SVMAllocsManager::prefetchSVMAllocs(Device &device, CommandStreamReceiver &commandStreamReceiver) { + std::shared_lock lock(mtx); + for (auto &allocation : this->SVMAllocs.allocations) { + NEO::SvmAllocationData allocData = allocation.second; + this->prefetchMemory(device, commandStreamReceiver, allocData); + } +} + std::unique_lock SVMAllocsManager::obtainOwnership() { return std::unique_lock(mtxForIndirectAccess); } diff --git a/shared/source/memory_manager/unified_memory_manager.h b/shared/source/memory_manager/unified_memory_manager.h index 18d8a3284a..e9187f48be 100644 --- a/shared/source/memory_manager/unified_memory_manager.h +++ b/shared/source/memory_manager/unified_memory_manager.h @@ -197,6 +197,7 @@ class SVMAllocsManager { MOCKABLE_VIRTUAL void makeIndirectAllocationsResident(CommandStreamReceiver &commandStreamReceiver, TaskCountType taskCount); void prepareIndirectAllocationForDestruction(SvmAllocationData *); void prefetchMemory(Device &device, CommandStreamReceiver &commandStreamReceiver, SvmAllocationData &svmData); + void prefetchSVMAllocs(Device &device, CommandStreamReceiver &commandStreamReceiver); std::unique_lock obtainOwnership(); std::map indirectAllocationsResidency; diff --git a/shared/test/unit_test/memory_manager/unified_memory_manager_tests.cpp b/shared/test/unit_test/memory_manager/unified_memory_manager_tests.cpp index cbe8f4ca83..76d22779be 100644 --- a/shared/test/unit_test/memory_manager/unified_memory_manager_tests.cpp +++ b/shared/test/unit_test/memory_manager/unified_memory_manager_tests.cpp @@ -209,3 +209,29 @@ TEST_F(SVMLocalMemoryAllocatorTest, givenKmdMigratedSharedAllocationWhenPrefetch svmManager->freeSVMAlloc(ptr); } + +TEST_F(SVMLocalMemoryAllocatorTest, givenForceMemoryPrefetchForKmdMigratedSharedAllocationsWhenSVMAllocsIsCalledThenPrefetchSharedUnifiedMemoryInSvmAllocsManager) { + DebugManagerStateRestore restore; + DebugManager.flags.UseKmdMigration.set(1); + DebugManager.flags.ForceMemoryPrefetchForKmdMigratedSharedAllocations.set(true); + + std::unique_ptr deviceFactory(new UltDeviceFactory(1, 2)); + auto device = deviceFactory->rootDevices[0]; + auto svmManager = std::make_unique(device->getMemoryManager(), false); + auto csr = std::make_unique(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield()); + csr->setupContext(*device->getDefaultEngine().osContext); + void *cmdQ = reinterpret_cast(0x12345); + + SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::SHARED_UNIFIED_MEMORY, rootDeviceIndices, deviceBitfields); + + auto ptr = svmManager->createSharedUnifiedMemoryAllocation(4096, unifiedMemoryProperties, &cmdQ); + EXPECT_NE(nullptr, ptr); + + svmManager->prefetchSVMAllocs(*device, *csr); + + auto mockMemoryManager = static_cast(device->getMemoryManager()); + EXPECT_TRUE(mockMemoryManager->setMemPrefetchCalled); + EXPECT_EQ(1u, mockMemoryManager->memPrefetchSubDeviceIds.size()); + + svmManager->freeSVMAlloc(ptr); +}