From b31c3bb3ca81762b20de65674e064223b77f70e6 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Wed, 9 Apr 2025 08:58:37 +0000 Subject: [PATCH] fix: remove doubled operations when executing regular on immediate - immediate flush does not need to perform below for command list execution - handling indirect allocations - performing page fault migration - making residency of unified shared memory - perform prefetching when required Related-To: NEO-10356 Signed-off-by: Zbigniew Zdanowicz --- .../source/cmdlist/cmdlist_hw_immediate.inl | 47 ++++++------ .../test/unit_tests/mocks/mock_cmdqueue.h | 6 ++ .../sources/cmdlist/test_cmdlist_6.cpp | 34 +++++++++ .../sources/cmdqueue/test_cmdqueue_3.cpp | 61 ++++++++++++++++ .../sources/context/test_context.cpp | 64 ++++++++++++++++ .../xe_hpc_core/test_cmdlist_xe_hpc_core.cpp | 73 +++++++++++++++++++ .../memory_manager/mock_prefetch_manager.h | 4 +- 7 files changed, 265 insertions(+), 24 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 6f93d1814e..b67725c3f6 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -451,6 +451,8 @@ inline ze_result_t CommandListCoreFamilyImmediate::executeCommand auto csr = cmdQImp->getCsr(); auto lockCSR = outerLock != nullptr ? std::move(*outerLock) : csr->obtainUniqueOwnership(); + std::unique_lock lockForIndirect; + if (appendOperation != NEO::AppendOperations::cmdList) { if (NEO::ApiSpecificConfig::isSharedAllocPrefetchEnabled()) { auto svmAllocMgr = this->device->getDriverHandle()->getSvmAllocsManager(); @@ -458,35 +460,34 @@ inline ze_result_t CommandListCoreFamilyImmediate::executeCommand } cmdQ->registerCsrClient(); - } - std::unique_lock lockForIndirect; - if (this->hasIndirectAllocationsAllowed()) { - cmdQ->handleIndirectAllocationResidency(this->getUnifiedMemoryControls(), lockForIndirect, performMigration); - } - - if (performMigration) { - auto deviceImp = static_cast(this->device); - auto pageFaultManager = deviceImp->getDriverHandle()->getMemoryManager()->getPageFaultManager(); - if (pageFaultManager == nullptr) { - performMigration = false; + if (this->hasIndirectAllocationsAllowed()) { + cmdQ->handleIndirectAllocationResidency(this->getUnifiedMemoryControls(), lockForIndirect, performMigration); } - } - cmdQ->makeResidentAndMigrate(performMigration, this->commandContainer.getResidencyContainer()); + if (performMigration) { + auto deviceImp = static_cast(this->device); + auto pageFaultManager = deviceImp->getDriverHandle()->getMemoryManager()->getPageFaultManager(); + if (pageFaultManager == nullptr) { + performMigration = false; + } + } - static_cast *>(this->cmdQImmediate)->patchCommands(*this, 0u, false); + cmdQ->makeResidentAndMigrate(performMigration, this->commandContainer.getResidencyContainer()); - if (performMigration) { - this->migrateSharedAllocations(); - } + if (performMigration) { + this->migrateSharedAllocations(); + } - if (this->performMemoryPrefetch) { - auto prefetchManager = this->device->getDriverHandle()->getMemoryManager()->getPrefetchManager(); - prefetchManager->migrateAllocationsToGpu(this->getPrefetchContext(), - *this->device->getDriverHandle()->getSvmAllocsManager(), - *this->device->getNEODevice(), - *csr); + if (this->performMemoryPrefetch) { + auto prefetchManager = this->device->getDriverHandle()->getMemoryManager()->getPrefetchManager(); + prefetchManager->migrateAllocationsToGpu(this->getPrefetchContext(), + *this->device->getDriverHandle()->getSvmAllocsManager(), + *this->device->getNEODevice(), + *csr); + } + + static_cast *>(this->cmdQImmediate)->patchCommands(*this, 0u, false); } NEO::CompletionStamp completionStamp; diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h index 98e972f772..06ffefbcc5 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h @@ -144,6 +144,11 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { return returnCode; } + void handleIndirectAllocationResidency(UnifiedMemoryControls unifiedMemoryControls, std::unique_lock &lockForIndirect, bool performMigration) override { + handleIndirectAllocationResidencyCalledTimes++; + BaseClass::handleIndirectAllocationResidency(unifiedMemoryControls, lockForIndirect, performMigration); + } + NEO::GraphicsAllocation *recordedGlobalStatelessAllocation = nullptr; NEO::ScratchSpaceController *recordedScratchController = nullptr; uint32_t synchronizedCalled = 0; @@ -151,6 +156,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw { ze_result_t synchronizeReturnValue{ZE_RESULT_SUCCESS}; std::optional reserveLinearStreamSizeReturnValue{}; std::optional submitBatchBufferReturnValue{}; + uint32_t handleIndirectAllocationResidencyCalledTimes = 0; bool recordedLockScratchController = false; }; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp index 83248e5d9c..661673bc99 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp @@ -34,6 +34,7 @@ #include "level_zero/core/test/unit_tests/mocks/mock_event.h" #include "level_zero/core/test/unit_tests/mocks/mock_image.h" #include "level_zero/core/test/unit_tests/mocks/mock_kernel.h" +#include "level_zero/core/test/unit_tests/sources/helper/ze_object_utils.h" #include "test_traits_common.h" @@ -1281,6 +1282,39 @@ HWTEST2_F(CommandListTest, givenCmdListWithIndirectAccessWhenExecutingCommandLis commandListImmediate.cmdQImmediate = oldCommandQueue; } +HWTEST2_F(CommandListTest, givenRegularCmdListWithIndirectAccessWhenExecutingRegularOnImmediateCommandListThenHandleIndirectAccessCalled, MatchAny) { + ze_command_queue_desc_t desc = {}; + desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; + + ze_result_t returnValue; + std::unique_ptr commandListImmediate(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::compute, returnValue)); + auto &mockCommandListImmediate = static_cast &>(*commandListImmediate); + + MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield()); + auto mockCmdQHw = makeZeUniquePtr>(device, device->getNEODevice()->getDefaultEngine().commandStreamReceiver, &desc); + mockCmdQHw->initialize(false, false, true); + + auto oldCommandQueue = mockCommandListImmediate.cmdQImmediate; + mockCommandListImmediate.cmdQImmediate = mockCmdQHw.get(); + + std::unique_ptr commandListRegular(CommandList::create(productFamily, + device, + NEO::EngineGroupType::compute, + 0u, + returnValue, false)); + ASSERT_NE(nullptr, commandListRegular); + auto &mockCommandListRegular = static_cast &>(*commandListRegular); + mockCommandListRegular.indirectAllocationsAllowed = true; + commandListRegular->close(); + + auto cmdListHandle = commandListRegular->toHandle(); + returnValue = commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr); + ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue); + + EXPECT_EQ(mockCmdQHw->handleIndirectAllocationResidencyCalledTimes, 1u); + mockCommandListImmediate.cmdQImmediate = oldCommandQueue; +} + HWTEST2_F(CommandListTest, givenCmdListWithNoIndirectAccessWhenExecutingCommandListImmediateWithFlushTaskThenHandleIndirectAccessNotCalled, MatchAny) { ze_command_queue_desc_t desc = {}; desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_3.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_3.cpp index f5596b9254..05220f5d30 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_3.cpp @@ -581,6 +581,67 @@ HWTEST_F(CommandQueueIndirectAllocations, givenDeviceThatSupportsSubmittingIndir device->getDriverHandle()->getSvmAllocsManager()->freeSVMAlloc(deviceAlloc); } +HWTEST_F(CommandQueueIndirectAllocations, + givenDeviceThatSupportsSubmittingIndirectAllocationsAsPackWhenIndirectAccessIsUsedWhenExecutingRegularOnImmediateThenWholePackIsMadeResidentWithRegularCommandList) { + DebugManagerStateRestore restorer; + NEO::debugManager.flags.EnableFlushTaskSubmission.set(true); + + MockCsrHw2 csr(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr.initializeTagAllocation(); + csr.setupContext(*neoDevice->getDefaultEngine().osContext); + + ze_result_t returnValue; + ze_command_queue_desc_t desc = {}; + desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + std::unique_ptr commandListImmediate(CommandList::createImmediate(productFamily, + device, + &desc, + false, + NEO::EngineGroupType::compute, + returnValue)); + ASSERT_NE(nullptr, commandListImmediate); + + std::unique_ptr commandListRegular(CommandList::create(productFamily, + device, + NEO::EngineGroupType::compute, + 0u, + returnValue, false)); + ASSERT_NE(nullptr, commandListRegular); + + void *deviceAlloc = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &deviceAlloc); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + auto gpuAlloc = device->getDriverHandle()->getSvmAllocsManager()->getSVMAllocs()->get(deviceAlloc)->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex()); + ASSERT_NE(nullptr, gpuAlloc); + + createKernel(); + kernel->unifiedMemoryControls.indirectDeviceAllocationsAllowed = true; + kernel->kernelHasIndirectAccess = true; + EXPECT_TRUE(kernel->getUnifiedMemoryControls().indirectDeviceAllocationsAllowed); + + static_cast(driverHandle->getMemoryManager())->overrideAllocateAsPackReturn = 1u; + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + result = commandListRegular->appendLaunchKernel(kernel->toHandle(), + groupCount, + nullptr, 0, nullptr, + launchParams, false); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + commandListRegular->close(); + + auto cmdListHandle = commandListRegular->toHandle(); + result = commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_TRUE(gpuAlloc->isResident(csr.getOsContext().getContextId())); + EXPECT_EQ(GraphicsAllocation::objectAlwaysResident, gpuAlloc->getResidencyTaskCount(csr.getOsContext().getContextId())); + + device->getDriverHandle()->getSvmAllocsManager()->freeSVMAlloc(deviceAlloc); +} + HWTEST_F(CommandQueueIndirectAllocations, givenImmediateCommandListAndFlushTaskWithIndirectAllocsAsPackDisabledThenLaunchKernelWorks) { DebugManagerStateRestore restorer; NEO::debugManager.flags.EnableFlushTaskSubmission.set(true); diff --git a/level_zero/core/test/unit_tests/sources/context/test_context.cpp b/level_zero/core/test/unit_tests/sources/context/test_context.cpp index 2640c33b8a..e8d55036ca 100644 --- a/level_zero/core/test/unit_tests/sources/context/test_context.cpp +++ b/level_zero/core/test/unit_tests/sources/context/test_context.cpp @@ -956,6 +956,70 @@ HWTEST_F(ContextMakeMemoryResidentAndMigrationTests, context->freeMem(dstBuffer); } +HWTEST_F(ContextMakeMemoryResidentAndMigrationTests, + GivenImmediateCommandListWhenExecutingRegularCommandListsHavingSharedAllocationWithMigrationOnImmediateThenMemoryFromMakeResidentIsMovedToGpuOnce) { + DriverHandleImp *driverHandleImp = static_cast(hostDriverHandle.get()); + size_t previousSize = driverHandleImp->sharedMakeResidentAllocations.size(); + + mockMemoryInterface->makeResidentResult = NEO::MemoryOperationsStatus::success; + + ze_result_t res = context->makeMemoryResident(device, ptr, size); + EXPECT_EQ(ZE_RESULT_SUCCESS, res); + + size_t currentSize = driverHandleImp->sharedMakeResidentAllocations.size(); + EXPECT_EQ(previousSize + 1, currentSize); + + const ze_command_queue_desc_t desc = {}; + MockCsrHw2 csr(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr.initializeTagAllocation(); + csr.setupContext(*neoDevice->getDefaultEngine().osContext); + + ze_result_t result = ZE_RESULT_SUCCESS; + + DebugManagerStateRestore restorer; + NEO::debugManager.flags.EnableFlushTaskSubmission.set(true); + + std::unique_ptr commandListImmediate(CommandList::createImmediate(productFamily, + device, + &desc, + false, + NEO::EngineGroupType::compute, + result)); + ASSERT_NE(nullptr, commandListImmediate); + + void *dstBuffer = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + ze_host_mem_alloc_desc_t hostDesc = {}; + result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, 16384u, 4090u, &dstBuffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + std::unique_ptr commandListRegular(CommandList::create(productFamily, + device, + NEO::EngineGroupType::compute, + 0u, + result, false)); + + int one = 1; + result = commandListRegular->appendMemoryFill(dstBuffer, reinterpret_cast(&one), sizeof(one), 4090u, + nullptr, 0, nullptr, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + commandListRegular->close(); + + auto cmdListHandle = commandListRegular->toHandle(); + result = commandListImmediate->appendCommandLists(1, &cmdListHandle, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_EQ(mockPageFaultManager->moveAllocationToGpuDomainCalledTimes, 1u); + EXPECT_EQ(mockPageFaultManager->migratedAddress, ptr); + + mockMemoryInterface->evictResult = NEO::MemoryOperationsStatus::success; + res = context->evictMemory(device, ptr, size); + EXPECT_EQ(ZE_RESULT_SUCCESS, res); + + context->freeMem(ptr); + context->freeMem(dstBuffer); +} + HWTEST_F(ContextMakeMemoryResidentAndMigrationTests, whenExecutingImmediateCommandListsHavingHostAllocationWithMigrationThenMemoryFromMakeResidentIsMovedToGpu) { DriverHandleImp *driverHandleImp = static_cast(hostDriverHandle.get()); diff --git a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp index 4123788624..87445f83a2 100644 --- a/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp +++ b/level_zero/core/test/unit_tests/xe_hpc_core/test_cmdlist_xe_hpc_core.cpp @@ -646,6 +646,79 @@ HWTEST2_F(CommandListStatePrefetchXeHpcCore, givenAppendMemoryPrefetchForKmdMigr commandQueue->destroy(); } +HWTEST2_F(CommandListStatePrefetchXeHpcCore, givenAppendMemoryPrefetchForKmdMigratedSharedAllocationsSetOnRegularCmdListWhenPrefetchApiIsCalledOnUnifiedSharedMemoryAndRegularExecutedOnImmediateThenCallMigrateAllocationsToGpuOnce, IsXeHpcCore) { + DebugManagerStateRestore restore; + debugManager.flags.UseKmdMigration.set(1); + + neoDevice->deviceBitfield = 0b1000; + + auto memoryManager = static_cast(device->getDriverHandle()->getMemoryManager()); + memoryManager->prefetchManager.reset(new MockPrefetchManager()); + + createKernel(); + ze_result_t returnValue; + ze_command_queue_desc_t queueDesc = {}; + + ze_command_list_handle_t commandListHandle = CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0u, returnValue, false)->toHandle(); + auto commandList = CommandList::fromHandle(commandListHandle); + std::unique_ptr commandListImmediate(CommandList::createImmediate(productFamily, + device, + &queueDesc, + false, + NEO::EngineGroupType::compute, + returnValue)); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + + auto eventPool = std::unique_ptr(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + auto event = std::unique_ptr(Event::create(eventPool.get(), &eventDesc, device)); + + size_t size = 10; + size_t alignment = 1u; + void *ptr = nullptr; + + ze_device_mem_alloc_desc_t deviceDesc = {}; + ze_host_mem_alloc_desc_t hostDesc = {}; + auto result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, size, alignment, &ptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_NE(nullptr, ptr); + + result = commandList->appendMemoryPrefetch(ptr, size); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto prefetchManager = static_cast(memoryManager->prefetchManager.get()); + EXPECT_EQ(1u, commandList->getPrefetchContext().allocations.size()); + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + result = commandList->appendLaunchKernel(kernel->toHandle(), groupCount, event->toHandle(), 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + commandList->close(); + + result = commandListImmediate->appendCommandLists(1, &commandListHandle, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_TRUE(memoryManager->setMemPrefetchCalled); + EXPECT_EQ(3u, memoryManager->memPrefetchSubDeviceIds[0]); + + EXPECT_TRUE(prefetchManager->migrateAllocationsToGpuCalled); + EXPECT_EQ(1u, prefetchManager->migrateAllocationsToGpuCalledCount); + EXPECT_EQ(1u, commandList->getPrefetchContext().allocations.size()); + + commandList->reset(); + EXPECT_TRUE(prefetchManager->removeAllocationsCalled); + EXPECT_EQ(0u, commandList->getPrefetchContext().allocations.size()); + + context->freeMem(ptr); + commandList->destroy(); +} + HWTEST2_F(CommandListStatePrefetchXeHpcCore, givenAppendMemoryPrefetchForKmdMigratedSharedAllocationsSetWhenPrefetchApiIsCalledForUnifiedSharedMemoryOnCmdListCopyOnlyThenCallMigrateAllocationsToGpu, IsXeHpcCore) { DebugManagerStateRestore restore; debugManager.flags.UseKmdMigration.set(1); diff --git a/shared/test/common/memory_manager/mock_prefetch_manager.h b/shared/test/common/memory_manager/mock_prefetch_manager.h index cc684415b2..a4ff463153 100644 --- a/shared/test/common/memory_manager/mock_prefetch_manager.h +++ b/shared/test/common/memory_manager/mock_prefetch_manager.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -17,6 +17,7 @@ class MockPrefetchManager : public PrefetchManager { void migrateAllocationsToGpu(PrefetchContext &prefetchContext, SVMAllocsManager &unifiedMemoryManager, Device &device, CommandStreamReceiver &csr) override { PrefetchManager::migrateAllocationsToGpu(prefetchContext, unifiedMemoryManager, device, csr); migrateAllocationsToGpuCalled = true; + migrateAllocationsToGpuCalledCount++; } void removeAllocations(PrefetchContext &prefetchContext) override { @@ -24,6 +25,7 @@ class MockPrefetchManager : public PrefetchManager { removeAllocationsCalled = true; } + uint32_t migrateAllocationsToGpuCalledCount = 0; bool migrateAllocationsToGpuCalled = false; bool removeAllocationsCalled = false; };