From a2012e04dc6a1032a1ebe298df2bbb854fde46da Mon Sep 17 00:00:00 2001 From: Vinod Tipparaju Date: Fri, 16 Jul 2021 16:17:08 +0530 Subject: [PATCH] Add pageFault migration support for immediate cmdlist submission via flushTask. Move logic for makeResident & pageFault migration to command place for re-use. Signed-off-by: Vinod Tipparaju --- level_zero/core/source/cmdlist/cmdlist.cpp | 29 ++++ level_zero/core/source/cmdlist/cmdlist.h | 3 + .../source/cmdlist/cmdlist_hw_immediate.inl | 24 +++- .../core/source/cmdqueue/cmdqueue_hw.inl | 23 +--- .../sources/context/test_context.cpp | 126 ++++++++++++++++++ .../sources/debugger/test_l0_debugger.cpp | 46 +++++++ 6 files changed, 231 insertions(+), 20 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.cpp b/level_zero/core/source/cmdlist/cmdlist.cpp index 263c75aeac..e16f8b9b90 100644 --- a/level_zero/core/source/cmdlist/cmdlist.cpp +++ b/level_zero/core/source/cmdlist/cmdlist.cpp @@ -7,10 +7,13 @@ #include "level_zero/core/source/cmdlist/cmdlist.h" +#include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/device/device_info.h" #include "shared/source/memory_manager/memory_manager.h" +#include "level_zero/core/source/device/device_imp.h" + namespace L0 { CommandList::~CommandList() { @@ -124,4 +127,30 @@ NEO::PreemptionMode CommandList::obtainFunctionPreemptionMode(Kernel *kernel) { return NEO::PreemptionHelper::taskPreemptionMode(device->getDevicePreemptionMode(), flags); } +void CommandList::makeResidentAndMigrate(bool performMigration) { + for (auto alloc : commandContainer.getResidencyContainer()) { + if (csr->getResidencyAllocations().end() == + std::find(csr->getResidencyAllocations().begin(), csr->getResidencyAllocations().end(), alloc)) { + csr->makeResident(*alloc); + + if (performMigration && + (alloc->getAllocationType() == NEO::GraphicsAllocation::AllocationType::SVM_GPU || + alloc->getAllocationType() == NEO::GraphicsAllocation::AllocationType::SVM_CPU)) { + auto pageFaultManager = device->getDriverHandle()->getMemoryManager()->getPageFaultManager(); + pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast(alloc->getGpuAddress())); + } + } + } +} + +void CommandList::migrateSharedAllocations() { + auto deviceImp = static_cast(device); + DriverHandleImp *driverHandleImp = static_cast(deviceImp->getDriverHandle()); + std::lock_guard lock(driverHandleImp->sharedMakeResidentAllocationsLock); + auto pageFaultManager = device->getDriverHandle()->getMemoryManager()->getPageFaultManager(); + for (auto alloc : driverHandleImp->sharedMakeResidentAllocations) { + pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast(alloc.second->getGpuAddress())); + } +} + } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 518e62f97d..21f39bcf7d 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -248,6 +248,9 @@ struct CommandList : _ze_command_list_handle_t { uint32_t threadArbitrationPolicy = NEO::ThreadArbitrationPolicy::RoundRobin; bool isFlushTaskSubmissionEnabled = false; + void makeResidentAndMigrate(bool); + void migrateSharedAllocations(); + protected: std::map hostPtrMap; NEO::EngineGroupType engineGroupType; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 2686d456f4..bf08804752 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -65,6 +65,20 @@ ze_result_t CommandListCoreFamilyImmediate::executeCommandListImm this->csr->setRequiredScratchSizes(this->getCommandListPerThreadScratchSize(), this->getCommandListPerThreadScratchSize()); + if (performMigration) { + auto deviceImp = static_cast(this->device); + auto pageFaultManager = deviceImp->getDriverHandle()->getMemoryManager()->getPageFaultManager(); + if (pageFaultManager == nullptr) { + performMigration = false; + } + } + + this->makeResidentAndMigrate(performMigration); + + if (performMigration) { + this->migrateSharedAllocations(); + } + auto completionStamp = this->csr->flushTask( *commandStream, commandStreamStart, @@ -319,9 +333,17 @@ ze_result_t CommandListCoreFamilyImmediate::appendEventReset(ze_e template ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(NEO::GraphicsAllocation *dstptr, NEO::GraphicsAllocation *srcptr, size_t size, bool flushHost) { + if (this->isFlushTaskSubmissionEnabled) { + checkAvailableSpace(); + } + auto ret = CommandListCoreFamily::appendPageFaultCopy(dstptr, srcptr, size, flushHost); if (ret == ZE_RESULT_SUCCESS) { - executeCommandListImmediate(false); + if (this->isFlushTaskSubmissionEnabled) { + executeCommandListImmediateWithFlushTask(false); + } else { + executeCommandListImmediate(false); + } } return ret; } diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 1fda1c8d82..54008a8a41 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -359,28 +359,13 @@ ze_result_t CommandQueueHw::executeCommandLists( commandList->getPrintfFunctionContainer().begin(), commandList->getPrintfFunctionContainer().end()); - for (auto alloc : commandList->commandContainer.getResidencyContainer()) { - if (csr->getResidencyAllocations().end() == - std::find(csr->getResidencyAllocations().begin(), csr->getResidencyAllocations().end(), alloc)) { - csr->makeResident(*alloc); - - if (performMigration) { - if (alloc && - (alloc->getAllocationType() == NEO::GraphicsAllocation::AllocationType::SVM_GPU || - alloc->getAllocationType() == NEO::GraphicsAllocation::AllocationType::SVM_CPU)) { - pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast(alloc->getGpuAddress())); - } - } - } - } + commandList->csr = csr; + commandList->makeResidentAndMigrate(performMigration); } if (performMigration) { - DriverHandleImp *driverHandleImp = static_cast(device->getDriverHandle()); - std::lock_guard lock(driverHandleImp->sharedMakeResidentAllocationsLock); - for (auto alloc : driverHandleImp->sharedMakeResidentAllocations) { - pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast(alloc.second->getGpuAddress())); - } + auto commandList = CommandList::fromHandle(phCommandLists[0]); + commandList->migrateSharedAllocations(); } if (stateSipRequired) { diff --git a/level_zero/core/test/unit_tests/sources/context/test_context.cpp b/level_zero/core/test/unit_tests/sources/context/test_context.cpp index 187a41465d..e4669e57b8 100644 --- a/level_zero/core/test/unit_tests/sources/context/test_context.cpp +++ b/level_zero/core/test/unit_tests/sources/context/test_context.cpp @@ -7,9 +7,12 @@ #include "shared/test/common/mocks/mock_command_stream_receiver.h" #include "shared/test/common/mocks/mock_compilers.h" +#include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h" +#include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_memory_manager.h" +#include "opencl/test/unit_test/mocks/mock_svm_manager.h" #include "test.h" #include "level_zero/core/source/context/context_imp.h" @@ -396,6 +399,8 @@ struct ContextMakeMemoryResidentAndMigrationTests : public ContextMakeMemoryResi ContextMakeMemoryResidentTests::SetUp(); mockMemoryManager = std::make_unique(); mockPageFaultManager = new MockResidentTestsPageFaultManager; + svmManager = std::make_unique(mockMemoryManager.get(), false); + mockMemoryManager->pageFaultManager.reset(mockPageFaultManager); memoryManager = device->getDriverHandle()->getMemoryManager(); device->getDriverHandle()->setMemoryManager(mockMemoryManager.get()); @@ -420,6 +425,7 @@ struct ContextMakeMemoryResidentAndMigrationTests : public ContextMakeMemoryResi void *ptr = nullptr; std::unique_ptr mockMemoryManager; + std::unique_ptr svmManager; MockResidentTestsPageFaultManager *mockPageFaultManager = nullptr; NEO::MemoryManager *memoryManager = nullptr; }; @@ -526,6 +532,126 @@ HWTEST_F(ContextMakeMemoryResidentAndMigrationTests, context->freeMem(ptr); } +HWTEST_F(ContextMakeMemoryResidentAndMigrationTests, + whenExecutingImmediateCommandListsHavingSharedAllocationWithMigrationThenMemoryFromMakeResidentIsMovedToGpu) { + DriverHandleImp *driverHandleImp = static_cast(hostDriverHandle.get()); + size_t previousSize = driverHandleImp->sharedMakeResidentAllocations.size(); + + EXPECT_CALL(*mockMemoryInterface, makeResident) + .WillRepeatedly(testing::Return(NEO::MemoryOperationsStatus::SUCCESS)); + ze_result_t res = context->makeMemoryResident(device, ptr, size); + EXPECT_EQ(ZE_RESULT_SUCCESS, res); + + size_t currentSize = driverHandleImp->sharedMakeResidentAllocations.size(); + EXPECT_EQ(previousSize + 1, currentSize); + + const ze_command_queue_desc_t desc = {}; + MockCsrHw2 csr(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr.initializeTagAllocation(); + csr.setupContext(*neoDevice->getDefaultEngine().osContext); + + ze_result_t result = ZE_RESULT_SUCCESS; + + DebugManagerStateRestore restorer; + NEO::DebugManager.flags.EnableFlushTaskSubmission.set(true); + + std::unique_ptr commandList0(CommandList::createImmediate(productFamily, + device, + &desc, + false, + NEO::EngineGroupType::RenderCompute, + result)); + ASSERT_NE(nullptr, commandList0); + + void *dst_buffer = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + ze_host_mem_alloc_desc_t hostDesc = {}; + result = context->allocSharedMem(device->toHandle(), &deviceDesc, &hostDesc, 16384u, 4090u, &dst_buffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + int one = 1; + result = commandList0->appendMemoryFill(dst_buffer, reinterpret_cast(&one), sizeof(one), 4090u, + nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_EQ(mockPageFaultManager->moveAllocationToGpuDomainCalledTimes, 1u); + EXPECT_EQ(mockPageFaultManager->migratedAddress, ptr); + + EXPECT_CALL(*mockMemoryInterface, evict) + .WillRepeatedly(testing::Return(NEO::MemoryOperationsStatus::SUCCESS)); + res = context->evictMemory(device, ptr, size); + EXPECT_EQ(ZE_RESULT_SUCCESS, res); + + context->freeMem(ptr); + context->freeMem(dst_buffer); +} + +HWTEST_F(ContextMakeMemoryResidentAndMigrationTests, + whenExecutingImmediateCommandListsHavingHostAllocationWithMigrationThenMemoryFromMakeResidentIsMovedToGpu) { + MockCommandQueue cmdQ; + DriverHandleImp *driverHandleImp = static_cast(hostDriverHandle.get()); + size_t previousSize = driverHandleImp->sharedMakeResidentAllocations.size(); + + EXPECT_CALL(*mockMemoryInterface, makeResident) + .WillRepeatedly(testing::Return(NEO::MemoryOperationsStatus::SUCCESS)); + ze_result_t res = context->makeMemoryResident(device, ptr, size); + EXPECT_EQ(ZE_RESULT_SUCCESS, res); + + size_t currentSize = driverHandleImp->sharedMakeResidentAllocations.size(); + EXPECT_EQ(previousSize + 1, currentSize); + + const ze_command_queue_desc_t desc = {}; + MockCsrHw2 csr(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr.initializeTagAllocation(); + csr.setupContext(*neoDevice->getDefaultEngine().osContext); + + ze_result_t result = ZE_RESULT_SUCCESS; + + DebugManagerStateRestore restorer; + NEO::DebugManager.flags.EnableFlushTaskSubmission.set(true); + + std::unique_ptr commandList0(CommandList::createImmediate(productFamily, + device, + &desc, + false, + NEO::EngineGroupType::RenderCompute, + result)); + ASSERT_NE(nullptr, commandList0); + + DebugManagerStateRestore restore; + DebugManager.flags.AllocateSharedAllocationsWithCpuAndGpuStorage.set(true); + + std::set rootDeviceIndices{mockRootDeviceIndex}; + std::map deviceBitfields{{mockRootDeviceIndex, mockDeviceBitfield}}; + + NEO::SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::SHARED_UNIFIED_MEMORY, rootDeviceIndices, deviceBitfields); + auto sharedPtr = svmManager->createSharedUnifiedMemoryAllocation(4096u, unifiedMemoryProperties, &cmdQ); + EXPECT_NE(nullptr, sharedPtr); + + auto allocation = svmManager->getSVMAlloc(sharedPtr); + auto gpuAllocation = allocation->gpuAllocations.getGraphicsAllocation(mockRootDeviceIndex); + + auto &commandContainer = commandList0->commandContainer; + commandContainer.addToResidencyContainer(gpuAllocation); + commandContainer.addToResidencyContainer(allocation->cpuAllocation); + + void *dst_buffer = nullptr; + ze_host_mem_alloc_desc_t hostDesc = {}; + result = context->allocHostMem(&hostDesc, 4096u, 0u, &dst_buffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + int one = 1; + result = commandList0->appendMemoryFill(dst_buffer, reinterpret_cast(&one), sizeof(one), 4090u, + nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_EQ(mockPageFaultManager->moveAllocationToGpuDomainCalledTimes, 3u); + + context->freeMem(ptr); + svmManager->freeSVMAlloc(sharedPtr); + context->freeMem(dst_buffer); +} + TEST_F(ContextTest, whenGettingDriverThenDriverIsRetrievedSuccessfully) { ze_context_handle_t hContext; ze_context_desc_t desc; diff --git a/level_zero/core/test/unit_tests/sources/debugger/test_l0_debugger.cpp b/level_zero/core/test/unit_tests/sources/debugger/test_l0_debugger.cpp index 5468d41df0..847b4c6487 100644 --- a/level_zero/core/test/unit_tests/sources/debugger/test_l0_debugger.cpp +++ b/level_zero/core/test/unit_tests/sources/debugger/test_l0_debugger.cpp @@ -1087,6 +1087,52 @@ HWTEST_F(L0DebuggerSimpleTest, givenUseCsrImmediateSubmissionDisabledForRegularC commandQueue->destroy(); } +HWTEST2_F(L0DebuggerSimpleTest, givenUseCsrImmediateSubmissionEnabledCommandListAndAppendPageFaultCopyThenSuccessIsReturned, IsSklOrAbove) { + DebugManagerStateRestore restorer; + NEO::DebugManager.flags.EnableFlushTaskSubmission.set(true); + + size_t size = (sizeof(uint32_t) * 4); + ze_command_queue_desc_t queueDesc = {}; + ze_result_t returnValue = ZE_RESULT_SUCCESS; + auto commandList = CommandList::createImmediate(productFamily, device, &queueDesc, true, NEO::EngineGroupType::RenderCompute, returnValue); + ASSERT_NE(nullptr, commandList); + + NEO::GraphicsAllocation srcPtr(0, NEO::GraphicsAllocation::AllocationType::INTERNAL_HOST_MEMORY, + reinterpret_cast(0x1234), size, 0, sizeof(uint32_t), + MemoryPool::System4KBPages); + NEO::GraphicsAllocation dstPtr(0, NEO::GraphicsAllocation::AllocationType::INTERNAL_HOST_MEMORY, + reinterpret_cast(0x2345), size, 0, sizeof(uint32_t), + MemoryPool::System4KBPages); + + auto result = commandList->appendPageFaultCopy(&dstPtr, &srcPtr, 0x100, false); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + commandList->destroy(); +} + +HWTEST2_F(L0DebuggerSimpleTest, givenUseCsrImmediateSubmissionDisabledCommandListAndAppendPageFaultCopyThenSuccessIsReturned, IsSklOrAbove) { + DebugManagerStateRestore restorer; + NEO::DebugManager.flags.EnableFlushTaskSubmission.set(false); + + size_t size = (sizeof(uint32_t) * 4); + ze_command_queue_desc_t queueDesc = {}; + ze_result_t returnValue = ZE_RESULT_SUCCESS; + auto commandList = CommandList::createImmediate(productFamily, device, &queueDesc, true, NEO::EngineGroupType::RenderCompute, returnValue); + ASSERT_NE(nullptr, commandList); + + NEO::GraphicsAllocation srcPtr(0, NEO::GraphicsAllocation::AllocationType::INTERNAL_HOST_MEMORY, + reinterpret_cast(0x1234), size, 0, sizeof(uint32_t), + MemoryPool::System4KBPages); + NEO::GraphicsAllocation dstPtr(0, NEO::GraphicsAllocation::AllocationType::INTERNAL_HOST_MEMORY, + reinterpret_cast(0x2345), size, 0, sizeof(uint32_t), + MemoryPool::System4KBPages); + + auto result = commandList->appendPageFaultCopy(&dstPtr, &srcPtr, 0x100, false); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + commandList->destroy(); +} + HWTEST_F(L0DebuggerSimpleTest, givenNonZeroGpuVasWhenProgrammingSbaTrackingThenCorrectCmdsAreAddedToStream) { using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; auto debugger = std::make_unique>(neoDevice);