From 5807d512b3e6ec0dce7cd91caa5f9b41816b9581 Mon Sep 17 00:00:00 2001 From: Maciej Plewka Date: Thu, 10 Aug 2023 15:40:21 +0000 Subject: [PATCH] fix: Reuse private allocations during cmdList dispatch Related-To: NEO-8201 Signed-off-by: Maciej Plewka --- level_zero/core/source/cmdlist/cmdlist.h | 3 +- level_zero/core/source/cmdlist/cmdlist_hw.h | 3 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 31 +++-- .../source/cmdlist/cmdlist_hw_immediate.h | 1 + .../source/cmdlist/cmdlist_hw_immediate.inl | 9 ++ .../cmdlist/cmdlist_hw_skl_to_tgllp.inl | 2 +- .../cmdlist/cmdlist_hw_xehp_and_later.inl | 3 +- level_zero/core/source/kernel/kernel_imp.cpp | 8 +- level_zero/core/source/kernel/kernel_imp.h | 2 + .../core/test/unit_tests/mocks/mock_cmdlist.h | 34 ++++- .../core/test/unit_tests/mocks/mock_module.h | 1 + .../sources/cmdlist/test_cmdlist_7.cpp | 116 ++++++++++++++++++ .../test_cmdlist_append_launch_kernel_2.cpp | 10 +- .../cmdlist/test_cmdlist_append_memory.cpp | 34 ++--- ...test_cmdlist_copy_event_xehp_and_later.cpp | 16 +-- .../command_stream_receiver.cpp | 7 ++ .../command_stream/command_stream_receiver.h | 3 + .../mocks/mock_command_stream_receiver.h | 1 + .../common/test_macros/mock_method_macros.h | 10 ++ .../command_stream_receiver_tests.cpp | 10 ++ 20 files changed, 256 insertions(+), 48 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 649aa01b97..7c22b88a46 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -21,6 +21,7 @@ #include #include +#include #include struct _ze_command_list_handle_t {}; @@ -355,7 +356,7 @@ struct CommandList : _ze_command_list_handle_t { MOCKABLE_VIRTUAL void synchronizeEventList(uint32_t numWaitEvents, ze_event_handle_t *waitEventList); std::map hostPtrMap; - std::vector ownedPrivateAllocations; + std::unordered_map ownedPrivateAllocations; std::vector patternAllocations; std::vector printfKernelContainer; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 61fe149783..792c12c462 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -309,7 +309,8 @@ struct CommandListCoreFamily : CommandListImp { return (this->pipeControlMultiKernelEventSync && splitKernel) || compactL3FlushEvent(dcFlush); } - void allocateKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread); + MOCKABLE_VIRTUAL void allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, std::unordered_map &privateAllocsToReuse); + virtual void allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread); CmdListEventOperation estimateEventPostSync(Event *event, uint32_t operations); void dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition); void dispatchPostSyncCompute(uint64_t gpuAddress, uint32_t value, bool workloadPartition); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index fef5e84998..6009e0501e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -20,6 +20,7 @@ #include "shared/source/helpers/definitions/command_encoder_args.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/hw_info.h" +#include "shared/source/helpers/kernel_helpers.h" #include "shared/source/helpers/logical_state_helper.h" #include "shared/source/helpers/pipe_control_args.h" #include "shared/source/helpers/preamble.h" @@ -54,6 +55,7 @@ #include "CL/cl.h" #include +#include namespace L0 { @@ -71,8 +73,8 @@ inline ze_result_t parseErrorCode(NEO::CommandContainer::ErrorCode returnValue) template CommandListCoreFamily::~CommandListCoreFamily() { clearCommandsToPatch(); - for (auto alloc : this->ownedPrivateAllocations) { - device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc); + for (auto &alloc : this->ownedPrivateAllocations) { + device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc.second); } this->ownedPrivateAllocations.clear(); for (auto &patternAlloc : this->patternAllocations) { @@ -129,8 +131,8 @@ ze_result_t CommandListCoreFamily::reset() { this->returnPoints.clear(); } - for (auto alloc : this->ownedPrivateAllocations) { - device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc); + for (auto &alloc : this->ownedPrivateAllocations) { + device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc.second); } this->ownedPrivateAllocations.clear(); cmdListCurrentStartOffset = 0; @@ -3172,16 +3174,27 @@ ze_result_t CommandListCoreFamily::appendWriteToMemory(void *desc } template -void CommandListCoreFamily::allocateKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread) { +void CommandListCoreFamily::allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread) { L0::KernelImp *kernelImp = static_cast(kernel); if (sizePerHwThread != 0U && kernelImp->getParentModule().shouldAllocatePrivateMemoryPerDispatch()) { - auto privateMemoryGraphicsAllocation = kernel->allocatePrivateMemoryGraphicsAllocation(); - kernel->patchCrossthreadDataWithPrivateAllocation(privateMemoryGraphicsAllocation); - this->commandContainer.addToResidencyContainer(privateMemoryGraphicsAllocation); - this->ownedPrivateAllocations.push_back(privateMemoryGraphicsAllocation); + allocateOrReuseKernelPrivateMemory(kernel, sizePerHwThread, ownedPrivateAllocations); } } +template +void CommandListCoreFamily::allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, std::unordered_map &privateAllocsToReuse) { + L0::KernelImp *kernelImp = static_cast(kernel); + NEO::GraphicsAllocation *privateAlloc = nullptr; + + if (privateAllocsToReuse[sizePerHwThread] != nullptr) { + privateAlloc = privateAllocsToReuse[sizePerHwThread]; + } else { + privateAlloc = kernelImp->allocatePrivateMemoryGraphicsAllocation(); + privateAllocsToReuse[sizePerHwThread] = privateAlloc; + } + kernelImp->patchAndMoveToResidencyContainerPrivateSurface(privateAlloc); +} + template CmdListEventOperation CommandListCoreFamily::estimateEventPostSync(Event *event, uint32_t operations) { CmdListEventOperation ret; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index 6158a702d3..9d0e72fb02 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -191,6 +191,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily::setupFlushMethod(const NEO:: } } +template +void CommandListCoreFamilyImmediate::allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread) { + L0::KernelImp *kernelImp = static_cast(kernel); + if (sizePerHwThread != 0U && kernelImp->getParentModule().shouldAllocatePrivateMemoryPerDispatch()) { + auto ownership = this->csr->obtainUniqueOwnership(); + this->allocateOrReuseKernelPrivateMemory(kernel, sizePerHwThread, this->csr->getOwnedPrivateAllocations()); + } +} + } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index e11878a49c..ed38a003b2 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -91,7 +91,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K kernel->patchGlobalOffset(); - this->allocateKernelPrivateMemoryIfNeeded(kernel, kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize); + this->allocateOrReuseKernelPrivateMemoryIfNeeded(kernel, kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize); if (!launchParams.isIndirect) { kernel->setGroupCount(threadGroupDimensions->groupCountX, diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 3dd00b8bb2..dbe4112b3f 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -152,8 +152,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K auto kernelPreemptionMode = obtainKernelPreemptionMode(kernel); kernel->patchGlobalOffset(); - - this->allocateKernelPrivateMemoryIfNeeded(kernel, kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize); + this->allocateOrReuseKernelPrivateMemoryIfNeeded(kernel, kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize); if (launchParams.isIndirect && threadGroupDimensions) { prepareIndirectParams(threadGroupDimensions); diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index f554a2a8b4..a25f27d52a 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -909,6 +909,11 @@ void KernelImp::setInlineSamplers() { } } +void KernelImp::patchAndMoveToResidencyContainerPrivateSurface(NEO::GraphicsAllocation *alloc) { + this->patchCrossthreadDataWithPrivateAllocation(alloc); + this->residencyContainer.push_back(alloc); +} + ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { this->kernelImmData = module->getKernelImmutableData(desc->pKernelName); if (this->kernelImmData == nullptr) { @@ -1017,8 +1022,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { auto &kernelAttributes = kernelDescriptor.kernelAttributes; if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) { this->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation(); - this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation); - this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation); + this->patchAndMoveToResidencyContainerPrivateSurface(this->privateMemoryGraphicsAllocation); } this->createPrintfBuffer(); diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index 8e0fa2f0f9..437927589f 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -90,6 +90,8 @@ struct KernelImp : Kernel { void setInlineSamplers(); + void patchAndMoveToResidencyContainerPrivateSurface(NEO::GraphicsAllocation *alloc); + ze_result_t initialize(const ze_kernel_desc_t *desc); const uint8_t *getPerThreadData() const override { return perThreadDataForWholeThreadGroup; } diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 871578cd25..674ac6a126 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -15,6 +15,8 @@ #include "level_zero/core/test/unit_tests/mocks/mock_device.h" #include "level_zero/core/test/unit_tests/white_box.h" +#include + namespace NEO { class GraphicsAllocation; } @@ -30,7 +32,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using BaseClass = ::L0::CommandListCoreFamily; using BaseClass::addFlushRequiredCommand; - using BaseClass::allocateKernelPrivateMemoryIfNeeded; + using BaseClass::allocateOrReuseKernelPrivateMemoryIfNeeded; using BaseClass::appendBlitFill; using BaseClass::appendCopyImageBlit; using BaseClass::appendDispatchOffsetRegister; @@ -500,12 +502,14 @@ struct MockCommandList : public CommandList { }; template -class MockAppendMemoryCopy : public CommandListCoreFamily { +class MockCommandListCoreFamily : public CommandListCoreFamily { public: using BaseClass = CommandListCoreFamily; + using BaseClass::allocateOrReuseKernelPrivateMemoryIfNeeded; using BaseClass::commandContainer; using BaseClass::dcFlushSupport; using BaseClass::device; + using BaseClass::ownedPrivateAllocations; ADDMETHOD(appendMemoryCopyKernelWithGA, ze_result_t, false, ZE_RESULT_SUCCESS, (void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc, @@ -526,6 +530,19 @@ class MockAppendMemoryCopy : public CommandListCoreFamily { uint64_t srcOffset, uint64_t size)); + ADDMETHOD_VOIDRETURN(allocateOrReuseKernelPrivateMemory, + false, + (L0::Kernel * kernel, + uint32_t sizePerHwThread, + std::unordered_map &privateAllocsToReuse), + (kernel, sizePerHwThread, privateAllocsToReuse)); + + ADDMETHOD_VOIDRETURN(allocateOrReuseKernelPrivateMemoryIfNeeded, + false, + (L0::Kernel * kernel, + uint32_t sizePerHwThread), + (kernel, sizePerHwThread)); + AlignedAllocationData getAlignedAllocationData(L0::Device *device, const void *buffer, uint64_t bufferSize, bool allowHostCopy) override { return L0::CommandListCoreFamily::getAlignedAllocationData(device, buffer, bufferSize, allowHostCopy); } @@ -610,6 +627,19 @@ class MockCommandListImmediateHw : public WhiteBox<::L0::CommandListCoreFamilyIm checkAssertCalled++; } + ADDMETHOD_VOIDRETURN(allocateOrReuseKernelPrivateMemory, + false, + (L0::Kernel * kernel, + uint32_t sizePerHwThread, + std::unordered_map &privateAllocsToReuse), + (kernel, sizePerHwThread, privateAllocsToReuse)); + + ADDMETHOD_VOIDRETURN(allocateOrReuseKernelPrivateMemoryIfNeeded, + false, + (L0::Kernel * kernel, + uint32_t sizePerHwThread), + (kernel, sizePerHwThread)); + uint32_t checkAssertCalled = 0; bool callBaseExecute = false; diff --git a/level_zero/core/test/unit_tests/mocks/mock_module.h b/level_zero/core/test/unit_tests/mocks/mock_module.h index 915e7c1aa0..1a17058325 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_module.h +++ b/level_zero/core/test/unit_tests/mocks/mock_module.h @@ -48,6 +48,7 @@ constexpr inline MockModuleTranslationUnit *toMockPtr(L0::ModuleTranslationUnit template <> struct WhiteBox<::L0::Module> : public ::L0::ModuleImp { using BaseClass = ::L0::ModuleImp; + using BaseClass::allocatePrivateMemoryPerDispatch; using BaseClass::BaseClass; using BaseClass::builtFromSPIRv; using BaseClass::copyPatchedSegments; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp index 651dc71304..5610ee0457 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp @@ -10,6 +10,7 @@ #include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/libult/ult_command_stream_receiver.h" +#include "shared/test/common/mocks/mock_command_stream_receiver.h" #include "shared/test/common/mocks/mock_ostime.h" #include "shared/test/common/mocks/ult_device_factory.h" #include "shared/test/common/test_macros/hw_test.h" @@ -3189,5 +3190,120 @@ HWTEST2_F(CommandListMappedTimestampTest, givenEventIsAddedToMappedEventListWhen EXPECT_EQ(0u, commandList->peekMappedEventList().size()); } +template +class MockCommandListCoreFamilyIfPrivateNeeded : public BaseMock { + public: + void allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, std::unordered_map &privateAllocsToReuse) override { + passedContainer = &privateAllocsToReuse; + BaseMock::allocateOrReuseKernelPrivateMemory(kernel, sizePerHwThread, privateAllocsToReuse); + } + std::unordered_map *passedContainer; +}; + +HWTEST2_F(CommandListCreate, givenPrivatePerDispatchDisabledWhenAllocatingPrivateMemoryThenAllocateIsNotCalled, IsAtLeastSkl) { + auto commandList = std::make_unique>>(); + commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true; + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + mockModule.allocatePrivateMemoryPerDispatch = false; + commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000); + EXPECT_EQ(commandList->allocateOrReuseKernelPrivateMemoryCalled, 0u); +} + +HWTEST2_F(CommandListCreate, givenPrivatePerDispatchEnabledWhenAllocatingPrivateMemoryThenAllocateIsCalled, IsAtLeastSkl) { + auto commandList = std::make_unique>>(); + commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true; + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + mockModule.allocatePrivateMemoryPerDispatch = true; + commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000); + EXPECT_EQ(commandList->allocateOrReuseKernelPrivateMemoryCalled, 1u); +} + +HWTEST2_F(CommandListCreate, givenPrivatePerDispatchEnabledWhenAllocatingPrivateMemoryThenCmdListMaprIsPassed, IsAtLeastSkl) { + auto commandList = std::make_unique>>(); + commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true; + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + mockModule.allocatePrivateMemoryPerDispatch = true; + commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000); + EXPECT_EQ(commandList->passedContainer, &commandList->ownedPrivateAllocations); +} + +HWTEST2_F(CommandListCreate, givenImmediateListAndPrivatePerDispatchDisabledWhenAllocatingPrivateMemoryCalledThenAllocateIsNotCalled, IsAtLeastSkl) { + auto commandList = std::make_unique>>(); + commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true; + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + mockModule.allocatePrivateMemoryPerDispatch = false; + commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000); + EXPECT_EQ(commandList->allocateOrReuseKernelPrivateMemoryCalled, 0u); +} + +HWTEST2_F(CommandListCreate, givenImmediateListAndPrivatePerDispatchEnabledWhenAllocatingPrivateMemoryThenAllocateIsCalled, IsAtLeastSkl) { + auto commandList = std::make_unique>>(); + commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true; + MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield()); + commandList->csr = &mockCommandStreamReceiver; + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + mockModule.allocatePrivateMemoryPerDispatch = true; + commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000); + EXPECT_EQ(commandList->allocateOrReuseKernelPrivateMemoryCalled, 1u); +} + +HWTEST2_F(CommandListCreate, givenImmediateListAndPrivatePerDispatchEnabledWhenAllocatingPrivateMemoryThenCsrMapIsPassed, IsAtLeastSkl) { + auto commandList = std::make_unique>>(); + commandList->allocateOrReuseKernelPrivateMemoryIfNeededCallBase = true; + MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield()); + commandList->csr = &mockCommandStreamReceiver; + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + mockModule.allocatePrivateMemoryPerDispatch = true; + commandList->allocateOrReuseKernelPrivateMemoryIfNeeded(&mockKernel, 0x1000); + EXPECT_EQ(commandList->passedContainer, &mockCommandStreamReceiver.getOwnedPrivateAllocations()); +} + +HWTEST2_F(CommandListCreate, givenCmdListWhenAllocateOrReuseCalledForSizeThatIsStoredInMapThenItsReused, IsAtLeastSkl) { + auto commandList = std::make_unique>(); + commandList->allocateOrReuseKernelPrivateMemoryCallBase = true; + commandList->device = this->device; + uint32_t sizePerHwThread = 0x1000; + auto mockMem = std::make_unique(0x1000); + Mock mockModule(this->device, nullptr); + Mock mockKernel; + const_cast(mockKernel.kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) = 0x1000; + mockKernel.module = &mockModule; + MockGraphicsAllocation mockGA(mockMem.get(), 2 * sizePerHwThread * this->neoDevice->getDeviceInfo().computeUnitsUsedForScratch); + std::unordered_map mapForReuse; + mapForReuse[sizePerHwThread] = &mockGA; + commandList->allocateOrReuseKernelPrivateMemory(&mockKernel, sizePerHwThread, mapForReuse); + EXPECT_EQ(mockKernel.residencyContainer[0], &mockGA); +} + +HWTEST2_F(CommandListCreate, givenNewSizeDifferentThanSizesInMapWhenAllocatingPrivateMemoryThenNewAllocationIsCreated, IsAtLeastSkl) { + auto commandList = std::make_unique>(); + commandList->allocateOrReuseKernelPrivateMemoryCallBase = true; + commandList->device = this->device; + uint32_t sizePerHwThread = 0x1000; + auto mockMem = std::make_unique(0x1000); + Mock mockModule(this->device, nullptr); + Mock mockKernel; + const_cast(mockKernel.kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) = 0x1000; + mockKernel.module = &mockModule; + MockGraphicsAllocation mockGA(mockMem.get(), sizePerHwThread * this->neoDevice->getDeviceInfo().computeUnitsUsedForScratch / 2); + std::unordered_map mapForReuse; + mapForReuse[sizePerHwThread] = &mockGA; + commandList->allocateOrReuseKernelPrivateMemory(&mockKernel, sizePerHwThread / 2, mapForReuse); + EXPECT_NE(mockKernel.residencyContainer[0], &mockGA); + neoDevice->getMemoryManager()->freeGraphicsMemory(mockKernel.residencyContainer[0]); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index 12b27c1093..06a7f3121f 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -1386,7 +1386,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichTogethe auto &kernelImmDatas = proxyModuleImpl->getKernelImmDatas(); for (size_t i = 0; i < kernelsNb; i++) { auto &kernelDesc = const_cast(kernelImmDatas[i]->getDescriptor()); - kernelDesc.kernelAttributes.perHwThreadPrivateMemorySize = overAllocMinSize; + kernelDesc.kernelAttributes.perHwThreadPrivateMemorySize = overAllocMinSize + static_cast(i * MemoryConstants::cacheLineSize); kernelDesc.kernelAttributes.flags.usesPrintf = false; kernelDesc.kernelMetadata.kernelName = kernelNames[i]; } @@ -1405,8 +1405,8 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichTogethe EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), i); kernels.push_back(this->createKernelWithName(kernelNames[i])); // This function is called by appendLaunchKernelWithParams - pCommandList->allocateKernelPrivateMemoryIfNeeded(kernels[i].get(), - kernels[i]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize); + pCommandList->allocateOrReuseKernelPrivateMemoryIfNeeded(kernels[i].get(), + kernels[i]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize); EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), i + 1); } } @@ -1442,8 +1442,8 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExc EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 0u); kernels.push_back(this->createKernelWithName(kernelNames[i])); // This function is called by appendLaunchKernelWithParams - pCommandList->allocateKernelPrivateMemoryIfNeeded(kernels[i].get(), - kernels[i]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize); + pCommandList->allocateOrReuseKernelPrivateMemoryIfNeeded(kernels[i].get(), + kernels[i]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize); EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 0u); } } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp index 854c325d11..995be83fa7 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp @@ -22,7 +22,7 @@ namespace ult { using AppendMemoryCopy = Test; HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionCalledThenTwoNewAllocationAreAddedToHostMapPtr, IsAtLeastSkl) { - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); void *srcPtr = reinterpret_cast(0x1234); void *dstPtr = reinterpret_cast(0x2345); @@ -33,7 +33,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionC } HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCopyRegion2DCalledThenSrcDstPointersArePageAligned, IsAtLeastSkl) { - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); void *srcPtr = reinterpret_cast(0x1234); void *dstPtr = reinterpret_cast(0x2345); @@ -46,7 +46,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCo } HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCopyRegion3DCalledThenSrcDstPointersArePageAligned, IsAtLeastSkl) { - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); void *srcPtr = reinterpret_cast(0x1234); void *dstPtr = reinterpret_cast(0x2345); @@ -59,7 +59,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCo } HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemoryCopyRegion2DCalledThenSrcDstNotZeroOffsetsArePassed, IsAtLeastSkl) { - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u); void *srcPtr = reinterpret_cast(0x1233); void *dstPtr = reinterpret_cast(0x2345); @@ -71,7 +71,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemo } HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemoryCopyRegion3DCalledThenSrcDstNotZeroOffsetsArePassed, IsAtLeastSkl) { - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u); void *srcPtr = reinterpret_cast(0x1233); void *dstPtr = reinterpret_cast(0x2345); @@ -83,7 +83,7 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemo } HWTEST2_F(AppendMemoryCopy, givenCommandListAndAlignedHostPointersWhenBlitMemoryCopyRegion3DCalledThenSrcDstZeroOffsetsArePassed, IsAtLeastSkl) { - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u); void *srcPtr = alignDown(reinterpret_cast(0x1233), NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignment()); void *dstPtr = alignDown(reinterpret_cast(0x2345), NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignment()); @@ -98,7 +98,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndDestinationPtrOffsetWhenMemor using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT; - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u); constexpr size_t allocSize = 4096; @@ -132,7 +132,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndSourcePtrOffsetWhenMemoryCopy using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT; - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u); constexpr size_t allocSize = 4096; @@ -166,7 +166,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndDestinationPtrOffsetWhenMemor using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT; - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u); constexpr size_t allocSize = 4096; @@ -201,7 +201,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndSourcePtrOffsetWhenMemoryCopy using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT; - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u); constexpr size_t allocSize = 4096; @@ -236,7 +236,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndDestinationPtrOffsetWhenMemor using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT; - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u); constexpr size_t allocSize = 4096; @@ -270,7 +270,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndSourcePtrOffsetWhenMemoryCopy using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT; - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy, 0u); constexpr size_t allocSize = 4096; @@ -303,7 +303,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListAndSourcePtrOffsetWhenMemoryCopy HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionCalledThenPipeControlWithDcFlushAdded, IsAtLeastSkl) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); void *srcPtr = reinterpret_cast(0x1234); void *dstPtr = reinterpret_cast(0x2345); @@ -610,7 +610,7 @@ HWTEST2_F(AppendMemoryCopy, givenSyncModeImmediateCommandListWhenAppendingMemory HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyCalledThenPipeControlWithDcFlushAdded, IsAtLeastSkl) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; - MockAppendMemoryCopy cmdList; + MockCommandListCoreFamily cmdList; cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); void *srcPtr = reinterpret_cast(0x1234); void *dstPtr = reinterpret_cast(0x2345); @@ -646,7 +646,7 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListWhenTimestampPassedToMemoryCopyT using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; using MI_FLUSH_DW = typename GfxFamily::MI_FLUSH_DW; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.initialize(device, NEO::EngineGroupType::Copy, 0u); void *srcPtr = reinterpret_cast(0x1234); void *dstPtr = reinterpret_cast(0x2345); @@ -701,7 +701,7 @@ HWTEST2_F(AppendMemoryCopy, using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.appendMemoryCopyKernelWithGACallBase = true; commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); @@ -759,7 +759,7 @@ HWTEST2_F(AppendMemoryCopy, using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.appendMemoryCopyKernelWithGACallBase = true; commandList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp index 5fc66fc51a..f71e945618 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_copy_event_xehp_and_later.cpp @@ -90,7 +90,7 @@ void testSingleTileAppendMemoryCopyThreeKernels(CopyTestInput &input, TestExpect using OPERATION = typename POSTSYNC_DATA::OPERATION; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.appendMemoryCopyKernelWithGACallBase = true; commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u); @@ -175,7 +175,7 @@ void testSingleTileAppendMemoryCopyThreeKernelsAndL3Flush(CopyTestInput &input, using OPERATION = typename POSTSYNC_DATA::OPERATION; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.appendMemoryCopyKernelWithGACallBase = true; commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u); @@ -283,7 +283,7 @@ void testSingleTileAppendMemoryCopySingleKernel(CopyTestInput &input, TestExpect using OPERATION = typename POSTSYNC_DATA::OPERATION; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.appendMemoryCopyKernelWithGACallBase = true; commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u); @@ -355,7 +355,7 @@ void testSingleTileAppendMemoryCopySingleKernelAndL3Flush(CopyTestInput &input, using OPERATION = typename POSTSYNC_DATA::OPERATION; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.appendMemoryCopyKernelWithGACallBase = true; commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u); @@ -509,7 +509,7 @@ void testMultiTileAppendMemoryCopyThreeKernels(CopyTestInput &input, TestExpecte using OPERATION = typename POSTSYNC_DATA::OPERATION; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.appendMemoryCopyKernelWithGACallBase = true; commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u); @@ -600,7 +600,7 @@ void testMultiTileAppendMemoryCopyThreeKernelsAndL3Flush(CopyTestInput &input, T using OPERATION = typename POSTSYNC_DATA::OPERATION; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.appendMemoryCopyKernelWithGACallBase = true; commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u); @@ -726,7 +726,7 @@ void testMultiTileAppendMemoryCopySingleKernel(CopyTestInput &input, TestExpecte using OPERATION = typename POSTSYNC_DATA::OPERATION; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.appendMemoryCopyKernelWithGACallBase = true; commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u); @@ -802,7 +802,7 @@ void testMultiTileAppendMemoryCopySingleKernelAndL3Flush(CopyTestInput &input, T using OPERATION = typename POSTSYNC_DATA::OPERATION; using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; - MockAppendMemoryCopy commandList; + MockCommandListCoreFamily commandList; commandList.appendMemoryCopyKernelWithGACallBase = true; commandList.initialize(input.device, NEO::EngineGroupType::RenderCompute, 0u); diff --git a/shared/source/command_stream/command_stream_receiver.cpp b/shared/source/command_stream/command_stream_receiver.cpp index 49ce0d7730..6b1a9eb7c7 100644 --- a/shared/source/command_stream/command_stream_receiver.cpp +++ b/shared/source/command_stream/command_stream_receiver.cpp @@ -389,6 +389,10 @@ void CommandStreamReceiver::cleanupResources() { getMemoryManager()->freeGraphicsMemory(globalStatelessHeapAllocation); globalStatelessHeapAllocation = nullptr; } + for (auto &alloc : ownedPrivateAllocations) { + getMemoryManager()->freeGraphicsMemory(alloc.second); + } + ownedPrivateAllocations.clear(); } WaitStatus CommandStreamReceiver::waitForCompletionWithTimeout(const WaitParams ¶ms, TaskCountType taskCountToWait) { @@ -567,6 +571,9 @@ ResidencyContainer &CommandStreamReceiver::getResidencyAllocations() { ResidencyContainer &CommandStreamReceiver::getEvictionAllocations() { return this->evictionAllocations; } +std::unordered_map &CommandStreamReceiver::getOwnedPrivateAllocations() { + return this->ownedPrivateAllocations; +} AubSubCaptureStatus CommandStreamReceiver::checkAndActivateAubSubCapture(const std::string &kernelName) { return {false, false}; } diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 22f09d1a55..6dfc2ec890 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -118,6 +118,7 @@ class CommandStreamReceiver { ResidencyContainer &getResidencyAllocations(); ResidencyContainer &getEvictionAllocations(); + std::unordered_map &getOwnedPrivateAllocations(); virtual GmmPageTableMngr *createPageTableManager() { return nullptr; } bool needsPageTableManager() const; @@ -460,6 +461,8 @@ class CommandStreamReceiver { ResidencyContainer residencyAllocations; ResidencyContainer evictionAllocations; + std::unordered_map ownedPrivateAllocations; + MutexType ownershipMutex; MutexType hostPtrSurfaceCreationMutex; ExecutionEnvironment &executionEnvironment; diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index 4d476481d8..005dc4d5df 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -35,6 +35,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { using CommandStreamReceiver::baseWaitFunction; using CommandStreamReceiver::checkForNewResources; using CommandStreamReceiver::checkImplicitFlushForGpuIdle; + using CommandStreamReceiver::cleanupResources; using CommandStreamReceiver::CommandStreamReceiver; using CommandStreamReceiver::globalFenceAllocation; using CommandStreamReceiver::gpuHangCheckPeriod; diff --git a/shared/test/common/test_macros/mock_method_macros.h b/shared/test/common/test_macros/mock_method_macros.h index da1da5b5cf..5d71f14c29 100644 --- a/shared/test/common/test_macros/mock_method_macros.h +++ b/shared/test/common/test_macros/mock_method_macros.h @@ -69,3 +69,13 @@ } \ return funcName##Result; \ } + +#define ADDMETHOD_VOIDRETURN(funcName, callBase, funcParams, invokeParams) \ + bool funcName##CallBase = callBase; \ + uint32_t funcName##Called = 0u; \ + void funcName funcParams override { \ + funcName##Called++; \ + if (funcName##CallBase) { \ + BaseClass::funcName invokeParams; \ + } \ + } diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 74a0e75952..0dd4a43566 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -4485,3 +4485,13 @@ HWTEST2_F(CommandStreamReceiverHwTest, EXPECT_EQ(nullptr, frontEndCmd); EXPECT_FALSE(commandStreamReceiver.getMediaVFEStateDirty()); } + +HWTEST_F(CommandStreamReceiverTest, givenCsrWhenCleanUpResourcesThenOwnedPrivateAllocationsAreFreed) { + auto &csr = pDevice->getUltCommandStreamReceiver(); + auto mockGA = std::make_unique(); + + auto mapForReuse = &csr.getOwnedPrivateAllocations(); + mapForReuse->insert({0x100, mockGA.release()}); + csr.cleanupResources(); + EXPECT_EQ(mapForReuse->size(), 0u); +}