From f553d9f76bfec7f8846bed2715777b8b7c0328e2 Mon Sep 17 00:00:00 2001 From: Maciej Bielski Date: Tue, 3 Oct 2023 23:09:30 +0000 Subject: [PATCH] fix: one transfer per kernel ISA allocation(s) page If several kernel heaps are sharing the same page then use a temporary buffer to collect all of them and transfer to memory in one shot. Previously there were several transfers performed (one per kernel) and, observably, they happened not to be immediately effective at times. Related-To: NEO-7788 Signed-off-by: Maciej Bielski --- level_zero/core/source/kernel/kernel_imp.cpp | 2 +- level_zero/core/source/module/module_imp.cpp | 112 +++++++++++------- level_zero/core/source/module/module_imp.h | 2 + .../unit_tests/sources/kernel/test_kernel.cpp | 31 ++++- .../unit_tests/sources/module/test_module.cpp | 14 ++- 5 files changed, 112 insertions(+), 49 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 51fe42fafb..a0592fb593 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -955,7 +955,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { } UNRECOVERABLE_IF(!this->kernelImmData->getKernelInfo()->heapInfo.pKernelHeap); - if (isaAllocation->getAllocationType() == NEO::AllocationType::KERNEL_ISA_INTERNAL) { + if (isaAllocation->getAllocationType() == NEO::AllocationType::KERNEL_ISA_INTERNAL && this->kernelImmData->getIsaParentAllocation() == nullptr) { isaAllocation->setTbxWritable(true, std::numeric_limits::max()); isaAllocation->setAubWritable(true, std::numeric_limits::max()); NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(neoDevice->getRootDeviceEnvironment(), *isaAllocation), diff --git a/level_zero/core/source/module/module_imp.cpp b/level_zero/core/source/module/module_imp.cpp index fe19498603..fccec33121 100644 --- a/level_zero/core/source/module/module_imp.cpp +++ b/level_zero/core/source/module/module_imp.cpp @@ -48,6 +48,7 @@ #include "program_debug_data.h" +#include #include #include #include @@ -555,25 +556,8 @@ ze_result_t ModuleImp::initialize(const ze_module_desc_t *desc, NEO::Device *neo linkageSuccessful &= populateHostGlobalSymbolsMap(this->translationUnit->programInfo.globalsDeviceToHostNameMap); this->updateBuildLog(neoDevice); - const auto &productHelper = neoDevice->getProductHelper(); - auto &rootDeviceEnvironment = neoDevice->getRootDeviceEnvironment(); - - if (this->isFullyLinked && this->type == ModuleType::User) { - for (auto &ki : kernelImmDatas) { - - if (!ki->isIsaCopiedToAllocation()) { - ki->getIsaGraphicsAllocation()->setTbxWritable(true, std::numeric_limits::max()); - ki->getIsaGraphicsAllocation()->setAubWritable(true, std::numeric_limits::max()); - NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *ki->getIsaGraphicsAllocation()), - *neoDevice, - ki->getIsaGraphicsAllocation(), - ki->getIsaOffsetInParentAllocation(), - ki->getKernelInfo()->heapInfo.pKernelHeap, - static_cast(ki->getKernelInfo()->heapInfo.kernelHeapSize)); - - ki->setIsaCopiedToAllocation(); - } - } + if ((this->isFullyLinked && this->type == ModuleType::User) || (this->kernelsIsaParentRegion && this->type == ModuleType::Builtin)) { + this->transferIsaSegmentsToAllocation(neoDevice, nullptr); if (device->getL0Debugger()) { auto allocs = getModuleAllocations(); @@ -587,6 +571,69 @@ ze_result_t ModuleImp::initialize(const ze_module_desc_t *desc, NEO::Device *neo return result; } +void ModuleImp::transferIsaSegmentsToAllocation(NEO::Device *neoDevice, const NEO::Linker::PatchableSegments *isaSegmentsForPatching) { + const auto &productHelper = neoDevice->getProductHelper(); + auto &rootDeviceEnvironment = neoDevice->getRootDeviceEnvironment(); + + if (this->kernelsIsaParentRegion && this->kernelImmDatas.size() && !this->kernelImmDatas[0]->isIsaCopiedToAllocation()) { + const auto isaBufferSize = this->kernelsIsaParentRegion->getUnderlyingBufferSize(); + DEBUG_BREAK_IF(isaBufferSize == 0); + auto isaBuffer = std::vector(isaBufferSize); + std::memset(isaBuffer.data(), 0x0, isaBufferSize); + + for (auto &kernelImmData : this->kernelImmDatas) { + DEBUG_BREAK_IF(kernelImmData->isIsaCopiedToAllocation()); + kernelImmData->getIsaGraphicsAllocation()->setAubWritable(true, std::numeric_limits::max()); + kernelImmData->getIsaGraphicsAllocation()->setTbxWritable(true, std::numeric_limits::max()); + + auto [kernelHeapPtr, kernelHeapSize] = this->getKernelHeapPointerAndSize(kernelImmData, isaSegmentsForPatching); + auto offset = kernelImmData->getIsaOffsetInParentAllocation(); + memcpy_s(isaBuffer.data() + offset, isaBufferSize - offset, kernelHeapPtr, kernelHeapSize); + } + NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *this->kernelsIsaParentRegion), + *neoDevice, + this->kernelsIsaParentRegion.get(), + 0u, + isaBuffer.data(), + isaBuffer.size()); + for (auto &kernelImmData : kernelImmDatas) { + kernelImmData->setIsaCopiedToAllocation(); + } + } else { + for (auto &kernelImmData : kernelImmDatas) { + if (nullptr == kernelImmData->getIsaGraphicsAllocation()) { + continue; + } + DEBUG_BREAK_IF(kernelImmData->isIsaCopiedToAllocation()); + kernelImmData->getIsaGraphicsAllocation()->setAubWritable(true, std::numeric_limits::max()); + kernelImmData->getIsaGraphicsAllocation()->setTbxWritable(true, std::numeric_limits::max()); + + if (!kernelImmData->isIsaCopiedToAllocation()) { + auto [kernelHeapPtr, kernelHeapSize] = this->getKernelHeapPointerAndSize(kernelImmData, isaSegmentsForPatching); + NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *kernelImmData->getIsaGraphicsAllocation()), + *neoDevice, + kernelImmData->getIsaGraphicsAllocation(), + 0u, + kernelHeapPtr, + kernelHeapSize); + kernelImmData->setIsaCopiedToAllocation(); + } + } + } +} + +std::pair ModuleImp::getKernelHeapPointerAndSize(const std::unique_ptr &kernelImmData, + const NEO::Linker::PatchableSegments *isaSegmentsForPatching) { + if (isaSegmentsForPatching) { + auto &segments = *isaSegmentsForPatching; + auto segmentId = &kernelImmData - &this->kernelImmDatas[0]; + return {segments[segmentId].hostPointer, segments[segmentId].segmentSize}; + } else { + return {kernelImmData->getKernelInfo()->heapInfo.pKernelHeap, + static_cast(kernelImmData->getKernelInfo()->heapInfo.kernelHeapSize)}; + } +} + inline ze_result_t ModuleImp::initializeTranslationUnit(const ze_module_desc_t *desc, NEO::Device *neoDevice) { std::string buildOptions; std::string internalBuildOptions; @@ -952,34 +999,17 @@ ze_result_t ModuleImp::getDebugInfo(size_t *pDebugDataSize, uint8_t *pDebugData) void ModuleImp::copyPatchedSegments(const NEO::Linker::PatchableSegments &isaSegmentsForPatching) { if (this->translationUnit->programInfo.linkerInput && this->translationUnit->programInfo.linkerInput->getTraits().requiresPatchingOfInstructionSegments) { - auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment(); - const auto &productHelper = this->device->getProductHelper(); + auto neoDevice = this->device->getNEODevice(); + auto &rootDeviceEnvironment = neoDevice->getRootDeviceEnvironment(); + + this->transferIsaSegmentsToAllocation(neoDevice, &isaSegmentsForPatching); for (auto &kernelImmData : this->kernelImmDatas) { - if (nullptr == kernelImmData->getIsaGraphicsAllocation()) { - continue; - } - - UNRECOVERABLE_IF(kernelImmData->isIsaCopiedToAllocation()); - - kernelImmData->getIsaGraphicsAllocation()->setTbxWritable(true, std::numeric_limits::max()); - kernelImmData->getIsaGraphicsAllocation()->setAubWritable(true, std::numeric_limits::max()); - auto segmentId = &kernelImmData - &this->kernelImmDatas[0]; - - NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *kernelImmData->getIsaGraphicsAllocation()), - *device->getNEODevice(), - kernelImmData->getIsaGraphicsAllocation(), - kernelImmData->getIsaOffsetInParentAllocation(), - isaSegmentsForPatching[segmentId].hostPointer, - isaSegmentsForPatching[segmentId].segmentSize); - - kernelImmData->setIsaCopiedToAllocation(); - if (device->getL0Debugger()) { NEO::MemoryOperationsHandler *memoryOperationsIface = rootDeviceEnvironment.memoryOperationsInterface.get(); auto allocation = kernelImmData->getIsaGraphicsAllocation(); if (memoryOperationsIface) { - memoryOperationsIface->makeResident(device->getNEODevice(), ArrayRef(&allocation, 1)); + memoryOperationsIface->makeResident(neoDevice, ArrayRef(&allocation, 1)); } } } diff --git a/level_zero/core/source/module/module_imp.h b/level_zero/core/source/module/module_imp.h index 32f21849cc..0c5bf9e581 100644 --- a/level_zero/core/source/module/module_imp.h +++ b/level_zero/core/source/module/module_imp.h @@ -170,6 +170,8 @@ struct ModuleImp : public Module { void notifyModuleDestroy(); bool populateHostGlobalSymbolsMap(std::unordered_map &devToHostNameMapping); ze_result_t setIsaGraphicsAllocations(); + void transferIsaSegmentsToAllocation(NEO::Device *neoDevice, const NEO::Linker::PatchableSegments *isaSegmentsForPatching); + std::pair getKernelHeapPointerAndSize(const std::unique_ptr &kernelImmData, const NEO::Linker::PatchableSegments *isaSegmentsForPatching); MOCKABLE_VIRTUAL size_t computeKernelIsaAllocationAlignedSizeWithPadding(size_t isaSize); MOCKABLE_VIRTUAL NEO::GraphicsAllocation *allocateKernelsIsaMemory(size_t size); StackVec getModuleAllocations(); diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 18fb1b50b4..1269f835dd 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -820,7 +820,23 @@ TEST_F(KernelImmutableDataTests, givenInternalModuleWhenKernelIsCreatedThenIsaIs mockMemoryManager->copyMemoryToAllocationCalledTimes); } -TEST_F(KernelImmutableDataTests, givenInternalModuleWhenKernelIsCreatedIsaIsNotCopiedDuringLinking) { +struct KernelIsaCopyingMomentTest : public ModuleImmutableDataFixture, public ::testing::TestWithParam> { + void SetUp() override { + ModuleImmutableDataFixture::setUp(); + } + + void TearDown() override { + ModuleImmutableDataFixture::tearDown(); + } +}; +std::pair kernelIsaCopyingPairs[] = { + {1, 1}, + {static_cast(MemoryConstants::pageSize64k + 1), 0}}; // pageSize64 is a common upper-bound for both system and local memory + +INSTANTIATE_TEST_CASE_P(, KernelIsaCopyingMomentTest, testing::ValuesIn(kernelIsaCopyingPairs)); + +TEST_P(KernelIsaCopyingMomentTest, givenInternalModuleWhenKernelIsCreatedThenIsaCopiedDuringLinkingOnlyIfCanFitInACommonParentPage) { + auto [testKernelHeapSize, numberOfCopiesToAllocationAtModuleInitialization] = GetParam(); auto cip = new NEO::MockCompilerInterfaceCaptureBuildOptions(); neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]->compilerInterface.reset(cip); @@ -845,7 +861,7 @@ TEST_F(KernelImmutableDataTests, givenInternalModuleWhenKernelIsCreatedIsaIsNotC uint32_t kernelHeap = 0; auto kernelInfo = new KernelInfo(); - kernelInfo->heapInfo.kernelHeapSize = 1; + kernelInfo->heapInfo.kernelHeapSize = testKernelHeapSize; kernelInfo->heapInfo.pKernelHeap = &kernelHeap; Mock<::L0::KernelImp> kernelMock; @@ -864,15 +880,20 @@ TEST_F(KernelImmutableDataTests, givenInternalModuleWhenKernelIsCreatedIsaIsNotC result = moduleMock->initialize(&moduleDesc, neoDevice); EXPECT_EQ(result, ZE_RESULT_SUCCESS); EXPECT_EQ(mockTranslationUnit->processUnpackedBinaryCalled, 1u); - size_t expectedPreviouscopyMemoryToAllocationCalledTimes = previouscopyMemoryToAllocationCalledTimes; + size_t expectedPreviouscopyMemoryToAllocationCalledTimes = previouscopyMemoryToAllocationCalledTimes + + numberOfCopiesToAllocationAtModuleInitialization; EXPECT_EQ(expectedPreviouscopyMemoryToAllocationCalledTimes, mockMemoryManager->copyMemoryToAllocationCalledTimes); for (auto &ki : moduleMock->kernelImmDatas) { - EXPECT_FALSE(ki->isIsaCopiedToAllocation()); + bool isaExpectedToBeCopied = (numberOfCopiesToAllocationAtModuleInitialization != 0u); + EXPECT_EQ(isaExpectedToBeCopied, ki->isIsaCopiedToAllocation()); } - expectedPreviouscopyMemoryToAllocationCalledTimes++; + if (numberOfCopiesToAllocationAtModuleInitialization == 0) { + // For large builtin kernels copying is not optimized and done at kernel initailization + expectedPreviouscopyMemoryToAllocationCalledTimes++; + } ze_kernel_desc_t desc = {}; desc.pKernelName = ""; diff --git a/level_zero/core/test/unit_tests/sources/module/test_module.cpp b/level_zero/core/test/unit_tests/sources/module/test_module.cpp index 119a4f85f9..e83a4333f0 100644 --- a/level_zero/core/test/unit_tests/sources/module/test_module.cpp +++ b/level_zero/core/test/unit_tests/sources/module/test_module.cpp @@ -222,7 +222,11 @@ HWTEST_F(ModuleTest, givenBlitterAvailableWhenCopyingPatchedSegmentsThenIsaIsTra auto &productHelper = device.getProductHelper(); auto &rootDeviceEnvironment = device.getNEODevice()->getRootDeviceEnvironment(); if (productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *module->getKernelImmutableDataVector()[0]->getIsaGraphicsAllocation())) { - EXPECT_EQ(zebinData->numOfKernels, blitterCalled); + if (module->getKernelsIsaParentAllocation()) { + EXPECT_EQ(1u, blitterCalled); + } else { + EXPECT_EQ(zebinData->numOfKernels, blitterCalled); + } } else { EXPECT_EQ(0u, blitterCalled); } @@ -3788,7 +3792,13 @@ TEST_F(ModuleInitializeTest, whenModuleInitializeIsCalledThenCorrectResultIsRetu class MyMockModuleTU : public MockModuleTU { public: using MockModuleTU::MockModuleTU; - ze_result_t createFromNativeBinary(const char *input, size_t inputSize) override { return ZE_RESULT_SUCCESS; } + ze_result_t createFromNativeBinary(const char *input, size_t inputSize) override { + programInfo.kernelInfos[0]->heapInfo.pKernelHeap = &mockKernelHeap; + programInfo.kernelInfos[0]->heapInfo.kernelHeapSize = 4; + return ZE_RESULT_SUCCESS; + } + + uint32_t mockKernelHeap = 0xDEAD; }; const auto &compilerProductHelper = neoDevice->getRootDeviceEnvironment().getHelper();