From 7c6c45f5b501593106313e576cd735dae64ad97e Mon Sep 17 00:00:00 2001 From: Jaroslaw Chodor Date: Fri, 23 Jul 2021 19:23:42 +0200 Subject: [PATCH] Add option to allocate private mem per dispatch Signed-off-by: Jaroslaw Chodor Signed-off-by: Krystian Chmielewski --- level_zero/core/source/cmdlist/cmdlist.h | 2 + level_zero/core/source/cmdlist/cmdlist_hw.inl | 9 ++++ .../core/source/cmdlist/cmdlist_hw_base.inl | 9 ++++ level_zero/core/source/kernel/kernel.h | 5 ++ level_zero/core/source/kernel/kernel_imp.cpp | 52 ++++++++++++------- level_zero/core/source/kernel/kernel_imp.h | 7 +++ level_zero/core/source/module/module.h | 2 + level_zero/core/source/module/module_imp.cpp | 21 ++++++++ level_zero/core/source/module/module_imp.h | 6 +++ .../test/unit_tests/fixtures/module_fixture.h | 6 +++ .../unit_tests/sources/kernel/test_kernel.cpp | 35 +++++++++++++ shared/source/device/device.h | 3 +- 12 files changed, 137 insertions(+), 20 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 46a3429106..b72db1332c 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -259,6 +259,8 @@ struct CommandList : _ze_command_list_handle_t { NEO::StreamProperties requiredStreamState{}; NEO::StreamProperties finalStreamState{}; CommandsToPatch commandsToPatch{}; + + std::vector ownedPrivateAllocations; }; using CommandListAllocatorFn = CommandList *(*)(uint32_t); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 605bf173b4..e2461d9243 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -58,6 +58,10 @@ inline ze_result_t parseErrorCode(NEO::ErrorCode returnValue) { template CommandListCoreFamily::~CommandListCoreFamily() { clearCommandsToPatch(); + for (auto alloc : this->ownedPrivateAllocations) { + device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc); + } + this->ownedPrivateAllocations.clear(); } template @@ -98,6 +102,11 @@ ze_result_t CommandListCoreFamily::reset() { programThreadArbitrationPolicy(device); } + for (auto alloc : this->ownedPrivateAllocations) { + device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc); + } + this->ownedPrivateAllocations.clear(); + return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index 6ebedb378a..55559198bf 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -41,6 +41,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z bool isPredicate, bool isCooperative) { const auto kernel = Kernel::fromHandle(hKernel); + const auto &kernelDescriptor = kernel->getKernelDescriptor(); UNRECOVERABLE_IF(kernel == nullptr); appendEventForProfiling(hEvent, true); const auto functionImmutableData = kernel->getImmutableData(); @@ -56,6 +57,14 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z kernel->patchGlobalOffset(); + if (kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize != 0U && + nullptr == kernel->getPrivateMemoryGraphicsAllocation()) { + auto privateMemoryGraphicsAllocation = kernel->allocatePrivateMemoryGraphicsAllocation(); + kernel->patchCrossthreadDataWithPrivateAllocation(privateMemoryGraphicsAllocation); + this->commandContainer.addToResidencyContainer(privateMemoryGraphicsAllocation); + this->ownedPrivateAllocations.push_back(privateMemoryGraphicsAllocation); + } + if (!isIndirect) { kernel->setGroupCount(pThreadGroupDimensions->groupCountX, pThreadGroupDimensions->groupCountY, diff --git a/level_zero/core/source/kernel/kernel.h b/level_zero/core/source/kernel/kernel.h index aff4c9b83a..a83003a503 100644 --- a/level_zero/core/source/kernel/kernel.h +++ b/level_zero/core/source/kernel/kernel.h @@ -137,6 +137,11 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI { virtual bool usesSyncBuffer() = 0; virtual void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) = 0; + virtual NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() = 0; + virtual void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) = 0; + + virtual NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() = 0; + Kernel() = default; Kernel(const Kernel &) = delete; Kernel(Kernel &&) = delete; diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 40e44b730f..2c7e3b1b14 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -686,6 +686,36 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties) return ZE_RESULT_SUCCESS; } +NEO::GraphicsAllocation *KernelImp::allocatePrivateMemoryGraphicsAllocation() { + auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes; + auto neoDevice = module->getDevice()->getNEODevice(); + + auto privateSurfaceSize = NEO::KernelHelper::getPrivateSurfaceSize(kernelAttributes.perHwThreadPrivateMemorySize, + neoDevice->getDeviceInfo().computeUnitsUsedForScratch); + + UNRECOVERABLE_IF(privateSurfaceSize == 0); + auto privateMemoryGraphicsAllocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties( + {neoDevice->getRootDeviceIndex(), privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE, neoDevice->getDeviceBitfield()}); + + UNRECOVERABLE_IF(privateMemoryGraphicsAllocation == nullptr); + return privateMemoryGraphicsAllocation; +} + +void KernelImp::patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) { + auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes; + auto neoDevice = module->getDevice()->getNEODevice(); + + ArrayRef crossThredDataArrayRef = ArrayRef(this->crossThreadData.get(), this->crossThreadDataSize); + ArrayRef surfaceStateHeapArrayRef = ArrayRef(this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize); + + patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef, + static_cast(privateMemoryGraphicsAllocation->getGpuAddressToPatch()), + *privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress, + *neoDevice, kernelAttributes.flags.useGlobalAtomics); + + this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation); +} + ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { this->kernelImmData = module->getKernelImmutableData(desc->pKernelName); if (this->kernelImmData == nullptr) { @@ -776,25 +806,9 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { auto &kernelAttributes = kernelImmData->getDescriptor().kernelAttributes; auto neoDevice = module->getDevice()->getNEODevice(); - if (kernelAttributes.perHwThreadPrivateMemorySize != 0) { - auto privateSurfaceSize = NEO::KernelHelper::getPrivateSurfaceSize(kernelAttributes.perHwThreadPrivateMemorySize, - neoDevice->getDeviceInfo().computeUnitsUsedForScratch); - - UNRECOVERABLE_IF(privateSurfaceSize == 0); - this->privateMemoryGraphicsAllocation = neoDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties( - {neoDevice->getRootDeviceIndex(), privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE, neoDevice->getDeviceBitfield()}); - - UNRECOVERABLE_IF(this->privateMemoryGraphicsAllocation == nullptr); - - ArrayRef crossThredDataArrayRef = ArrayRef(this->crossThreadData.get(), this->crossThreadDataSize); - ArrayRef surfaceStateHeapArrayRef = ArrayRef(this->surfaceStateHeapData.get(), this->surfaceStateHeapDataSize); - - patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef, - static_cast(privateMemoryGraphicsAllocation->getGpuAddressToPatch()), - *privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress, - *neoDevice, kernelAttributes.flags.useGlobalAtomics); - - this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation); + if ((kernelAttributes.perHwThreadPrivateMemorySize != 0U) && (false == module->shouldAllocatePrivateMemoryPerDispatch())) { + this->privateMemoryGraphicsAllocation = allocatePrivateMemoryGraphicsAllocation(); + this->patchCrossthreadDataWithPrivateAllocation(this->privateMemoryGraphicsAllocation); } this->createPrintfBuffer(); diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index a2786d7f0c..e7255b0478 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -143,6 +143,13 @@ struct KernelImp : Kernel { return kernelHasIndirectAccess; } + NEO::GraphicsAllocation *allocatePrivateMemoryGraphicsAllocation() override; + void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) override; + + NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() override { + return privateMemoryGraphicsAllocation; + } + protected: KernelImp() = default; diff --git a/level_zero/core/source/module/module.h b/level_zero/core/source/module/module.h index 0baa9c5280..11b0a438c0 100644 --- a/level_zero/core/source/module/module.h +++ b/level_zero/core/source/module/module.h @@ -50,6 +50,8 @@ struct Module : _ze_module_handle_t { virtual const std::vector> &getKernelImmutableDataVector() const = 0; virtual uint32_t getMaxGroupSize() const = 0; virtual bool isDebugEnabled() const = 0; + virtual bool shouldAllocatePrivateMemoryPerDispatch() const = 0; + virtual void checkIfPrivateMemoryPerDispatchIsNeeded() = 0; Module() = default; Module(const Module &) = delete; diff --git a/level_zero/core/source/module/module_imp.cpp b/level_zero/core/source/module/module_imp.cpp index 5d94ded48d..faff39d92a 100644 --- a/level_zero/core/source/module/module_imp.cpp +++ b/level_zero/core/source/module/module_imp.cpp @@ -13,6 +13,7 @@ #include "shared/source/device_binary_format/device_binary_formats.h" #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/constants.h" +#include "shared/source/helpers/kernel_helpers.h" #include "shared/source/helpers/string.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/unified_memory_manager.h" @@ -383,6 +384,8 @@ bool ModuleImp::initialize(const ze_module_desc_t *desc, NEO::Device *neoDevice) } this->maxGroupSize = static_cast(this->translationUnit->device->getNEODevice()->getDeviceInfo().maxWorkGroupSize); + checkIfPrivateMemoryPerDispatchIsNeeded(); + if (debugEnabled) { if (device->getSourceLevelDebugger()) { for (auto kernelInfo : this->translationUnit->programInfo.kernelInfos) { @@ -642,6 +645,24 @@ void ModuleImp::verifyDebugCapabilities() { debugEnabled = debugCapabilities; } +void ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded() { + size_t modulePrivateMemorySize = 0; + for (auto &kernelImmData : this->kernelImmDatas) { + if (0 == kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) { + continue; + } + auto kernelPrivateMemorySize = NEO::KernelHelper::getPrivateSurfaceSize(kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize, + this->device->getNEODevice()->getDeviceInfo().computeUnitsUsedForScratch); + modulePrivateMemorySize += kernelPrivateMemorySize; + } + + this->allocatePrivateMemoryPerDispatch = false; + if (modulePrivateMemorySize > 0U) { + auto globalMemorySize = device->getNEODevice()->getRootDevice()->getGlobalMemorySize(static_cast(device->getNEODevice()->getDeviceBitfield().to_ulong())); + this->allocatePrivateMemoryPerDispatch = modulePrivateMemorySize > globalMemorySize; + } +} + ze_result_t ModuleImp::getProperties(ze_module_properties_t *pModuleProperties) { pModuleProperties->flags = 0; diff --git a/level_zero/core/source/module/module_imp.h b/level_zero/core/source/module/module_imp.h index 0663a83706..710df206a7 100644 --- a/level_zero/core/source/module/module_imp.h +++ b/level_zero/core/source/module/module_imp.h @@ -114,6 +114,10 @@ struct ModuleImp : public Module { bool isDebugEnabled() const override; + bool shouldAllocatePrivateMemoryPerDispatch() const override { + return allocatePrivateMemoryPerDispatch; + } + ModuleTranslationUnit *getTranslationUnit() { return this->translationUnit.get(); } @@ -121,6 +125,7 @@ struct ModuleImp : public Module { protected: void copyPatchedSegments(const NEO::Linker::PatchableSegments &isaSegmentsForPatching); void verifyDebugCapabilities(); + void checkIfPrivateMemoryPerDispatchIsNeeded() override; Device *device = nullptr; PRODUCT_FAMILY productFamily{}; @@ -132,6 +137,7 @@ struct ModuleImp : public Module { NEO::Linker::RelocatedSymbolsMap symbols; bool debugEnabled = false; bool isFullyLinked = false; + bool allocatePrivateMemoryPerDispatch = true; ModuleType type; NEO::Linker::UnresolvedExternals unresolvedExternalsInfo{}; std::set importedSymbolAllocations{}; diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.h b/level_zero/core/test/unit_tests/fixtures/module_fixture.h index 0f7864f950..e51e951704 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.h @@ -102,6 +102,12 @@ struct ModuleImmutableDataFixture : public DeviceFixture { const KernelImmutableData *getKernelImmutableData(const char *functionName) const override { return mockKernelImmData; } + + void checkIfPrivateMemoryPerDispatchIsNeeded() override { + const_cast(kernelImmDatas[0]->getDescriptor()).kernelAttributes.perHwThreadPrivateMemorySize = mockKernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize; + ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded(); + } + MockImmutableData *mockKernelImmData = nullptr; }; diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 18a8f63d3b..2131a9c300 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -571,6 +571,41 @@ HWTEST_F(KernelImmutableDataTests, givenKernelInitializedWithPrivateMemoryThenCo EXPECT_EQ(sizeContainerWithoutPrivateMemory + 1u, sizeContainerWithPrivateMemory); } +HWTEST_F(KernelImmutableDataTests, givenKernelWithPrivateMemoryBiggerThanGlobalMemoryThenPrivateMemoryIsNotAllocated) { + std::string testFile; + retrieveBinaryKernelFilenameNoRevision(testFile, binaryFilename + "_", ".bin"); + + size_t size = 0; + auto src = loadDataFromFile( + testFile.c_str(), + size); + ASSERT_NE(0u, size); + ASSERT_NE(nullptr, src); + + ze_module_desc_t moduleDesc = {}; + moduleDesc.format = ZE_MODULE_FORMAT_NATIVE; + moduleDesc.pInputModule = reinterpret_cast(src.get()); + moduleDesc.inputSize = size; + ModuleBuildLog *moduleBuildLog = nullptr; + + uint32_t perHwThreadPrivateMemorySizeRequested = std::numeric_limits::max(); + std::unique_ptr mockKernelImmData = std::make_unique(perHwThreadPrivateMemorySizeRequested); + std::unique_ptr module = std::make_unique(device, + moduleBuildLog, + ModuleType::User, + perHwThreadPrivateMemorySizeRequested, + mockKernelImmData.get()); + bool result = module->initialize(&moduleDesc, device->getNEODevice()); + EXPECT_TRUE(result); + EXPECT_TRUE(module->shouldAllocatePrivateMemoryPerDispatch()); + + std::unique_ptr kernel; + kernel = std::make_unique(module.get()); + + createKernel(kernel.get()); + EXPECT_EQ(nullptr, kernel->getPrivateMemoryGraphicsAllocation()); +} + class KernelDescriptorRTCallsTrue : public NEO::KernelDescriptor { bool hasRTCalls() const override { return true; diff --git a/shared/source/device/device.h b/shared/source/device/device.h index e4a5345993..2edeef0e82 100644 --- a/shared/source/device/device.h +++ b/shared/source/device/device.h @@ -121,6 +121,8 @@ class Device : public ReferenceTrackedObject { void initializeRayTracing(); void reduceMaxMemAllocSize(); + virtual uint64_t getGlobalMemorySize(uint32_t deviceBitfield) const; + protected: Device() = delete; Device(ExecutionEnvironment *executionEnvironment); @@ -145,7 +147,6 @@ class Device : public ReferenceTrackedObject { MOCKABLE_VIRTUAL std::unique_ptr createCommandStreamReceiver() const; MOCKABLE_VIRTUAL SubDevice *createSubDevice(uint32_t subDeviceIndex); MOCKABLE_VIRTUAL SubDevice *createEngineInstancedSubDevice(uint32_t subDeviceIndex, aub_stream::EngineType engineType); - virtual uint64_t getGlobalMemorySize(uint32_t deviceBitfield) const; double getPercentOfGlobalMemoryAvailable() const; virtual void createBindlessHeapsHelper() {} bool createSubDevices();