fix: count active modules for enabling per-dispatch private memory

Related-To: NEO-13086

Signed-off-by: Wenbin Lu <wenbin.lu@intel.com>
This commit is contained in:
Wenbin Lu
2024-12-23 21:02:01 +00:00
committed by Compute-Runtime-Automation
parent fece6956c6
commit a483b361f9
8 changed files with 82 additions and 15 deletions

View File

@@ -12,7 +12,6 @@
#include "shared/source/compiler_interface/compiler_options_extra.h"
#include "shared/source/compiler_interface/compiler_warnings/compiler_warnings.h"
#include "shared/source/compiler_interface/external_functions.h"
#include "shared/source/compiler_interface/intermediate_representations.h"
#include "shared/source/compiler_interface/linker.h"
#include "shared/source/debugger/debugger_l0.h"
#include "shared/source/device/device.h"
@@ -53,9 +52,9 @@
#include "program_debug_data.h"
#include <algorithm>
#include <list>
#include <memory>
#include <unordered_map>
namespace L0 {
namespace BuildOptions {
@@ -1280,7 +1279,6 @@ ze_result_t ModuleImp::getKernelNames(uint32_t *pCount, const char **pNames) {
}
void ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded() {
size_t modulePrivateMemorySize = 0;
auto neoDevice = this->device->getNEODevice();
for (auto &kernelImmData : this->kernelImmDatas) {
if (0 == kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) {
@@ -1288,17 +1286,26 @@ void ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded() {
}
auto kernelPrivateMemorySize = NEO::KernelHelper::getPrivateSurfaceSize(kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize,
neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
modulePrivateMemorySize += kernelPrivateMemorySize;
this->privateMemorySize += kernelPrivateMemorySize;
}
this->allocatePrivateMemoryPerDispatch = false;
if (modulePrivateMemorySize > 0U) {
if (this->privateMemorySize > 0U) {
auto deviceBitfield = neoDevice->getDeviceBitfield();
auto globalMemorySize = neoDevice->getRootDevice()->getGlobalMemorySize(static_cast<uint32_t>(deviceBitfield.to_ulong()));
auto numSubDevices = deviceBitfield.count();
this->allocatePrivateMemoryPerDispatch = modulePrivateMemorySize * numSubDevices > globalMemorySize;
auto allSubDevicePrivateMemorySize = this->privateMemorySize * numSubDevices;
float maxPercentage = 0.25f;
if (NEO::debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() > 0) {
maxPercentage = NEO::debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() / 100.0f;
}
auto privateMemorySizeLock = neoDevice->getMemoryManager()->lockKernelManagedPrivateMemorySize();
this->allocatePrivateMemoryPerDispatch = (neoDevice->getMemoryManager()->getKernelManagedPrivateMemorySize() + allSubDevicePrivateMemorySize) > static_cast<size_t>(globalMemorySize * maxPercentage);
if (!this->allocatePrivateMemoryPerDispatch) {
neoDevice->getMemoryManager()->registerKernelManagedPrivateMemorySize(allSubDevicePrivateMemorySize);
}
PRINT_DEBUG_STRING(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Private Memory Per Dispatch %d for modulePrivateMemorySize %zu subDevices %zu globalMemorySize %" PRIu64 "\n",
this->allocatePrivateMemoryPerDispatch, modulePrivateMemorySize, numSubDevices, globalMemorySize);
this->allocatePrivateMemoryPerDispatch, this->privateMemorySize, numSubDevices, globalMemorySize);
}
}
@@ -1570,6 +1577,13 @@ ze_result_t ModuleImp::destroy() {
}
}
if (!this->allocatePrivateMemoryPerDispatch) {
auto neoDevice = this->device->getNEODevice();
auto allSubDevicePrivateMemorySize = neoDevice->getDeviceBitfield().count() * this->privateMemorySize;
auto privateMemorySizeLock = neoDevice->getMemoryManager()->lockKernelManagedPrivateMemorySize();
neoDevice->getMemoryManager()->unregisterKernelManagedPrivateMemorySize(allSubDevicePrivateMemorySize);
}
delete this;
if (tempDevice->getL0Debugger() && tempHandle != 0) {

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2024 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -16,7 +16,6 @@
#include "igfxfmid.h"
#include <list>
#include <memory>
#include <set>
#include <string>
@@ -216,6 +215,7 @@ struct ModuleImp : public Module {
uint32_t profileFlags = 0;
uint64_t moduleLoadAddress = std::numeric_limits<uint64_t>::max();
size_t isaAllocationPageSize = 0;
size_t privateMemorySize = 0;
NEO::Linker::PatchableSegments isaSegmentsForPatching;
std::vector<std::vector<char>> patchedIsaTempStorage;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2024 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -16,7 +16,6 @@
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
#include "shared/test/common/mocks/mock_compilers.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/test_macros/hw_test.h"
@@ -1305,12 +1304,14 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichTogethe
}
}
HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExceedGlobalMemSizeWhenAppendLaunchKernelWithParamsIsCalledThenNoAllocationIsDone, MatchAny) {
HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExceedGlobalMemSizePercentWhenAppendLaunchKernelWithParamsIsCalledThenNoAllocationIsDone, MatchAny) {
debugManager.flags.MaxKernelManagedPrivateMemoryPercent.set(33);
auto devInfo = device->getNEODevice()->getDeviceInfo();
auto kernelsNb = 2u;
uint32_t margin128KB = 131072u;
auto underAllocSize = static_cast<uint32_t>(devInfo.globalMemSize / kernelsNb / devInfo.computeUnitsUsedForScratch) - margin128KB;
auto maxModulePrivateMemorySize = static_cast<uint64_t>(devInfo.globalMemSize * (debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() / 100.0f));
auto underAllocSize = static_cast<uint32_t>(maxModulePrivateMemorySize / kernelsNb / devInfo.computeUnitsUsedForScratch) - margin128KB;
auto kernelNames = std::array<std::string, 2u>{"test1", "test2"};
auto &kernelImmDatas = this->module->kernelImmDatas;
@@ -1340,6 +1341,37 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExc
EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 0u);
}
}
HWTEST2_F(CommandListAppendLaunchKernel, givenKernelPrivateAllocWhichExceedGlobalMemSizePercentWhenAppendLaunchKernelWithParamsIsCalledThenAllocationIsDone, MatchAny) {
debugManager.flags.MaxKernelManagedPrivateMemoryPercent.set(80);
auto devInfo = device->getNEODevice()->getDeviceInfo();
auto kernelPrivateMemorySize = static_cast<uint64_t>(devInfo.globalMemSize * ((debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() + 1) / 100.0f));
auto underAllocSize = static_cast<uint32_t>(kernelPrivateMemorySize / devInfo.computeUnitsUsedForScratch);
auto &kernelImmDatas = this->module->kernelImmDatas;
auto &kernelDesc = const_cast<KernelDescriptor &>(kernelImmDatas[0]->getDescriptor());
kernelDesc.kernelAttributes.perHwThreadPrivateMemorySize = underAllocSize;
kernelDesc.kernelAttributes.flags.usesPrintf = false;
kernelDesc.kernelMetadata.kernelName = "test1";
EXPECT_FALSE(this->module->shouldAllocatePrivateMemoryPerDispatch());
this->module->checkIfPrivateMemoryPerDispatchIsNeeded();
EXPECT_TRUE(this->module->shouldAllocatePrivateMemoryPerDispatch());
auto pCommandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
pCommandList->device = this->module->getDevice();
auto memoryMgr = static_cast<OsAgnosticMemoryManager *>(pCommandList->device->getNEODevice()->getExecutionEnvironment()->memoryManager.get());
memoryMgr->turnOnFakingBigAllocations();
auto kernels = std::vector<std::unique_ptr<WhiteBox<::L0::KernelImp>>>();
EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 0u);
kernels.push_back(this->createKernelWithName("test1"));
pCommandList->allocateOrReuseKernelPrivateMemoryIfNeeded(kernels[0].get(),
kernels[0]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize);
EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 1u);
}
HWTEST2_F(CommandListAppendLaunchKernel, GivenDebugToggleSetWhenUpdateStreamPropertiesIsCalledThenCorrectThreadArbitrationPolicyIsSet, MatchAny) {
DebugManagerStateRestore restorer;
debugManager.flags.ForceThreadArbitrationPolicyProgrammingWithScm.set(1);