Revert "fix: count active modules for enabling per-dispatch private memory"

This reverts commit a483b361f9.

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2025-01-15 11:00:40 +00:00
committed by Compute-Runtime-Automation
parent 621ceaf9ec
commit 2dd9940f60
8 changed files with 12 additions and 79 deletions

View File

@@ -12,6 +12,7 @@
#include "shared/source/compiler_interface/compiler_options_extra.h"
#include "shared/source/compiler_interface/compiler_warnings/compiler_warnings.h"
#include "shared/source/compiler_interface/external_functions.h"
#include "shared/source/compiler_interface/intermediate_representations.h"
#include "shared/source/compiler_interface/linker.h"
#include "shared/source/debugger/debugger_l0.h"
#include "shared/source/device/device.h"
@@ -52,9 +53,9 @@
#include "program_debug_data.h"
#include <algorithm>
#include <list>
#include <memory>
#include <unordered_map>
namespace L0 {
namespace BuildOptions {
@@ -1279,6 +1280,7 @@ ze_result_t ModuleImp::getKernelNames(uint32_t *pCount, const char **pNames) {
}
void ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded() {
size_t modulePrivateMemorySize = 0;
auto neoDevice = this->device->getNEODevice();
for (auto &kernelImmData : this->kernelImmDatas) {
if (0 == kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) {
@@ -1286,26 +1288,17 @@ void ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded() {
}
auto kernelPrivateMemorySize = NEO::KernelHelper::getPrivateSurfaceSize(kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize,
neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
this->privateMemorySize += kernelPrivateMemorySize;
modulePrivateMemorySize += kernelPrivateMemorySize;
}
this->allocatePrivateMemoryPerDispatch = false;
if (this->privateMemorySize > 0U) {
if (modulePrivateMemorySize > 0U) {
auto deviceBitfield = neoDevice->getDeviceBitfield();
auto globalMemorySize = neoDevice->getRootDevice()->getGlobalMemorySize(static_cast<uint32_t>(deviceBitfield.to_ulong()));
auto numSubDevices = deviceBitfield.count();
auto allSubDevicePrivateMemorySize = this->privateMemorySize * numSubDevices;
float maxPercentage = 0.25f;
if (NEO::debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() > 0) {
maxPercentage = NEO::debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() / 100.0f;
}
auto privateMemorySizeLock = neoDevice->getMemoryManager()->lockKernelManagedPrivateMemorySize();
this->allocatePrivateMemoryPerDispatch = (neoDevice->getMemoryManager()->getKernelManagedPrivateMemorySize() + allSubDevicePrivateMemorySize) > static_cast<size_t>(globalMemorySize * maxPercentage);
if (!this->allocatePrivateMemoryPerDispatch) {
neoDevice->getMemoryManager()->registerKernelManagedPrivateMemorySize(allSubDevicePrivateMemorySize);
}
this->allocatePrivateMemoryPerDispatch = modulePrivateMemorySize * numSubDevices > globalMemorySize;
PRINT_DEBUG_STRING(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Private Memory Per Dispatch %d for modulePrivateMemorySize %zu subDevices %zu globalMemorySize %" PRIu64 "\n",
this->allocatePrivateMemoryPerDispatch, this->privateMemorySize, numSubDevices, globalMemorySize);
this->allocatePrivateMemoryPerDispatch, modulePrivateMemorySize, numSubDevices, globalMemorySize);
}
}
@@ -1577,13 +1570,6 @@ ze_result_t ModuleImp::destroy() {
}
}
if (!this->allocatePrivateMemoryPerDispatch) {
auto neoDevice = this->device->getNEODevice();
auto allSubDevicePrivateMemorySize = neoDevice->getDeviceBitfield().count() * this->privateMemorySize;
auto privateMemorySizeLock = neoDevice->getMemoryManager()->lockKernelManagedPrivateMemorySize();
neoDevice->getMemoryManager()->unregisterKernelManagedPrivateMemorySize(allSubDevicePrivateMemorySize);
}
delete this;
if (tempDevice->getL0Debugger() && tempHandle != 0) {

View File

@@ -16,6 +16,7 @@
#include "igfxfmid.h"
#include <list>
#include <memory>
#include <set>
#include <string>
@@ -215,7 +216,6 @@ struct ModuleImp : public Module {
uint32_t profileFlags = 0;
uint64_t moduleLoadAddress = std::numeric_limits<uint64_t>::max();
size_t isaAllocationPageSize = 0;
size_t privateMemorySize = 0;
NEO::Linker::PatchableSegments isaSegmentsForPatching;
std::vector<std::vector<char>> patchedIsaTempStorage;

View File

@@ -16,6 +16,7 @@
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
#include "shared/test/common/mocks/mock_compilers.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/test_macros/hw_test.h"
@@ -1304,14 +1305,12 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichTogethe
}
}
HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExceedGlobalMemSizePercentWhenAppendLaunchKernelWithParamsIsCalledThenNoAllocationIsDone, MatchAny) {
HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExceedGlobalMemSizeWhenAppendLaunchKernelWithParamsIsCalledThenNoAllocationIsDone, MatchAny) {
debugManager.flags.MaxKernelManagedPrivateMemoryPercent.set(33);
auto devInfo = device->getNEODevice()->getDeviceInfo();
auto kernelsNb = 2u;
uint32_t margin128KB = 131072u;
auto maxModulePrivateMemorySize = static_cast<uint64_t>(devInfo.globalMemSize * (debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() / 100.0f));
auto underAllocSize = static_cast<uint32_t>(maxModulePrivateMemorySize / kernelsNb / devInfo.computeUnitsUsedForScratch) - margin128KB;
auto underAllocSize = static_cast<uint32_t>(devInfo.globalMemSize / kernelsNb / devInfo.computeUnitsUsedForScratch) - margin128KB;
auto kernelNames = std::array<std::string, 2u>{"test1", "test2"};
auto &kernelImmDatas = this->module->kernelImmDatas;
@@ -1341,37 +1340,6 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExc
EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 0u);
}
}
HWTEST2_F(CommandListAppendLaunchKernel, givenKernelPrivateAllocWhichExceedGlobalMemSizePercentWhenAppendLaunchKernelWithParamsIsCalledThenAllocationIsDone, MatchAny) {
debugManager.flags.MaxKernelManagedPrivateMemoryPercent.set(80);
auto devInfo = device->getNEODevice()->getDeviceInfo();
auto kernelPrivateMemorySize = static_cast<uint64_t>(devInfo.globalMemSize * ((debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() + 1) / 100.0f));
auto underAllocSize = static_cast<uint32_t>(kernelPrivateMemorySize / devInfo.computeUnitsUsedForScratch);
auto &kernelImmDatas = this->module->kernelImmDatas;
auto &kernelDesc = const_cast<KernelDescriptor &>(kernelImmDatas[0]->getDescriptor());
kernelDesc.kernelAttributes.perHwThreadPrivateMemorySize = underAllocSize;
kernelDesc.kernelAttributes.flags.usesPrintf = false;
kernelDesc.kernelMetadata.kernelName = "test1";
EXPECT_FALSE(this->module->shouldAllocatePrivateMemoryPerDispatch());
this->module->checkIfPrivateMemoryPerDispatchIsNeeded();
EXPECT_TRUE(this->module->shouldAllocatePrivateMemoryPerDispatch());
auto pCommandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
pCommandList->device = this->module->getDevice();
auto memoryMgr = static_cast<OsAgnosticMemoryManager *>(pCommandList->device->getNEODevice()->getExecutionEnvironment()->memoryManager.get());
memoryMgr->turnOnFakingBigAllocations();
auto kernels = std::vector<std::unique_ptr<WhiteBox<::L0::KernelImp>>>();
EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 0u);
kernels.push_back(this->createKernelWithName("test1"));
pCommandList->allocateOrReuseKernelPrivateMemoryIfNeeded(kernels[0].get(),
kernels[0]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize);
EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 1u);
}
HWTEST2_F(CommandListAppendLaunchKernel, GivenDebugToggleSetWhenUpdateStreamPropertiesIsCalledThenCorrectThreadArbitrationPolicyIsSet, MatchAny) {
DebugManagerStateRestore restorer;
debugManager.flags.ForceThreadArbitrationPolicyProgrammingWithScm.set(1);

View File

@@ -385,7 +385,6 @@ DECLARE_DEBUG_VARIABLE(int32_t, AllowZeroCopyWithoutCoherency, -1, "Use cachelin
DECLARE_DEBUG_VARIABLE(int32_t, EnableHostPtrTracking, -1, "Enable host ptr tracking: -1 - default platform setting, 0 - disabled, 1 - enabled")
DECLARE_DEBUG_VARIABLE(int32_t, MaxHwThreadsPercent, 0, "If not zero then maximum number of used HW threads is capped to max * MaxHwThreadsPercent / 100")
DECLARE_DEBUG_VARIABLE(int32_t, MinHwThreadsUnoccupied, 0, "If not zero then maximum number of used HW threads is reduced by MinHwThreadsUnoccupied")
DECLARE_DEBUG_VARIABLE(int32_t, MaxKernelManagedPrivateMemoryPercent, 0, "If not zero then maximum amount of kernel-managed private memory is capped to MaxGlobalMemory * MaxKernelManagedPrivateMemoryPercent / 100")
DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushEveryEnqueueCount, -1, "If greater than 0, driver performs implicit flush every N submissions.")
DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForNewResource, -1, "-1: platform specific, 0: force disable, 1: force enable")
DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForIdleGpu, -1, "-1: platform specific, 0: force disable, 1: force enable")

View File

@@ -64,7 +64,6 @@ MemoryManager::MemoryManager(ExecutionEnvironment &executionEnvironment) : execu
secondaryEngines.resize(rootEnvCount + 1);
localMemAllocsSize = std::make_unique<std::atomic<size_t>[]>(rootEnvCount);
sysMemAllocsSize.store(0u);
kernelManagedPrivateMemorySize = 0u;
for (uint32_t rootDeviceIndex = 0; rootDeviceIndex < rootEnvCount; ++rootDeviceIndex) {
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[rootDeviceIndex];

View File

@@ -272,9 +272,6 @@ class MemoryManager {
virtual AllocationStatus registerSysMemAlloc(GraphicsAllocation *allocation);
virtual AllocationStatus registerLocalMemAlloc(GraphicsAllocation *allocation, uint32_t rootDeviceIndex);
void registerKernelManagedPrivateMemorySize(size_t size) { this->kernelManagedPrivateMemorySize += size; };
void unregisterKernelManagedPrivateMemorySize(size_t size) { this->kernelManagedPrivateMemorySize -= size; };
virtual bool setMemAdvise(GraphicsAllocation *gfxAllocation, MemAdviseFlags flags, uint32_t rootDeviceIndex) { return true; }
virtual bool setMemPrefetch(GraphicsAllocation *gfxAllocation, SubDeviceIdsVec &subDeviceIds, uint32_t rootDeviceIndex) { return true; }
virtual bool setAtomicAccess(GraphicsAllocation *gfxAllocation, size_t size, AtomicAccessMode mode, uint32_t rootDeviceIndex) { return true; }
@@ -335,8 +332,6 @@ class MemoryManager {
size_t getUsedLocalMemorySize(uint32_t rootDeviceIndex) const { return localMemAllocsSize[rootDeviceIndex]; }
size_t getUsedSystemMemorySize() const { return sysMemAllocsSize; }
size_t getKernelManagedPrivateMemorySize() const { return kernelManagedPrivateMemorySize; }
[[nodiscard]] std::unique_lock<std::mutex> lockKernelManagedPrivateMemorySize() { return std::unique_lock<std::mutex>(this->kernelManagedPrivateMemorySizeMutex); };
uint32_t getFirstContextIdForRootDevice(uint32_t rootDeviceIndex);
virtual void getExtraDeviceProperties(uint32_t rootDeviceIndex, uint32_t *moduleId, uint16_t *serverType) { return; }
@@ -433,8 +428,6 @@ class MemoryManager {
std::mutex physicalMemoryAllocationMapMutex;
std::unique_ptr<std::atomic<size_t>[]> localMemAllocsSize;
std::atomic<size_t> sysMemAllocsSize;
size_t kernelManagedPrivateMemorySize;
std::mutex kernelManagedPrivateMemorySizeMutex;
size_t hostAllocationsSavedForReuseSize = 0u;
mutable std::mutex hostAllocationsReuseMtx;
std::map<std::pair<AllocationType, bool>, CustomHeapAllocatorConfig> customHeapAllocators;

View File

@@ -221,7 +221,6 @@ ReturnRawGpuTimestamps = 0
EnableDeviceBasedTimestamps = 1
MaxHwThreadsPercent = 0
MinHwThreadsUnoccupied = 0
MaxKernelManagedPrivateMemoryPercent = 0
LimitBlitterMaxWidth = -1
LimitBlitterMaxHeight = -1
PostBlitCommand = -1

View File

@@ -5,6 +5,7 @@
*
*/
#include "shared/source/compiler_interface/external_functions.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/helpers/blit_helper.h"
#include "shared/source/helpers/surface_format_info.h"
@@ -228,18 +229,6 @@ TEST(MemoryManagerTest, givenFailureOnRegisterLocalMemoryAllocationWhenAllocatin
EXPECT_EQ(nullptr, memoryManager.allocateGraphicsMemoryWithProperties(properties));
}
TEST(MemoryManagerTest, givenDifferentSizesWhenRegisteringAndUnregisteringModulePrivateMemorySizesThenCorrectValuesAreReturned) {
MockMemoryManager memoryManager(true, true);
auto privateMemorySizeLock = memoryManager.lockKernelManagedPrivateMemorySize();
EXPECT_EQ(0u, memoryManager.getKernelManagedPrivateMemorySize());
memoryManager.registerKernelManagedPrivateMemorySize(1234u);
EXPECT_EQ(1234u, memoryManager.getKernelManagedPrivateMemorySize());
memoryManager.unregisterKernelManagedPrivateMemorySize(1000u);
EXPECT_EQ(234u, memoryManager.getKernelManagedPrivateMemorySize());
memoryManager.unregisterKernelManagedPrivateMemorySize(234u);
EXPECT_EQ(0u, memoryManager.getKernelManagedPrivateMemorySize());
}
using MemoryhManagerMultiContextResourceTests = ::testing::Test;
HWTEST_F(MemoryhManagerMultiContextResourceTests, givenAllocationUsedByManyOsContextsWhenCheckingUsageBeforeDestroyThenMultiContextDestructorIsUsedForWaitingForAllOsContexts) {
auto executionEnvironment = new MockExecutionEnvironment(defaultHwInfo.get(), true, 2);