Revert "fix: count active modules for enabling per-dispatch private memory"

This reverts commit a483b361f9. Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
2025-12-19 06:24:51 +08:00 · 2025-01-15 11:00:40 +00:00
parent 621ceaf9ec
commit 2dd9940f60
8 changed files with 12 additions and 79 deletions
--- a/level_zero/core/source/module/module_imp.cpp
+++ b/level_zero/core/source/module/module_imp.cpp
@@ -12,6 +12,7 @@
 #include "shared/source/compiler_interface/compiler_options_extra.h"
 #include "shared/source/compiler_interface/compiler_warnings/compiler_warnings.h"
 #include "shared/source/compiler_interface/external_functions.h"
+#include "shared/source/compiler_interface/intermediate_representations.h"
 #include "shared/source/compiler_interface/linker.h"
 #include "shared/source/debugger/debugger_l0.h"
 #include "shared/source/device/device.h"
@@ -52,9 +53,9 @@
 #include "program_debug_data.h"

 #include <algorithm>
+#include <list>
 #include <memory>
 #include <unordered_map>
-
 namespace L0 {

 namespace BuildOptions {
@@ -1279,6 +1280,7 @@ ze_result_t ModuleImp::getKernelNames(uint32_t *pCount, const char **pNames) {
 }

 void ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded() {
+    size_t modulePrivateMemorySize = 0;
    auto neoDevice = this->device->getNEODevice();
    for (auto &kernelImmData : this->kernelImmDatas) {
        if (0 == kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) {
@@ -1286,26 +1288,17 @@ void ModuleImp::checkIfPrivateMemoryPerDispatchIsNeeded() {
        }
        auto kernelPrivateMemorySize = NEO::KernelHelper::getPrivateSurfaceSize(kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize,
                                                                                neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
-        this->privateMemorySize += kernelPrivateMemorySize;
+        modulePrivateMemorySize += kernelPrivateMemorySize;
    }

    this->allocatePrivateMemoryPerDispatch = false;
-    if (this->privateMemorySize > 0U) {
+    if (modulePrivateMemorySize > 0U) {
        auto deviceBitfield = neoDevice->getDeviceBitfield();
        auto globalMemorySize = neoDevice->getRootDevice()->getGlobalMemorySize(static_cast<uint32_t>(deviceBitfield.to_ulong()));
        auto numSubDevices = deviceBitfield.count();
-        auto allSubDevicePrivateMemorySize = this->privateMemorySize * numSubDevices;
-        float maxPercentage = 0.25f;
-        if (NEO::debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() > 0) {
-            maxPercentage = NEO::debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() / 100.0f;
-        }
-        auto privateMemorySizeLock = neoDevice->getMemoryManager()->lockKernelManagedPrivateMemorySize();
-        this->allocatePrivateMemoryPerDispatch = (neoDevice->getMemoryManager()->getKernelManagedPrivateMemorySize() + allSubDevicePrivateMemorySize) > static_cast<size_t>(globalMemorySize * maxPercentage);
-        if (!this->allocatePrivateMemoryPerDispatch) {
-            neoDevice->getMemoryManager()->registerKernelManagedPrivateMemorySize(allSubDevicePrivateMemorySize);
-        }
+        this->allocatePrivateMemoryPerDispatch = modulePrivateMemorySize * numSubDevices > globalMemorySize;
        PRINT_DEBUG_STRING(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Private Memory Per Dispatch %d for modulePrivateMemorySize %zu subDevices %zu globalMemorySize %" PRIu64 "\n",
-                           this->allocatePrivateMemoryPerDispatch, this->privateMemorySize, numSubDevices, globalMemorySize);
+                           this->allocatePrivateMemoryPerDispatch, modulePrivateMemorySize, numSubDevices, globalMemorySize);
    }
 }

@@ -1577,13 +1570,6 @@ ze_result_t ModuleImp::destroy() {
        }
    }

-    if (!this->allocatePrivateMemoryPerDispatch) {
-        auto neoDevice = this->device->getNEODevice();
-        auto allSubDevicePrivateMemorySize = neoDevice->getDeviceBitfield().count() * this->privateMemorySize;
-        auto privateMemorySizeLock = neoDevice->getMemoryManager()->lockKernelManagedPrivateMemorySize();
-        neoDevice->getMemoryManager()->unregisterKernelManagedPrivateMemorySize(allSubDevicePrivateMemorySize);
-    }
-
    delete this;

    if (tempDevice->getL0Debugger() && tempHandle != 0) {
--- a/level_zero/core/source/module/module_imp.h
+++ b/level_zero/core/source/module/module_imp.h
@@ -16,6 +16,7 @@

 #include "igfxfmid.h"

+#include <list>
 #include <memory>
 #include <set>
 #include <string>
@@ -215,7 +216,6 @@ struct ModuleImp : public Module {
    uint32_t profileFlags = 0;
    uint64_t moduleLoadAddress = std::numeric_limits<uint64_t>::max();
    size_t isaAllocationPageSize = 0;
-    size_t privateMemorySize = 0;

    NEO::Linker::PatchableSegments isaSegmentsForPatching;
    std::vector<std::vector<char>> patchedIsaTempStorage;
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp
@@ -16,6 +16,7 @@
 #include "shared/test/common/cmd_parse/gen_cmd_parse.h"
 #include "shared/test/common/helpers/unit_test_helper.h"
 #include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
+#include "shared/test/common/mocks/mock_compilers.h"
 #include "shared/test/common/mocks/mock_device.h"
 #include "shared/test/common/test_macros/hw_test.h"

@@ -1304,14 +1305,12 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichTogethe
    }
 }

-HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExceedGlobalMemSizePercentWhenAppendLaunchKernelWithParamsIsCalledThenNoAllocationIsDone, MatchAny) {
+HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExceedGlobalMemSizeWhenAppendLaunchKernelWithParamsIsCalledThenNoAllocationIsDone, MatchAny) {

-    debugManager.flags.MaxKernelManagedPrivateMemoryPercent.set(33);
    auto devInfo = device->getNEODevice()->getDeviceInfo();
    auto kernelsNb = 2u;
    uint32_t margin128KB = 131072u;
-    auto maxModulePrivateMemorySize = static_cast<uint64_t>(devInfo.globalMemSize * (debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() / 100.0f));
-    auto underAllocSize = static_cast<uint32_t>(maxModulePrivateMemorySize / kernelsNb / devInfo.computeUnitsUsedForScratch) - margin128KB;
+    auto underAllocSize = static_cast<uint32_t>(devInfo.globalMemSize / kernelsNb / devInfo.computeUnitsUsedForScratch) - margin128KB;
    auto kernelNames = std::array<std::string, 2u>{"test1", "test2"};

    auto &kernelImmDatas = this->module->kernelImmDatas;
@@ -1341,37 +1340,6 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTwoKernelPrivateAllocsWhichDontExc
        EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 0u);
    }
 }
-
-HWTEST2_F(CommandListAppendLaunchKernel, givenKernelPrivateAllocWhichExceedGlobalMemSizePercentWhenAppendLaunchKernelWithParamsIsCalledThenAllocationIsDone, MatchAny) {
-
-    debugManager.flags.MaxKernelManagedPrivateMemoryPercent.set(80);
-    auto devInfo = device->getNEODevice()->getDeviceInfo();
-    auto kernelPrivateMemorySize = static_cast<uint64_t>(devInfo.globalMemSize * ((debugManager.flags.MaxKernelManagedPrivateMemoryPercent.get() + 1) / 100.0f));
-    auto underAllocSize = static_cast<uint32_t>(kernelPrivateMemorySize / devInfo.computeUnitsUsedForScratch);
-
-    auto &kernelImmDatas = this->module->kernelImmDatas;
-    auto &kernelDesc = const_cast<KernelDescriptor &>(kernelImmDatas[0]->getDescriptor());
-    kernelDesc.kernelAttributes.perHwThreadPrivateMemorySize = underAllocSize;
-    kernelDesc.kernelAttributes.flags.usesPrintf = false;
-    kernelDesc.kernelMetadata.kernelName = "test1";
-
-    EXPECT_FALSE(this->module->shouldAllocatePrivateMemoryPerDispatch());
-    this->module->checkIfPrivateMemoryPerDispatchIsNeeded();
-    EXPECT_TRUE(this->module->shouldAllocatePrivateMemoryPerDispatch());
-
-    auto pCommandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
-    pCommandList->device = this->module->getDevice();
-    auto memoryMgr = static_cast<OsAgnosticMemoryManager *>(pCommandList->device->getNEODevice()->getExecutionEnvironment()->memoryManager.get());
-    memoryMgr->turnOnFakingBigAllocations();
-
-    auto kernels = std::vector<std::unique_ptr<WhiteBox<::L0::KernelImp>>>();
-    EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 0u);
-    kernels.push_back(this->createKernelWithName("test1"));
-    pCommandList->allocateOrReuseKernelPrivateMemoryIfNeeded(kernels[0].get(),
-                                                             kernels[0]->getKernelDescriptor().kernelAttributes.perHwThreadPrivateMemorySize);
-    EXPECT_EQ(pCommandList->getOwnedPrivateAllocationsSize(), 1u);
-}
-
 HWTEST2_F(CommandListAppendLaunchKernel, GivenDebugToggleSetWhenUpdateStreamPropertiesIsCalledThenCorrectThreadArbitrationPolicyIsSet, MatchAny) {
    DebugManagerStateRestore restorer;
    debugManager.flags.ForceThreadArbitrationPolicyProgrammingWithScm.set(1);
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@@ -385,7 +385,6 @@ DECLARE_DEBUG_VARIABLE(int32_t, AllowZeroCopyWithoutCoherency, -1, "Use cachelin
 DECLARE_DEBUG_VARIABLE(int32_t, EnableHostPtrTracking, -1, "Enable host ptr tracking: -1 - default platform setting, 0 - disabled, 1 - enabled")
 DECLARE_DEBUG_VARIABLE(int32_t, MaxHwThreadsPercent, 0, "If not zero then maximum number of used HW threads is capped to max * MaxHwThreadsPercent / 100")
 DECLARE_DEBUG_VARIABLE(int32_t, MinHwThreadsUnoccupied, 0, "If not zero then maximum number of used HW threads is reduced by MinHwThreadsUnoccupied")
-DECLARE_DEBUG_VARIABLE(int32_t, MaxKernelManagedPrivateMemoryPercent, 0, "If not zero then maximum amount of kernel-managed private memory is capped to MaxGlobalMemory * MaxKernelManagedPrivateMemoryPercent / 100")
 DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushEveryEnqueueCount, -1, "If greater than 0, driver performs implicit flush every N submissions.")
 DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForNewResource, -1, "-1: platform specific, 0: force disable, 1: force enable")
 DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForIdleGpu, -1, "-1: platform specific, 0: force disable, 1: force enable")
--- a/shared/source/memory_manager/memory_manager.cpp
+++ b/shared/source/memory_manager/memory_manager.cpp
@@ -64,7 +64,6 @@ MemoryManager::MemoryManager(ExecutionEnvironment &executionEnvironment) : execu
    secondaryEngines.resize(rootEnvCount + 1);
    localMemAllocsSize = std::make_unique<std::atomic<size_t>[]>(rootEnvCount);
    sysMemAllocsSize.store(0u);
-    kernelManagedPrivateMemorySize = 0u;

    for (uint32_t rootDeviceIndex = 0; rootDeviceIndex < rootEnvCount; ++rootDeviceIndex) {
        auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[rootDeviceIndex];
--- a/shared/source/memory_manager/memory_manager.h
+++ b/shared/source/memory_manager/memory_manager.h
@@ -272,9 +272,6 @@ class MemoryManager {
    virtual AllocationStatus registerSysMemAlloc(GraphicsAllocation *allocation);
    virtual AllocationStatus registerLocalMemAlloc(GraphicsAllocation *allocation, uint32_t rootDeviceIndex);

-    void registerKernelManagedPrivateMemorySize(size_t size) { this->kernelManagedPrivateMemorySize += size; };
-    void unregisterKernelManagedPrivateMemorySize(size_t size) { this->kernelManagedPrivateMemorySize -= size; };
-
    virtual bool setMemAdvise(GraphicsAllocation *gfxAllocation, MemAdviseFlags flags, uint32_t rootDeviceIndex) { return true; }
    virtual bool setMemPrefetch(GraphicsAllocation *gfxAllocation, SubDeviceIdsVec &subDeviceIds, uint32_t rootDeviceIndex) { return true; }
    virtual bool setAtomicAccess(GraphicsAllocation *gfxAllocation, size_t size, AtomicAccessMode mode, uint32_t rootDeviceIndex) { return true; }
@@ -335,8 +332,6 @@ class MemoryManager {

    size_t getUsedLocalMemorySize(uint32_t rootDeviceIndex) const { return localMemAllocsSize[rootDeviceIndex]; }
    size_t getUsedSystemMemorySize() const { return sysMemAllocsSize; }
-    size_t getKernelManagedPrivateMemorySize() const { return kernelManagedPrivateMemorySize; }
-    [[nodiscard]] std::unique_lock<std::mutex> lockKernelManagedPrivateMemorySize() { return std::unique_lock<std::mutex>(this->kernelManagedPrivateMemorySizeMutex); };
    uint32_t getFirstContextIdForRootDevice(uint32_t rootDeviceIndex);

    virtual void getExtraDeviceProperties(uint32_t rootDeviceIndex, uint32_t *moduleId, uint16_t *serverType) { return; }
@@ -433,8 +428,6 @@ class MemoryManager {
    std::mutex physicalMemoryAllocationMapMutex;
    std::unique_ptr<std::atomic<size_t>[]> localMemAllocsSize;
    std::atomic<size_t> sysMemAllocsSize;
-    size_t kernelManagedPrivateMemorySize;
-    std::mutex kernelManagedPrivateMemorySizeMutex;
    size_t hostAllocationsSavedForReuseSize = 0u;
    mutable std::mutex hostAllocationsReuseMtx;
    std::map<std::pair<AllocationType, bool>, CustomHeapAllocatorConfig> customHeapAllocators;
--- a/shared/test/common/test_files/igdrcl.config
+++ b/shared/test/common/test_files/igdrcl.config
@@ -221,7 +221,6 @@ ReturnRawGpuTimestamps = 0
 EnableDeviceBasedTimestamps = 1
 MaxHwThreadsPercent = 0
 MinHwThreadsUnoccupied = 0
-MaxKernelManagedPrivateMemoryPercent = 0
 LimitBlitterMaxWidth = -1
 LimitBlitterMaxHeight = -1
 PostBlitCommand = -1
--- a/shared/test/unit_test/memory_manager/memory_manager_tests.cpp
+++ b/shared/test/unit_test/memory_manager/memory_manager_tests.cpp
@@ -5,6 +5,7 @@
 *
 */

+#include "shared/source/compiler_interface/external_functions.h"
 #include "shared/source/gmm_helper/gmm_helper.h"
 #include "shared/source/helpers/blit_helper.h"
 #include "shared/source/helpers/surface_format_info.h"
@@ -228,18 +229,6 @@ TEST(MemoryManagerTest, givenFailureOnRegisterLocalMemoryAllocationWhenAllocatin
    EXPECT_EQ(nullptr, memoryManager.allocateGraphicsMemoryWithProperties(properties));
 }

-TEST(MemoryManagerTest, givenDifferentSizesWhenRegisteringAndUnregisteringModulePrivateMemorySizesThenCorrectValuesAreReturned) {
-    MockMemoryManager memoryManager(true, true);
-    auto privateMemorySizeLock = memoryManager.lockKernelManagedPrivateMemorySize();
-    EXPECT_EQ(0u, memoryManager.getKernelManagedPrivateMemorySize());
-    memoryManager.registerKernelManagedPrivateMemorySize(1234u);
-    EXPECT_EQ(1234u, memoryManager.getKernelManagedPrivateMemorySize());
-    memoryManager.unregisterKernelManagedPrivateMemorySize(1000u);
-    EXPECT_EQ(234u, memoryManager.getKernelManagedPrivateMemorySize());
-    memoryManager.unregisterKernelManagedPrivateMemorySize(234u);
-    EXPECT_EQ(0u, memoryManager.getKernelManagedPrivateMemorySize());
-}
-
 using MemoryhManagerMultiContextResourceTests = ::testing::Test;
 HWTEST_F(MemoryhManagerMultiContextResourceTests, givenAllocationUsedByManyOsContextsWhenCheckingUsageBeforeDestroyThenMultiContextDestructorIsUsedForWaitingForAllOsContexts) {
    auto executionEnvironment = new MockExecutionEnvironment(defaultHwInfo.get(), true, 2);