performance: Use vector for private allocs to reuse

Related-To: HSD-18033105655, HSD-18033153203 Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
2023-09-04 09:42:02 +00:00 · 2023-09-04 09:42:02 +00:00 · 3b3e17e738
parent 91b26277a4
commit 3b3e17e738
9 changed files with 30 additions and 19 deletions
--- a/level_zero/core/source/cmdlist/cmdlist.h
+++ b/level_zero/core/source/cmdlist/cmdlist.h
@ -11,6 +11,7 @@
 #include "shared/source/command_stream/preemption_mode.h"
 #include "shared/source/command_stream/stream_properties.h"
 #include "shared/source/helpers/cache_policy.h"
+#include "shared/source/helpers/common_types.h"
 #include "shared/source/helpers/definitions/command_encoder_args.h"
 #include "shared/source/helpers/heap_base_address_model.h"
 #include "shared/source/memory_manager/prefetch_manager.h"
@ -356,7 +357,7 @@ struct CommandList : _ze_command_list_handle_t {
    MOCKABLE_VIRTUAL void synchronizeEventList(uint32_t numWaitEvents, ze_event_handle_t *waitEventList);

    std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
-    std::unordered_map<uint32_t, NEO::GraphicsAllocation *> ownedPrivateAllocations;
+    NEO::PrivateAllocsToReuseContainer ownedPrivateAllocations;
    std::vector<NEO::GraphicsAllocation *> patternAllocations;
    std::vector<Kernel *> printfKernelContainer;

--- a/level_zero/core/source/cmdlist/cmdlist_hw.h
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.h
@ -309,7 +309,7 @@ struct CommandListCoreFamily : CommandListImp {
        return (this->pipeControlMultiKernelEventSync && splitKernel) ||
               compactL3FlushEvent(dcFlush);
    }
-    MOCKABLE_VIRTUAL void allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, std::unordered_map<uint32_t, NEO::GraphicsAllocation *> &privateAllocsToReuse);
+    MOCKABLE_VIRTUAL void allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, NEO::PrivateAllocsToReuseContainer &privateAllocsToReuse);
    virtual void allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread);
    CmdListEventOperation estimateEventPostSync(Event *event, uint32_t operations);
    void dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition);
--- a/level_zero/core/source/cmdlist/cmdlist_hw.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl
@ -3182,15 +3182,22 @@ void CommandListCoreFamily<gfxCoreFamily>::allocateOrReuseKernelPrivateMemoryIfN
 }

 template <GFXCORE_FAMILY gfxCoreFamily>
-void CommandListCoreFamily<gfxCoreFamily>::allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, std::unordered_map<uint32_t, NEO::GraphicsAllocation *> &privateAllocsToReuse) {
+void CommandListCoreFamily<gfxCoreFamily>::allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, NEO::PrivateAllocsToReuseContainer &privateAllocsToReuse) {
    L0::KernelImp *kernelImp = static_cast<KernelImp *>(kernel);
    NEO::GraphicsAllocation *privateAlloc = nullptr;

-    if (privateAllocsToReuse[sizePerHwThread] != nullptr) {
-        privateAlloc = privateAllocsToReuse[sizePerHwThread];
-    } else {
+    bool allocToReuseFound = false;
+
+    for (auto &alloc : privateAllocsToReuse) {
+        if (sizePerHwThread == alloc.first) {
+            privateAlloc = alloc.second;
+            allocToReuseFound = true;
+            break;
+        }
+    }
+    if (!allocToReuseFound) {
        privateAlloc = kernelImp->allocatePrivateMemoryGraphicsAllocation();
-        privateAllocsToReuse[sizePerHwThread] = privateAlloc;
+        privateAllocsToReuse.push_back({sizePerHwThread, privateAlloc});
    }
    kernelImp->patchAndMoveToResidencyContainerPrivateSurface(privateAlloc);
 }
--- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h
+++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h
@ -534,7 +534,7 @@ class MockCommandListCoreFamily : public CommandListCoreFamily<gfxCoreFamily> {
                         false,
                         (L0::Kernel * kernel,
                          uint32_t sizePerHwThread,
-                          std::unordered_map<uint32_t, NEO::GraphicsAllocation *> &privateAllocsToReuse),
+                          NEO::PrivateAllocsToReuseContainer &privateAllocsToReuse),
                         (kernel, sizePerHwThread, privateAllocsToReuse));

    ADDMETHOD_VOIDRETURN(allocateOrReuseKernelPrivateMemoryIfNeeded,
@ -631,7 +631,7 @@ class MockCommandListImmediateHw : public WhiteBox<::L0::CommandListCoreFamilyIm
                         false,
                         (L0::Kernel * kernel,
                          uint32_t sizePerHwThread,
-                          std::unordered_map<uint32_t, NEO::GraphicsAllocation *> &privateAllocsToReuse),
+                          NEO::PrivateAllocsToReuseContainer &privateAllocsToReuse),
                         (kernel, sizePerHwThread, privateAllocsToReuse));

    ADDMETHOD_VOIDRETURN(allocateOrReuseKernelPrivateMemoryIfNeeded,
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp
@ -3193,11 +3193,11 @@ HWTEST2_F(CommandListMappedTimestampTest, givenEventIsAddedToMappedEventListWhen
 template <GFXCORE_FAMILY gfxCoreFamily, typename BaseMock>
 class MockCommandListCoreFamilyIfPrivateNeeded : public BaseMock {
  public:
-    void allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, std::unordered_map<uint32_t, GraphicsAllocation *> &privateAllocsToReuse) override {
+    void allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, PrivateAllocsToReuseContainer &privateAllocsToReuse) override {
        passedContainer = &privateAllocsToReuse;
        BaseMock::allocateOrReuseKernelPrivateMemory(kernel, sizePerHwThread, privateAllocsToReuse);
    }
-    std::unordered_map<uint32_t, GraphicsAllocation *> *passedContainer;
+    PrivateAllocsToReuseContainer *passedContainer;
 };

 HWTEST2_F(CommandListCreate, givenPrivatePerDispatchDisabledWhenAllocatingPrivateMemoryThenAllocateIsNotCalled, IsAtLeastSkl) {
@ -3281,8 +3281,8 @@ HWTEST2_F(CommandListCreate, givenCmdListWhenAllocateOrReuseCalledForSizeThatIsS
    const_cast<uint32_t &>(mockKernel.kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) = 0x1000;
    mockKernel.module = &mockModule;
    MockGraphicsAllocation mockGA(mockMem.get(), 2 * sizePerHwThread * this->neoDevice->getDeviceInfo().computeUnitsUsedForScratch);
-    std::unordered_map<uint32_t, GraphicsAllocation *> mapForReuse;
-    mapForReuse[sizePerHwThread] = &mockGA;
+    PrivateAllocsToReuseContainer mapForReuse;
+    mapForReuse.push_back({sizePerHwThread, &mockGA});
    commandList->allocateOrReuseKernelPrivateMemory(&mockKernel, sizePerHwThread, mapForReuse);
    EXPECT_EQ(mockKernel.residencyContainer[0], &mockGA);
 }
@ -3298,8 +3298,8 @@ HWTEST2_F(CommandListCreate, givenNewSizeDifferentThanSizesInMapWhenAllocatingPr
    const_cast<uint32_t &>(mockKernel.kernelImmData->getDescriptor().kernelAttributes.perHwThreadPrivateMemorySize) = 0x1000;
    mockKernel.module = &mockModule;
    MockGraphicsAllocation mockGA(mockMem.get(), sizePerHwThread * this->neoDevice->getDeviceInfo().computeUnitsUsedForScratch / 2);
-    std::unordered_map<uint32_t, GraphicsAllocation *> mapForReuse;
-    mapForReuse[sizePerHwThread] = &mockGA;
+    PrivateAllocsToReuseContainer mapForReuse;
+    mapForReuse.push_back({sizePerHwThread, &mockGA});
    commandList->allocateOrReuseKernelPrivateMemory(&mockKernel, sizePerHwThread / 2, mapForReuse);
    EXPECT_NE(mockKernel.residencyContainer[0], &mockGA);
    neoDevice->getMemoryManager()->freeGraphicsMemory(mockKernel.residencyContainer[0]);
--- a/shared/source/command_stream/command_stream_receiver.cpp
+++ b/shared/source/command_stream/command_stream_receiver.cpp
@ -571,7 +571,7 @@ ResidencyContainer &CommandStreamReceiver::getResidencyAllocations() {
 ResidencyContainer &CommandStreamReceiver::getEvictionAllocations() {
    return this->evictionAllocations;
 }
-std::unordered_map<uint32_t, GraphicsAllocation *> &CommandStreamReceiver::getOwnedPrivateAllocations() {
+PrivateAllocsToReuseContainer &CommandStreamReceiver::getOwnedPrivateAllocations() {
    return this->ownedPrivateAllocations;
 }

--- a/shared/source/command_stream/command_stream_receiver.h
+++ b/shared/source/command_stream/command_stream_receiver.h
@ -11,6 +11,7 @@
 #include "shared/source/command_stream/stream_properties.h"
 #include "shared/source/helpers/blit_properties_container.h"
 #include "shared/source/helpers/cache_policy.h"
+#include "shared/source/helpers/common_types.h"
 #include "shared/source/helpers/completion_stamp.h"
 #include "shared/source/helpers/options.h"
 #include "shared/source/utilities/spinlock.h"
@ -118,7 +119,7 @@ class CommandStreamReceiver {

    ResidencyContainer &getResidencyAllocations();
    ResidencyContainer &getEvictionAllocations();
-    std::unordered_map<uint32_t, GraphicsAllocation *> &getOwnedPrivateAllocations();
+    PrivateAllocsToReuseContainer &getOwnedPrivateAllocations();

    virtual GmmPageTableMngr *createPageTableManager() { return nullptr; }
    bool needsPageTableManager() const;
@ -461,7 +462,7 @@ class CommandStreamReceiver {

    ResidencyContainer residencyAllocations;
    ResidencyContainer evictionAllocations;
-    std::unordered_map<uint32_t, GraphicsAllocation *> ownedPrivateAllocations;
+    PrivateAllocsToReuseContainer ownedPrivateAllocations;

    MutexType ownershipMutex;
    MutexType hostPtrSurfaceCreationMutex;
--- a/shared/source/helpers/common_types.h
+++ b/shared/source/helpers/common_types.h
@ -13,11 +13,13 @@
 #include <vector>

 namespace NEO {
+class GraphicsAllocation;
 struct EngineControl;
 using EngineControlContainer = std::vector<EngineControl>;
 using MultiDeviceEngineControlContainer = StackVec<EngineControlContainer, 6u>;
 class Device;
 using DeviceVector = std::vector<std::unique_ptr<Device>>;
+using PrivateAllocsToReuseContainer = StackVec<std::pair<uint32_t, GraphicsAllocation *>, 8>;

 enum class DebugPauseState : uint32_t {
    disabled,
--- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp
+++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp
@ -4491,7 +4491,7 @@ HWTEST_F(CommandStreamReceiverTest, givenCsrWhenCleanUpResourcesThenOwnedPrivate
    auto mockGA = std::make_unique<MockGraphicsAllocation>();

    auto mapForReuse = &csr.getOwnedPrivateAllocations();
-    mapForReuse->insert({0x100, mockGA.release()});
+    mapForReuse->push_back({0x100, mockGA.release()});
    csr.cleanupResources();
    EXPECT_EQ(mapForReuse->size(), 0u);
 }