performance: Don't wait for taskCount for indirect allocs

Related-To: GSD-9385 In case of indirect allocations, we don't really know their task count because we can't track their true usage on GPU. In case of non-blocking free, don't wait for latestSentTaskCount. Signed-off-by: Szymon Morek <szymon.morek@intel.com>
2024-07-10 13:22:44 +00:00 · 2024-07-10 13:22:44 +00:00 · 35cbbfe43a
parent 19b6f5a258
commit 35cbbfe43a
4 changed files with 20 additions and 6 deletions
--- a/level_zero/core/test/unit_tests/sources/memory/test_memory.cpp
+++ b/level_zero/core/test/unit_tests/sources/memory/test_memory.cpp
@ -3969,7 +3969,7 @@ HWTEST2_F(MultipleDevicePeerAllocationTest,

    auto allocationData1 = svmManager->getSVMAlloc(ptr1);
    TaskCountType prevPeekTaskCount1 = allocationData1->gpuAllocations.getGraphicsAllocation(1u)->getTaskCount(csr0->getOsContext().getContextId());
-    svmManager->prepareIndirectAllocationForDestruction(allocationData1);
+    svmManager->prepareIndirectAllocationForDestruction(allocationData1, false);
    TaskCountType postPeekTaskCount1 = allocationData1->gpuAllocations.getGraphicsAllocation(1u)->getTaskCount(csr0->getOsContext().getContextId());

    EXPECT_EQ(postPeekTaskCount1, prevPeekTaskCount1);
--- a/opencl/test/unit_test/memory_manager/unified_memory_manager_tests.cpp
+++ b/opencl/test/unit_test/memory_manager/unified_memory_manager_tests.cpp
@ -722,9 +722,16 @@ TEST(UnifiedMemoryTest, givenInternalAllocationsWhenTheyArePreparedForFreeingThe

    auto allocationData = unifiedMemoryManager->getSVMAlloc(ptr);

-    unifiedMemoryManager->prepareIndirectAllocationForDestruction(allocationData);
+    unifiedMemoryManager->prepareIndirectAllocationForDestruction(allocationData, false);
    EXPECT_EQ(124u, graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->getTaskCount(commandStreamReceiver.getOsContext().getContextId()));
    EXPECT_EQ(124u, graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->getResidencyTaskCount(commandStreamReceiver.getOsContext().getContextId()));
+
+    graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->updateTaskCount(1u, commandStreamReceiver.getOsContext().getContextId());
+    graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->updateResidencyTaskCount(GraphicsAllocation::objectAlwaysResident, commandStreamReceiver.getOsContext().getContextId());
+    unifiedMemoryManager->prepareIndirectAllocationForDestruction(allocationData, true);
+    EXPECT_EQ(GraphicsAllocation::objectNotUsed, graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->getTaskCount(commandStreamReceiver.getOsContext().getContextId()));
+    EXPECT_EQ(GraphicsAllocation::objectNotResident, graphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->getResidencyTaskCount(commandStreamReceiver.getOsContext().getContextId()));
+
    unifiedMemoryManager->freeSVMAlloc(ptr);
 }

--- a/shared/source/memory_manager/unified_memory_manager.cpp
+++ b/shared/source/memory_manager/unified_memory_manager.cpp
@ -496,7 +496,8 @@ bool SVMAllocsManager::freeSVMAllocDefer(void *ptr) {
 }

 void SVMAllocsManager::freeSVMAllocImpl(void *ptr, FreePolicyType policy, SvmAllocationData *svmData) {
-    this->prepareIndirectAllocationForDestruction(svmData);
+    auto allowNonBlockingFree = policy == FreePolicyType::none;
+    this->prepareIndirectAllocationForDestruction(svmData, allowNonBlockingFree);

    if (policy == FreePolicyType::blocking) {
        if (svmData->cpuAllocation) {
@ -769,7 +770,7 @@ void SVMAllocsManager::makeIndirectAllocationsResident(CommandStreamReceiver &co
    }
 }

-void SVMAllocsManager::prepareIndirectAllocationForDestruction(SvmAllocationData *allocationData) {
+void SVMAllocsManager::prepareIndirectAllocationForDestruction(SvmAllocationData *allocationData, bool isNonBlockingFree) {
    std::unique_lock<std::shared_mutex> lock(mtx);
    if (this->indirectAllocationsResidency.size() > 0u) {
        for (auto &internalAllocationsHandling : this->indirectAllocationsResidency) {
@ -778,7 +779,13 @@ void SVMAllocsManager::prepareIndirectAllocationForDestruction(SvmAllocationData
            if (gpuAllocation == nullptr) {
                continue;
            }
-            auto desiredTaskCount = std::max(internalAllocationsHandling.second.latestSentTaskCount, gpuAllocation->getTaskCount(commandStreamReceiver->getOsContext().getContextId()));
+
+            // Marking gpuAllocation task count as objectNotUsed means we will not wait for GPU completion.
+            // However, if this is blocking free, we must select "safest" task count to wait for.
+            TaskCountType desiredTaskCount = std::max(internalAllocationsHandling.second.latestSentTaskCount, gpuAllocation->getTaskCount(commandStreamReceiver->getOsContext().getContextId()));
+            if (isNonBlockingFree) {
+                desiredTaskCount = GraphicsAllocation::objectNotUsed;
+            }
            if (gpuAllocation->isAlwaysResident(commandStreamReceiver->getOsContext().getContextId())) {
                gpuAllocation->updateResidencyTaskCount(GraphicsAllocation::objectNotResident, commandStreamReceiver->getOsContext().getContextId());
                gpuAllocation->updateResidencyTaskCount(desiredTaskCount, commandStreamReceiver->getOsContext().getContextId());
--- a/shared/source/memory_manager/unified_memory_manager.h
+++ b/shared/source/memory_manager/unified_memory_manager.h
@ -229,7 +229,7 @@ class SVMAllocsManager {
    bool hasHostAllocations();
    std::atomic<uint32_t> allocationsCounter = 0;
    MOCKABLE_VIRTUAL void makeIndirectAllocationsResident(CommandStreamReceiver &commandStreamReceiver, TaskCountType taskCount);
-    void prepareIndirectAllocationForDestruction(SvmAllocationData *);
+    void prepareIndirectAllocationForDestruction(SvmAllocationData *allocationData, bool isNonBlockingFree);
    MOCKABLE_VIRTUAL void prefetchMemory(Device &device, CommandStreamReceiver &commandStreamReceiver, SvmAllocationData &svmData);
    void prefetchSVMAllocs(Device &device, CommandStreamReceiver &commandStreamReceiver);
    std::unique_lock<std::mutex> obtainOwnership();