fix: add missing lock in bcs split path

Related-To: NEO-14557 Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
2025-12-29 17:13:29 +08:00 · 2025-10-03 15:38:56 +00:00
parent 6c0c229c82
commit 8ea8e78471
6 changed files with 88 additions and 34 deletions
--- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl
@@ -1327,11 +1327,17 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
    const auto copyOffloadModeForOperation = getCopyOffloadModeForOperation(copyOffloadSubmission);
    auto queue = getCmdQImmediate(copyOffloadModeForOperation);
    this->latestFlushIsDualCopyOffload = (copyOffloadModeForOperation == CopyOffloadModes::dualStream);
+    this->latestFlushIsHostVisible = !this->dcFlushSupport;

    if (NEO::debugManager.flags.DeferStateInitSubmissionToFirstRegularUsage.get() == 1) {
        static_cast<CommandQueueImp *>(queue)->getCsr()->ensurePrimaryCsrInitialized(*this->device->getNEODevice());
    }

+    if (signalEvent) {
+        signalEvent->setCsr(static_cast<CommandQueueImp *>(queue)->getCsr(), isInOrderExecutionEnabled());
+        this->latestFlushIsHostVisible |= signalEvent->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST) && !this->latestFlushIsDualCopyOffload;
+    }
+
    if (inputRet == ZE_RESULT_SUCCESS) {
        if (signalEvent && (NEO::debugManager.flags.TrackNumCsrClientsOnSyncPoints.get() != 0)) {
            signalEvent->setLatestUsedCmdQueue(queue);
@@ -1340,13 +1346,6 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
                                                            outerLock, outerLockForIndirect);
    }

-    this->latestFlushIsHostVisible = !this->dcFlushSupport;
-
-    if (signalEvent) {
-        signalEvent->setCsr(static_cast<CommandQueueImp *>(queue)->getCsr(), isInOrderExecutionEnabled());
-        this->latestFlushIsHostVisible |= signalEvent->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST) && !this->latestFlushIsDualCopyOffload;
-    }
-
    return inputRet;
 }

--- a/level_zero/core/source/device/bcs_split.h
+++ b/level_zero/core/source/device/bcs_split.h
@@ -121,6 +121,8 @@ struct BcsSplit {
        for (size_t i = 0; i < cmdListsForSplit.size(); i++) {
            auto subCmdList = static_cast<CommandListCoreFamilyImmediate<gfxCoreFamily> *>(cmdListsForSplit[i]);

+            auto lock = subCmdList->getCsr(false)->obtainUniqueOwnership();
+
            subCmdList->checkAvailableSpace(numWaitEvents, hasRelaxedOrderingDependencies, estimatedCmdBufferSize, false);

            if (barrierRequired) {
@@ -145,7 +147,7 @@ struct BcsSplit {
            auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i;
            auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle();
            result = appendCall(subCmdList, localDstPtr, localSrcPtr, localSize, eventHandle, aggregatedEventIncrementVal);
-            subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, nullptr, nullptr);
+            subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, &lock, nullptr);

            if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) {
                eventHandles.push_back(eventHandle);
@@ -179,6 +181,7 @@ struct BcsSplit {
        cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload);

        if (aggregatedEventsMode && !useSignalEventForSubcopy) {
+            std::lock_guard<std::mutex> lock(events.mtx);
            cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]);
        }

--- a/level_zero/core/test/unit_tests/mt_tests/device/test_mt_device.cpp
+++ b/level_zero/core/test/unit_tests/mt_tests/device/test_mt_device.cpp
@@ -9,11 +9,14 @@
 #include "shared/test/common/test_macros/hw_test.h"

 #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
+#include "level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h"
 #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
 #include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
 #include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
 #include "level_zero/core/test/unit_tests/mocks/mock_module.h"

+#include <array>
+
 namespace L0 {
 namespace ult {

@@ -147,5 +150,59 @@ HWTEST_F(DeviceMtTest, givenMultiThreadsExecutingCmdListAndSynchronizingDeviceWh
    }
    commandQueue->destroy();
 }
+
+using AggregatedBcsSplitMtTests = AggregatedBcsSplitTests;
+
+HWTEST2_F(AggregatedBcsSplitMtTests, givenBcsSplitEnabledWhenMultipleThreadsAccessingThenInternalResourcesUsedCorrectly, IsAtLeastXeHpcCore) {
+    constexpr uint32_t numThreads = 8;
+    constexpr uint32_t iterationCount = 5;
+
+    std::array<DestroyableZeUniquePtr<L0::CommandList>, numThreads> cmdLists = {};
+    std::array<std::thread, numThreads> threads = {};
+    std::array<void *, numThreads> hostPtrs = {};
+    std::vector<TaskCountType> initialTaskCounts;
+
+    for (uint32_t i = 0; i < numThreads; i++) {
+        cmdLists[i] = createCmdList(true);
+        hostPtrs[i] = allocHostMem();
+        cmdLists[i]->appendMemoryCopy(hostPtrs[i], hostPtrs[i], copySize, nullptr, 0, nullptr, copyParams);
+    }
+
+    for (auto &cmdList : bcsSplit->cmdLists) {
+        initialTaskCounts.push_back(cmdList->getCsr(false)->peekTaskCount());
+    }
+
+    std::atomic_bool started = false;
+
+    auto threadBody = [&](uint32_t cmdListId) {
+        while (!started.load()) {
+            std::this_thread::yield();
+        }
+
+        auto localCopyParams = copyParams;
+
+        for (uint32_t i = 1; i < iterationCount; i++) {
+            cmdLists[cmdListId]->appendMemoryCopy(hostPtrs[cmdListId], hostPtrs[cmdListId], copySize, nullptr, 0, nullptr, localCopyParams);
+        }
+    };
+
+    for (uint32_t i = 0; i < numThreads; ++i) {
+        threads[i] = std::thread(threadBody, i);
+    }
+
+    started = true;
+
+    for (auto &thread : threads) {
+        thread.join();
+    }
+
+    for (size_t i = 0; i < bcsSplit->cmdLists.size(); i++) {
+        EXPECT_TRUE(bcsSplit->cmdLists[i]->getCsr(false)->peekTaskCount() > initialTaskCounts[i]);
+    }
+
+    for (auto &ptr : hostPtrs) {
+        context->freeMem(ptr);
+    }
+}
 } // namespace ult
 } // namespace L0
--- a/shared/source/helpers/in_order_cmd_helpers.h
+++ b/shared/source/helpers/in_order_cmd_helpers.h
@@ -8,6 +8,7 @@
 #pragma once

 #include "shared/source/helpers/common_types.h"
+#include "shared/source/helpers/mt_helpers.h"
 #include "shared/source/helpers/non_copyable_or_moveable.h"
 #include "shared/source/helpers/ptr_math.h"
 #include "shared/source/memory_manager/allocation_type.h"
@@ -102,7 +103,7 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
    bool isExternalMemoryExecInfo() const { return deviceCounterNode == nullptr; }
    void setLastWaitedCounterValue(uint64_t value) {
        if (!isExternalMemoryExecInfo()) {
-            lastWaitedCounterValue = std::max(value, lastWaitedCounterValue);
+            NEO::MultiThreadHelpers::interlockedMax(lastWaitedCounterValue, value);
        }
    }

@@ -127,9 +128,9 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
    std::vector<std::pair<NEO::TagNodeBase *, uint64_t>> tempTimestampNodes;

    std::mutex mutex;
+    std::atomic<uint64_t> lastWaitedCounterValue = 0;

    uint64_t counterValue = 0;
-    uint64_t lastWaitedCounterValue = 0;
    uint64_t regularCmdListSubmissionCounter = 0;
    uint64_t deviceAddress = 0;
    uint64_t *hostAddress = nullptr;
--- a/shared/source/memory_manager/os_agnostic_memory_manager.cpp
+++ b/shared/source/memory_manager/os_agnostic_memory_manager.cpp
@@ -71,9 +71,9 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryWithAlignment

    if (fakeBigAllocations && sizeAligned > bigAllocation) {
        memoryAllocation = createMemoryAllocation(
-            allocationData.type, nullptr, reinterpret_cast<void *>(dummyAddress), dummyAddress, sizeAligned, counter,
+            allocationData.type, nullptr, reinterpret_cast<void *>(dummyAddress), dummyAddress, sizeAligned, counter.fetch_add(1),
            MemoryPool::system4KBPages, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
-        counter++;
+
        return memoryAllocation;
    }

@@ -91,7 +91,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryWithAlignment
    auto ptr = allocateSystemMemory(cpuAllocationSize, alignment);
    if (ptr != nullptr) {
        memoryAllocation = createMemoryAllocation(allocationData.type, ptr, ptr, reinterpret_cast<uint64_t>(ptr), sizeAligned,
-                                                  counter, MemoryPool::system4KBPages, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
+                                                  counter.load(), MemoryPool::system4KBPages, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);

        if (allocationData.type == AllocationType::svmCpu) {
            // add  padding in case mapPtr is not aligned
@@ -169,12 +169,11 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryForNonSvmHost
    auto offsetInPage = ptrDiff(allocationData.hostPtr, alignedPtr);

    auto memoryAllocation = createMemoryAllocation(allocationData.type, nullptr, const_cast<void *>(allocationData.hostPtr),
-                                                   reinterpret_cast<uint64_t>(alignedPtr), allocationData.size, counter,
+                                                   reinterpret_cast<uint64_t>(alignedPtr), allocationData.size, counter.fetch_add(1),
                                                   MemoryPool::system4KBPages, allocationData.rootDeviceIndex, false, allocationData.flags.flushL3, false);

    memoryAllocation->setAllocationOffset(offsetInPage);

-    counter++;
    return memoryAllocation;
 }

@@ -234,13 +233,12 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocate32BitGraphicsMemoryImpl(con
        MemoryAllocation *memAlloc = new MemoryAllocation(
            allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, nullptr, const_cast<void *>(allocationData.hostPtr),
            canonizedGpuAddress, allocationData.size,
-            counter, MemoryPool::system4KBPagesWith32BitGpuAddressing, false, false, maxOsContextCount);
+            counter.fetch_add(1), MemoryPool::system4KBPagesWith32BitGpuAddressing, false, false, maxOsContextCount);

        memAlloc->set32BitAllocation(true);
        memAlloc->setGpuBaseAddress(gmmHelper->canonize(gfxPartition->getHeapBase(heap)));
        memAlloc->sizeToFree = allocationSize;

-        counter++;
        return memAlloc;
    }

@@ -261,7 +259,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocate32BitGraphicsMemoryImpl(con
        auto canonizedGpuAddress = gmmHelper->canonize(gpuAddress);
        memoryAllocation = new MemoryAllocation(allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, ptrAlloc, ptrAlloc,
                                                canonizedGpuAddress,
-                                                allocationData.size, counter, MemoryPool::system4KBPagesWith32BitGpuAddressing,
+                                                allocationData.size, counter.load(), MemoryPool::system4KBPagesWith32BitGpuAddressing,
                                                false, allocationData.flags.flushL3, maxOsContextCount);

        memoryAllocation->set32BitAllocation(true);
@@ -377,7 +375,7 @@ uint64_t OsAgnosticMemoryManager::getSystemSharedMemory(uint32_t rootDeviceIndex

 GraphicsAllocation *OsAgnosticMemoryManager::createGraphicsAllocation(OsHandleStorage &handleStorage, const AllocationData &allocationData) {
    auto allocation = createMemoryAllocation(allocationData.type, nullptr, const_cast<void *>(allocationData.hostPtr),
-                                             reinterpret_cast<uint64_t>(allocationData.hostPtr), allocationData.size, counter++,
+                                             reinterpret_cast<uint64_t>(allocationData.hostPtr), allocationData.size, counter.fetch_add(1),
                                             MemoryPool::system4KBPages, allocationData.rootDeviceIndex, false, allocationData.flags.flushL3, false);

    allocation->fragmentsStorage = handleStorage;
@@ -474,9 +472,9 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocatePhysicalLocalDeviceMemory(c
    if (systemMemory) {
        auto sizeOfHeapChunk = sizeAligned64k;
        allocation = new MemoryAllocation(allocationData.rootDeviceIndex, numHandles, allocationData.type, systemMemory, systemMemory,
-                                          0u, sizeAligned64k, counter,
+                                          0u, sizeAligned64k, counter.fetch_add(1),
                                          MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
-        counter++;
+
        allocation->setDefaultGmm(gmm.release());
        allocation->sizeToFree = sizeOfHeapChunk;
    }
@@ -507,8 +505,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocatePhysicalDeviceMemory(const
    auto ptr = allocateSystemMemory(alignUp(allocationData.size, MemoryConstants::pageSize), MemoryConstants::pageSize);
    if (ptr != nullptr) {
        alloc = new MemoryAllocation(allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, ptr, ptr, 0u, allocationData.size,
-                                     counter, MemoryPool::systemCpuInaccessible, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
-        counter++;
+                                     counter.fetch_add(1), MemoryPool::systemCpuInaccessible, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
    }

    if (alloc) {
@@ -535,8 +532,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocatePhysicalHostMemory(const Al
    auto ptr = allocateSystemMemory(alignUp(allocationData.size, MemoryConstants::pageSize), MemoryConstants::pageSize2M);
    if (ptr != nullptr) {
        alloc = new MemoryAllocation(allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, ptr, ptr, 0u, allocationData.size,
-                                     counter, MemoryPool::system4KBPages, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
-        counter++;
+                                     counter.fetch_add(1), MemoryPool::system4KBPages, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
    }

    if (alloc) {
@@ -564,8 +560,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateMemoryByKMD(const Allocatio
    auto ptr = allocateSystemMemory(alignUp(allocationData.size, alignment), alignment);
    if (ptr != nullptr) {
        alloc = createMemoryAllocation(allocationData.type, ptr, ptr, reinterpret_cast<uint64_t>(ptr), allocationData.size,
-                                       counter, MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
-        counter++;
+                                       counter.fetch_add(1), MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
    }

    if (alloc) {
@@ -589,8 +584,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryForImageImpl(
    auto ptr = allocateSystemMemory(alignUp(allocationData.imgInfo->size, MemoryConstants::pageSize), MemoryConstants::pageSize);
    if (ptr != nullptr) {
        alloc = createMemoryAllocation(allocationData.type, ptr, ptr, reinterpret_cast<uint64_t>(ptr), allocationData.imgInfo->size,
-                                       counter, MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
-        counter++;
+                                       counter.fetch_add(1), MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
    }

    if (alloc) {
@@ -696,8 +690,8 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryInDevicePool(
        auto storage = allocateSystemMemory(allocationData.size, MemoryConstants::pageSize2M);
        auto canonizedGpuAddress = gmmHelper->canonize(reinterpret_cast<uint64_t>(allocationData.hostPtr));
        allocation = new MemoryAllocation(allocationData.rootDeviceIndex, numHandles, allocationData.type, storage, storage, canonizedGpuAddress,
-                                          allocationData.size, counter, MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
-        counter++;
+                                          allocationData.size, counter.fetch_add(1), MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
+
        if (allocationData.flags.preferCompressed) {
            auto &productHelper = executionEnvironment.rootDeviceEnvironments[allocationData.rootDeviceIndex]->getHelper<ProductHelper>();
            GmmRequirements gmmRequirements{};
@@ -762,9 +756,9 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryInDevicePool(
            canonizedGpuAddress = MemoryManager::adjustToggleBitFlagForGpuVa(allocationData.type, canonizedGpuAddress);
        }
        allocation = new MemoryAllocation(allocationData.rootDeviceIndex, numHandles, allocationData.type, systemMemory, systemMemory,
-                                          canonizedGpuAddress, sizeAligned64k, counter,
+                                          canonizedGpuAddress, sizeAligned64k, counter.fetch_add(1),
                                          MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
-        counter++;
+
        allocation->setDefaultGmm(gmm.release());
        allocation->sizeToFree = sizeOfHeapChunk;
        if (use32Allocator) {
--- a/shared/source/memory_manager/os_agnostic_memory_manager.h
+++ b/shared/source/memory_manager/os_agnostic_memory_manager.h
@@ -79,7 +79,7 @@ class OsAgnosticMemoryManager : public MemoryManager {
    bool fakeBigAllocations = false;

  private:
-    unsigned long long counter = 0;
+    std::atomic<uint64_t> counter = 0;
 };

 } // namespace NEO