fix: add missing lock in bcs split path

Related-To: NEO-14557

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2025-10-03 15:38:56 +00:00
committed by Compute-Runtime-Automation
parent 6c0c229c82
commit 8ea8e78471
6 changed files with 88 additions and 34 deletions

View File

@@ -1327,11 +1327,17 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
const auto copyOffloadModeForOperation = getCopyOffloadModeForOperation(copyOffloadSubmission);
auto queue = getCmdQImmediate(copyOffloadModeForOperation);
this->latestFlushIsDualCopyOffload = (copyOffloadModeForOperation == CopyOffloadModes::dualStream);
this->latestFlushIsHostVisible = !this->dcFlushSupport;
if (NEO::debugManager.flags.DeferStateInitSubmissionToFirstRegularUsage.get() == 1) {
static_cast<CommandQueueImp *>(queue)->getCsr()->ensurePrimaryCsrInitialized(*this->device->getNEODevice());
}
if (signalEvent) {
signalEvent->setCsr(static_cast<CommandQueueImp *>(queue)->getCsr(), isInOrderExecutionEnabled());
this->latestFlushIsHostVisible |= signalEvent->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST) && !this->latestFlushIsDualCopyOffload;
}
if (inputRet == ZE_RESULT_SUCCESS) {
if (signalEvent && (NEO::debugManager.flags.TrackNumCsrClientsOnSyncPoints.get() != 0)) {
signalEvent->setLatestUsedCmdQueue(queue);
@@ -1340,13 +1346,6 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
outerLock, outerLockForIndirect);
}
this->latestFlushIsHostVisible = !this->dcFlushSupport;
if (signalEvent) {
signalEvent->setCsr(static_cast<CommandQueueImp *>(queue)->getCsr(), isInOrderExecutionEnabled());
this->latestFlushIsHostVisible |= signalEvent->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST) && !this->latestFlushIsDualCopyOffload;
}
return inputRet;
}

View File

@@ -121,6 +121,8 @@ struct BcsSplit {
for (size_t i = 0; i < cmdListsForSplit.size(); i++) {
auto subCmdList = static_cast<CommandListCoreFamilyImmediate<gfxCoreFamily> *>(cmdListsForSplit[i]);
auto lock = subCmdList->getCsr(false)->obtainUniqueOwnership();
subCmdList->checkAvailableSpace(numWaitEvents, hasRelaxedOrderingDependencies, estimatedCmdBufferSize, false);
if (barrierRequired) {
@@ -145,7 +147,7 @@ struct BcsSplit {
auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i;
auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle();
result = appendCall(subCmdList, localDstPtr, localSrcPtr, localSize, eventHandle, aggregatedEventIncrementVal);
subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, nullptr, nullptr);
subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, &lock, nullptr);
if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) {
eventHandles.push_back(eventHandle);
@@ -179,6 +181,7 @@ struct BcsSplit {
cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload);
if (aggregatedEventsMode && !useSignalEventForSubcopy) {
std::lock_guard<std::mutex> lock(events.mtx);
cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]);
}

View File

@@ -9,11 +9,14 @@
#include "shared/test/common/test_macros/hw_test.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
#include <array>
namespace L0 {
namespace ult {
@@ -147,5 +150,59 @@ HWTEST_F(DeviceMtTest, givenMultiThreadsExecutingCmdListAndSynchronizingDeviceWh
}
commandQueue->destroy();
}
using AggregatedBcsSplitMtTests = AggregatedBcsSplitTests;
HWTEST2_F(AggregatedBcsSplitMtTests, givenBcsSplitEnabledWhenMultipleThreadsAccessingThenInternalResourcesUsedCorrectly, IsAtLeastXeHpcCore) {
constexpr uint32_t numThreads = 8;
constexpr uint32_t iterationCount = 5;
std::array<DestroyableZeUniquePtr<L0::CommandList>, numThreads> cmdLists = {};
std::array<std::thread, numThreads> threads = {};
std::array<void *, numThreads> hostPtrs = {};
std::vector<TaskCountType> initialTaskCounts;
for (uint32_t i = 0; i < numThreads; i++) {
cmdLists[i] = createCmdList(true);
hostPtrs[i] = allocHostMem();
cmdLists[i]->appendMemoryCopy(hostPtrs[i], hostPtrs[i], copySize, nullptr, 0, nullptr, copyParams);
}
for (auto &cmdList : bcsSplit->cmdLists) {
initialTaskCounts.push_back(cmdList->getCsr(false)->peekTaskCount());
}
std::atomic_bool started = false;
auto threadBody = [&](uint32_t cmdListId) {
while (!started.load()) {
std::this_thread::yield();
}
auto localCopyParams = copyParams;
for (uint32_t i = 1; i < iterationCount; i++) {
cmdLists[cmdListId]->appendMemoryCopy(hostPtrs[cmdListId], hostPtrs[cmdListId], copySize, nullptr, 0, nullptr, localCopyParams);
}
};
for (uint32_t i = 0; i < numThreads; ++i) {
threads[i] = std::thread(threadBody, i);
}
started = true;
for (auto &thread : threads) {
thread.join();
}
for (size_t i = 0; i < bcsSplit->cmdLists.size(); i++) {
EXPECT_TRUE(bcsSplit->cmdLists[i]->getCsr(false)->peekTaskCount() > initialTaskCounts[i]);
}
for (auto &ptr : hostPtrs) {
context->freeMem(ptr);
}
}
} // namespace ult
} // namespace L0

View File

@@ -8,6 +8,7 @@
#pragma once
#include "shared/source/helpers/common_types.h"
#include "shared/source/helpers/mt_helpers.h"
#include "shared/source/helpers/non_copyable_or_moveable.h"
#include "shared/source/helpers/ptr_math.h"
#include "shared/source/memory_manager/allocation_type.h"
@@ -102,7 +103,7 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
bool isExternalMemoryExecInfo() const { return deviceCounterNode == nullptr; }
void setLastWaitedCounterValue(uint64_t value) {
if (!isExternalMemoryExecInfo()) {
lastWaitedCounterValue = std::max(value, lastWaitedCounterValue);
NEO::MultiThreadHelpers::interlockedMax(lastWaitedCounterValue, value);
}
}
@@ -127,9 +128,9 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
std::vector<std::pair<NEO::TagNodeBase *, uint64_t>> tempTimestampNodes;
std::mutex mutex;
std::atomic<uint64_t> lastWaitedCounterValue = 0;
uint64_t counterValue = 0;
uint64_t lastWaitedCounterValue = 0;
uint64_t regularCmdListSubmissionCounter = 0;
uint64_t deviceAddress = 0;
uint64_t *hostAddress = nullptr;

View File

@@ -71,9 +71,9 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryWithAlignment
if (fakeBigAllocations && sizeAligned > bigAllocation) {
memoryAllocation = createMemoryAllocation(
allocationData.type, nullptr, reinterpret_cast<void *>(dummyAddress), dummyAddress, sizeAligned, counter,
allocationData.type, nullptr, reinterpret_cast<void *>(dummyAddress), dummyAddress, sizeAligned, counter.fetch_add(1),
MemoryPool::system4KBPages, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
counter++;
return memoryAllocation;
}
@@ -91,7 +91,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryWithAlignment
auto ptr = allocateSystemMemory(cpuAllocationSize, alignment);
if (ptr != nullptr) {
memoryAllocation = createMemoryAllocation(allocationData.type, ptr, ptr, reinterpret_cast<uint64_t>(ptr), sizeAligned,
counter, MemoryPool::system4KBPages, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
counter.load(), MemoryPool::system4KBPages, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
if (allocationData.type == AllocationType::svmCpu) {
// add padding in case mapPtr is not aligned
@@ -169,12 +169,11 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryForNonSvmHost
auto offsetInPage = ptrDiff(allocationData.hostPtr, alignedPtr);
auto memoryAllocation = createMemoryAllocation(allocationData.type, nullptr, const_cast<void *>(allocationData.hostPtr),
reinterpret_cast<uint64_t>(alignedPtr), allocationData.size, counter,
reinterpret_cast<uint64_t>(alignedPtr), allocationData.size, counter.fetch_add(1),
MemoryPool::system4KBPages, allocationData.rootDeviceIndex, false, allocationData.flags.flushL3, false);
memoryAllocation->setAllocationOffset(offsetInPage);
counter++;
return memoryAllocation;
}
@@ -234,13 +233,12 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocate32BitGraphicsMemoryImpl(con
MemoryAllocation *memAlloc = new MemoryAllocation(
allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, nullptr, const_cast<void *>(allocationData.hostPtr),
canonizedGpuAddress, allocationData.size,
counter, MemoryPool::system4KBPagesWith32BitGpuAddressing, false, false, maxOsContextCount);
counter.fetch_add(1), MemoryPool::system4KBPagesWith32BitGpuAddressing, false, false, maxOsContextCount);
memAlloc->set32BitAllocation(true);
memAlloc->setGpuBaseAddress(gmmHelper->canonize(gfxPartition->getHeapBase(heap)));
memAlloc->sizeToFree = allocationSize;
counter++;
return memAlloc;
}
@@ -261,7 +259,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocate32BitGraphicsMemoryImpl(con
auto canonizedGpuAddress = gmmHelper->canonize(gpuAddress);
memoryAllocation = new MemoryAllocation(allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, ptrAlloc, ptrAlloc,
canonizedGpuAddress,
allocationData.size, counter, MemoryPool::system4KBPagesWith32BitGpuAddressing,
allocationData.size, counter.load(), MemoryPool::system4KBPagesWith32BitGpuAddressing,
false, allocationData.flags.flushL3, maxOsContextCount);
memoryAllocation->set32BitAllocation(true);
@@ -377,7 +375,7 @@ uint64_t OsAgnosticMemoryManager::getSystemSharedMemory(uint32_t rootDeviceIndex
GraphicsAllocation *OsAgnosticMemoryManager::createGraphicsAllocation(OsHandleStorage &handleStorage, const AllocationData &allocationData) {
auto allocation = createMemoryAllocation(allocationData.type, nullptr, const_cast<void *>(allocationData.hostPtr),
reinterpret_cast<uint64_t>(allocationData.hostPtr), allocationData.size, counter++,
reinterpret_cast<uint64_t>(allocationData.hostPtr), allocationData.size, counter.fetch_add(1),
MemoryPool::system4KBPages, allocationData.rootDeviceIndex, false, allocationData.flags.flushL3, false);
allocation->fragmentsStorage = handleStorage;
@@ -474,9 +472,9 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocatePhysicalLocalDeviceMemory(c
if (systemMemory) {
auto sizeOfHeapChunk = sizeAligned64k;
allocation = new MemoryAllocation(allocationData.rootDeviceIndex, numHandles, allocationData.type, systemMemory, systemMemory,
0u, sizeAligned64k, counter,
0u, sizeAligned64k, counter.fetch_add(1),
MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
counter++;
allocation->setDefaultGmm(gmm.release());
allocation->sizeToFree = sizeOfHeapChunk;
}
@@ -507,8 +505,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocatePhysicalDeviceMemory(const
auto ptr = allocateSystemMemory(alignUp(allocationData.size, MemoryConstants::pageSize), MemoryConstants::pageSize);
if (ptr != nullptr) {
alloc = new MemoryAllocation(allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, ptr, ptr, 0u, allocationData.size,
counter, MemoryPool::systemCpuInaccessible, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
counter++;
counter.fetch_add(1), MemoryPool::systemCpuInaccessible, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
}
if (alloc) {
@@ -535,8 +532,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocatePhysicalHostMemory(const Al
auto ptr = allocateSystemMemory(alignUp(allocationData.size, MemoryConstants::pageSize), MemoryConstants::pageSize2M);
if (ptr != nullptr) {
alloc = new MemoryAllocation(allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, ptr, ptr, 0u, allocationData.size,
counter, MemoryPool::system4KBPages, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
counter++;
counter.fetch_add(1), MemoryPool::system4KBPages, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
}
if (alloc) {
@@ -564,8 +560,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateMemoryByKMD(const Allocatio
auto ptr = allocateSystemMemory(alignUp(allocationData.size, alignment), alignment);
if (ptr != nullptr) {
alloc = createMemoryAllocation(allocationData.type, ptr, ptr, reinterpret_cast<uint64_t>(ptr), allocationData.size,
counter, MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
counter++;
counter.fetch_add(1), MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
}
if (alloc) {
@@ -589,8 +584,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryForImageImpl(
auto ptr = allocateSystemMemory(alignUp(allocationData.imgInfo->size, MemoryConstants::pageSize), MemoryConstants::pageSize);
if (ptr != nullptr) {
alloc = createMemoryAllocation(allocationData.type, ptr, ptr, reinterpret_cast<uint64_t>(ptr), allocationData.imgInfo->size,
counter, MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
counter++;
counter.fetch_add(1), MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
}
if (alloc) {
@@ -696,8 +690,8 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryInDevicePool(
auto storage = allocateSystemMemory(allocationData.size, MemoryConstants::pageSize2M);
auto canonizedGpuAddress = gmmHelper->canonize(reinterpret_cast<uint64_t>(allocationData.hostPtr));
allocation = new MemoryAllocation(allocationData.rootDeviceIndex, numHandles, allocationData.type, storage, storage, canonizedGpuAddress,
allocationData.size, counter, MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
counter++;
allocationData.size, counter.fetch_add(1), MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
if (allocationData.flags.preferCompressed) {
auto &productHelper = executionEnvironment.rootDeviceEnvironments[allocationData.rootDeviceIndex]->getHelper<ProductHelper>();
GmmRequirements gmmRequirements{};
@@ -762,9 +756,9 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryInDevicePool(
canonizedGpuAddress = MemoryManager::adjustToggleBitFlagForGpuVa(allocationData.type, canonizedGpuAddress);
}
allocation = new MemoryAllocation(allocationData.rootDeviceIndex, numHandles, allocationData.type, systemMemory, systemMemory,
canonizedGpuAddress, sizeAligned64k, counter,
canonizedGpuAddress, sizeAligned64k, counter.fetch_add(1),
MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
counter++;
allocation->setDefaultGmm(gmm.release());
allocation->sizeToFree = sizeOfHeapChunk;
if (use32Allocator) {

View File

@@ -79,7 +79,7 @@ class OsAgnosticMemoryManager : public MemoryManager {
bool fakeBigAllocations = false;
private:
unsigned long long counter = 0;
std::atomic<uint64_t> counter = 0;
};
} // namespace NEO