mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-29 17:13:29 +08:00
fix: add missing lock in bcs split path
Related-To: NEO-14557 Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
6c0c229c82
commit
8ea8e78471
@@ -1327,11 +1327,17 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
|
||||
const auto copyOffloadModeForOperation = getCopyOffloadModeForOperation(copyOffloadSubmission);
|
||||
auto queue = getCmdQImmediate(copyOffloadModeForOperation);
|
||||
this->latestFlushIsDualCopyOffload = (copyOffloadModeForOperation == CopyOffloadModes::dualStream);
|
||||
this->latestFlushIsHostVisible = !this->dcFlushSupport;
|
||||
|
||||
if (NEO::debugManager.flags.DeferStateInitSubmissionToFirstRegularUsage.get() == 1) {
|
||||
static_cast<CommandQueueImp *>(queue)->getCsr()->ensurePrimaryCsrInitialized(*this->device->getNEODevice());
|
||||
}
|
||||
|
||||
if (signalEvent) {
|
||||
signalEvent->setCsr(static_cast<CommandQueueImp *>(queue)->getCsr(), isInOrderExecutionEnabled());
|
||||
this->latestFlushIsHostVisible |= signalEvent->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST) && !this->latestFlushIsDualCopyOffload;
|
||||
}
|
||||
|
||||
if (inputRet == ZE_RESULT_SUCCESS) {
|
||||
if (signalEvent && (NEO::debugManager.flags.TrackNumCsrClientsOnSyncPoints.get() != 0)) {
|
||||
signalEvent->setLatestUsedCmdQueue(queue);
|
||||
@@ -1340,13 +1346,6 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
|
||||
outerLock, outerLockForIndirect);
|
||||
}
|
||||
|
||||
this->latestFlushIsHostVisible = !this->dcFlushSupport;
|
||||
|
||||
if (signalEvent) {
|
||||
signalEvent->setCsr(static_cast<CommandQueueImp *>(queue)->getCsr(), isInOrderExecutionEnabled());
|
||||
this->latestFlushIsHostVisible |= signalEvent->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST) && !this->latestFlushIsDualCopyOffload;
|
||||
}
|
||||
|
||||
return inputRet;
|
||||
}
|
||||
|
||||
|
||||
@@ -121,6 +121,8 @@ struct BcsSplit {
|
||||
for (size_t i = 0; i < cmdListsForSplit.size(); i++) {
|
||||
auto subCmdList = static_cast<CommandListCoreFamilyImmediate<gfxCoreFamily> *>(cmdListsForSplit[i]);
|
||||
|
||||
auto lock = subCmdList->getCsr(false)->obtainUniqueOwnership();
|
||||
|
||||
subCmdList->checkAvailableSpace(numWaitEvents, hasRelaxedOrderingDependencies, estimatedCmdBufferSize, false);
|
||||
|
||||
if (barrierRequired) {
|
||||
@@ -145,7 +147,7 @@ struct BcsSplit {
|
||||
auto copyEventIndex = aggregatedEventsMode ? markerEventIndex : subcopyEventIndex + i;
|
||||
auto eventHandle = useSignalEventForSubcopy ? signalEvent : this->events.subcopy[copyEventIndex]->toHandle();
|
||||
result = appendCall(subCmdList, localDstPtr, localSrcPtr, localSize, eventHandle, aggregatedEventIncrementVal);
|
||||
subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, nullptr, nullptr);
|
||||
subCmdList->flushImmediate(result, true, !hasRelaxedOrderingDependencies, hasRelaxedOrderingDependencies, NEO::AppendOperations::nonKernel, false, nullptr, true, &lock, nullptr);
|
||||
|
||||
if ((aggregatedEventsMode && i == 0) || !aggregatedEventsMode) {
|
||||
eventHandles.push_back(eventHandle);
|
||||
@@ -179,6 +181,7 @@ struct BcsSplit {
|
||||
cmdList->handleInOrderDependencyCounter(signalEvent, false, dualStreamCopyOffload);
|
||||
|
||||
if (aggregatedEventsMode && !useSignalEventForSubcopy) {
|
||||
std::lock_guard<std::mutex> lock(events.mtx);
|
||||
cmdList->assignInOrderExecInfoToEvent(this->events.marker[markerEventIndex]);
|
||||
}
|
||||
|
||||
|
||||
@@ -9,11 +9,14 @@
|
||||
#include "shared/test/common/test_macros/hw_test.h"
|
||||
|
||||
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
|
||||
#include "level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
|
||||
#include "level_zero/core/test/unit_tests/mocks/mock_module.h"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace L0 {
|
||||
namespace ult {
|
||||
|
||||
@@ -147,5 +150,59 @@ HWTEST_F(DeviceMtTest, givenMultiThreadsExecutingCmdListAndSynchronizingDeviceWh
|
||||
}
|
||||
commandQueue->destroy();
|
||||
}
|
||||
|
||||
using AggregatedBcsSplitMtTests = AggregatedBcsSplitTests;
|
||||
|
||||
HWTEST2_F(AggregatedBcsSplitMtTests, givenBcsSplitEnabledWhenMultipleThreadsAccessingThenInternalResourcesUsedCorrectly, IsAtLeastXeHpcCore) {
|
||||
constexpr uint32_t numThreads = 8;
|
||||
constexpr uint32_t iterationCount = 5;
|
||||
|
||||
std::array<DestroyableZeUniquePtr<L0::CommandList>, numThreads> cmdLists = {};
|
||||
std::array<std::thread, numThreads> threads = {};
|
||||
std::array<void *, numThreads> hostPtrs = {};
|
||||
std::vector<TaskCountType> initialTaskCounts;
|
||||
|
||||
for (uint32_t i = 0; i < numThreads; i++) {
|
||||
cmdLists[i] = createCmdList(true);
|
||||
hostPtrs[i] = allocHostMem();
|
||||
cmdLists[i]->appendMemoryCopy(hostPtrs[i], hostPtrs[i], copySize, nullptr, 0, nullptr, copyParams);
|
||||
}
|
||||
|
||||
for (auto &cmdList : bcsSplit->cmdLists) {
|
||||
initialTaskCounts.push_back(cmdList->getCsr(false)->peekTaskCount());
|
||||
}
|
||||
|
||||
std::atomic_bool started = false;
|
||||
|
||||
auto threadBody = [&](uint32_t cmdListId) {
|
||||
while (!started.load()) {
|
||||
std::this_thread::yield();
|
||||
}
|
||||
|
||||
auto localCopyParams = copyParams;
|
||||
|
||||
for (uint32_t i = 1; i < iterationCount; i++) {
|
||||
cmdLists[cmdListId]->appendMemoryCopy(hostPtrs[cmdListId], hostPtrs[cmdListId], copySize, nullptr, 0, nullptr, localCopyParams);
|
||||
}
|
||||
};
|
||||
|
||||
for (uint32_t i = 0; i < numThreads; ++i) {
|
||||
threads[i] = std::thread(threadBody, i);
|
||||
}
|
||||
|
||||
started = true;
|
||||
|
||||
for (auto &thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < bcsSplit->cmdLists.size(); i++) {
|
||||
EXPECT_TRUE(bcsSplit->cmdLists[i]->getCsr(false)->peekTaskCount() > initialTaskCounts[i]);
|
||||
}
|
||||
|
||||
for (auto &ptr : hostPtrs) {
|
||||
context->freeMem(ptr);
|
||||
}
|
||||
}
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/helpers/common_types.h"
|
||||
#include "shared/source/helpers/mt_helpers.h"
|
||||
#include "shared/source/helpers/non_copyable_or_moveable.h"
|
||||
#include "shared/source/helpers/ptr_math.h"
|
||||
#include "shared/source/memory_manager/allocation_type.h"
|
||||
@@ -102,7 +103,7 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
|
||||
bool isExternalMemoryExecInfo() const { return deviceCounterNode == nullptr; }
|
||||
void setLastWaitedCounterValue(uint64_t value) {
|
||||
if (!isExternalMemoryExecInfo()) {
|
||||
lastWaitedCounterValue = std::max(value, lastWaitedCounterValue);
|
||||
NEO::MultiThreadHelpers::interlockedMax(lastWaitedCounterValue, value);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,9 +128,9 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
|
||||
std::vector<std::pair<NEO::TagNodeBase *, uint64_t>> tempTimestampNodes;
|
||||
|
||||
std::mutex mutex;
|
||||
std::atomic<uint64_t> lastWaitedCounterValue = 0;
|
||||
|
||||
uint64_t counterValue = 0;
|
||||
uint64_t lastWaitedCounterValue = 0;
|
||||
uint64_t regularCmdListSubmissionCounter = 0;
|
||||
uint64_t deviceAddress = 0;
|
||||
uint64_t *hostAddress = nullptr;
|
||||
|
||||
@@ -71,9 +71,9 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryWithAlignment
|
||||
|
||||
if (fakeBigAllocations && sizeAligned > bigAllocation) {
|
||||
memoryAllocation = createMemoryAllocation(
|
||||
allocationData.type, nullptr, reinterpret_cast<void *>(dummyAddress), dummyAddress, sizeAligned, counter,
|
||||
allocationData.type, nullptr, reinterpret_cast<void *>(dummyAddress), dummyAddress, sizeAligned, counter.fetch_add(1),
|
||||
MemoryPool::system4KBPages, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
|
||||
counter++;
|
||||
|
||||
return memoryAllocation;
|
||||
}
|
||||
|
||||
@@ -91,7 +91,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryWithAlignment
|
||||
auto ptr = allocateSystemMemory(cpuAllocationSize, alignment);
|
||||
if (ptr != nullptr) {
|
||||
memoryAllocation = createMemoryAllocation(allocationData.type, ptr, ptr, reinterpret_cast<uint64_t>(ptr), sizeAligned,
|
||||
counter, MemoryPool::system4KBPages, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
|
||||
counter.load(), MemoryPool::system4KBPages, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
|
||||
|
||||
if (allocationData.type == AllocationType::svmCpu) {
|
||||
// add padding in case mapPtr is not aligned
|
||||
@@ -169,12 +169,11 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryForNonSvmHost
|
||||
auto offsetInPage = ptrDiff(allocationData.hostPtr, alignedPtr);
|
||||
|
||||
auto memoryAllocation = createMemoryAllocation(allocationData.type, nullptr, const_cast<void *>(allocationData.hostPtr),
|
||||
reinterpret_cast<uint64_t>(alignedPtr), allocationData.size, counter,
|
||||
reinterpret_cast<uint64_t>(alignedPtr), allocationData.size, counter.fetch_add(1),
|
||||
MemoryPool::system4KBPages, allocationData.rootDeviceIndex, false, allocationData.flags.flushL3, false);
|
||||
|
||||
memoryAllocation->setAllocationOffset(offsetInPage);
|
||||
|
||||
counter++;
|
||||
return memoryAllocation;
|
||||
}
|
||||
|
||||
@@ -234,13 +233,12 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocate32BitGraphicsMemoryImpl(con
|
||||
MemoryAllocation *memAlloc = new MemoryAllocation(
|
||||
allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, nullptr, const_cast<void *>(allocationData.hostPtr),
|
||||
canonizedGpuAddress, allocationData.size,
|
||||
counter, MemoryPool::system4KBPagesWith32BitGpuAddressing, false, false, maxOsContextCount);
|
||||
counter.fetch_add(1), MemoryPool::system4KBPagesWith32BitGpuAddressing, false, false, maxOsContextCount);
|
||||
|
||||
memAlloc->set32BitAllocation(true);
|
||||
memAlloc->setGpuBaseAddress(gmmHelper->canonize(gfxPartition->getHeapBase(heap)));
|
||||
memAlloc->sizeToFree = allocationSize;
|
||||
|
||||
counter++;
|
||||
return memAlloc;
|
||||
}
|
||||
|
||||
@@ -261,7 +259,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocate32BitGraphicsMemoryImpl(con
|
||||
auto canonizedGpuAddress = gmmHelper->canonize(gpuAddress);
|
||||
memoryAllocation = new MemoryAllocation(allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, ptrAlloc, ptrAlloc,
|
||||
canonizedGpuAddress,
|
||||
allocationData.size, counter, MemoryPool::system4KBPagesWith32BitGpuAddressing,
|
||||
allocationData.size, counter.load(), MemoryPool::system4KBPagesWith32BitGpuAddressing,
|
||||
false, allocationData.flags.flushL3, maxOsContextCount);
|
||||
|
||||
memoryAllocation->set32BitAllocation(true);
|
||||
@@ -377,7 +375,7 @@ uint64_t OsAgnosticMemoryManager::getSystemSharedMemory(uint32_t rootDeviceIndex
|
||||
|
||||
GraphicsAllocation *OsAgnosticMemoryManager::createGraphicsAllocation(OsHandleStorage &handleStorage, const AllocationData &allocationData) {
|
||||
auto allocation = createMemoryAllocation(allocationData.type, nullptr, const_cast<void *>(allocationData.hostPtr),
|
||||
reinterpret_cast<uint64_t>(allocationData.hostPtr), allocationData.size, counter++,
|
||||
reinterpret_cast<uint64_t>(allocationData.hostPtr), allocationData.size, counter.fetch_add(1),
|
||||
MemoryPool::system4KBPages, allocationData.rootDeviceIndex, false, allocationData.flags.flushL3, false);
|
||||
|
||||
allocation->fragmentsStorage = handleStorage;
|
||||
@@ -474,9 +472,9 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocatePhysicalLocalDeviceMemory(c
|
||||
if (systemMemory) {
|
||||
auto sizeOfHeapChunk = sizeAligned64k;
|
||||
allocation = new MemoryAllocation(allocationData.rootDeviceIndex, numHandles, allocationData.type, systemMemory, systemMemory,
|
||||
0u, sizeAligned64k, counter,
|
||||
0u, sizeAligned64k, counter.fetch_add(1),
|
||||
MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
|
||||
counter++;
|
||||
|
||||
allocation->setDefaultGmm(gmm.release());
|
||||
allocation->sizeToFree = sizeOfHeapChunk;
|
||||
}
|
||||
@@ -507,8 +505,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocatePhysicalDeviceMemory(const
|
||||
auto ptr = allocateSystemMemory(alignUp(allocationData.size, MemoryConstants::pageSize), MemoryConstants::pageSize);
|
||||
if (ptr != nullptr) {
|
||||
alloc = new MemoryAllocation(allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, ptr, ptr, 0u, allocationData.size,
|
||||
counter, MemoryPool::systemCpuInaccessible, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
|
||||
counter++;
|
||||
counter.fetch_add(1), MemoryPool::systemCpuInaccessible, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
|
||||
}
|
||||
|
||||
if (alloc) {
|
||||
@@ -535,8 +532,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocatePhysicalHostMemory(const Al
|
||||
auto ptr = allocateSystemMemory(alignUp(allocationData.size, MemoryConstants::pageSize), MemoryConstants::pageSize2M);
|
||||
if (ptr != nullptr) {
|
||||
alloc = new MemoryAllocation(allocationData.rootDeviceIndex, 1u /*num gmms*/, allocationData.type, ptr, ptr, 0u, allocationData.size,
|
||||
counter, MemoryPool::system4KBPages, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
|
||||
counter++;
|
||||
counter.fetch_add(1), MemoryPool::system4KBPages, allocationData.flags.uncacheable, allocationData.flags.flushL3, maxOsContextCount);
|
||||
}
|
||||
|
||||
if (alloc) {
|
||||
@@ -564,8 +560,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateMemoryByKMD(const Allocatio
|
||||
auto ptr = allocateSystemMemory(alignUp(allocationData.size, alignment), alignment);
|
||||
if (ptr != nullptr) {
|
||||
alloc = createMemoryAllocation(allocationData.type, ptr, ptr, reinterpret_cast<uint64_t>(ptr), allocationData.size,
|
||||
counter, MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
|
||||
counter++;
|
||||
counter.fetch_add(1), MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
|
||||
}
|
||||
|
||||
if (alloc) {
|
||||
@@ -589,8 +584,7 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryForImageImpl(
|
||||
auto ptr = allocateSystemMemory(alignUp(allocationData.imgInfo->size, MemoryConstants::pageSize), MemoryConstants::pageSize);
|
||||
if (ptr != nullptr) {
|
||||
alloc = createMemoryAllocation(allocationData.type, ptr, ptr, reinterpret_cast<uint64_t>(ptr), allocationData.imgInfo->size,
|
||||
counter, MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
|
||||
counter++;
|
||||
counter.fetch_add(1), MemoryPool::systemCpuInaccessible, allocationData.rootDeviceIndex, allocationData.flags.uncacheable, allocationData.flags.flushL3, false);
|
||||
}
|
||||
|
||||
if (alloc) {
|
||||
@@ -696,8 +690,8 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryInDevicePool(
|
||||
auto storage = allocateSystemMemory(allocationData.size, MemoryConstants::pageSize2M);
|
||||
auto canonizedGpuAddress = gmmHelper->canonize(reinterpret_cast<uint64_t>(allocationData.hostPtr));
|
||||
allocation = new MemoryAllocation(allocationData.rootDeviceIndex, numHandles, allocationData.type, storage, storage, canonizedGpuAddress,
|
||||
allocationData.size, counter, MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
|
||||
counter++;
|
||||
allocationData.size, counter.fetch_add(1), MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
|
||||
|
||||
if (allocationData.flags.preferCompressed) {
|
||||
auto &productHelper = executionEnvironment.rootDeviceEnvironments[allocationData.rootDeviceIndex]->getHelper<ProductHelper>();
|
||||
GmmRequirements gmmRequirements{};
|
||||
@@ -762,9 +756,9 @@ GraphicsAllocation *OsAgnosticMemoryManager::allocateGraphicsMemoryInDevicePool(
|
||||
canonizedGpuAddress = MemoryManager::adjustToggleBitFlagForGpuVa(allocationData.type, canonizedGpuAddress);
|
||||
}
|
||||
allocation = new MemoryAllocation(allocationData.rootDeviceIndex, numHandles, allocationData.type, systemMemory, systemMemory,
|
||||
canonizedGpuAddress, sizeAligned64k, counter,
|
||||
canonizedGpuAddress, sizeAligned64k, counter.fetch_add(1),
|
||||
MemoryPool::localMemory, false, allocationData.flags.flushL3, maxOsContextCount);
|
||||
counter++;
|
||||
|
||||
allocation->setDefaultGmm(gmm.release());
|
||||
allocation->sizeToFree = sizeOfHeapChunk;
|
||||
if (use32Allocator) {
|
||||
|
||||
@@ -79,7 +79,7 @@ class OsAgnosticMemoryManager : public MemoryManager {
|
||||
bool fakeBigAllocations = false;
|
||||
|
||||
private:
|
||||
unsigned long long counter = 0;
|
||||
std::atomic<uint64_t> counter = 0;
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
Reference in New Issue
Block a user