fix: l3 flush after post sync logic in OCL

Related-To: NEO-13163
Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
Kamil Kopryk
2025-06-18 16:11:06 +00:00
committed by Compute-Runtime-Automation
parent 77e25f90d7
commit f0f89836e0
10 changed files with 63 additions and 9 deletions

View File

@@ -135,8 +135,8 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled(hwInfo); this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled(hwInfo);
this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled(this->heaplessModeEnabled); this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled(this->heaplessModeEnabled);
this->isForceStateless = compilerProductHelper.isForceToStatelessRequired(); this->isForceStateless = compilerProductHelper.isForceToStatelessRequired();
this->l3FlushAfterPostSyncEnabled = productHelper.isL3FlushAfterPostSyncRequired(this->heaplessModeEnabled);
} }
} }

View File

@@ -527,6 +527,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool heaplessModeEnabled = false; bool heaplessModeEnabled = false;
bool heaplessStateInitEnabled = false; bool heaplessStateInitEnabled = false;
bool isForceStateless = false; bool isForceStateless = false;
bool l3FlushedAfterCpuRead = true;
bool l3FlushAfterPostSyncEnabled = false;
}; };
static_assert(NEO::NonCopyableAndNonMovable<CommandQueue>); static_assert(NEO::NonCopyableAndNonMovable<CommandQueue>);

View File

@@ -555,7 +555,8 @@ class CommandQueueHw : public CommandQueue {
CsrDependencies &csrDeps, CsrDependencies &csrDeps,
KernelOperation *blockedCommandsData, KernelOperation *blockedCommandsData,
TimestampPacketDependencies &timestampPacketDependencies, TimestampPacketDependencies &timestampPacketDependencies,
bool relaxedOrderingEnabled); bool relaxedOrderingEnabled,
bool blocking);
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies, bool containsCrossEngineDependency, bool textureCacheFlushRequired) const; MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies, bool containsCrossEngineDependency, bool textureCacheFlushRequired) const;
void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType); void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);

View File

@@ -282,9 +282,10 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
clearLastBcsPackets(); clearLastBcsPackets();
setStallingCommandsOnNextFlush(false); setStallingCommandsOnNextFlush(false);
} }
processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(), processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
hwTimeStamps, blockQueue, csrDeps, blockedCommandsData.get(), hwTimeStamps, blockQueue, csrDeps, blockedCommandsData.get(),
timestampPacketDependencies, relaxedOrderingEnabled); timestampPacketDependencies, relaxedOrderingEnabled, blocking);
} else if (isCacheFlushCommand(commandType)) { } else if (isCacheFlushCommand(commandType)) {
processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps); processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
} else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) { } else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
@@ -520,7 +521,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
CsrDependencies &csrDeps, CsrDependencies &csrDeps,
KernelOperation *blockedCommandsData, KernelOperation *blockedCommandsData,
TimestampPacketDependencies &timestampPacketDependencies, TimestampPacketDependencies &timestampPacketDependencies,
bool relaxedOrderingEnabled) { bool relaxedOrderingEnabled, bool blocking) {
TagNodeBase *hwPerfCounter = nullptr; TagNodeBase *hwPerfCounter = nullptr;
getClFileLogger().dumpKernelArgs(&multiDispatchInfo); getClFileLogger().dumpKernelArgs(&multiDispatchInfo);
@@ -556,6 +557,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
dispatchWalkerArgs.commandType = commandType; dispatchWalkerArgs.commandType = commandType;
dispatchWalkerArgs.event = event; dispatchWalkerArgs.event = event;
dispatchWalkerArgs.relaxedOrderingEnabled = relaxedOrderingEnabled; dispatchWalkerArgs.relaxedOrderingEnabled = relaxedOrderingEnabled;
dispatchWalkerArgs.blocking = blocking;
getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(0u), multiDispatchInfo.getRequiredScratchSize(1u)); getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(0u), multiDispatchInfo.getRequiredScratchSize(1u));

View File

@@ -76,6 +76,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBufferImpl(
if (isCpuCopyAllowed) { if (isCpuCopyAllowed) {
if (isMemTransferNeeded) { if (isMemTransferNeeded) {
this->l3FlushedAfterCpuRead = false;
return enqueueReadWriteBufferOnCpuWithMemoryTransfer(cmdType, buffer, offset, size, ptr, return enqueueReadWriteBufferOnCpuWithMemoryTransfer(cmdType, buffer, offset, size, ptr,
numEventsInWaitList, eventWaitList, event); numEventsInWaitList, eventWaitList, event);
} else { } else {

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2018-2023 Intel Corporation * Copyright (C) 2018-2025 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@@ -15,11 +15,19 @@ namespace NEO {
template <typename GfxFamily> template <typename GfxFamily>
cl_int CommandQueueHw<GfxFamily>::finish() { cl_int CommandQueueHw<GfxFamily>::finish() {
auto result = getGpgpuCommandStreamReceiver().flushBatchedSubmissions();
auto &csr = getGpgpuCommandStreamReceiver();
auto result = csr.flushBatchedSubmissions();
if (!result) { if (!result) {
return CL_OUT_OF_RESOURCES; return CL_OUT_OF_RESOURCES;
} }
if (!l3FlushedAfterCpuRead && l3FlushAfterPostSyncEnabled) {
csr.flushTagUpdate();
this->l3FlushedAfterCpuRead = true;
}
// Stall until HW reaches taskCount on all its engines // Stall until HW reaches taskCount on all its engines
const auto waitStatus = waitForAllEngines(true, nullptr); const auto waitStatus = waitForAllEngines(true, nullptr);
if (waitStatus == WaitStatus::gpuHang) { if (waitStatus == WaitStatus::gpuHang) {

View File

@@ -53,6 +53,7 @@ struct HardwareInterfaceWalkerArgs {
uint32_t interfaceDescriptorIndex = 0; uint32_t interfaceDescriptorIndex = 0;
bool isMainKernel = false; bool isMainKernel = false;
bool relaxedOrderingEnabled = false; bool relaxedOrderingEnabled = false;
bool blocking = false;
}; };
struct HardwareInterfaceHelper { struct HardwareInterfaceHelper {

View File

@@ -106,7 +106,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
if constexpr (heaplessModeEnabled) { if constexpr (heaplessModeEnabled) {
auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>(); auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation || kernel.isAnyKernelArgumentUsingZeroCopyMemory(); bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation;
bool flushL3AfterPostSyncForExternalAllocation = kernel.isUsingSharedObjArgs(); bool flushL3AfterPostSyncForExternalAllocation = kernel.isUsingSharedObjArgs();
if (debugManager.flags.RedirectFlushL3HostUsmToExternal.get() && flushL3AfterPostSyncForHostUsm) { if (debugManager.flags.RedirectFlushL3HostUsmToExternal.get() && flushL3AfterPostSyncForHostUsm) {
@@ -114,7 +114,9 @@ inline void HardwareInterface<GfxFamily>::programWalker(
flushL3AfterPostSyncForExternalAllocation = true; flushL3AfterPostSyncForExternalAllocation = true;
} }
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacketFlushL3<WalkerType>(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation); if (walkerArgs.event != nullptr || walkerArgs.blocking) {
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacketFlushL3<WalkerType>(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation);
}
} }
} }

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (C) 2018-2023 Intel Corporation * Copyright (C) 2018-2025 Intel Corporation
* *
* SPDX-License-Identifier: MIT * SPDX-License-Identifier: MIT
* *
@@ -87,6 +87,7 @@ HWTEST_F(FinishTest, WhenFinishIsCalledThenPipeControlIsNotAddedToCqCommandStrea
auto itorCmd = reverseFind<PIPE_CONTROL *>(cmdList.rbegin(), cmdList.rend()); auto itorCmd = reverseFind<PIPE_CONTROL *>(cmdList.rbegin(), cmdList.rend());
EXPECT_EQ(cmdList.rend(), itorCmd); EXPECT_EQ(cmdList.rend(), itorCmd);
} }
HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllocated) { HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllocated) {
MockContext contextWithMockCmdQ(pClDevice, true); MockContext contextWithMockCmdQ(pClDevice, true);
MockCommandQueueHw<FamilyType> cmdQ(&contextWithMockCmdQ, pClDevice, 0); MockCommandQueueHw<FamilyType> cmdQ(&contextWithMockCmdQ, pClDevice, 0);
@@ -96,3 +97,37 @@ HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllo
EXPECT_EQ(nullptr, cmdQ.peekCommandStream()); EXPECT_EQ(nullptr, cmdQ.peekCommandStream());
} }
HWTEST_F(FinishTest, givenL3FlushAfterPostSyncEnabledWhenFlushTagUpdateIsCalledThenPipeControlIsAddedWithDcFlushEnabled) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
DebugManagerStateRestore dbgRestorer;
debugManager.flags.EnableL3FlushAfterPostSync.set(true);
auto &productHelper = pClDevice->getDevice().getProductHelper();
if (!productHelper.isL3FlushAfterPostSyncRequired(true)) {
GTEST_SKIP();
}
MockContext contextWithMockCmdQ(pClDevice, true);
MockCommandQueueHw<FamilyType> cmdQ(&contextWithMockCmdQ, pClDevice, 0);
cmdQ.l3FlushedAfterCpuRead = false;
cmdQ.l3FlushAfterPostSyncEnabled = true;
auto &csr = cmdQ.getUltCommandStreamReceiver();
auto used = csr.commandStream.getUsed();
auto retVal = cmdQ.finish();
ASSERT_EQ(CL_SUCCESS, retVal);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(csr.commandStream, used);
auto itorCmd = find<PIPE_CONTROL *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
EXPECT_NE(hwParse.cmdList.end(), itorCmd);
// Verify DC flush is enabled
auto pipeControl = genCmdCast<PIPE_CONTROL *>(*itorCmd);
ASSERT_NE(nullptr, pipeControl);
EXPECT_EQ(csr.dcFlushSupport, pipeControl->getDcFlushEnable());
}

View File

@@ -309,6 +309,8 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
using BaseClass::isCacheFlushOnNextBcsWriteRequired; using BaseClass::isCacheFlushOnNextBcsWriteRequired;
using BaseClass::isCompleted; using BaseClass::isCompleted;
using BaseClass::isGpgpuSubmissionForBcsRequired; using BaseClass::isGpgpuSubmissionForBcsRequired;
using BaseClass::l3FlushAfterPostSyncEnabled;
using BaseClass::l3FlushedAfterCpuRead;
using BaseClass::latestSentEnqueueType; using BaseClass::latestSentEnqueueType;
using BaseClass::minimalSizeForBcsSplit; using BaseClass::minimalSizeForBcsSplit;
using BaseClass::obtainCommandStream; using BaseClass::obtainCommandStream;