mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-06 10:26:29 +08:00
fix: l3 flush after post sync logic in OCL
Related-To: NEO-13163 Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
77e25f90d7
commit
f0f89836e0
@@ -135,8 +135,8 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
|
|||||||
|
|
||||||
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled(hwInfo);
|
this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled(hwInfo);
|
||||||
this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled(this->heaplessModeEnabled);
|
this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled(this->heaplessModeEnabled);
|
||||||
|
|
||||||
this->isForceStateless = compilerProductHelper.isForceToStatelessRequired();
|
this->isForceStateless = compilerProductHelper.isForceToStatelessRequired();
|
||||||
|
this->l3FlushAfterPostSyncEnabled = productHelper.isL3FlushAfterPostSyncRequired(this->heaplessModeEnabled);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -527,6 +527,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
|||||||
bool heaplessModeEnabled = false;
|
bool heaplessModeEnabled = false;
|
||||||
bool heaplessStateInitEnabled = false;
|
bool heaplessStateInitEnabled = false;
|
||||||
bool isForceStateless = false;
|
bool isForceStateless = false;
|
||||||
|
bool l3FlushedAfterCpuRead = true;
|
||||||
|
bool l3FlushAfterPostSyncEnabled = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(NEO::NonCopyableAndNonMovable<CommandQueue>);
|
static_assert(NEO::NonCopyableAndNonMovable<CommandQueue>);
|
||||||
|
|||||||
@@ -555,7 +555,8 @@ class CommandQueueHw : public CommandQueue {
|
|||||||
CsrDependencies &csrDeps,
|
CsrDependencies &csrDeps,
|
||||||
KernelOperation *blockedCommandsData,
|
KernelOperation *blockedCommandsData,
|
||||||
TimestampPacketDependencies ×tampPacketDependencies,
|
TimestampPacketDependencies ×tampPacketDependencies,
|
||||||
bool relaxedOrderingEnabled);
|
bool relaxedOrderingEnabled,
|
||||||
|
bool blocking);
|
||||||
|
|
||||||
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency, bool textureCacheFlushRequired) const;
|
MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency, bool textureCacheFlushRequired) const;
|
||||||
void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);
|
void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);
|
||||||
|
|||||||
@@ -282,9 +282,10 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
|||||||
clearLastBcsPackets();
|
clearLastBcsPackets();
|
||||||
setStallingCommandsOnNextFlush(false);
|
setStallingCommandsOnNextFlush(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
|
processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
|
||||||
hwTimeStamps, blockQueue, csrDeps, blockedCommandsData.get(),
|
hwTimeStamps, blockQueue, csrDeps, blockedCommandsData.get(),
|
||||||
timestampPacketDependencies, relaxedOrderingEnabled);
|
timestampPacketDependencies, relaxedOrderingEnabled, blocking);
|
||||||
} else if (isCacheFlushCommand(commandType)) {
|
} else if (isCacheFlushCommand(commandType)) {
|
||||||
processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
|
processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
|
||||||
} else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
|
} else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
|
||||||
@@ -520,7 +521,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
|
|||||||
CsrDependencies &csrDeps,
|
CsrDependencies &csrDeps,
|
||||||
KernelOperation *blockedCommandsData,
|
KernelOperation *blockedCommandsData,
|
||||||
TimestampPacketDependencies ×tampPacketDependencies,
|
TimestampPacketDependencies ×tampPacketDependencies,
|
||||||
bool relaxedOrderingEnabled) {
|
bool relaxedOrderingEnabled, bool blocking) {
|
||||||
TagNodeBase *hwPerfCounter = nullptr;
|
TagNodeBase *hwPerfCounter = nullptr;
|
||||||
getClFileLogger().dumpKernelArgs(&multiDispatchInfo);
|
getClFileLogger().dumpKernelArgs(&multiDispatchInfo);
|
||||||
|
|
||||||
@@ -556,6 +557,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
|
|||||||
dispatchWalkerArgs.commandType = commandType;
|
dispatchWalkerArgs.commandType = commandType;
|
||||||
dispatchWalkerArgs.event = event;
|
dispatchWalkerArgs.event = event;
|
||||||
dispatchWalkerArgs.relaxedOrderingEnabled = relaxedOrderingEnabled;
|
dispatchWalkerArgs.relaxedOrderingEnabled = relaxedOrderingEnabled;
|
||||||
|
dispatchWalkerArgs.blocking = blocking;
|
||||||
|
|
||||||
getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(0u), multiDispatchInfo.getRequiredScratchSize(1u));
|
getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(0u), multiDispatchInfo.getRequiredScratchSize(1u));
|
||||||
|
|
||||||
|
|||||||
@@ -76,6 +76,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBufferImpl(
|
|||||||
|
|
||||||
if (isCpuCopyAllowed) {
|
if (isCpuCopyAllowed) {
|
||||||
if (isMemTransferNeeded) {
|
if (isMemTransferNeeded) {
|
||||||
|
this->l3FlushedAfterCpuRead = false;
|
||||||
return enqueueReadWriteBufferOnCpuWithMemoryTransfer(cmdType, buffer, offset, size, ptr,
|
return enqueueReadWriteBufferOnCpuWithMemoryTransfer(cmdType, buffer, offset, size, ptr,
|
||||||
numEventsInWaitList, eventWaitList, event);
|
numEventsInWaitList, eventWaitList, event);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2025 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
@@ -15,11 +15,19 @@ namespace NEO {
|
|||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
cl_int CommandQueueHw<GfxFamily>::finish() {
|
cl_int CommandQueueHw<GfxFamily>::finish() {
|
||||||
auto result = getGpgpuCommandStreamReceiver().flushBatchedSubmissions();
|
|
||||||
|
auto &csr = getGpgpuCommandStreamReceiver();
|
||||||
|
|
||||||
|
auto result = csr.flushBatchedSubmissions();
|
||||||
if (!result) {
|
if (!result) {
|
||||||
return CL_OUT_OF_RESOURCES;
|
return CL_OUT_OF_RESOURCES;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!l3FlushedAfterCpuRead && l3FlushAfterPostSyncEnabled) {
|
||||||
|
csr.flushTagUpdate();
|
||||||
|
this->l3FlushedAfterCpuRead = true;
|
||||||
|
}
|
||||||
|
|
||||||
// Stall until HW reaches taskCount on all its engines
|
// Stall until HW reaches taskCount on all its engines
|
||||||
const auto waitStatus = waitForAllEngines(true, nullptr);
|
const auto waitStatus = waitForAllEngines(true, nullptr);
|
||||||
if (waitStatus == WaitStatus::gpuHang) {
|
if (waitStatus == WaitStatus::gpuHang) {
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ struct HardwareInterfaceWalkerArgs {
|
|||||||
uint32_t interfaceDescriptorIndex = 0;
|
uint32_t interfaceDescriptorIndex = 0;
|
||||||
bool isMainKernel = false;
|
bool isMainKernel = false;
|
||||||
bool relaxedOrderingEnabled = false;
|
bool relaxedOrderingEnabled = false;
|
||||||
|
bool blocking = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct HardwareInterfaceHelper {
|
struct HardwareInterfaceHelper {
|
||||||
|
|||||||
@@ -106,7 +106,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
|||||||
|
|
||||||
if constexpr (heaplessModeEnabled) {
|
if constexpr (heaplessModeEnabled) {
|
||||||
auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
|
auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
|
||||||
bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation || kernel.isAnyKernelArgumentUsingZeroCopyMemory();
|
bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation;
|
||||||
bool flushL3AfterPostSyncForExternalAllocation = kernel.isUsingSharedObjArgs();
|
bool flushL3AfterPostSyncForExternalAllocation = kernel.isUsingSharedObjArgs();
|
||||||
|
|
||||||
if (debugManager.flags.RedirectFlushL3HostUsmToExternal.get() && flushL3AfterPostSyncForHostUsm) {
|
if (debugManager.flags.RedirectFlushL3HostUsmToExternal.get() && flushL3AfterPostSyncForHostUsm) {
|
||||||
@@ -114,7 +114,9 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
|||||||
flushL3AfterPostSyncForExternalAllocation = true;
|
flushL3AfterPostSyncForExternalAllocation = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacketFlushL3<WalkerType>(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation);
|
if (walkerArgs.event != nullptr || walkerArgs.blocking) {
|
||||||
|
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacketFlushL3<WalkerType>(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2025 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
@@ -87,6 +87,7 @@ HWTEST_F(FinishTest, WhenFinishIsCalledThenPipeControlIsNotAddedToCqCommandStrea
|
|||||||
auto itorCmd = reverseFind<PIPE_CONTROL *>(cmdList.rbegin(), cmdList.rend());
|
auto itorCmd = reverseFind<PIPE_CONTROL *>(cmdList.rbegin(), cmdList.rend());
|
||||||
EXPECT_EQ(cmdList.rend(), itorCmd);
|
EXPECT_EQ(cmdList.rend(), itorCmd);
|
||||||
}
|
}
|
||||||
|
|
||||||
HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllocated) {
|
HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllocated) {
|
||||||
MockContext contextWithMockCmdQ(pClDevice, true);
|
MockContext contextWithMockCmdQ(pClDevice, true);
|
||||||
MockCommandQueueHw<FamilyType> cmdQ(&contextWithMockCmdQ, pClDevice, 0);
|
MockCommandQueueHw<FamilyType> cmdQ(&contextWithMockCmdQ, pClDevice, 0);
|
||||||
@@ -96,3 +97,37 @@ HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllo
|
|||||||
|
|
||||||
EXPECT_EQ(nullptr, cmdQ.peekCommandStream());
|
EXPECT_EQ(nullptr, cmdQ.peekCommandStream());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HWTEST_F(FinishTest, givenL3FlushAfterPostSyncEnabledWhenFlushTagUpdateIsCalledThenPipeControlIsAddedWithDcFlushEnabled) {
|
||||||
|
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||||
|
|
||||||
|
DebugManagerStateRestore dbgRestorer;
|
||||||
|
debugManager.flags.EnableL3FlushAfterPostSync.set(true);
|
||||||
|
|
||||||
|
auto &productHelper = pClDevice->getDevice().getProductHelper();
|
||||||
|
if (!productHelper.isL3FlushAfterPostSyncRequired(true)) {
|
||||||
|
GTEST_SKIP();
|
||||||
|
}
|
||||||
|
|
||||||
|
MockContext contextWithMockCmdQ(pClDevice, true);
|
||||||
|
MockCommandQueueHw<FamilyType> cmdQ(&contextWithMockCmdQ, pClDevice, 0);
|
||||||
|
|
||||||
|
cmdQ.l3FlushedAfterCpuRead = false;
|
||||||
|
cmdQ.l3FlushAfterPostSyncEnabled = true;
|
||||||
|
|
||||||
|
auto &csr = cmdQ.getUltCommandStreamReceiver();
|
||||||
|
auto used = csr.commandStream.getUsed();
|
||||||
|
auto retVal = cmdQ.finish();
|
||||||
|
ASSERT_EQ(CL_SUCCESS, retVal);
|
||||||
|
|
||||||
|
HardwareParse hwParse;
|
||||||
|
hwParse.parseCommands<FamilyType>(csr.commandStream, used);
|
||||||
|
auto itorCmd = find<PIPE_CONTROL *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
|
||||||
|
|
||||||
|
EXPECT_NE(hwParse.cmdList.end(), itorCmd);
|
||||||
|
|
||||||
|
// Verify DC flush is enabled
|
||||||
|
auto pipeControl = genCmdCast<PIPE_CONTROL *>(*itorCmd);
|
||||||
|
ASSERT_NE(nullptr, pipeControl);
|
||||||
|
EXPECT_EQ(csr.dcFlushSupport, pipeControl->getDcFlushEnable());
|
||||||
|
}
|
||||||
|
|||||||
@@ -309,6 +309,8 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
|||||||
using BaseClass::isCacheFlushOnNextBcsWriteRequired;
|
using BaseClass::isCacheFlushOnNextBcsWriteRequired;
|
||||||
using BaseClass::isCompleted;
|
using BaseClass::isCompleted;
|
||||||
using BaseClass::isGpgpuSubmissionForBcsRequired;
|
using BaseClass::isGpgpuSubmissionForBcsRequired;
|
||||||
|
using BaseClass::l3FlushAfterPostSyncEnabled;
|
||||||
|
using BaseClass::l3FlushedAfterCpuRead;
|
||||||
using BaseClass::latestSentEnqueueType;
|
using BaseClass::latestSentEnqueueType;
|
||||||
using BaseClass::minimalSizeForBcsSplit;
|
using BaseClass::minimalSizeForBcsSplit;
|
||||||
using BaseClass::obtainCommandStream;
|
using BaseClass::obtainCommandStream;
|
||||||
|
|||||||
Reference in New Issue
Block a user