fix: add deferred l3 flush tag in kernels

Related-To: NEO-13163
Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
Kamil Kopryk
2025-08-21 16:16:19 +00:00
committed by Compute-Runtime-Automation
parent 6e875f18ab
commit 2a3bd7cd03
10 changed files with 256 additions and 4 deletions

View File

@@ -435,6 +435,18 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool waitOnDestructionNeeded() const;
bool getL3FlushDeferredIfNeeded() const {
return l3FlushDeferredIfNeeded;
}
void setL3FlushDeferredIfNeeded(bool newValue) {
l3FlushDeferredIfNeeded = newValue;
}
void setCheckIfDeferredL3FlushIsNeeded(bool newValue) {
checkIfDeferredL3FlushIsNeeded = newValue;
}
protected:
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest);
@@ -550,6 +562,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool l3FlushAfterPostSyncEnabled = false;
bool isWalkerWithProfilingEnqueued = false;
bool shouldRegisterEnqueuedWalkerWithProfiling = false;
bool l3FlushDeferredIfNeeded = false;
bool checkIfDeferredL3FlushIsNeeded = false;
};
static_assert(NEO::NonCopyableAndNonMovable<CommandQueue>);

View File

@@ -110,7 +110,10 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
}
// wait for the completness of previous commands
if (transferProperties.finishRequired) {
this->setCheckIfDeferredL3FlushIsNeeded(true);
auto ret = finish();
this->setCheckIfDeferredL3FlushIsNeeded(false);
if (ret != CL_SUCCESS) {
err.set(ret);
return nullptr;

View File

@@ -410,6 +410,27 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
} else {
UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::enqueueWithoutSubmission);
if (this->getL3FlushDeferredIfNeeded()) {
if (blocking) {
this->setCheckIfDeferredL3FlushIsNeeded(true);
this->finish();
this->setCheckIfDeferredL3FlushIsNeeded(false);
} else if (event) {
computeCommandStreamReceiver.flushBatchedSubmissions();
computeCommandStreamReceiver.flushTagUpdate();
CompletionStamp completionStamp = {
computeCommandStreamReceiver.peekTaskCount(),
std::max(taskLevel, computeCommandStreamReceiver.peekTaskLevel()),
computeCommandStreamReceiver.obtainCurrentFlushStamp()};
this->updateFromCompletionStamp(completionStamp, nullptr);
this->l3FlushDeferredIfNeeded = false;
eventBuilder.getEvent()->setWaitForTaskCountRequired(true);
}
}
auto maxTaskCountCurrentRootDevice = this->taskCount;
for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {

View File

@@ -25,6 +25,20 @@ cl_int CommandQueueHw<GfxFamily>::finish() {
bool waitForTaskCountRequired = false;
if (l3FlushAfterPostSyncEnabled && this->checkIfDeferredL3FlushIsNeeded && this->l3FlushDeferredIfNeeded) {
csr.flushTagUpdate();
CompletionStamp completionStamp = {
csr.peekTaskCount(),
std::max(this->taskLevel, csr.peekTaskLevel()),
csr.obtainCurrentFlushStamp()};
this->updateFromCompletionStamp(completionStamp, nullptr);
this->l3FlushDeferredIfNeeded = false;
waitForTaskCountRequired = true;
}
// Stall until HW reaches taskCount on all its engines
const auto waitStatus = waitForAllEngines(true, nullptr, waitForTaskCountRequired);
if (waitStatus == WaitStatus::gpuHang) {
@@ -33,4 +47,5 @@ cl_int CommandQueueHw<GfxFamily>::finish() {
return CL_SUCCESS;
}
} // namespace NEO

View File

@@ -107,9 +107,13 @@ inline void HardwareInterface<GfxFamily>::programWalker(
if constexpr (heaplessModeEnabled) {
auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
auto containsPrintBuffer = kernel.hasPrintfOutput();
bool l3FlushDeferredIfNeeded = false;
bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation || containsPrintBuffer;
bool flushL3AfterPostSyncForExternalAllocation = kernel.isUsingSharedObjArgs();
l3FlushDeferredIfNeeded = flushL3AfterPostSyncForHostUsm || flushL3AfterPostSyncForExternalAllocation;
if (debugManager.flags.RedirectFlushL3HostUsmToExternal.get() && flushL3AfterPostSyncForHostUsm) {
flushL3AfterPostSyncForHostUsm = false;
flushL3AfterPostSyncForExternalAllocation = true;
@@ -127,6 +131,11 @@ inline void HardwareInterface<GfxFamily>::programWalker(
if (walkerArgs.event != nullptr || walkerArgs.blocking || containsPrintBuffer || forceFlushL3) {
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacketFlushL3<WalkerType>(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation);
l3FlushDeferredIfNeeded = false;
}
if (l3FlushDeferredIfNeeded) {
commandQueue.setL3FlushDeferredIfNeeded(true);
}
}
}