mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-29 09:03:14 +08:00
fix: add deferred l3 flush tag in kernels
Related-To: NEO-13163 Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
6e875f18ab
commit
2a3bd7cd03
@@ -435,6 +435,18 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
|
||||
bool waitOnDestructionNeeded() const;
|
||||
|
||||
bool getL3FlushDeferredIfNeeded() const {
|
||||
return l3FlushDeferredIfNeeded;
|
||||
}
|
||||
|
||||
void setL3FlushDeferredIfNeeded(bool newValue) {
|
||||
l3FlushDeferredIfNeeded = newValue;
|
||||
}
|
||||
|
||||
void setCheckIfDeferredL3FlushIsNeeded(bool newValue) {
|
||||
checkIfDeferredL3FlushIsNeeded = newValue;
|
||||
}
|
||||
|
||||
protected:
|
||||
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
|
||||
cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest);
|
||||
@@ -550,6 +562,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
bool l3FlushAfterPostSyncEnabled = false;
|
||||
bool isWalkerWithProfilingEnqueued = false;
|
||||
bool shouldRegisterEnqueuedWalkerWithProfiling = false;
|
||||
bool l3FlushDeferredIfNeeded = false;
|
||||
bool checkIfDeferredL3FlushIsNeeded = false;
|
||||
};
|
||||
|
||||
static_assert(NEO::NonCopyableAndNonMovable<CommandQueue>);
|
||||
|
||||
@@ -110,7 +110,10 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
|
||||
}
|
||||
// wait for the completness of previous commands
|
||||
if (transferProperties.finishRequired) {
|
||||
this->setCheckIfDeferredL3FlushIsNeeded(true);
|
||||
auto ret = finish();
|
||||
this->setCheckIfDeferredL3FlushIsNeeded(false);
|
||||
|
||||
if (ret != CL_SUCCESS) {
|
||||
err.set(ret);
|
||||
return nullptr;
|
||||
|
||||
@@ -410,6 +410,27 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
} else {
|
||||
UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::enqueueWithoutSubmission);
|
||||
|
||||
if (this->getL3FlushDeferredIfNeeded()) {
|
||||
if (blocking) {
|
||||
this->setCheckIfDeferredL3FlushIsNeeded(true);
|
||||
this->finish();
|
||||
this->setCheckIfDeferredL3FlushIsNeeded(false);
|
||||
|
||||
} else if (event) {
|
||||
computeCommandStreamReceiver.flushBatchedSubmissions();
|
||||
computeCommandStreamReceiver.flushTagUpdate();
|
||||
|
||||
CompletionStamp completionStamp = {
|
||||
computeCommandStreamReceiver.peekTaskCount(),
|
||||
std::max(taskLevel, computeCommandStreamReceiver.peekTaskLevel()),
|
||||
computeCommandStreamReceiver.obtainCurrentFlushStamp()};
|
||||
|
||||
this->updateFromCompletionStamp(completionStamp, nullptr);
|
||||
this->l3FlushDeferredIfNeeded = false;
|
||||
eventBuilder.getEvent()->setWaitForTaskCountRequired(true);
|
||||
}
|
||||
}
|
||||
|
||||
auto maxTaskCountCurrentRootDevice = this->taskCount;
|
||||
|
||||
for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
|
||||
|
||||
@@ -25,6 +25,20 @@ cl_int CommandQueueHw<GfxFamily>::finish() {
|
||||
|
||||
bool waitForTaskCountRequired = false;
|
||||
|
||||
if (l3FlushAfterPostSyncEnabled && this->checkIfDeferredL3FlushIsNeeded && this->l3FlushDeferredIfNeeded) {
|
||||
csr.flushTagUpdate();
|
||||
|
||||
CompletionStamp completionStamp = {
|
||||
csr.peekTaskCount(),
|
||||
std::max(this->taskLevel, csr.peekTaskLevel()),
|
||||
csr.obtainCurrentFlushStamp()};
|
||||
|
||||
this->updateFromCompletionStamp(completionStamp, nullptr);
|
||||
|
||||
this->l3FlushDeferredIfNeeded = false;
|
||||
waitForTaskCountRequired = true;
|
||||
}
|
||||
|
||||
// Stall until HW reaches taskCount on all its engines
|
||||
const auto waitStatus = waitForAllEngines(true, nullptr, waitForTaskCountRequired);
|
||||
if (waitStatus == WaitStatus::gpuHang) {
|
||||
@@ -33,4 +47,5 @@ cl_int CommandQueueHw<GfxFamily>::finish() {
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -107,9 +107,13 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
if constexpr (heaplessModeEnabled) {
|
||||
auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
|
||||
auto containsPrintBuffer = kernel.hasPrintfOutput();
|
||||
bool l3FlushDeferredIfNeeded = false;
|
||||
|
||||
bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation || containsPrintBuffer;
|
||||
bool flushL3AfterPostSyncForExternalAllocation = kernel.isUsingSharedObjArgs();
|
||||
|
||||
l3FlushDeferredIfNeeded = flushL3AfterPostSyncForHostUsm || flushL3AfterPostSyncForExternalAllocation;
|
||||
|
||||
if (debugManager.flags.RedirectFlushL3HostUsmToExternal.get() && flushL3AfterPostSyncForHostUsm) {
|
||||
flushL3AfterPostSyncForHostUsm = false;
|
||||
flushL3AfterPostSyncForExternalAllocation = true;
|
||||
@@ -127,6 +131,11 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
|
||||
if (walkerArgs.event != nullptr || walkerArgs.blocking || containsPrintBuffer || forceFlushL3) {
|
||||
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacketFlushL3<WalkerType>(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation);
|
||||
l3FlushDeferredIfNeeded = false;
|
||||
}
|
||||
|
||||
if (l3FlushDeferredIfNeeded) {
|
||||
commandQueue.setL3FlushDeferredIfNeeded(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user