diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index cab1391b98..b5250ae969 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -435,6 +435,18 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool waitOnDestructionNeeded() const; + bool getL3FlushDeferredIfNeeded() const { + return l3FlushDeferredIfNeeded; + } + + void setL3FlushDeferredIfNeeded(bool newValue) { + l3FlushDeferredIfNeeded = newValue; + } + + void setCheckIfDeferredL3FlushIsNeeded(bool newValue) { + checkIfDeferredL3FlushIsNeeded = newValue; + } + protected: void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet); cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest); @@ -550,6 +562,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool l3FlushAfterPostSyncEnabled = false; bool isWalkerWithProfilingEnqueued = false; bool shouldRegisterEnqueuedWalkerWithProfiling = false; + bool l3FlushDeferredIfNeeded = false; + bool checkIfDeferredL3FlushIsNeeded = false; }; static_assert(NEO::NonCopyableAndNonMovable); diff --git a/opencl/source/command_queue/cpu_data_transfer_handler.cpp b/opencl/source/command_queue/cpu_data_transfer_handler.cpp index f2175eb6ca..0be35729da 100644 --- a/opencl/source/command_queue/cpu_data_transfer_handler.cpp +++ b/opencl/source/command_queue/cpu_data_transfer_handler.cpp @@ -110,7 +110,10 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie } // wait for the completness of previous commands if (transferProperties.finishRequired) { + this->setCheckIfDeferredL3FlushIsNeeded(true); auto ret = finish(); + this->setCheckIfDeferredL3FlushIsNeeded(false); + if (ret != CL_SUCCESS) { err.set(ret); return nullptr; diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index aa5a309b2b..65f3b09abf 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -410,6 +410,27 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } else { UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::enqueueWithoutSubmission); + if (this->getL3FlushDeferredIfNeeded()) { + if (blocking) { + this->setCheckIfDeferredL3FlushIsNeeded(true); + this->finish(); + this->setCheckIfDeferredL3FlushIsNeeded(false); + + } else if (event) { + computeCommandStreamReceiver.flushBatchedSubmissions(); + computeCommandStreamReceiver.flushTagUpdate(); + + CompletionStamp completionStamp = { + computeCommandStreamReceiver.peekTaskCount(), + std::max(taskLevel, computeCommandStreamReceiver.peekTaskLevel()), + computeCommandStreamReceiver.obtainCurrentFlushStamp()}; + + this->updateFromCompletionStamp(completionStamp, nullptr); + this->l3FlushDeferredIfNeeded = false; + eventBuilder.getEvent()->setWaitForTaskCountRequired(true); + } + } + auto maxTaskCountCurrentRootDevice = this->taskCount; for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) { diff --git a/opencl/source/command_queue/finish.h b/opencl/source/command_queue/finish.h index 7315db59cd..bf6ad97bc3 100644 --- a/opencl/source/command_queue/finish.h +++ b/opencl/source/command_queue/finish.h @@ -25,6 +25,20 @@ cl_int CommandQueueHw::finish() { bool waitForTaskCountRequired = false; + if (l3FlushAfterPostSyncEnabled && this->checkIfDeferredL3FlushIsNeeded && this->l3FlushDeferredIfNeeded) { + csr.flushTagUpdate(); + + CompletionStamp completionStamp = { + csr.peekTaskCount(), + std::max(this->taskLevel, csr.peekTaskLevel()), + csr.obtainCurrentFlushStamp()}; + + this->updateFromCompletionStamp(completionStamp, nullptr); + + this->l3FlushDeferredIfNeeded = false; + waitForTaskCountRequired = true; + } + // Stall until HW reaches taskCount on all its engines const auto waitStatus = waitForAllEngines(true, nullptr, waitForTaskCountRequired); if (waitStatus == WaitStatus::gpuHang) { @@ -33,4 +47,5 @@ cl_int CommandQueueHw::finish() { return CL_SUCCESS; } + } // namespace NEO diff --git a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl index bf0cfe5aea..6ac2e23078 100644 --- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl +++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl @@ -107,9 +107,13 @@ inline void HardwareInterface::programWalker( if constexpr (heaplessModeEnabled) { auto &productHelper = rootDeviceEnvironment.getHelper(); auto containsPrintBuffer = kernel.hasPrintfOutput(); + bool l3FlushDeferredIfNeeded = false; + bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation || containsPrintBuffer; bool flushL3AfterPostSyncForExternalAllocation = kernel.isUsingSharedObjArgs(); + l3FlushDeferredIfNeeded = flushL3AfterPostSyncForHostUsm || flushL3AfterPostSyncForExternalAllocation; + if (debugManager.flags.RedirectFlushL3HostUsmToExternal.get() && flushL3AfterPostSyncForHostUsm) { flushL3AfterPostSyncForHostUsm = false; flushL3AfterPostSyncForExternalAllocation = true; @@ -127,6 +131,11 @@ inline void HardwareInterface::programWalker( if (walkerArgs.event != nullptr || walkerArgs.blocking || containsPrintBuffer || forceFlushL3) { GpgpuWalkerHelper::template setupTimestampPacketFlushL3(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation); + l3FlushDeferredIfNeeded = false; + } + + if (l3FlushDeferredIfNeeded) { + commandQueue.setL3FlushDeferredIfNeeded(true); } } } diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index be441b7b96..05ebf95a8f 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -485,8 +485,14 @@ inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) { std::span states{&bcsState, bcsState.isValid() ? 1u : 0u}; auto waitStatus = WaitStatus::notReady; - auto waitedOnTimestamps = cmdQueue->waitForTimestamps(states, waitStatus, this->timestampPacketContainer.get(), nullptr); - waitStatus = cmdQueue->waitUntilComplete(taskCount.load(), states, flushStamp->peekStamp(), useQuickKmdSleep, true, waitedOnTimestamps); + auto skipWaitOnTaskCount = cmdQueue->waitForTimestamps(states, waitStatus, this->timestampPacketContainer.get(), nullptr); + + if (this->getWaitForTaskCountRequired()) { + skipWaitOnTaskCount = false; + this->setWaitForTaskCountRequired(false); + } + + waitStatus = cmdQueue->waitUntilComplete(taskCount.load(), states, flushStamp->peekStamp(), useQuickKmdSleep, true, skipWaitOnTaskCount); if (waitStatus == WaitStatus::gpuHang) { return WaitStatus::gpuHang; } @@ -500,7 +506,7 @@ inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) { { TakeOwnershipWrapper queueOwnership(*cmdQueue); - bool checkQueueCompletionForPostSyncOperations = !(waitedOnTimestamps && !cmdQueue->isOOQEnabled() && + bool checkQueueCompletionForPostSyncOperations = !(skipWaitOnTaskCount && !cmdQueue->isOOQEnabled() && (this->timestampPacketContainer->peekNodes() == cmdQueue->getTimestampPacketContainer()->peekNodes())); cmdQueue->handlePostCompletionOperations(checkQueueCompletionForPostSyncOperations); diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index a2fc1a3072..b6093c49af 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -312,6 +312,14 @@ class Event : public BaseObject<_cl_event>, public IDNode { void copyTimestamps(Event &srcEvent); + void setWaitForTaskCountRequired(bool newValue) { + waitForTaskCountRequired = newValue; + } + + bool getWaitForTaskCountRequired() const { + return waitForTaskCountRequired; + } + protected: Event(Context *ctx, CommandQueue *cmdQueue, cl_command_type cmdType, TaskCountType taskLevel, TaskCountType taskCount); @@ -367,6 +375,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { std::atomic cmdToSubmit{nullptr}; std::atomic submittedCmd{nullptr}; bool eventWithoutCommand = true; + bool waitForTaskCountRequired = false; Context *ctx = nullptr; CommandQueue *cmdQueue = nullptr; diff --git a/opencl/source/sharings/gl/cl_gl_api.cpp b/opencl/source/sharings/gl/cl_gl_api.cpp index 399e51fcd1..696e9401dc 100644 --- a/opencl/source/sharings/gl/cl_gl_api.cpp +++ b/opencl/source/sharings/gl/cl_gl_api.cpp @@ -275,8 +275,10 @@ cl_int CL_API_CALL clEnqueueReleaseGLObjects(cl_command_queue commandQueue, cl_u TRACING_EXIT(ClEnqueueReleaseGlObjects, &retVal); return retVal; } - + pCommandQueue->setCheckIfDeferredL3FlushIsNeeded(true); pCommandQueue->finish(); + pCommandQueue->setCheckIfDeferredL3FlushIsNeeded(false); + retVal = pCommandQueue->enqueueReleaseSharedObjects(numObjects, memObjects, numEventsInWaitList, eventWaitList, event, CL_COMMAND_RELEASE_GL_OBJECTS); } diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp index 097e9027b0..e860bb7a38 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp @@ -575,6 +575,75 @@ HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelAndZeroSurfacesWhenE EXPECT_EQ(mockCmdQ->getCS(0).getUsed(), requiredCmdStreamSize); } +HWTEST_F(EnqueueHandlerTest, givenEnableL3FlushAfterPostSyncWithSignalingEventWhenEnqueueWithoutKernelIsCalledThenEventIsSetToWaitForTaskCount) { + + DebugManagerStateRestore dbgRestorer; + debugManager.flags.EnableL3FlushAfterPostSync.set(1); + + auto &productHelper = pClDevice->getDevice().getProductHelper(); + if (!productHelper.isL3FlushAfterPostSyncRequired(true)) { + GTEST_SKIP(); + } + + auto &csr = pDevice->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + auto mockTagAllocator = new MockTagAllocator<>(csr.rootDeviceIndex, pDevice->getMemoryManager()); + csr.timestampPacketAllocator.reset(mockTagAllocator); + auto mockCmdQ = std::make_unique>(context, pClDevice, nullptr); + + auto event = std::make_unique>(context, nullptr, 0, 0, 0); + cl_event clEvent = event.get(); + + mockCmdQ->setL3FlushDeferredIfNeeded(true); + + MultiDispatchInfo multiDispatch; + const auto enqueueResult = mockCmdQ->template enqueueHandler(nullptr, 0, false, multiDispatch, 0, nullptr, &clEvent); + EXPECT_EQ(CL_SUCCESS, enqueueResult); + + auto eventObj = castToObject(clEvent); + + EXPECT_TRUE(eventObj->getWaitForTaskCountRequired()); + + clReleaseEvent(clEvent); +} + +HWTEST_F(EnqueueHandlerTest, givenL3FlushDeferredIfNeededWhenEnqueueWithoutKernelBlockingIsCalledThenPipeControlWithL3FlushIsProgrammed) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + DebugManagerStateRestore dbgRestorer; + debugManager.flags.EnableL3FlushAfterPostSync.set(1); + + auto &productHelper = pClDevice->getDevice().getProductHelper(); + if (!productHelper.isL3FlushAfterPostSyncRequired(true)) { + GTEST_SKIP(); + } + + auto &csr = pDevice->getUltCommandStreamReceiver(); + csr.timestampPacketWriteEnabled = true; + auto mockTagAllocator = new MockTagAllocator<>(csr.rootDeviceIndex, pDevice->getMemoryManager()); + csr.timestampPacketAllocator.reset(mockTagAllocator); + auto mockCmdQ = std::make_unique>(context, pClDevice, nullptr); + + mockCmdQ->setL3FlushDeferredIfNeeded(true); + + MultiDispatchInfo multiDispatch; + auto finishCalledCountBefore = mockCmdQ->finishCalledCount; + auto taskCountBeforeFinish = csr.taskCount.load(); + + const auto enqueueResult = mockCmdQ->template enqueueHandler(nullptr, 0, true, multiDispatch, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, enqueueResult); + + HardwareParse hwParser; + hwParser.parseCommands(csr.commandStream, 0); + auto pipeControls = findAll(hwParser.cmdList.begin(), hwParser.cmdList.end()); + auto pipeControlCmd = genCmdCast(*pipeControls.back()); + EXPECT_TRUE(pipeControlCmd->getDcFlushEnable()); + + EXPECT_TRUE(csr.flushTagUpdateCalled); + EXPECT_EQ(finishCalledCountBefore + 1, mockCmdQ->finishCalledCount); + EXPECT_EQ(taskCountBeforeFinish + 1, mockCmdQ->latestTaskCountWaited); +} + HWTEST_F(EnqueueHandlerTest, givenTimestampPacketWriteEnabledAndCommandWithCacheFlushWhenEnqueueingHandlerThenObtainNewStamp) { auto &csr = pDevice->getUltCommandStreamReceiver(); csr.timestampPacketWriteEnabled = true; diff --git a/opencl/test/unit_test/command_queue/finish_tests.cpp b/opencl/test/unit_test/command_queue/finish_tests.cpp index 049521e238..089d1d8225 100644 --- a/opencl/test/unit_test/command_queue/finish_tests.cpp +++ b/opencl/test/unit_test/command_queue/finish_tests.cpp @@ -7,11 +7,13 @@ #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/cmd_parse/hw_parse.h" +#include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/test_macros/hw_test.h" #include "opencl/test/unit_test/command_queue/command_queue_fixture.h" #include "opencl/test/unit_test/command_stream/command_stream_fixture.h" #include "opencl/test/unit_test/fixtures/cl_device_fixture.h" +#include "opencl/test/unit_test/mocks/mock_buffer.h" #include "opencl/test/unit_test/mocks/mock_cl_device.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" @@ -96,3 +98,105 @@ HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllo EXPECT_EQ(nullptr, cmdQ.peekCommandStream()); } + +HWTEST_F(FinishTest, givenL3FlushAfterPostSyncEnabledWhenFlushTagUpdateIsCalledThenPipeControlIsAddedWithDcFlushAndTaskCountIsUpdated) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + DebugManagerStateRestore dbgRestorer; + debugManager.flags.EnableL3FlushAfterPostSync.set(true); + + auto &productHelper = pClDevice->getDevice().getProductHelper(); + if (!productHelper.isL3FlushAfterPostSyncRequired(true)) { + GTEST_SKIP(); + } + + MockContext contextWithMockCmdQ(pClDevice, true); + MockCommandQueueHw cmdQ(&contextWithMockCmdQ, pClDevice, 0); + + cmdQ.setL3FlushDeferredIfNeeded(true); + cmdQ.l3FlushAfterPostSyncEnabled = true; + cmdQ.setCheckIfDeferredL3FlushIsNeeded(true); + + auto &csr = cmdQ.getUltCommandStreamReceiver(); + auto used = csr.commandStream.getUsed(); + + auto taskCountBeforeFinish = csr.taskCount.load(); + auto beforeWaitForAllEnginesCalledCount = cmdQ.waitForAllEnginesCalledCount; + auto retVal = cmdQ.finish(); + ASSERT_EQ(CL_SUCCESS, retVal); + + EXPECT_EQ(taskCountBeforeFinish + 1, cmdQ.latestTaskCountWaited); + EXPECT_FALSE(cmdQ.recordedSkipWait); + EXPECT_EQ(beforeWaitForAllEnginesCalledCount + 1, cmdQ.waitForAllEnginesCalledCount); + EXPECT_EQ(taskCountBeforeFinish + 1, csr.taskCount.load()); + EXPECT_EQ(taskCountBeforeFinish + 1, cmdQ.taskCount); + + HardwareParse hwParse; + hwParse.parseCommands(csr.commandStream, used); + auto itorCmd = find(hwParse.cmdList.begin(), hwParse.cmdList.end()); + + EXPECT_NE(hwParse.cmdList.end(), itorCmd); + + // Verify DC flush is enabled + auto pipeControl = genCmdCast(*itorCmd); + ASSERT_NE(nullptr, pipeControl); + EXPECT_TRUE(pipeControl->getDcFlushEnable()); + EXPECT_EQ(taskCountBeforeFinish + 1, pipeControl->getImmediateData()); +} + +HWTEST_F(FinishTest, givenL3FlushDeferredIfNeededAndL3FlushAfterPostSyncEnabledWhenCpuDataTransferHandlerCalledThenPipeControlWithDcFlushIsProgrammed) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + DebugManagerStateRestore dbgRestorer; + debugManager.flags.EnableL3FlushAfterPostSync.set(true); + + auto &productHelper = pClDevice->getDevice().getProductHelper(); + if (!productHelper.isL3FlushAfterPostSyncRequired(true)) { + GTEST_SKIP(); + } + + MockContext contextWithMockCmdQ(pClDevice, true); + MockCommandQueueHw cmdQ(&contextWithMockCmdQ, pClDevice, 0); + cmdQ.setL3FlushDeferredIfNeeded(true); + cmdQ.l3FlushAfterPostSyncEnabled = true; + + size_t offset = 0; + size_t size = 16; + MockGraphicsAllocation alloc{}; + auto buffer = std::make_unique(&contextWithMockCmdQ, alloc); + auto mem = std::make_unique(size); + buffer->hostPtr = mem.get(); + buffer->memoryStorage = mem.get(); + auto dstPtr = std::make_unique(size); + auto &csr = cmdQ.getUltCommandStreamReceiver(); + auto usedBefore = csr.commandStream.getUsed(); + + TransferProperties transferProperties(buffer.get(), CL_COMMAND_READ_BUFFER, 0, true, &offset, &size, dstPtr.get(), true, pDevice->getRootDeviceIndex()); + cl_event returnEvent = nullptr; + EventsRequest eventsRequest(0, nullptr, &returnEvent); + cl_int retVal = CL_SUCCESS; + + auto taskCountBeforeFinish = csr.taskCount.load(); + auto beforeWaitForAllEnginesCalledCount = cmdQ.waitForAllEnginesCalledCount; + + cmdQ.cpuDataTransferHandler(transferProperties, eventsRequest, retVal); + ASSERT_EQ(CL_SUCCESS, retVal); + + EXPECT_EQ(taskCountBeforeFinish + 1, cmdQ.latestTaskCountWaited); + EXPECT_FALSE(cmdQ.recordedSkipWait); + EXPECT_EQ(beforeWaitForAllEnginesCalledCount + 1, cmdQ.waitForAllEnginesCalledCount); + EXPECT_EQ(taskCountBeforeFinish + 1, csr.taskCount.load()); + EXPECT_EQ(taskCountBeforeFinish + 1, cmdQ.taskCount); + + HardwareParse hwParse; + hwParse.parseCommands(csr.commandStream, usedBefore); + auto itPc = find(hwParse.cmdList.begin(), hwParse.cmdList.end()); + EXPECT_NE(hwParse.cmdList.end(), itPc); + + auto pipeControl = genCmdCast(*itPc); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(true, pipeControl->getDcFlushEnable()); + EXPECT_EQ(taskCountBeforeFinish + 1, pipeControl->getImmediateData()); + + clReleaseEvent(returnEvent); +} \ No newline at end of file