diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index 47e086dc7a..56c40ad193 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -135,8 +135,8 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled(hwInfo); this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled(this->heaplessModeEnabled); - this->isForceStateless = compilerProductHelper.isForceToStatelessRequired(); + this->l3FlushAfterPostSyncEnabled = productHelper.isL3FlushAfterPostSyncRequired(this->heaplessModeEnabled); } } diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 5b23b2cf89..6f23741b21 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -527,6 +527,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool heaplessModeEnabled = false; bool heaplessStateInitEnabled = false; bool isForceStateless = false; + bool l3FlushedAfterCpuRead = true; + bool l3FlushAfterPostSyncEnabled = false; }; static_assert(NEO::NonCopyableAndNonMovable); diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h index f970596384..cba3be9a46 100644 --- a/opencl/source/command_queue/command_queue_hw.h +++ b/opencl/source/command_queue/command_queue_hw.h @@ -555,7 +555,8 @@ class CommandQueueHw : public CommandQueue { CsrDependencies &csrDeps, KernelOperation *blockedCommandsData, TimestampPacketDependencies ×tampPacketDependencies, - bool relaxedOrderingEnabled); + bool relaxedOrderingEnabled, + bool blocking); MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies ×tampPacketDependencies, bool containsCrossEngineDependency, bool textureCacheFlushRequired) const; void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType); diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index e1c3308036..d3af3adc86 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -282,9 +282,10 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, clearLastBcsPackets(); setStallingCommandsOnNextFlush(false); } + processDispatchForKernels(multiDispatchInfo, printfHandler, eventBuilder.getEvent(), hwTimeStamps, blockQueue, csrDeps, blockedCommandsData.get(), - timestampPacketDependencies, relaxedOrderingEnabled); + timestampPacketDependencies, relaxedOrderingEnabled, blocking); } else if (isCacheFlushCommand(commandType)) { processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps); } else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) { @@ -520,7 +521,7 @@ void CommandQueueHw::processDispatchForKernels(const MultiDispatchInf CsrDependencies &csrDeps, KernelOperation *blockedCommandsData, TimestampPacketDependencies ×tampPacketDependencies, - bool relaxedOrderingEnabled) { + bool relaxedOrderingEnabled, bool blocking) { TagNodeBase *hwPerfCounter = nullptr; getClFileLogger().dumpKernelArgs(&multiDispatchInfo); @@ -556,6 +557,7 @@ void CommandQueueHw::processDispatchForKernels(const MultiDispatchInf dispatchWalkerArgs.commandType = commandType; dispatchWalkerArgs.event = event; dispatchWalkerArgs.relaxedOrderingEnabled = relaxedOrderingEnabled; + dispatchWalkerArgs.blocking = blocking; getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(0u), multiDispatchInfo.getRequiredScratchSize(1u)); diff --git a/opencl/source/command_queue/enqueue_read_buffer.h b/opencl/source/command_queue/enqueue_read_buffer.h index 95e0ecd120..cfcac3f791 100644 --- a/opencl/source/command_queue/enqueue_read_buffer.h +++ b/opencl/source/command_queue/enqueue_read_buffer.h @@ -76,6 +76,7 @@ cl_int CommandQueueHw::enqueueReadBufferImpl( if (isCpuCopyAllowed) { if (isMemTransferNeeded) { + this->l3FlushedAfterCpuRead = false; return enqueueReadWriteBufferOnCpuWithMemoryTransfer(cmdType, buffer, offset, size, ptr, numEventsInWaitList, eventWaitList, event); } else { diff --git a/opencl/source/command_queue/finish.h b/opencl/source/command_queue/finish.h index 5e18034ce1..fe4e5f51a0 100644 --- a/opencl/source/command_queue/finish.h +++ b/opencl/source/command_queue/finish.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -15,11 +15,19 @@ namespace NEO { template cl_int CommandQueueHw::finish() { - auto result = getGpgpuCommandStreamReceiver().flushBatchedSubmissions(); + + auto &csr = getGpgpuCommandStreamReceiver(); + + auto result = csr.flushBatchedSubmissions(); if (!result) { return CL_OUT_OF_RESOURCES; } + if (!l3FlushedAfterCpuRead && l3FlushAfterPostSyncEnabled) { + csr.flushTagUpdate(); + this->l3FlushedAfterCpuRead = true; + } + // Stall until HW reaches taskCount on all its engines const auto waitStatus = waitForAllEngines(true, nullptr); if (waitStatus == WaitStatus::gpuHang) { diff --git a/opencl/source/command_queue/hardware_interface.h b/opencl/source/command_queue/hardware_interface.h index ecb7b3dce8..73c46a7eca 100644 --- a/opencl/source/command_queue/hardware_interface.h +++ b/opencl/source/command_queue/hardware_interface.h @@ -53,6 +53,7 @@ struct HardwareInterfaceWalkerArgs { uint32_t interfaceDescriptorIndex = 0; bool isMainKernel = false; bool relaxedOrderingEnabled = false; + bool blocking = false; }; struct HardwareInterfaceHelper { diff --git a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl index b2a044f947..12034e662d 100644 --- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl +++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl @@ -106,7 +106,7 @@ inline void HardwareInterface::programWalker( if constexpr (heaplessModeEnabled) { auto &productHelper = rootDeviceEnvironment.getHelper(); - bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation || kernel.isAnyKernelArgumentUsingZeroCopyMemory(); + bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation; bool flushL3AfterPostSyncForExternalAllocation = kernel.isUsingSharedObjArgs(); if (debugManager.flags.RedirectFlushL3HostUsmToExternal.get() && flushL3AfterPostSyncForHostUsm) { @@ -114,7 +114,9 @@ inline void HardwareInterface::programWalker( flushL3AfterPostSyncForExternalAllocation = true; } - GpgpuWalkerHelper::template setupTimestampPacketFlushL3(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation); + if (walkerArgs.event != nullptr || walkerArgs.blocking) { + GpgpuWalkerHelper::template setupTimestampPacketFlushL3(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation); + } } } diff --git a/opencl/test/unit_test/command_queue/finish_tests.cpp b/opencl/test/unit_test/command_queue/finish_tests.cpp index 35543ae830..27be1c2ba2 100644 --- a/opencl/test/unit_test/command_queue/finish_tests.cpp +++ b/opencl/test/unit_test/command_queue/finish_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -87,6 +87,7 @@ HWTEST_F(FinishTest, WhenFinishIsCalledThenPipeControlIsNotAddedToCqCommandStrea auto itorCmd = reverseFind(cmdList.rbegin(), cmdList.rend()); EXPECT_EQ(cmdList.rend(), itorCmd); } + HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllocated) { MockContext contextWithMockCmdQ(pClDevice, true); MockCommandQueueHw cmdQ(&contextWithMockCmdQ, pClDevice, 0); @@ -96,3 +97,37 @@ HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllo EXPECT_EQ(nullptr, cmdQ.peekCommandStream()); } + +HWTEST_F(FinishTest, givenL3FlushAfterPostSyncEnabledWhenFlushTagUpdateIsCalledThenPipeControlIsAddedWithDcFlushEnabled) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + DebugManagerStateRestore dbgRestorer; + debugManager.flags.EnableL3FlushAfterPostSync.set(true); + + auto &productHelper = pClDevice->getDevice().getProductHelper(); + if (!productHelper.isL3FlushAfterPostSyncRequired(true)) { + GTEST_SKIP(); + } + + MockContext contextWithMockCmdQ(pClDevice, true); + MockCommandQueueHw cmdQ(&contextWithMockCmdQ, pClDevice, 0); + + cmdQ.l3FlushedAfterCpuRead = false; + cmdQ.l3FlushAfterPostSyncEnabled = true; + + auto &csr = cmdQ.getUltCommandStreamReceiver(); + auto used = csr.commandStream.getUsed(); + auto retVal = cmdQ.finish(); + ASSERT_EQ(CL_SUCCESS, retVal); + + HardwareParse hwParse; + hwParse.parseCommands(csr.commandStream, used); + auto itorCmd = find(hwParse.cmdList.begin(), hwParse.cmdList.end()); + + EXPECT_NE(hwParse.cmdList.end(), itorCmd); + + // Verify DC flush is enabled + auto pipeControl = genCmdCast(*itorCmd); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(csr.dcFlushSupport, pipeControl->getDcFlushEnable()); +} diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index c610fbd225..82a172aa62 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -309,6 +309,8 @@ class MockCommandQueueHw : public CommandQueueHw { using BaseClass::isCacheFlushOnNextBcsWriteRequired; using BaseClass::isCompleted; using BaseClass::isGpgpuSubmissionForBcsRequired; + using BaseClass::l3FlushAfterPostSyncEnabled; + using BaseClass::l3FlushedAfterCpuRead; using BaseClass::latestSentEnqueueType; using BaseClass::minimalSizeForBcsSplit; using BaseClass::obtainCommandStream;