diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp
index 47e086dc7a..56c40ad193 100644
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@@ -135,8 +135,8 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
 
         this->heaplessModeEnabled = compilerProductHelper.isHeaplessModeEnabled(hwInfo);
         this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled(this->heaplessModeEnabled);
-
         this->isForceStateless = compilerProductHelper.isForceToStatelessRequired();
+        this->l3FlushAfterPostSyncEnabled = productHelper.isL3FlushAfterPostSyncRequired(this->heaplessModeEnabled);
     }
 }
 
diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h
index 5b23b2cf89..6f23741b21 100644
--- a/opencl/source/command_queue/command_queue.h
+++ b/opencl/source/command_queue/command_queue.h
@@ -527,6 +527,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
     bool heaplessModeEnabled = false;
     bool heaplessStateInitEnabled = false;
     bool isForceStateless = false;
+    bool l3FlushedAfterCpuRead = true;
+    bool l3FlushAfterPostSyncEnabled = false;
 };
 
 static_assert(NEO::NonCopyableAndNonMovable<CommandQueue>);
diff --git a/opencl/source/command_queue/command_queue_hw.h b/opencl/source/command_queue/command_queue_hw.h
index f970596384..cba3be9a46 100644
--- a/opencl/source/command_queue/command_queue_hw.h
+++ b/opencl/source/command_queue/command_queue_hw.h
@@ -555,7 +555,8 @@ class CommandQueueHw : public CommandQueue {
                                    CsrDependencies &csrDeps,
                                    KernelOperation *blockedCommandsData,
                                    TimestampPacketDependencies &timestampPacketDependencies,
-                                   bool relaxedOrderingEnabled);
+                                   bool relaxedOrderingEnabled,
+                                   bool blocking);
 
     MOCKABLE_VIRTUAL bool isGpgpuSubmissionForBcsRequired(bool queueBlocked, TimestampPacketDependencies &timestampPacketDependencies, bool containsCrossEngineDependency, bool textureCacheFlushRequired) const;
     void setupEvent(EventBuilder &eventBuilder, cl_event *outEvent, uint32_t cmdType);
diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h
index e1c3308036..d3af3adc86 100644
--- a/opencl/source/command_queue/enqueue_common.h
+++ b/opencl/source/command_queue/enqueue_common.h
@@ -282,9 +282,10 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
             clearLastBcsPackets();
             setStallingCommandsOnNextFlush(false);
         }
+
         processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
                                                hwTimeStamps, blockQueue, csrDeps, blockedCommandsData.get(),
-                                               timestampPacketDependencies, relaxedOrderingEnabled);
+                                               timestampPacketDependencies, relaxedOrderingEnabled, blocking);
     } else if (isCacheFlushCommand(commandType)) {
         processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
     } else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
@@ -520,7 +521,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
                                                           CsrDependencies &csrDeps,
                                                           KernelOperation *blockedCommandsData,
                                                           TimestampPacketDependencies &timestampPacketDependencies,
-                                                          bool relaxedOrderingEnabled) {
+                                                          bool relaxedOrderingEnabled, bool blocking) {
     TagNodeBase *hwPerfCounter = nullptr;
     getClFileLogger().dumpKernelArgs(&multiDispatchInfo);
 
@@ -556,6 +557,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
     dispatchWalkerArgs.commandType = commandType;
     dispatchWalkerArgs.event = event;
     dispatchWalkerArgs.relaxedOrderingEnabled = relaxedOrderingEnabled;
+    dispatchWalkerArgs.blocking = blocking;
 
     getGpgpuCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(0u), multiDispatchInfo.getRequiredScratchSize(1u));
 
diff --git a/opencl/source/command_queue/enqueue_read_buffer.h b/opencl/source/command_queue/enqueue_read_buffer.h
index 95e0ecd120..cfcac3f791 100644
--- a/opencl/source/command_queue/enqueue_read_buffer.h
+++ b/opencl/source/command_queue/enqueue_read_buffer.h
@@ -76,6 +76,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueReadBufferImpl(
 
     if (isCpuCopyAllowed) {
         if (isMemTransferNeeded) {
+            this->l3FlushedAfterCpuRead = false;
             return enqueueReadWriteBufferOnCpuWithMemoryTransfer(cmdType, buffer, offset, size, ptr,
                                                                  numEventsInWaitList, eventWaitList, event);
         } else {
diff --git a/opencl/source/command_queue/finish.h b/opencl/source/command_queue/finish.h
index 5e18034ce1..fe4e5f51a0 100644
--- a/opencl/source/command_queue/finish.h
+++ b/opencl/source/command_queue/finish.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -15,11 +15,19 @@ namespace NEO {
 
 template <typename GfxFamily>
 cl_int CommandQueueHw<GfxFamily>::finish() {
-    auto result = getGpgpuCommandStreamReceiver().flushBatchedSubmissions();
+
+    auto &csr = getGpgpuCommandStreamReceiver();
+
+    auto result = csr.flushBatchedSubmissions();
     if (!result) {
         return CL_OUT_OF_RESOURCES;
     }
 
+    if (!l3FlushedAfterCpuRead && l3FlushAfterPostSyncEnabled) {
+        csr.flushTagUpdate();
+        this->l3FlushedAfterCpuRead = true;
+    }
+
     // Stall until HW reaches taskCount on all its engines
     const auto waitStatus = waitForAllEngines(true, nullptr);
     if (waitStatus == WaitStatus::gpuHang) {
diff --git a/opencl/source/command_queue/hardware_interface.h b/opencl/source/command_queue/hardware_interface.h
index ecb7b3dce8..73c46a7eca 100644
--- a/opencl/source/command_queue/hardware_interface.h
+++ b/opencl/source/command_queue/hardware_interface.h
@@ -53,6 +53,7 @@ struct HardwareInterfaceWalkerArgs {
     uint32_t interfaceDescriptorIndex = 0;
     bool isMainKernel = false;
     bool relaxedOrderingEnabled = false;
+    bool blocking = false;
 };
 
 struct HardwareInterfaceHelper {
diff --git a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
index b2a044f947..12034e662d 100644
--- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
+++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
@@ -106,7 +106,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
 
         if constexpr (heaplessModeEnabled) {
             auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
-            bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation || kernel.isAnyKernelArgumentUsingZeroCopyMemory();
+            bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation;
             bool flushL3AfterPostSyncForExternalAllocation = kernel.isUsingSharedObjArgs();
 
             if (debugManager.flags.RedirectFlushL3HostUsmToExternal.get() && flushL3AfterPostSyncForHostUsm) {
@@ -114,7 +114,9 @@ inline void HardwareInterface<GfxFamily>::programWalker(
                 flushL3AfterPostSyncForExternalAllocation = true;
             }
 
-            GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacketFlushL3<WalkerType>(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation);
+            if (walkerArgs.event != nullptr || walkerArgs.blocking) {
+                GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacketFlushL3<WalkerType>(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation);
+            }
         }
     }
 
diff --git a/opencl/test/unit_test/command_queue/finish_tests.cpp b/opencl/test/unit_test/command_queue/finish_tests.cpp
index 35543ae830..27be1c2ba2 100644
--- a/opencl/test/unit_test/command_queue/finish_tests.cpp
+++ b/opencl/test/unit_test/command_queue/finish_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -87,6 +87,7 @@ HWTEST_F(FinishTest, WhenFinishIsCalledThenPipeControlIsNotAddedToCqCommandStrea
     auto itorCmd = reverseFind<PIPE_CONTROL *>(cmdList.rbegin(), cmdList.rend());
     EXPECT_EQ(cmdList.rend(), itorCmd);
 }
+
 HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllocated) {
     MockContext contextWithMockCmdQ(pClDevice, true);
     MockCommandQueueHw<FamilyType> cmdQ(&contextWithMockCmdQ, pClDevice, 0);
@@ -96,3 +97,37 @@ HWTEST_F(FinishTest, givenFreshQueueWhenFinishIsCalledThenCommandStreamIsNotAllo
 
     EXPECT_EQ(nullptr, cmdQ.peekCommandStream());
 }
+
+HWTEST_F(FinishTest, givenL3FlushAfterPostSyncEnabledWhenFlushTagUpdateIsCalledThenPipeControlIsAddedWithDcFlushEnabled) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+
+    DebugManagerStateRestore dbgRestorer;
+    debugManager.flags.EnableL3FlushAfterPostSync.set(true);
+
+    auto &productHelper = pClDevice->getDevice().getProductHelper();
+    if (!productHelper.isL3FlushAfterPostSyncRequired(true)) {
+        GTEST_SKIP();
+    }
+
+    MockContext contextWithMockCmdQ(pClDevice, true);
+    MockCommandQueueHw<FamilyType> cmdQ(&contextWithMockCmdQ, pClDevice, 0);
+
+    cmdQ.l3FlushedAfterCpuRead = false;
+    cmdQ.l3FlushAfterPostSyncEnabled = true;
+
+    auto &csr = cmdQ.getUltCommandStreamReceiver();
+    auto used = csr.commandStream.getUsed();
+    auto retVal = cmdQ.finish();
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    HardwareParse hwParse;
+    hwParse.parseCommands<FamilyType>(csr.commandStream, used);
+    auto itorCmd = find<PIPE_CONTROL *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
+
+    EXPECT_NE(hwParse.cmdList.end(), itorCmd);
+
+    // Verify DC flush is enabled
+    auto pipeControl = genCmdCast<PIPE_CONTROL *>(*itorCmd);
+    ASSERT_NE(nullptr, pipeControl);
+    EXPECT_EQ(csr.dcFlushSupport, pipeControl->getDcFlushEnable());
+}
diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h
index c610fbd225..82a172aa62 100644
--- a/opencl/test/unit_test/mocks/mock_command_queue.h
+++ b/opencl/test/unit_test/mocks/mock_command_queue.h
@@ -309,6 +309,8 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
     using BaseClass::isCacheFlushOnNextBcsWriteRequired;
     using BaseClass::isCompleted;
     using BaseClass::isGpgpuSubmissionForBcsRequired;
+    using BaseClass::l3FlushAfterPostSyncEnabled;
+    using BaseClass::l3FlushedAfterCpuRead;
     using BaseClass::latestSentEnqueueType;
     using BaseClass::minimalSizeForBcsSplit;
     using BaseClass::obtainCommandStream;