performance(ocl): program barrier pc in taskStream

Program barrier to task stream, before next enqueue kernel. This will reduce the number of batch buffer starts for sequences of enqueue, barrier, enqueue, ... . Related-To: NEO-8147 Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
2025-12-24 21:18:24 +08:00 · 2023-09-12 14:17:52 +00:00
parent e08d46085b
commit 1b7e178b25
23 changed files with 224 additions and 98 deletions
--- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp
@@ -811,7 +811,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlitEnq
    }
 }

-HWTEST_TEMPLATED_F(BcsBufferTests, givenBarrierWithEmptyWaitlistWhenReleasingMultipleBlockedEnqueuesThenProgramBarrierOnce) {
+HWTEST_TEMPLATED_F(BcsBufferTests, givenStallingCommandsOnNextFlushWhenReleasingMultipleBlockedEnqueuesThenProgramBarrierOnce) {
    DebugManager.flags.OptimizeIoqBarriersHandling.set(0);

    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
@@ -827,7 +827,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBarrierWithEmptyWaitlistWhenReleasingMul
    cl_event waitlist0[] = {&userEvent0};
    cl_event waitlist1[] = {&userEvent1};

-    cmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr);
+    cmdQ->setStallingCommandsOnNextFlush(true);
    cmdQ->enqueueWriteBuffer(buffer.get(), false, 0, 1, hostPtr, nullptr, 1, waitlist0, nullptr);
    cmdQ->enqueueWriteBuffer(buffer.get(), false, 0, 1, hostPtr, nullptr, 1, waitlist1, nullptr);