From 59233d9597397c3ea2f85534d45b51a1410a06c2 Mon Sep 17 00:00:00 2001
From: "Dunajski, Bartosz" <bartosz.dunajski@intel.com>
Date: Thu, 17 Aug 2023 13:15:00 +0000
Subject: [PATCH] performance: skip queue state check when waiting for latest
 IOQ TSP

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
---
 opencl/source/event/event.cpp                 |  5 +-
 .../api/cl_enqueue_wait_for_events_tests.inl  | 55 +++++++++++++++++++
 .../test/unit_test/mocks/mock_command_queue.h |  9 ++-
 3 files changed, 67 insertions(+), 2 deletions(-)
diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp
index 18cb83da5a..c5171031fd 100644
--- a/opencl/source/event/event.cpp
+++ b/opencl/source/event/event.cpp
@@ -454,7 +454,10 @@ inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) {
 
     DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
 
-    cmdQueue->handlePostCompletionOperations(true);
+    bool checkQueueCompletionForPostSyncOperations = !(waitedOnTimestamps && !cmdQueue->isOOQEnabled() &&
+                                                       (this->timestampPacketContainer->peekNodes() == cmdQueue->getTimestampPacketContainer()->peekNodes()));
+
+    cmdQueue->handlePostCompletionOperations(checkQueueCompletionForPostSyncOperations);
 
     auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage();
     allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION);
diff --git a/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl b/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl
index 30ad103d42..1f4f98593b 100644
--- a/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl
+++ b/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl
@@ -15,6 +15,7 @@
 #include "opencl/source/event/user_event.h"
 #include "opencl/source/gtpin/gtpin_defs.h"
 #include "opencl/test/unit_test/mocks/mock_event.h"
+#include "opencl/test/unit_test/mocks/mock_kernel.h"
 
 #include "cl_api_tests.h"
 
@@ -154,6 +155,60 @@ HWTEST_F(clEnqueueWaitForEventsTests, givenOoqWhenWaitingForEventThenCallWaitFor
     EXPECT_TRUE(commandQueueHw.latestWaitForTimestampsStatus);
 }
 
+struct clEnqueueWaitForTimestampsTests : public clEnqueueWaitForEventsTests {
+    void SetUp() override {
+        DebugManager.flags.EnableTimestampWaitForQueues.set(4);
+        DebugManager.flags.EnableTimestampWaitForEvents.set(4);
+        DebugManager.flags.EnableTimestampPacket.set(1);
+
+        clEnqueueWaitForEventsTests::SetUp();
+    }
+
+    DebugManagerStateRestore restore;
+};
+
+HWTEST_F(clEnqueueWaitForTimestampsTests, givenIoqWhenWaitingForLatestEventThenDontCheckQueueCompletion) {
+    MockCommandQueueHw<FamilyType> commandQueueHw(pContext, pDevice, nullptr);
+
+    MockKernelWithInternals kernel(*pDevice);
+
+    cl_event event0, event1;
+
+    const size_t gws[] = {1, 1, 1};
+    commandQueueHw.enqueueKernel(kernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &event0);
+    commandQueueHw.enqueueKernel(kernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &event1);
+
+    auto eventObj0 = castToObjectOrAbort<Event>(event0);
+    auto eventObj1 = castToObjectOrAbort<Event>(event1);
+
+    auto node0 = eventObj0->getTimestampPacketNodes()->peekNodes()[0];
+    auto node1 = eventObj1->getTimestampPacketNodes()->peekNodes()[0];
+
+    auto contextEnd0 = ptrOffset(node0->getCpuBase(), node0->getContextEndOffset());
+    auto contextEnd1 = ptrOffset(node1->getCpuBase(), node1->getContextEndOffset());
+
+    *reinterpret_cast<typename FamilyType::TimestampPacketType *>(contextEnd0) = 0;
+    *reinterpret_cast<typename FamilyType::TimestampPacketType *>(contextEnd1) = 0;
+
+    EXPECT_EQ(0u, commandQueueHw.isCompletedCalled);
+
+    EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event0));
+    EXPECT_EQ(1u, commandQueueHw.isCompletedCalled);
+
+    EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event1));
+    EXPECT_EQ(1u, commandQueueHw.isCompletedCalled);
+
+    commandQueueHw.setOoqEnabled();
+    EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event0));
+    EXPECT_EQ(2u, commandQueueHw.isCompletedCalled);
+
+    EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event1));
+    EXPECT_EQ(3u, commandQueueHw.isCompletedCalled);
+
+    clReleaseEvent(event0);
+    clReleaseEvent(event1);
+}
+
 HWTEST_F(clEnqueueWaitForEventsTests, givenAlreadyCompletedEventWhenWaitForCompletionThenCheckGpuStateOnce) {
     auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
     auto csrTagAddress = ultCsr.getTagAddress();
diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h
index 05d75e977b..b6c5c4ebf4 100644
--- a/opencl/test/unit_test/mocks/mock_command_queue.h
+++ b/opencl/test/unit_test/mocks/mock_command_queue.h
@@ -433,7 +433,13 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
         latestWaitForTimestampsStatus = BaseClass::waitForTimestamps(copyEnginesToWait, status, mainContainer, deferredContainer);
 
         return latestWaitForTimestampsStatus;
-    };
+    }
+
+    bool isCompleted(TaskCountType gpgpuTaskCount, const Range<CopyEngineState> &bcsStates) override {
+        isCompletedCalled++;
+
+        return CommandQueue::isCompleted(gpgpuTaskCount, bcsStates);
+    }
 
     unsigned int lastCommandType;
     std::vector<Kernel *> lastEnqueuedKernels;
@@ -459,6 +465,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
     } overrideIsCacheFlushForBcsRequired;
     BuiltinOpParams kernelParams;
     std::atomic<TaskCountType> latestTaskCountWaited{std::numeric_limits<uint32_t>::max()};
+    std::atomic<uint32_t> isCompletedCalled = 0;
     bool flushCalled = false;
     std::optional<WaitStatus> waitForAllEnginesReturnValue{};
     std::optional<WaitStatus> waitUntilCompleteReturnValue{};