From 59233d9597397c3ea2f85534d45b51a1410a06c2 Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Thu, 17 Aug 2023 13:15:00 +0000 Subject: [PATCH] performance: skip queue state check when waiting for latest IOQ TSP Signed-off-by: Dunajski, Bartosz --- opencl/source/event/event.cpp | 5 +- .../api/cl_enqueue_wait_for_events_tests.inl | 55 +++++++++++++++++++ .../test/unit_test/mocks/mock_command_queue.h | 9 ++- 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index 18cb83da5a..c5171031fd 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -454,7 +454,10 @@ inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) { DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0); - cmdQueue->handlePostCompletionOperations(true); + bool checkQueueCompletionForPostSyncOperations = !(waitedOnTimestamps && !cmdQueue->isOOQEnabled() && + (this->timestampPacketContainer->peekNodes() == cmdQueue->getTimestampPacketContainer()->peekNodes())); + + cmdQueue->handlePostCompletionOperations(checkQueueCompletionForPostSyncOperations); auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage(); allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION); diff --git a/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl b/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl index 30ad103d42..1f4f98593b 100644 --- a/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl +++ b/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl @@ -15,6 +15,7 @@ #include "opencl/source/event/user_event.h" #include "opencl/source/gtpin/gtpin_defs.h" #include "opencl/test/unit_test/mocks/mock_event.h" +#include "opencl/test/unit_test/mocks/mock_kernel.h" #include "cl_api_tests.h" @@ -154,6 +155,60 @@ HWTEST_F(clEnqueueWaitForEventsTests, givenOoqWhenWaitingForEventThenCallWaitFor EXPECT_TRUE(commandQueueHw.latestWaitForTimestampsStatus); } +struct clEnqueueWaitForTimestampsTests : public clEnqueueWaitForEventsTests { + void SetUp() override { + DebugManager.flags.EnableTimestampWaitForQueues.set(4); + DebugManager.flags.EnableTimestampWaitForEvents.set(4); + DebugManager.flags.EnableTimestampPacket.set(1); + + clEnqueueWaitForEventsTests::SetUp(); + } + + DebugManagerStateRestore restore; +}; + +HWTEST_F(clEnqueueWaitForTimestampsTests, givenIoqWhenWaitingForLatestEventThenDontCheckQueueCompletion) { + MockCommandQueueHw commandQueueHw(pContext, pDevice, nullptr); + + MockKernelWithInternals kernel(*pDevice); + + cl_event event0, event1; + + const size_t gws[] = {1, 1, 1}; + commandQueueHw.enqueueKernel(kernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &event0); + commandQueueHw.enqueueKernel(kernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &event1); + + auto eventObj0 = castToObjectOrAbort(event0); + auto eventObj1 = castToObjectOrAbort(event1); + + auto node0 = eventObj0->getTimestampPacketNodes()->peekNodes()[0]; + auto node1 = eventObj1->getTimestampPacketNodes()->peekNodes()[0]; + + auto contextEnd0 = ptrOffset(node0->getCpuBase(), node0->getContextEndOffset()); + auto contextEnd1 = ptrOffset(node1->getCpuBase(), node1->getContextEndOffset()); + + *reinterpret_cast(contextEnd0) = 0; + *reinterpret_cast(contextEnd1) = 0; + + EXPECT_EQ(0u, commandQueueHw.isCompletedCalled); + + EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event0)); + EXPECT_EQ(1u, commandQueueHw.isCompletedCalled); + + EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event1)); + EXPECT_EQ(1u, commandQueueHw.isCompletedCalled); + + commandQueueHw.setOoqEnabled(); + EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event0)); + EXPECT_EQ(2u, commandQueueHw.isCompletedCalled); + + EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event1)); + EXPECT_EQ(3u, commandQueueHw.isCompletedCalled); + + clReleaseEvent(event0); + clReleaseEvent(event1); +} + HWTEST_F(clEnqueueWaitForEventsTests, givenAlreadyCompletedEventWhenWaitForCompletionThenCheckGpuStateOnce) { auto &ultCsr = pDevice->getUltCommandStreamReceiver(); auto csrTagAddress = ultCsr.getTagAddress(); diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 05d75e977b..b6c5c4ebf4 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -433,7 +433,13 @@ class MockCommandQueueHw : public CommandQueueHw { latestWaitForTimestampsStatus = BaseClass::waitForTimestamps(copyEnginesToWait, status, mainContainer, deferredContainer); return latestWaitForTimestampsStatus; - }; + } + + bool isCompleted(TaskCountType gpgpuTaskCount, const Range &bcsStates) override { + isCompletedCalled++; + + return CommandQueue::isCompleted(gpgpuTaskCount, bcsStates); + } unsigned int lastCommandType; std::vector lastEnqueuedKernels; @@ -459,6 +465,7 @@ class MockCommandQueueHw : public CommandQueueHw { } overrideIsCacheFlushForBcsRequired; BuiltinOpParams kernelParams; std::atomic latestTaskCountWaited{std::numeric_limits::max()}; + std::atomic isCompletedCalled = 0; bool flushCalled = false; std::optional waitForAllEnginesReturnValue{}; std::optional waitUntilCompleteReturnValue{};