From 712e059acefe9f85f1b215fa09c00a628cb21503 Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Thu, 13 Jul 2023 11:32:39 +0000 Subject: [PATCH] performance: check completion alloc only once when waiting for Event Signed-off-by: Dunajski, Bartosz --- opencl/source/command_queue/command_queue.h | 2 +- opencl/source/event/event.cpp | 13 ++++- opencl/source/event/event.h | 1 + .../api/cl_enqueue_wait_for_events_tests.inl | 51 +++++++++++++++++++ .../test/unit_test/mocks/mock_command_queue.h | 7 +++ 5 files changed, 72 insertions(+), 2 deletions(-) diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 2a50487d50..0cd1135119 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -214,7 +214,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { volatile TagAddressType *getHwTagAddress() const; - bool isCompleted(TaskCountType gpgpuTaskCount, CopyEngineState bcsState); + MOCKABLE_VIRTUAL bool isCompleted(TaskCountType gpgpuTaskCount, CopyEngineState bcsState); bool isWaitForTimestampsEnabled() const; virtual bool waitForTimestamps(Range copyEnginesToWait, TaskCountType taskCount, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) = 0; diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index 44827758c1..7161503dc3 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -447,6 +447,9 @@ inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) { if (waitStatus == WaitStatus::GpuHang) { return WaitStatus::GpuHang; } + + this->gpuStateWaited = true; + updateExecutionStatus(); DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0); @@ -704,7 +707,15 @@ inline void Event::setExecutionStatusToAbortedDueToGpuHang(cl_event *first, cl_e } bool Event::isCompleted() { - return cmdQueue->isCompleted(getCompletionStamp(), this->bcsState) || this->areTimestampsCompleted(); + if (gpuStateWaited) { + return true; + } + + if (cmdQueue->isCompleted(getCompletionStamp(), this->bcsState) || this->areTimestampsCompleted()) { + gpuStateWaited = true; + } + + return gpuStateWaited; } bool Event::isWaitForTimestampsEnabled() const { diff --git a/opencl/source/event/event.h b/opencl/source/event/event.h index 06498ee8b9..fcebbc3841 100644 --- a/opencl/source/event/event.h +++ b/opencl/source/event/event.h @@ -393,6 +393,7 @@ class Event : public BaseObject<_cl_event>, public IDNode { // number of events this event depends on std::unique_ptr multiRootDeviceTimestampPacketContainer; std::atomic parentCount; + std::atomic gpuStateWaited = false; // event parents std::vector parentEvents; diff --git a/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl b/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl index 9a95928c77..89aac4987a 100644 --- a/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl +++ b/opencl/test/unit_test/api/cl_enqueue_wait_for_events_tests.inl @@ -129,6 +129,57 @@ TEST_F(clEnqueueWaitForEventsTests, GivenInvalidEventWhenClEnqueueWaitForEventsI ASSERT_EQ(CL_SUCCESS, retVal); } +HWTEST_F(clEnqueueWaitForEventsTests, givenAlreadyCompletedEventWhenWaitForCompletionThenCheckGpuStateOnce) { + auto &ultCsr = pDevice->getUltCommandStreamReceiver(); + auto csrTagAddress = ultCsr.getTagAddress(); + + TaskCountType eventTaskCount = 5; + + *csrTagAddress = eventTaskCount - 1; + + MockEvent event1(pCommandQueue, CL_COMMAND_READ_BUFFER, 0, eventTaskCount); + MockEvent event2(pCommandQueue, CL_COMMAND_READ_BUFFER, 0, eventTaskCount); + cl_event hEvent1 = &event1; + cl_event hEvent2 = &event2; + + EXPECT_EQ(0u, pCommandQueue->isCompletedCalled); + + // Event 1 + event1.updateExecutionStatus(); + EXPECT_EQ(1u, pCommandQueue->isCompletedCalled); + + event1.updateExecutionStatus(); + EXPECT_EQ(2u, pCommandQueue->isCompletedCalled); + + *csrTagAddress = eventTaskCount; + + event1.updateExecutionStatus(); + EXPECT_EQ(3u, pCommandQueue->isCompletedCalled); + + event1.updateExecutionStatus(); + EXPECT_EQ(3u, pCommandQueue->isCompletedCalled); + + auto retVal = clEnqueueWaitForEvents(pCommandQueue, 1, &hEvent1); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_EQ(3u, pCommandQueue->isCompletedCalled); + + // Event 2 + retVal = clEnqueueWaitForEvents(pCommandQueue, 1, &hEvent2); + EXPECT_EQ(CL_SUCCESS, retVal); + + // clEnqueueWaitForEvents signals completion before isCompletedCalled() + EXPECT_EQ(3u, pCommandQueue->isCompletedCalled); + + retVal = clEnqueueWaitForEvents(pCommandQueue, 1, &hEvent2); + EXPECT_EQ(CL_SUCCESS, retVal); + + EXPECT_EQ(3u, pCommandQueue->isCompletedCalled); + + event2.updateExecutionStatus(); + EXPECT_EQ(3u, pCommandQueue->isCompletedCalled); +} + struct GTPinMockCommandQueue : MockCommandQueue { GTPinMockCommandQueue(Context *context, MockClDevice *device) : MockCommandQueue(context, device, nullptr, false) {} WaitStatus waitUntilComplete(TaskCountType gpgpuTaskCountToWait, Range copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) override { diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 96a26ab9a7..32f0261f81 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -224,9 +224,16 @@ class MockCommandQueue : public CommandQueue { return false; }; + bool isCompleted(TaskCountType gpgpuTaskCount, CopyEngineState bcsState) override { + isCompletedCalled++; + + return CommandQueue::isCompleted(gpgpuTaskCount, bcsState); + } + bool releaseIndirectHeapCalled = false; bool waitForTimestampsCalled = false; cl_int writeBufferRetValue = CL_SUCCESS; + uint32_t isCompletedCalled = 0; uint32_t writeBufferCounter = 0; bool writeBufferBlocking = false; size_t writeBufferOffset = 0;