performance: check completion alloc only once when waiting for Event

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz 2023-07-13 11:32:39 +00:00 committed by Compute-Runtime-Automation
parent ed972bb21c
commit 712e059ace
5 changed files with 72 additions and 2 deletions

View File

@ -214,7 +214,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
volatile TagAddressType *getHwTagAddress() const;
bool isCompleted(TaskCountType gpgpuTaskCount, CopyEngineState bcsState);
MOCKABLE_VIRTUAL bool isCompleted(TaskCountType gpgpuTaskCount, CopyEngineState bcsState);
bool isWaitForTimestampsEnabled() const;
virtual bool waitForTimestamps(Range<CopyEngineState> copyEnginesToWait, TaskCountType taskCount, WaitStatus &status, TimestampPacketContainer *mainContainer, TimestampPacketContainer *deferredContainer) = 0;

View File

@ -447,6 +447,9 @@ inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) {
if (waitStatus == WaitStatus::GpuHang) {
return WaitStatus::GpuHang;
}
this->gpuStateWaited = true;
updateExecutionStatus();
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
@ -704,7 +707,15 @@ inline void Event::setExecutionStatusToAbortedDueToGpuHang(cl_event *first, cl_e
}
bool Event::isCompleted() {
return cmdQueue->isCompleted(getCompletionStamp(), this->bcsState) || this->areTimestampsCompleted();
if (gpuStateWaited) {
return true;
}
if (cmdQueue->isCompleted(getCompletionStamp(), this->bcsState) || this->areTimestampsCompleted()) {
gpuStateWaited = true;
}
return gpuStateWaited;
}
bool Event::isWaitForTimestampsEnabled() const {

View File

@ -393,6 +393,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
// number of events this event depends on
std::unique_ptr<TimestampPacketContainer> multiRootDeviceTimestampPacketContainer;
std::atomic<int> parentCount;
std::atomic<bool> gpuStateWaited = false;
// event parents
std::vector<Event *> parentEvents;

View File

@ -129,6 +129,57 @@ TEST_F(clEnqueueWaitForEventsTests, GivenInvalidEventWhenClEnqueueWaitForEventsI
ASSERT_EQ(CL_SUCCESS, retVal);
}
HWTEST_F(clEnqueueWaitForEventsTests, givenAlreadyCompletedEventWhenWaitForCompletionThenCheckGpuStateOnce) {
auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto csrTagAddress = ultCsr.getTagAddress();
TaskCountType eventTaskCount = 5;
*csrTagAddress = eventTaskCount - 1;
MockEvent<Event> event1(pCommandQueue, CL_COMMAND_READ_BUFFER, 0, eventTaskCount);
MockEvent<Event> event2(pCommandQueue, CL_COMMAND_READ_BUFFER, 0, eventTaskCount);
cl_event hEvent1 = &event1;
cl_event hEvent2 = &event2;
EXPECT_EQ(0u, pCommandQueue->isCompletedCalled);
// Event 1
event1.updateExecutionStatus();
EXPECT_EQ(1u, pCommandQueue->isCompletedCalled);
event1.updateExecutionStatus();
EXPECT_EQ(2u, pCommandQueue->isCompletedCalled);
*csrTagAddress = eventTaskCount;
event1.updateExecutionStatus();
EXPECT_EQ(3u, pCommandQueue->isCompletedCalled);
event1.updateExecutionStatus();
EXPECT_EQ(3u, pCommandQueue->isCompletedCalled);
auto retVal = clEnqueueWaitForEvents(pCommandQueue, 1, &hEvent1);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(3u, pCommandQueue->isCompletedCalled);
// Event 2
retVal = clEnqueueWaitForEvents(pCommandQueue, 1, &hEvent2);
EXPECT_EQ(CL_SUCCESS, retVal);
// clEnqueueWaitForEvents signals completion before isCompletedCalled()
EXPECT_EQ(3u, pCommandQueue->isCompletedCalled);
retVal = clEnqueueWaitForEvents(pCommandQueue, 1, &hEvent2);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(3u, pCommandQueue->isCompletedCalled);
event2.updateExecutionStatus();
EXPECT_EQ(3u, pCommandQueue->isCompletedCalled);
}
struct GTPinMockCommandQueue : MockCommandQueue {
GTPinMockCommandQueue(Context *context, MockClDevice *device) : MockCommandQueue(context, device, nullptr, false) {}
WaitStatus waitUntilComplete(TaskCountType gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) override {

View File

@ -224,9 +224,16 @@ class MockCommandQueue : public CommandQueue {
return false;
};
bool isCompleted(TaskCountType gpgpuTaskCount, CopyEngineState bcsState) override {
isCompletedCalled++;
return CommandQueue::isCompleted(gpgpuTaskCount, bcsState);
}
bool releaseIndirectHeapCalled = false;
bool waitForTimestampsCalled = false;
cl_int writeBufferRetValue = CL_SUCCESS;
uint32_t isCompletedCalled = 0;
uint32_t writeBufferCounter = 0;
bool writeBufferBlocking = false;
size_t writeBufferOffset = 0;