performance: skip queue state check when waiting for latest IOQ TSP

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-08-17 13:15:00 +00:00
committed by Compute-Runtime-Automation
parent 73ffc56938
commit 59233d9597
3 changed files with 67 additions and 2 deletions

View File

@@ -454,7 +454,10 @@ inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) {
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0); DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
cmdQueue->handlePostCompletionOperations(true); bool checkQueueCompletionForPostSyncOperations = !(waitedOnTimestamps && !cmdQueue->isOOQEnabled() &&
(this->timestampPacketContainer->peekNodes() == cmdQueue->getTimestampPacketContainer()->peekNodes()));
cmdQueue->handlePostCompletionOperations(checkQueueCompletionForPostSyncOperations);
auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage(); auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage();
allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION); allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION);

View File

@@ -15,6 +15,7 @@
#include "opencl/source/event/user_event.h" #include "opencl/source/event/user_event.h"
#include "opencl/source/gtpin/gtpin_defs.h" #include "opencl/source/gtpin/gtpin_defs.h"
#include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_event.h"
#include "opencl/test/unit_test/mocks/mock_kernel.h"
#include "cl_api_tests.h" #include "cl_api_tests.h"
@@ -154,6 +155,60 @@ HWTEST_F(clEnqueueWaitForEventsTests, givenOoqWhenWaitingForEventThenCallWaitFor
EXPECT_TRUE(commandQueueHw.latestWaitForTimestampsStatus); EXPECT_TRUE(commandQueueHw.latestWaitForTimestampsStatus);
} }
struct clEnqueueWaitForTimestampsTests : public clEnqueueWaitForEventsTests {
void SetUp() override {
DebugManager.flags.EnableTimestampWaitForQueues.set(4);
DebugManager.flags.EnableTimestampWaitForEvents.set(4);
DebugManager.flags.EnableTimestampPacket.set(1);
clEnqueueWaitForEventsTests::SetUp();
}
DebugManagerStateRestore restore;
};
HWTEST_F(clEnqueueWaitForTimestampsTests, givenIoqWhenWaitingForLatestEventThenDontCheckQueueCompletion) {
MockCommandQueueHw<FamilyType> commandQueueHw(pContext, pDevice, nullptr);
MockKernelWithInternals kernel(*pDevice);
cl_event event0, event1;
const size_t gws[] = {1, 1, 1};
commandQueueHw.enqueueKernel(kernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &event0);
commandQueueHw.enqueueKernel(kernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &event1);
auto eventObj0 = castToObjectOrAbort<Event>(event0);
auto eventObj1 = castToObjectOrAbort<Event>(event1);
auto node0 = eventObj0->getTimestampPacketNodes()->peekNodes()[0];
auto node1 = eventObj1->getTimestampPacketNodes()->peekNodes()[0];
auto contextEnd0 = ptrOffset(node0->getCpuBase(), node0->getContextEndOffset());
auto contextEnd1 = ptrOffset(node1->getCpuBase(), node1->getContextEndOffset());
*reinterpret_cast<typename FamilyType::TimestampPacketType *>(contextEnd0) = 0;
*reinterpret_cast<typename FamilyType::TimestampPacketType *>(contextEnd1) = 0;
EXPECT_EQ(0u, commandQueueHw.isCompletedCalled);
EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event0));
EXPECT_EQ(1u, commandQueueHw.isCompletedCalled);
EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event1));
EXPECT_EQ(1u, commandQueueHw.isCompletedCalled);
commandQueueHw.setOoqEnabled();
EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event0));
EXPECT_EQ(2u, commandQueueHw.isCompletedCalled);
EXPECT_EQ(CL_SUCCESS, clEnqueueWaitForEvents(&commandQueueHw, 1, &event1));
EXPECT_EQ(3u, commandQueueHw.isCompletedCalled);
clReleaseEvent(event0);
clReleaseEvent(event1);
}
HWTEST_F(clEnqueueWaitForEventsTests, givenAlreadyCompletedEventWhenWaitForCompletionThenCheckGpuStateOnce) { HWTEST_F(clEnqueueWaitForEventsTests, givenAlreadyCompletedEventWhenWaitForCompletionThenCheckGpuStateOnce) {
auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>(); auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto csrTagAddress = ultCsr.getTagAddress(); auto csrTagAddress = ultCsr.getTagAddress();

View File

@@ -433,7 +433,13 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
latestWaitForTimestampsStatus = BaseClass::waitForTimestamps(copyEnginesToWait, status, mainContainer, deferredContainer); latestWaitForTimestampsStatus = BaseClass::waitForTimestamps(copyEnginesToWait, status, mainContainer, deferredContainer);
return latestWaitForTimestampsStatus; return latestWaitForTimestampsStatus;
}; }
bool isCompleted(TaskCountType gpgpuTaskCount, const Range<CopyEngineState> &bcsStates) override {
isCompletedCalled++;
return CommandQueue::isCompleted(gpgpuTaskCount, bcsStates);
}
unsigned int lastCommandType; unsigned int lastCommandType;
std::vector<Kernel *> lastEnqueuedKernels; std::vector<Kernel *> lastEnqueuedKernels;
@@ -459,6 +465,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
} overrideIsCacheFlushForBcsRequired; } overrideIsCacheFlushForBcsRequired;
BuiltinOpParams kernelParams; BuiltinOpParams kernelParams;
std::atomic<TaskCountType> latestTaskCountWaited{std::numeric_limits<uint32_t>::max()}; std::atomic<TaskCountType> latestTaskCountWaited{std::numeric_limits<uint32_t>::max()};
std::atomic<uint32_t> isCompletedCalled = 0;
bool flushCalled = false; bool flushCalled = false;
std::optional<WaitStatus> waitForAllEnginesReturnValue{}; std::optional<WaitStatus> waitForAllEnginesReturnValue{};
std::optional<WaitStatus> waitUntilCompleteReturnValue{}; std::optional<WaitStatus> waitUntilCompleteReturnValue{};