performance: Reuse GPU timestamp instead of KMD escape

This can be enabled only if related
debug flag will be set.

Related-To: NEO-10615

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-04-30 10:59:04 +00:00
committed by Compute-Runtime-Automation
parent c1004b77bf
commit 83e8ae4a20
17 changed files with 601 additions and 74 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2023 Intel Corporation
* Copyright (C) 2018-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -889,11 +889,12 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
auto osTime = pClDevice->getDevice().getOSTime();
HwTimeStamps timestamp{};
timestamp.globalStartTS = 3;
event.queueTimeStamp.gpuTimeStamp = 2;
event.submitTimeStamp.gpuTimeStamp = 4;
event.submitTimeStamp.gpuTimeStamp = osTime->getTimestampRefreshTimeout() + 4;
event.submitTimeStamp.gpuTimeInNs = static_cast<uint64_t>(4 * resolution);
TagNode<HwTimeStamps> timestampNode{};
timestampNode.tagForCpuAccess = &timestamp;
@@ -909,6 +910,76 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
event.timeStampNode = nullptr;
}
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitWhenCalculateStartTimestampThenAdjustTimestmaps) {
DebugManagerStateRestore dbgRestore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
MockContext context{};
auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;
HwTimeStamps timestamp{};
timestamp.globalStartTS = 3;
event.queueTimeStamp.gpuTimeStamp = 2;
event.submitTimeStamp.gpuTimeStamp = 4;
event.submitTimeStamp.gpuTimeInNs = static_cast<uint64_t>(4 * resolution);
TagNode<HwTimeStamps> timestampNode{};
timestampNode.tagForCpuAccess = &timestamp;
event.timeStampNode = &timestampNode;
uint64_t start = 0u;
uint64_t submit = 0u;
uint64_t queue = 0u;
event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, nullptr);
event.getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queue, nullptr);
EXPECT_EQ(start, static_cast<uint64_t>(timestamp.globalStartTS * resolution));
EXPECT_EQ(submit, static_cast<uint64_t>((timestamp.globalStartTS - 1) * resolution));
EXPECT_EQ(queue, static_cast<uint64_t>((timestamp.globalStartTS - 2) * resolution));
event.timeStampNode = nullptr;
}
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitAndStartTSBelowOneWhenCalculateStartTimestampThenAdjustTimestmaps) {
DebugManagerStateRestore dbgRestore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
MockContext context{};
auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;
HwTimeStamps timestamp{};
timestamp.globalStartTS = 2;
event.queueTimeStamp.gpuTimeStamp = 2;
event.submitTimeStamp.gpuTimeStamp = 4;
event.submitTimeStamp.gpuTimeInNs = static_cast<uint64_t>(4 * resolution);
TagNode<HwTimeStamps> timestampNode{};
timestampNode.tagForCpuAccess = &timestamp;
event.timeStampNode = &timestampNode;
uint64_t start = 0u;
uint64_t submit = 0u;
uint64_t queue = 0u;
event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, nullptr);
event.getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queue, nullptr);
EXPECT_EQ(start, static_cast<uint64_t>(timestamp.globalStartTS * resolution));
EXPECT_EQ(submit, 0ul);
EXPECT_EQ(queue, 0ul);
event.timeStampNode = nullptr;
}
TEST_F(InternalsEventTest, givenGpuHangWhenEventWaitReportsHangThenWaititingIsAbortedAndUnfinishedEventsHaveExecutionStatusEqualsToAbortedDueToGpuHang) {
MockCommandQueue cmdQ(mockContext, pClDevice, nullptr, false);