performance: Reuse GPU timestamp instead of KMD escape

This can be enabled only if related debug flag will be set. Related-To: NEO-10615 Signed-off-by: Szymon Morek <szymon.morek@intel.com>
2026-01-10 23:24:53 +08:00 · 2024-04-30 10:59:04 +00:00
parent c1004b77bf
commit 83e8ae4a20
17 changed files with 601 additions and 74 deletions
--- a/opencl/test/unit_test/event/event_tests.cpp
+++ b/opencl/test/unit_test/event/event_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -889,11 +889,12 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    auto osTime = pClDevice->getDevice().getOSTime();

    HwTimeStamps timestamp{};
    timestamp.globalStartTS = 3;
    event.queueTimeStamp.gpuTimeStamp = 2;
-    event.submitTimeStamp.gpuTimeStamp = 4;
+    event.submitTimeStamp.gpuTimeStamp = osTime->getTimestampRefreshTimeout() + 4;
    event.submitTimeStamp.gpuTimeInNs = static_cast<uint64_t>(4 * resolution);
    TagNode<HwTimeStamps> timestampNode{};
    timestampNode.tagForCpuAccess = &timestamp;
@@ -909,6 +910,76 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
    event.timeStampNode = nullptr;
 }

+TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitWhenCalculateStartTimestampThenAdjustTimestmaps) {
+    DebugManagerStateRestore dbgRestore;
+    debugManager.flags.EnableReusingGpuTimestamps.set(true);
+
+    MockContext context{};
+    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+    MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
+    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
+    auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+
+    HwTimeStamps timestamp{};
+    timestamp.globalStartTS = 3;
+    event.queueTimeStamp.gpuTimeStamp = 2;
+    event.submitTimeStamp.gpuTimeStamp = 4;
+    event.submitTimeStamp.gpuTimeInNs = static_cast<uint64_t>(4 * resolution);
+    TagNode<HwTimeStamps> timestampNode{};
+    timestampNode.tagForCpuAccess = &timestamp;
+    event.timeStampNode = &timestampNode;
+
+    uint64_t start = 0u;
+    uint64_t submit = 0u;
+    uint64_t queue = 0u;
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, nullptr);
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queue, nullptr);
+
+    EXPECT_EQ(start, static_cast<uint64_t>(timestamp.globalStartTS * resolution));
+    EXPECT_EQ(submit, static_cast<uint64_t>((timestamp.globalStartTS - 1) * resolution));
+    EXPECT_EQ(queue, static_cast<uint64_t>((timestamp.globalStartTS - 2) * resolution));
+
+    event.timeStampNode = nullptr;
+}
+
+TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitAndStartTSBelowOneWhenCalculateStartTimestampThenAdjustTimestmaps) {
+    DebugManagerStateRestore dbgRestore;
+    debugManager.flags.EnableReusingGpuTimestamps.set(true);
+
+    MockContext context{};
+    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+
+    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+    MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
+    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
+    auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+
+    HwTimeStamps timestamp{};
+    timestamp.globalStartTS = 2;
+    event.queueTimeStamp.gpuTimeStamp = 2;
+    event.submitTimeStamp.gpuTimeStamp = 4;
+    event.submitTimeStamp.gpuTimeInNs = static_cast<uint64_t>(4 * resolution);
+    TagNode<HwTimeStamps> timestampNode{};
+    timestampNode.tagForCpuAccess = &timestamp;
+    event.timeStampNode = &timestampNode;
+
+    uint64_t start = 0u;
+    uint64_t submit = 0u;
+    uint64_t queue = 0u;
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr);
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit, nullptr);
+    event.getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queue, nullptr);
+
+    EXPECT_EQ(start, static_cast<uint64_t>(timestamp.globalStartTS * resolution));
+    EXPECT_EQ(submit, 0ul);
+    EXPECT_EQ(queue, 0ul);
+
+    event.timeStampNode = nullptr;
+}
+
 TEST_F(InternalsEventTest, givenGpuHangWhenEventWaitReportsHangThenWaititingIsAbortedAndUnfinishedEventsHaveExecutionStatusEqualsToAbortedDueToGpuHang) {
    MockCommandQueue cmdQ(mockContext, pClDevice, nullptr, false);