From ad3aeb6eeac7e1311dac762c5926db968bd5ae41 Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Fri, 20 Oct 2023 11:10:54 +0000 Subject: [PATCH] fix: use submit timestamp in kernel start time calculation Related-To: NEO-8394 Signed-off-by: Mateusz Jablonski --- opencl/source/event/event.cpp | 8 ++-- opencl/test/unit_test/event/event_tests.cpp | 11 +++-- .../unit_test/profiling/profiling_tests.cpp | 44 +++++++++++++++++++ 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index 759d1cdc3a..da7106a243 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -370,17 +370,17 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con auto &device = this->cmdQueue->getDevice(); auto &gfxCoreHelper = device.getGfxCoreHelper(); auto frequency = device.getDeviceInfo().profilingTimerResolution; - auto gpuQueueTimeStamp = gfxCoreHelper.getGpuTimeStampInNS(queueTimeStamp.gpuTimeStamp, frequency); + auto gpuSubmitTimeStamp = gfxCoreHelper.getGpuTimeStampInNS(submitTimeStamp.gpuTimeStamp, frequency); if (DebugManager.flags.EnableDeviceBasedTimestamps.get()) { startTimeStamp = static_cast(globalStartTS * frequency); - while (startTimeStamp < gpuQueueTimeStamp) { + while (startTimeStamp < gpuSubmitTimeStamp) { startTimeStamp += static_cast((1ULL << gfxCoreHelper.getGlobalTimeStampBits()) * frequency); } } else { - int64_t c0 = queueTimeStamp.cpuTimeinNS - gpuQueueTimeStamp; + int64_t c0 = submitTimeStamp.cpuTimeinNS - gpuSubmitTimeStamp; startTimeStamp = static_cast(globalStartTS * frequency) + c0; - if (startTimeStamp < queueTimeStamp.cpuTimeinNS) { + if (startTimeStamp < submitTimeStamp.cpuTimeinNS) { c0 += static_cast((1ULL << (gfxCoreHelper.getGlobalTimeStampBits())) * frequency); startTimeStamp = static_cast(globalStartTS * frequency) + c0; } diff --git a/opencl/test/unit_test/event/event_tests.cpp b/opencl/test/unit_test/event/event_tests.cpp index 2adbcc760b..b76f575570 100644 --- a/opencl/test/unit_test/event/event_tests.cpp +++ b/opencl/test/unit_test/event/event_tests.cpp @@ -825,6 +825,10 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseNotEnabledWhenCalculateStartT timestamp.globalStartTS = 2; event.queueTimeStamp.gpuTimeStamp = 1; event.queueTimeStamp.cpuTimeinNS = 100; + + event.queueTimeStamp.gpuTimeStamp = 2; + event.queueTimeStamp.cpuTimeinNS = 200; + TagNode timestampNode{}; timestampNode.tagForCpuAccess = ×tamp; event.timeStampNode = ×tampNode; @@ -834,7 +838,7 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseNotEnabledWhenCalculateStartT auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution; auto &gfxCoreHelper = pClDevice->getGfxCoreHelper(); - auto c0 = event.queueTimeStamp.cpuTimeinNS - gfxCoreHelper.getGpuTimeStampInNS(event.queueTimeStamp.gpuTimeStamp, resolution); + auto c0 = event.submitTimeStamp.cpuTimeinNS - gfxCoreHelper.getGpuTimeStampInNS(event.submitTimeStamp.gpuTimeStamp, resolution); EXPECT_EQ(start, static_cast(timestamp.globalStartTS * resolution) + c0); event.timeStampNode = nullptr; @@ -883,8 +887,9 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle MockEvent event(&cmdQ, CL_COMPLETE, 0, 0); HwTimeStamps timestamp{}; - timestamp.globalStartTS = 1; + timestamp.globalStartTS = 3; event.queueTimeStamp.gpuTimeStamp = 2; + event.submitTimeStamp.gpuTimeStamp = 4; TagNode timestampNode{}; timestampNode.tagForCpuAccess = ×tamp; event.timeStampNode = ×tampNode; @@ -894,7 +899,7 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle auto &gfxCoreHelper = pClDevice->getGfxCoreHelper(); auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution; - auto refStartTime = static_cast(timestamp.globalStartTS * resolution + (1ULL << gfxCoreHelper.getGlobalTimeStampBits()) * resolution); + auto refStartTime = static_cast(timestamp.globalStartTS * resolution) + static_cast((1ULL << gfxCoreHelper.getGlobalTimeStampBits()) * resolution); EXPECT_EQ(start, refStartTime); event.timeStampNode = nullptr; diff --git a/opencl/test/unit_test/profiling/profiling_tests.cpp b/opencl/test/unit_test/profiling/profiling_tests.cpp index 6780211a95..dfbbb993b9 100644 --- a/opencl/test/unit_test/profiling/profiling_tests.cpp +++ b/opencl/test/unit_test/profiling/profiling_tests.cpp @@ -625,6 +625,50 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EventProfilingTests, givenRawTimestampsDebugModeWhen event.timeStampNode = nullptr; } +TEST_F(EventProfilingTests, givenSubmitTimeMuchGreaterThanQueueTimeWhenCalculatingStartTimeThenItIsGreaterThanSubmitTime) { + auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(nullptr)); + MockContext context(device.get()); + MockCommandQueue cmdQ(&context, device.get(), nullptr, false); + cmdQ.setProfilingEnabled(); + + HwTimeStamps timestamp{}; + timestamp.globalStartTS = 10; + timestamp.contextStartTS = 20; + timestamp.globalEndTS = 80; + timestamp.contextEndTS = 56; + + MockTagNode timestampNode{}; + timestampNode.tagForCpuAccess = ×tamp; + + MockEvent event(&cmdQ, CL_COMPLETE, 0, 0); + cl_event clEvent = &event; + + event.queueTimeStamp.cpuTimeinNS = 1; + event.queueTimeStamp.gpuTimeStamp = 2; + + event.submitTimeStamp.cpuTimeinNS = (1ull << 33) + 3; + event.submitTimeStamp.gpuTimeStamp = (1ull << 33) + 4; + + event.timeStampNode = ×tampNode; + + cl_ulong queued = 0ul; + cl_ulong submited = 0ul; + cl_ulong start = 0ul; + cl_ulong end = 0ul; + + clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued, nullptr); + clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submited, nullptr); + clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr); + clGetEventProfilingInfo(clEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr); + + EXPECT_LT(0ull, queued); + EXPECT_LT(queued, submited); + EXPECT_LT(submited, start); + EXPECT_LT(start, end); + + event.timeStampNode = nullptr; +} + HWCMDTEST_F(IGFX_GEN8_CORE, EventProfilingTest, givenRawTimestampsDebugModeWhenStartTimeStampLTQueueTimeStampThenIncreaseStartTimeStamp) { DebugManagerStateRestore stateRestore; DebugManager.flags.ReturnRawGpuTimestamps.set(1);