performance: Reuse GPU timestamps by default

Related-To: NEO-10615 Signed-off-by: Szymon Morek <szymon.morek@intel.com>
2025-12-24 21:18:24 +08:00 · 2024-05-15 15:22:40 +00:00
parent 9989829487
commit 7aceed58ca
6 changed files with 94 additions and 86 deletions
--- a/opencl/source/event/event.cpp
+++ b/opencl/source/event/event.cpp
@@ -395,6 +395,8 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
    auto &gfxCoreHelper = device.getGfxCoreHelper();
    auto resolution = device.getDeviceInfo().profilingTimerResolution;

+    // Calculate startTimestamp only if it was not already set on CPU
+    if (startTimeStamp.cpuTimeInNs == 0) {
        startTimeStamp.gpuTimeStamp = globalStartTS;
        addOverflowToTimestamp(startTimeStamp.gpuTimeStamp, submitTimeStamp.gpuTimeStamp);
        if (startTimeStamp.gpuTimeStamp < submitTimeStamp.gpuTimeStamp) {
@@ -415,6 +417,8 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
                startTimeStamp.gpuTimeStamp += static_cast<uint64_t>(1ULL << gfxCoreHelper.getGlobalTimeStampBits());
            }
        }
+    }
+
    UNRECOVERABLE_IF(startTimeStamp.gpuTimeStamp < submitTimeStamp.gpuTimeStamp);
    auto gpuTicksDiff = startTimeStamp.gpuTimeStamp - submitTimeStamp.gpuTimeStamp;
    auto timeDiff = static_cast<uint64_t>(gpuTicksDiff * resolution);
--- a/opencl/test/unit_test/event/event_tests.cpp
+++ b/opencl/test/unit_test/event/event_tests.cpp
@@ -911,16 +911,10 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
 }

 TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitWhenCalculateStartTimestampThenAdjustTimestmaps) {
-    DebugManagerStateRestore dbgRestore;
-    debugManager.flags.EnableReusingGpuTimestamps.set(true);
-
-    MockContext context{};
-    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
-
    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
-    MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
+    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
-    auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;

    HwTimeStamps timestamp{};
    timestamp.globalStartTS = 3;
@@ -946,16 +940,10 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
 }

 TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitAndStartTSBelowOneWhenCalculateStartTimestampThenAdjustTimestmaps) {
-    DebugManagerStateRestore dbgRestore;
-    debugManager.flags.EnableReusingGpuTimestamps.set(true);
-
-    MockContext context{};
-    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
-
    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
-    MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
+    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
-    auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;

    HwTimeStamps timestamp{};
    timestamp.globalStartTS = 2;
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@@ -347,7 +347,7 @@ DECLARE_DEBUG_VARIABLE(bool, DisableStatelessToStatefulOptimization, false, "Dis
 DECLARE_DEBUG_VARIABLE(bool, DisableConcurrentBlockExecution, false, "disables concurrent block kernel execution")
 DECLARE_DEBUG_VARIABLE(bool, UseNoRingFlushesKmdMode, true, "Windows only, passes flag to KMD that informs KMD to not emit any ring buffer flushes.")
 DECLARE_DEBUG_VARIABLE(bool, DisableZeroCopyForUseHostPtr, false, "When active all buffer allocations created with CL_MEM_USE_HOST_PTR flag will not share memory with CPU.")
-DECLARE_DEBUG_VARIABLE(bool, EnableReusingGpuTimestamps, false, "When enabled, GPU timestamp will be reused for next device time requests")
+DECLARE_DEBUG_VARIABLE(bool, EnableReusingGpuTimestamps, true, "When enabled, GPU timestamp will be reused for next device time requests")
 DECLARE_DEBUG_VARIABLE(int32_t, AllowZeroCopyWithoutCoherency, -1, "Use cacheline flush instead of memory copy for map/unmap mem object")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableHostPtrTracking, -1, "Enable host ptr tracking: -1 - default platform setting, 0 - disabled, 1 - enabled")
 DECLARE_DEBUG_VARIABLE(int32_t, MaxHwThreadsPercent, 0, "If not zero then maximum number of used HW threads is capped to max * MaxHwThreadsPercent / 100")
--- a/shared/test/common/test_files/igdrcl.config
+++ b/shared/test/common/test_files/igdrcl.config
@@ -601,5 +601,5 @@ ForceSynchronizedDispatchMode = -1
 DirectSubmissionControllerAdjustOnThrottleAndAcLineStatus = -1
 ReadOnlyAllocationsTypeMask = 0
 EnableLogLevel = 6
-EnableReusingGpuTimestamps = 0
+EnableReusingGpuTimestamps = 1
 # Please don't edit below this line
--- a/shared/test/unit_test/os_interface/linux/os_time_test.cpp
+++ b/shared/test/unit_test/os_interface/linux/os_time_test.cpp
@@ -126,27 +126,27 @@ TEST_F(DrmTimeTest, given36BitGpuTimeStampWhenGpuTimeStampOverflowThenGpuTimeDoe
    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
    EXPECT_EQ(100ull, gpuCpuTime.gpuTimeStamp);

-    deviceTime->gpuCpuTimeValue.gpuTimeStamp = 200ll;
+    actualTime = 200ll;
    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
-    EXPECT_EQ(200ull, gpuCpuTime.gpuTimeStamp);
+    EXPECT_EQ(actualTime, gpuCpuTime.gpuTimeStamp);

    osTime->maxGpuTimeStamp = 1ull << 36;

-    deviceTime->gpuCpuTimeValue.gpuTimeStamp = 10ull; // read below initial value
-    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
+    deviceTime->gpuCpuTimeValue = {10ull, 10ull}; // read from KMD below initial value
+    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime, true));
    EXPECT_EQ(osTime->maxGpuTimeStamp + 10ull, gpuCpuTime.gpuTimeStamp);

-    deviceTime->gpuCpuTimeValue.gpuTimeStamp = 30ull; // second read below initial value
+    actualTime = 30ull; // second read below initial value
    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
-    EXPECT_EQ(osTime->maxGpuTimeStamp + 30ull, gpuCpuTime.gpuTimeStamp);
+    EXPECT_EQ(osTime->maxGpuTimeStamp + actualTime, gpuCpuTime.gpuTimeStamp);

-    deviceTime->gpuCpuTimeValue.gpuTimeStamp = 110ull;
+    actualTime = 110ull;
    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
-    EXPECT_EQ(osTime->maxGpuTimeStamp + 110ull, gpuCpuTime.gpuTimeStamp);
+    EXPECT_EQ(osTime->maxGpuTimeStamp + actualTime, gpuCpuTime.gpuTimeStamp);

-    deviceTime->gpuCpuTimeValue.gpuTimeStamp = 70ull; // second overflow
+    actualTime = 70ull; // second overflow
    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
-    EXPECT_EQ(2ull * osTime->maxGpuTimeStamp + 70ull, gpuCpuTime.gpuTimeStamp);
+    EXPECT_EQ(2ull * osTime->maxGpuTimeStamp + actualTime, gpuCpuTime.gpuTimeStamp);
 }

 TEST_F(DrmTimeTest, given64BitGpuTimeStampWhenGpuTimeStampOverflowThenOverflowsAreNotDetected) {
@@ -157,27 +157,27 @@ TEST_F(DrmTimeTest, given64BitGpuTimeStampWhenGpuTimeStampOverflowThenOverflowsA
    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
    EXPECT_EQ(100ull, gpuCpuTime.gpuTimeStamp);

-    deviceTime->gpuCpuTimeValue.gpuTimeStamp = 200ull;
+    actualTime = 200ull;
    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
-    EXPECT_EQ(200ull, gpuCpuTime.gpuTimeStamp);
+    EXPECT_EQ(actualTime, gpuCpuTime.gpuTimeStamp);

    osTime->maxGpuTimeStamp = 0ull;

-    deviceTime->gpuCpuTimeValue.gpuTimeStamp = 10ull; // read below initial value
-    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
+    deviceTime->gpuCpuTimeValue = {10ull, 10ull}; // read from KMD below initial value
+    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime, true));
    EXPECT_EQ(10ull, gpuCpuTime.gpuTimeStamp);

-    deviceTime->gpuCpuTimeValue.gpuTimeStamp = 30ull;
+    actualTime = 30ull;
    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
-    EXPECT_EQ(30ull, gpuCpuTime.gpuTimeStamp);
+    EXPECT_EQ(actualTime, gpuCpuTime.gpuTimeStamp);

-    deviceTime->gpuCpuTimeValue.gpuTimeStamp = 110ull;
+    actualTime = 110ull;
    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
-    EXPECT_EQ(110ull, gpuCpuTime.gpuTimeStamp);
+    EXPECT_EQ(actualTime, gpuCpuTime.gpuTimeStamp);

-    deviceTime->gpuCpuTimeValue.gpuTimeStamp = 70ull;
+    actualTime = 70ull;
    EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
-    EXPECT_EQ(70ull, gpuCpuTime.gpuTimeStamp);
+    EXPECT_EQ(actualTime, gpuCpuTime.gpuTimeStamp);
 }

 TEST_F(DrmTimeTest, GivenInvalidDrmWhenGettingGpuCpuTimeThenFails) {
@@ -287,17 +287,8 @@ TEST_F(DrmTimeTest, whenGettingMaxGpuTimeStampValueThenHwInfoBasedValueIsReturne
 }

 TEST_F(DrmTimeTest, whenGettingGpuTimeStampValueWithinIntervalThenReuseFromPreviousCall) {
-    DebugManagerStateRestore restore;
-    debugManager.flags.EnableReusingGpuTimestamps.set(true);
-
-    // Recreate mock to apply debug flag
    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
-    osTime = MockOSTimeLinux::create(*rootDeviceEnvironment.osInterface);
-    osTime->setResolutionFunc(resolutionFuncTrue);
-    osTime->setGetTimeFunc(getTimeFuncTrue);
-    osTime->setDeviceTimerResolution(*hwInfo);
-    auto deviceTime = osTime->getDeviceTime();

    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 0u);
    TimeStampData gpuCpuTime;
@@ -320,17 +311,6 @@ TEST_F(DrmTimeTest, whenGettingGpuTimeStampValueWithinIntervalThenReuseFromPrevi
 }

 TEST_F(DrmTimeTest, whenGettingGpuTimeStampValueAfterIntervalThenCallToKmdAndAdaptTimeout) {
-    DebugManagerStateRestore restore;
-    debugManager.flags.EnableReusingGpuTimestamps.set(true);
-
-    // Recreate mock to apply debug flag
-    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
-    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
-    osTime = MockOSTimeLinux::create(*rootDeviceEnvironment.osInterface);
-    osTime->setResolutionFunc(resolutionFuncTrue);
-    osTime->setGetTimeFunc(getTimeFuncTrue);
-    osTime->setDeviceTimerResolution(*hwInfo);
-    auto deviceTime = osTime->getDeviceTime();
    deviceTime->callBaseGetGpuCpuTimeImpl = false;
    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 0u);

@@ -400,3 +380,20 @@ TEST_F(DrmTimeTest, givenReusingTimestampsDisabledWhenGetTimestampRefreshTimeout
    osTime->setGetTimeFunc(getTimeFuncTrue);
    EXPECT_EQ(0ul, osTime->getTimestampRefreshTimeout());
 }
+
+TEST_F(DrmTimeTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysCallKmd) {
+    DebugManagerStateRestore restore;
+    debugManager.flags.EnableReusingGpuTimestamps.set(0);
+    // Recreate mock to apply debug flag
+    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
+    osTime = MockOSTimeLinux::create(*rootDeviceEnvironment.osInterface);
+    osTime->setResolutionFunc(resolutionFuncTrue);
+    osTime->setGetTimeFunc(getTimeFuncTrue);
+    auto deviceTime = osTime->getDeviceTime();
+    TimeStampData gpuCpuTime;
+    osTime->getGpuCpuTime(&gpuCpuTime);
+    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u);
+
+    osTime->getGpuCpuTime(&gpuCpuTime);
+    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
+}
--- a/shared/test/unit_test/os_interface/windows/os_time_win_tests.cpp
+++ b/shared/test/unit_test/os_interface/windows/os_time_win_tests.cpp
@@ -53,6 +53,8 @@ struct OSTimeWinTest : public ::testing::Test {
        rootDeviceEnvironment.osInterface = std::make_unique<OSInterface>();
        rootDeviceEnvironment.osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
        osTime = std::unique_ptr<MockOSTimeWin>(new MockOSTimeWin(*rootDeviceEnvironment.osInterface));
+        auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
+        osTime->setDeviceTimerResolution(*hwInfo);
    }

    void TearDown() override {
@@ -62,8 +64,11 @@ struct OSTimeWinTest : public ::testing::Test {
 };

 TEST_F(OSTimeWinTest, given36BitGpuTimeStampWhenGpuTimeStampOverflowThenGpuTimeDoesNotDecrease) {
+    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
+    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
    auto deviceTime = new MockDeviceTimeWin();
    osTime->deviceTime.reset(deviceTime);
+    osTime->setDeviceTimerResolution(*hwInfo);

    TimeStampData gpuCpuTime = {0ull, 0ull};

@@ -95,8 +100,11 @@ TEST_F(OSTimeWinTest, given36BitGpuTimeStampWhenGpuTimeStampOverflowThenGpuTimeD
 }

 TEST_F(OSTimeWinTest, given64BitGpuTimeStampWhenGpuTimeStampOverflowThenOverflowsAreNotDetected) {
+    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
+    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
    auto deviceTime = new MockDeviceTimeWin();
    osTime->deviceTime.reset(deviceTime);
+    osTime->setDeviceTimerResolution(*hwInfo);

    TimeStampData gpuCpuTime = {0ull, 0ull};

@@ -183,9 +191,12 @@ TEST(OSTimeWinTests, givenOSInterfaceWhenGetGpuCpuTimeThenReturnsSuccess) {
    auto wddm = new WddmMock(rootDeviceEnvironment);
    TimeStampData gpuCpuTime01 = {};
    TimeStampData gpuCpuTime02 = {};
-    std::unique_ptr<OSInterface> osInterface(new OSInterface());
-    osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
-    auto osTime = OSTime::create(osInterface.get());
+    rootDeviceEnvironment.osInterface = std::make_unique<OSInterface>();
+    rootDeviceEnvironment.osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
+    wddm->init();
+    auto osTime = OSTime::create(rootDeviceEnvironment.osInterface.get());
+    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
+    osTime->setDeviceTimerResolution(*hwInfo);
    auto success = osTime->getGpuCpuTime(&gpuCpuTime01);
    EXPECT_TRUE(success);
    EXPECT_NE(0u, gpuCpuTime01.cpuTimeinNS);
@@ -208,8 +219,6 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueThenHwInfoBasedValueIsRetur
 }

 TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWithinIntervalThenReuseFromPreviousCall) {
-    DebugManagerStateRestore restore;
-    debugManager.flags.EnableReusingGpuTimestamps.set(true);
    osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
    LARGE_INTEGER frequency = {};
    frequency.QuadPart = NSEC_PER_SEC;
@@ -245,8 +254,6 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWithinIntervalThenReuseFrom
 }

 TEST_F(OSTimeWinTest, whenGettingGpuTimeStampValueAfterIntervalThenCallToKmdAndAdaptTimeout) {
-    DebugManagerStateRestore restore;
-    debugManager.flags.EnableReusingGpuTimestamps.set(true);
    osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
    LARGE_INTEGER frequency = {};
    frequency.QuadPart = NSEC_PER_SEC;
@@ -304,8 +311,6 @@ TEST_F(OSTimeWinTest, whenGetGpuCpuTimeFailedThenReturnFalse) {
 }

 TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueAfterFlagSetThenCallToKmd) {
-    DebugManagerStateRestore restore;
-    debugManager.flags.EnableReusingGpuTimestamps.set(true);
    TimeStampData gpuCpuTime;
    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
@@ -324,8 +329,6 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueAfterFlagSetThenCallToKmd)
 }

 TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWhenForceFlagSetThenCallToKmd) {
-    DebugManagerStateRestore restore;
-    debugManager.flags.EnableReusingGpuTimestamps.set(true);
    osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
    LARGE_INTEGER frequency = {};
    frequency.QuadPart = NSEC_PER_SEC;
@@ -359,3 +362,19 @@ TEST_F(OSTimeWinTest, givenReusingTimestampsDisabledWhenGetTimestampRefreshTimeo
    osTime->setDeviceTimerResolution(*hwInfo);
    EXPECT_EQ(0ul, osTime->getTimestampRefreshTimeout());
 }
+
+TEST_F(OSTimeWinTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysCallKmd) {
+    DebugManagerStateRestore restore;
+    debugManager.flags.EnableReusingGpuTimestamps.set(0);
+    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
+    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
+    auto deviceTime = new MockDeviceTimeWin();
+    osTime->deviceTime.reset(deviceTime);
+    osTime->setDeviceTimerResolution(*hwInfo);
+    TimeStampData gpuCpuTime;
+    osTime->getGpuCpuTime(&gpuCpuTime);
+    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u);
+
+    osTime->getGpuCpuTime(&gpuCpuTime);
+    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
+}