Revert "performance: Reuse GPU timestamps by default on Windows"

This reverts commit bca3fecaa0. Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com>
2024-05-25 07:39:33 +02:00 · 2024-05-25 07:39:33 +02:00 · 0b2c9e92e7
parent d7726c9d86
commit 0b2c9e92e7
11 changed files with 62 additions and 87 deletions
--- a/opencl/source/event/event.cpp
+++ b/opencl/source/event/event.cpp
@ -395,8 +395,6 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
    auto &gfxCoreHelper = device.getGfxCoreHelper();
    auto resolution = device.getDeviceInfo().profilingTimerResolution;

-    // Calculate startTimestamp only if it was not already set on CPU
-    if (startTimeStamp.cpuTimeInNs == 0) {
    startTimeStamp.gpuTimeStamp = globalStartTS;
    addOverflowToTimestamp(startTimeStamp.gpuTimeStamp, submitTimeStamp.gpuTimeStamp);
    if (startTimeStamp.gpuTimeStamp < submitTimeStamp.gpuTimeStamp) {
@ -417,8 +415,6 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
            startTimeStamp.gpuTimeStamp += static_cast<uint64_t>(1ULL << gfxCoreHelper.getGlobalTimeStampBits());
        }
    }
-    }
-
    UNRECOVERABLE_IF(startTimeStamp.gpuTimeStamp < submitTimeStamp.gpuTimeStamp);
    auto gpuTicksDiff = startTimeStamp.gpuTimeStamp - submitTimeStamp.gpuTimeStamp;
    auto timeDiff = static_cast<uint64_t>(gpuTicksDiff * resolution);
--- a/opencl/test/unit_test/event/event_tests.cpp
+++ b/opencl/test/unit_test/event/event_tests.cpp
@ -911,10 +911,16 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
 }

 TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitWhenCalculateStartTimestampThenAdjustTimestmaps) {
+    DebugManagerStateRestore dbgRestore;
+    debugManager.flags.EnableReusingGpuTimestamps.set(true);
+
+    MockContext context{};
+    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+
    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
-    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
+    MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
-    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;

    HwTimeStamps timestamp{};
    timestamp.globalStartTS = 3;
@ -940,10 +946,16 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
 }

 TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitAndStartTSBelowOneWhenCalculateStartTimestampThenAdjustTimestmaps) {
+    DebugManagerStateRestore dbgRestore;
+    debugManager.flags.EnableReusingGpuTimestamps.set(true);
+
+    MockContext context{};
+    auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
+
    const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
-    MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
+    MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
    MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
-    auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
+    auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;

    HwTimeStamps timestamp{};
    timestamp.globalStartTS = 2;
--- a/shared/source/debug_settings/debug_variables_base.inl
+++ b/shared/source/debug_settings/debug_variables_base.inl
@ -350,7 +350,7 @@ DECLARE_DEBUG_VARIABLE(bool, DisableStatelessToStatefulOptimization, false, "Dis
 DECLARE_DEBUG_VARIABLE(bool, DisableConcurrentBlockExecution, false, "disables concurrent block kernel execution")
 DECLARE_DEBUG_VARIABLE(bool, UseNoRingFlushesKmdMode, true, "Windows only, passes flag to KMD that informs KMD to not emit any ring buffer flushes.")
 DECLARE_DEBUG_VARIABLE(bool, DisableZeroCopyForUseHostPtr, false, "When active all buffer allocations created with CL_MEM_USE_HOST_PTR flag will not share memory with CPU.")
-DECLARE_DEBUG_VARIABLE(int32_t, EnableReusingGpuTimestamps, -1, "Reuse GPU timestamp for next device time requests. -1: os-specific, 0: disable, 1: enable")
+DECLARE_DEBUG_VARIABLE(bool, EnableReusingGpuTimestamps, false, "When enabled, GPU timestamp will be reused for next device time requests")
 DECLARE_DEBUG_VARIABLE(int32_t, AllowZeroCopyWithoutCoherency, -1, "Use cacheline flush instead of memory copy for map/unmap mem object")
 DECLARE_DEBUG_VARIABLE(int32_t, EnableHostPtrTracking, -1, "Enable host ptr tracking: -1 - default platform setting, 0 - disabled, 1 - enabled")
 DECLARE_DEBUG_VARIABLE(int32_t, MaxHwThreadsPercent, 0, "If not zero then maximum number of used HW threads is capped to max * MaxHwThreadsPercent / 100")
--- a/shared/source/os_interface/linux/device_time_drm.cpp
+++ b/shared/source/os_interface/linux/device_time_drm.cpp
@ -7,7 +7,6 @@

 #include "shared/source/os_interface/linux/device_time_drm.h"

-#include "shared/source/debug_settings/debug_settings_manager.h"
 #include "shared/source/helpers/register_offsets.h"
 #include "shared/source/os_interface/linux/drm_neo.h"
 #include "shared/source/os_interface/linux/drm_wrappers.h"
@ -51,12 +50,4 @@ uint64_t DeviceTimeDrm::getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) c
    return static_cast<uint64_t>(nanosecondsPerSecond / OSTime::getDeviceTimerResolution(hwInfo));
 }

-bool DeviceTimeDrm::isTimestampsRefreshEnabled() const {
-    bool timestampsRefreshEnabled = false;
-    if (debugManager.flags.EnableReusingGpuTimestamps.get() != -1) {
-        timestampsRefreshEnabled = debugManager.flags.EnableReusingGpuTimestamps.get();
-    }
-    return timestampsRefreshEnabled;
-}
-
 } // namespace NEO
--- a/shared/source/os_interface/linux/device_time_drm.h
+++ b/shared/source/os_interface/linux/device_time_drm.h
@ -17,7 +17,6 @@ class DeviceTimeDrm : public DeviceTime {
    bool getGpuCpuTimeImpl(TimeStampData *pGpuCpuTime, OSTime *osTime) override;
    double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override;
    uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override;
-    bool isTimestampsRefreshEnabled() const override;

  protected:
    Drm *pDrm = nullptr;
--- a/shared/source/os_interface/os_time.cpp
+++ b/shared/source/os_interface/os_time.cpp
@ -19,6 +19,13 @@ double OSTime::getDeviceTimerResolution(HardwareInfo const &hwInfo) {
    return hwInfo.capabilityTable.defaultProfilingTimerResolution;
 };

+DeviceTime::DeviceTime() {
+    reusingTimestampsEnabled = debugManager.flags.EnableReusingGpuTimestamps.get();
+    if (reusingTimestampsEnabled) {
+        timestampRefreshTimeoutNS = NSEC_PER_MSEC * 100; // 100ms
+    }
+}
+
 bool DeviceTime::getGpuCpuTimeImpl(TimeStampData *pGpuCpuTime, OSTime *osTime) {
    pGpuCpuTime->cpuTimeinNS = 0;
    pGpuCpuTime->gpuTimeStamp = 0;
@ -40,14 +47,6 @@ void DeviceTime::setDeviceTimerResolution(HardwareInfo const &hwInfo) {
    }
 }

-bool DeviceTime::isTimestampsRefreshEnabled() const {
-    bool timestampsRefreshEnabled = true;
-    if (debugManager.flags.EnableReusingGpuTimestamps.get() != -1) {
-        timestampsRefreshEnabled = debugManager.flags.EnableReusingGpuTimestamps.get();
-    }
-    return timestampsRefreshEnabled;
-}
-
 /**
 * @brief If this method is called within interval, GPU timestamp
 * will be calculated based on CPU timestamp and previous GPU ticks
@ -64,7 +63,7 @@ bool DeviceTime::getGpuCpuTimestamps(TimeStampData *timeStamp, OSTime *osTime, b
    if (forceKmdCall || cpuTimeDiffInNS >= timestampRefreshTimeoutNS) {
        refreshTimestamps = true;
    }
-    bool reusingTimestampsEnabled = isTimestampsRefreshEnabled();
+
    if (!reusingTimestampsEnabled || refreshTimestamps) {
        if (!getGpuCpuTimeImpl(timeStamp, osTime)) {
            return false;
--- a/shared/source/os_interface/os_time.h
+++ b/shared/source/os_interface/os_time.h
@ -25,12 +25,12 @@ class OSTime;

 class DeviceTime {
  public:
+    DeviceTime();
    virtual ~DeviceTime() = default;
    bool getGpuCpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime, bool forceKmdCall);
    virtual bool getGpuCpuTimeImpl(TimeStampData *pGpuCpuTime, OSTime *osTime);
    virtual double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const;
    virtual uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const;
-    virtual bool isTimestampsRefreshEnabled() const;
    bool getGpuCpuTimestamps(TimeStampData *timeStamp, OSTime *osTime, bool forceKmdCall);
    void setDeviceTimerResolution(HardwareInfo const &hwInfo);
    void setRefreshTimestampsFlag() {
@ -47,8 +47,9 @@ class DeviceTime {
    double deviceTimerResolution = 0;
    const uint64_t timestampRefreshMinTimeoutNS = NSEC_PER_MSEC; // 1ms
    const uint64_t timestampRefreshMaxTimeoutNS = NSEC_PER_SEC;  // 1s
-    uint64_t timestampRefreshTimeoutNS = NSEC_PER_MSEC * 100;    // 100ms
+    uint64_t timestampRefreshTimeoutNS = 0;
    bool refreshTimestamps = true;
+    bool reusingTimestampsEnabled = false;
    TimeStampData fetchedTimestamps{};
 };

--- a/shared/test/common/test_files/igdrcl.config
+++ b/shared/test/common/test_files/igdrcl.config
@ -602,7 +602,7 @@ ForceSynchronizedDispatchMode = -1
 DirectSubmissionControllerAdjustOnThrottleAndAcLineStatus = -1
 ReadOnlyAllocationsTypeMask = 0
 EnableLogLevel = 6
-EnableReusingGpuTimestamps = -1
+EnableReusingGpuTimestamps = 0
 ForceCopyOperationOffloadForComputeCmdList = -1
 SecondaryContextEngineTypeMask = -1
 # Please don't edit below this line
--- a/shared/test/unit_test/execution_environment/execution_environment_tests.cpp
+++ b/shared/test/unit_test/execution_environment/execution_environment_tests.cpp
@ -86,7 +86,6 @@ TEST(RootDeviceEnvironment, givenExecutionEnvironmentWhenInitializeAubCenterIsCa
 }

 TEST(RootDeviceEnvironment, whenCreatingRootDeviceEnvironmentThenCreateOsAgnosticOsTime) {
-    DebugManagerStateRestore dbgRestore;
    MockExecutionEnvironment executionEnvironment;
    executionEnvironment.rootDeviceEnvironments[0]->setHwInfoAndInitHelpers(defaultHwInfo.get());
    auto profilingTimerResolution = defaultHwInfo->capabilityTable.defaultProfilingTimerResolution;
@ -110,14 +109,6 @@ TEST(RootDeviceEnvironment, whenCreatingRootDeviceEnvironmentThenCreateOsAgnosti

    EXPECT_EQ(profilingTimerResolution, rootDeviceEnvironment->osTime->getDynamicDeviceTimerResolution(*defaultHwInfo));
    EXPECT_EQ(static_cast<uint64_t>(1000000000.0 / OSTime::getDeviceTimerResolution(*defaultHwInfo)), rootDeviceEnvironment->osTime->getDynamicDeviceTimerClock(*defaultHwInfo));
-
-    struct MockOSTime : public OSTime {
-        using OSTime::deviceTime;
-    };
-    auto deviceTime = static_cast<MockOSTime *>(rootDeviceEnvironment->osTime.get())->deviceTime.get();
-    EXPECT_TRUE(deviceTime->isTimestampsRefreshEnabled());
-    debugManager.flags.EnableReusingGpuTimestamps.set(0);
-    EXPECT_FALSE(deviceTime->isTimestampsRefreshEnabled());
 }

 TEST(RootDeviceEnvironment, givenUseAubStreamFalseWhenGetAubManagerIsCalledThenReturnNull) {
--- a/shared/test/unit_test/os_interface/linux/os_time_test.cpp
+++ b/shared/test/unit_test/os_interface/linux/os_time_test.cpp
@ -390,7 +390,7 @@ TEST_F(DrmTimeTest, whenGettingMaxGpuTimeStampValueWhenForceFlagSetThenCallToKmd
    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
 }

-TEST_F(DrmTimeTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysCallKmd) {
+TEST_F(DrmTimeTest, givenReusingTimestampsDisabledWhenGetTimestampRefreshTimeoutThenReturnCorrectValue) {
    DebugManagerStateRestore restore;
    debugManager.flags.EnableReusingGpuTimestamps.set(0);
    // Recreate mock to apply debug flag
@ -398,11 +398,5 @@ TEST_F(DrmTimeTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysCal
    osTime = MockOSTimeLinux::create(*rootDeviceEnvironment.osInterface);
    osTime->setResolutionFunc(resolutionFuncTrue);
    osTime->setGetTimeFunc(getTimeFuncTrue);
-    auto deviceTime = osTime->getDeviceTime();
-    TimeStampData gpuCpuTime;
-    osTime->getGpuCpuTime(&gpuCpuTime);
-    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u);
-
-    osTime->getGpuCpuTime(&gpuCpuTime);
-    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
+    EXPECT_EQ(0ul, osTime->getTimestampRefreshTimeout());
 }
--- a/shared/test/unit_test/os_interface/windows/os_time_win_tests.cpp
+++ b/shared/test/unit_test/os_interface/windows/os_time_win_tests.cpp
@ -53,8 +53,6 @@ struct OSTimeWinTest : public ::testing::Test {
        rootDeviceEnvironment.osInterface = std::make_unique<OSInterface>();
        rootDeviceEnvironment.osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
        osTime = std::unique_ptr<MockOSTimeWin>(new MockOSTimeWin(*rootDeviceEnvironment.osInterface));
-        auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
-        osTime->setDeviceTimerResolution(*hwInfo);
    }

    void TearDown() override {
@ -64,11 +62,8 @@ struct OSTimeWinTest : public ::testing::Test {
 };

 TEST_F(OSTimeWinTest, given36BitGpuTimeStampWhenGpuTimeStampOverflowThenGpuTimeDoesNotDecrease) {
-    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
-    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
    auto deviceTime = new MockDeviceTimeWin();
    osTime->deviceTime.reset(deviceTime);
-    osTime->setDeviceTimerResolution(*hwInfo);

    TimeStampData gpuCpuTime = {0ull, 0ull};

@ -100,11 +95,8 @@ TEST_F(OSTimeWinTest, given36BitGpuTimeStampWhenGpuTimeStampOverflowThenGpuTimeD
 }

 TEST_F(OSTimeWinTest, given64BitGpuTimeStampWhenGpuTimeStampOverflowThenOverflowsAreNotDetected) {
-    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
-    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
    auto deviceTime = new MockDeviceTimeWin();
    osTime->deviceTime.reset(deviceTime);
-    osTime->setDeviceTimerResolution(*hwInfo);

    TimeStampData gpuCpuTime = {0ull, 0ull};

@ -191,12 +183,9 @@ TEST(OSTimeWinTests, givenOSInterfaceWhenGetGpuCpuTimeThenReturnsSuccess) {
    auto wddm = new WddmMock(rootDeviceEnvironment);
    TimeStampData gpuCpuTime01 = {};
    TimeStampData gpuCpuTime02 = {};
-    rootDeviceEnvironment.osInterface = std::make_unique<OSInterface>();
-    rootDeviceEnvironment.osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
-    wddm->init();
-    auto osTime = OSTime::create(rootDeviceEnvironment.osInterface.get());
-    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
-    osTime->setDeviceTimerResolution(*hwInfo);
+    std::unique_ptr<OSInterface> osInterface(new OSInterface());
+    osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
+    auto osTime = OSTime::create(osInterface.get());
    auto success = osTime->getGpuCpuTime(&gpuCpuTime01);
    EXPECT_TRUE(success);
    EXPECT_NE(0u, gpuCpuTime01.cpuTimeinNS);
@ -219,6 +208,8 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueThenHwInfoBasedValueIsRetur
 }

 TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWithinIntervalThenReuseFromPreviousCall) {
+    DebugManagerStateRestore restore;
+    debugManager.flags.EnableReusingGpuTimestamps.set(true);
    osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
    LARGE_INTEGER frequency = {};
    frequency.QuadPart = NSEC_PER_SEC;
@ -254,6 +245,8 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWithinIntervalThenReuseFrom
 }

 TEST_F(OSTimeWinTest, whenGettingGpuTimeStampValueAfterIntervalThenCallToKmdAndAdaptTimeout) {
+    DebugManagerStateRestore restore;
+    debugManager.flags.EnableReusingGpuTimestamps.set(true);
    osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
    LARGE_INTEGER frequency = {};
    frequency.QuadPart = NSEC_PER_SEC;
@ -311,6 +304,8 @@ TEST_F(OSTimeWinTest, whenGetGpuCpuTimeFailedThenReturnFalse) {
 }

 TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueAfterFlagSetThenCallToKmd) {
+    DebugManagerStateRestore restore;
+    debugManager.flags.EnableReusingGpuTimestamps.set(true);
    TimeStampData gpuCpuTime;
    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
    auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
@ -329,6 +324,8 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueAfterFlagSetThenCallToKmd)
 }

 TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWhenForceFlagSetThenCallToKmd) {
+    DebugManagerStateRestore restore;
+    debugManager.flags.EnableReusingGpuTimestamps.set(true);
    osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
    LARGE_INTEGER frequency = {};
    frequency.QuadPart = NSEC_PER_SEC;
@ -352,7 +349,7 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWhenForceFlagSetThenCallToK
    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
 }

-TEST_F(OSTimeWinTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysCallKmd) {
+TEST_F(OSTimeWinTest, givenReusingTimestampsDisabledWhenGetTimestampRefreshTimeoutThenReturnCorrectValue) {
    DebugManagerStateRestore restore;
    debugManager.flags.EnableReusingGpuTimestamps.set(0);
    auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
@ -360,10 +357,5 @@ TEST_F(OSTimeWinTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysC
    auto deviceTime = new MockDeviceTimeWin();
    osTime->deviceTime.reset(deviceTime);
    osTime->setDeviceTimerResolution(*hwInfo);
-    TimeStampData gpuCpuTime;
-    osTime->getGpuCpuTime(&gpuCpuTime);
-    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u);
-
-    osTime->getGpuCpuTime(&gpuCpuTime);
-    EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
+    EXPECT_EQ(0ul, osTime->getTimestampRefreshTimeout());
 }