From 9ca2091725bc827edb98a9b070d7db44e4c2547c Mon Sep 17 00:00:00 2001 From: "Morek, Szymon" Date: Tue, 16 Apr 2024 08:50:16 +0000 Subject: [PATCH] performance: Reuse GPU timestamp instead of KMD escape Resolves: NEO-10615 Signed-off-by: Morek, Szymon --- .../unit_test/device/device_timers_tests.cpp | 6 +- .../debug_settings/debug_variables_base.inl | 1 + .../root_device_environment.cpp | 1 + shared/source/os_interface/os_time.cpp | 54 +++++++++++++++++- shared/source/os_interface/os_time.h | 14 ++++- .../common/mocks/linux/mock_os_time_linux.h | 15 ++++- shared/test/common/mocks/mock_device.cpp | 4 +- shared/test/common/test_files/igdrcl.config | 1 + .../os_interface/linux/os_time_test.cpp | 52 ++++++++++++++++- .../windows/os_time_win_tests.cpp | 56 ++++++++++++++++++- 10 files changed, 190 insertions(+), 14 deletions(-) diff --git a/opencl/test/unit_test/device/device_timers_tests.cpp b/opencl/test/unit_test/device/device_timers_tests.cpp index e8f03597a2..04d874c9fb 100644 --- a/opencl/test/unit_test/device/device_timers_tests.cpp +++ b/opencl/test/unit_test/device/device_timers_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -25,7 +25,9 @@ TEST(MockOSTime, WhenSleepingThenDeviceAndHostTimerAreIncreased) { cl_ulong hostTimestamp[2] = {0, 0}; auto mDev = MockDevice::createWithNewExecutionEnvironment(nullptr); - mDev->setOSTime(new MockOSTime()); + auto osTime = new MockOSTime(); + osTime->setDeviceTimerResolution(mDev->getHardwareInfo()); + mDev->setOSTime(osTime); mDev->getDeviceAndHostTimer( &deviceTimestamp[0], diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 6b58ad4139..9ee654d4a5 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -513,6 +513,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableBcsSwControlWa, -1, "Enable BCS WA via BCS DECLARE_DEBUG_VARIABLE(bool, EnableHostAllocationMemPolicy, false, "Enables Memory Policy for host allocation") DECLARE_DEBUG_VARIABLE(int32_t, OverrideHostAllocationMemPolicyMode, -1, "Override Memory Policy mode for host allocation -1: default (use the system configuration), 0: MPOL_DEFAULT, 1: MPOL_PREFERRED, 2: MPOL_BIND, 3: MPOL_INTERLEAVED, 4: MPOL_LOCAL, 5: MPOL_PREFERRED_MANY") DECLARE_DEBUG_VARIABLE(int32_t, EnableFtrTile64Optimization, 0, "Control feature Tile64 Optimization flag passed to gmmlib. -1: pass as-is, 0: disable flag(default due to NEO-10623), 1: enable flag"); +DECLARE_DEBUG_VARIABLE(int32_t, GpuTimestampRefreshTimeout, -1, "Set timeout to refresh cached GPU timestamp, -1: default 5 ms, >=0: timeout in ms") /* IMPLICIT SCALING */ DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disable, 1: enable, Enables Walker Partitioning via WPARID.") diff --git a/shared/source/execution_environment/root_device_environment.cpp b/shared/source/execution_environment/root_device_environment.cpp index a43689f729..b29408e4c1 100644 --- a/shared/source/execution_environment/root_device_environment.cpp +++ b/shared/source/execution_environment/root_device_environment.cpp @@ -124,6 +124,7 @@ void RootDeviceEnvironment::initGmm() { void RootDeviceEnvironment::initOsTime() { if (!osTime) { osTime = OSTime::create(osInterface.get()); + osTime->setDeviceTimerResolution(*hwInfo); } } diff --git a/shared/source/os_interface/os_time.cpp b/shared/source/os_interface/os_time.cpp index cbf07cd9d1..c5d6d15812 100644 --- a/shared/source/os_interface/os_time.cpp +++ b/shared/source/os_interface/os_time.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -7,6 +7,8 @@ #include "shared/source/os_interface/os_time.h" +#include "shared/source/debug_settings/debug_settings_manager.h" +#include "shared/source/helpers/debug_helpers.h" #include "shared/source/helpers/hw_info.h" #include @@ -17,6 +19,12 @@ double OSTime::getDeviceTimerResolution(HardwareInfo const &hwInfo) { return hwInfo.capabilityTable.defaultProfilingTimerResolution; }; +DeviceTime::DeviceTime() { + if (debugManager.flags.GpuTimestampRefreshTimeout.get() != -1) { + timestampRefreshTimeoutMS = debugManager.flags.GpuTimestampRefreshTimeout.get(); + } +} + bool DeviceTime::getGpuCpuTimeImpl(TimeStampData *pGpuCpuTime, OSTime *osTime) { pGpuCpuTime->cpuTimeinNS = 0; pGpuCpuTime->gpuTimeStamp = 0; @@ -31,8 +39,50 @@ uint64_t DeviceTime::getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) cons return static_cast(1000000000.0 / OSTime::getDeviceTimerResolution(hwInfo)); } +void DeviceTime::setDeviceTimerResolution(HardwareInfo const &hwInfo) { + deviceTimerResolution = getDynamicDeviceTimerResolution(hwInfo); + if (debugManager.flags.OverrideProfilingTimerResolution.get() != -1) { + deviceTimerResolution = static_cast(debugManager.flags.OverrideProfilingTimerResolution.get()); + } +} + +/** + * @brief If this method is called within 100ms interval, GPU timestamp + * will be calculated based on CPU timestamp and previous GPU ticks + * to reduce amount of internal KMD calls. + * + * @return returns false if internal call to KMD failed. True otherwise. + */ +bool DeviceTime::getGpuCpuTimestamps(TimeStampData *timeStamp, OSTime *osTime) { + bool refreshTimestamps = false; + + uint64_t cpuTimeinNS; + osTime->getCpuTime(&cpuTimeinNS); + auto cpuTimeDiffInNS = cpuTimeinNS - fetchedTimestamps.cpuTimeinNS; + if (cpuTimeDiffInNS >= (NSEC_PER_MSEC * timestampRefreshTimeoutMS)) { + refreshTimestamps = true; + } + + // Refresh on first call + if (!initialGpuTimeStamp) { + refreshTimestamps = true; + } + + if (refreshTimestamps) { + if (!getGpuCpuTimeImpl(timeStamp, osTime)) { + return false; + } + fetchedTimestamps = *timeStamp; + } else { + timeStamp->cpuTimeinNS = cpuTimeinNS; + UNRECOVERABLE_IF(deviceTimerResolution == 0); + timeStamp->gpuTimeStamp = fetchedTimestamps.gpuTimeStamp + static_cast(cpuTimeDiffInNS / deviceTimerResolution); + } + return true; +} + bool DeviceTime::getGpuCpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime) { - if (!getGpuCpuTimeImpl(pGpuCpuTime, osTime)) { + if (!getGpuCpuTimestamps(pGpuCpuTime, osTime)) { return false; } diff --git a/shared/source/os_interface/os_time.h b/shared/source/os_interface/os_time.h index e021a829bb..08ae37ef6f 100644 --- a/shared/source/os_interface/os_time.h +++ b/shared/source/os_interface/os_time.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -10,7 +10,7 @@ #include #define NSEC_PER_SEC (1000000000ULL) - +#define NSEC_PER_MSEC (NSEC_PER_SEC / 1000) namespace NEO { class OSInterface; @@ -25,15 +25,21 @@ class OSTime; class DeviceTime { public: + DeviceTime(); virtual ~DeviceTime() = default; bool getGpuCpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime); virtual bool getGpuCpuTimeImpl(TimeStampData *pGpuCpuTime, OSTime *osTime); virtual double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const; virtual uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const; + bool getGpuCpuTimestamps(TimeStampData *timeStamp, OSTime *osTime); + void setDeviceTimerResolution(HardwareInfo const &hwInfo); std::optional initialGpuTimeStamp{}; bool waitingForGpuTimeStampOverflow = false; uint64_t gpuTimeStampOverflowCounter = 0; + double deviceTimerResolution = 0; + uint32_t timestampRefreshTimeoutMS = 100u; + TimeStampData fetchedTimestamps{}; }; class OSTime { @@ -61,6 +67,10 @@ class OSTime { uint64_t getMaxGpuTimeStamp() const { return maxGpuTimeStamp; } + void setDeviceTimerResolution(HardwareInfo const &hwInfo) const { + deviceTime->setDeviceTimerResolution(hwInfo); + } + protected: OSTime() = default; OSInterface *osInterface = nullptr; diff --git a/shared/test/common/mocks/linux/mock_os_time_linux.h b/shared/test/common/mocks/linux/mock_os_time_linux.h index 9a99ed7c57..a73f51e3b0 100644 --- a/shared/test/common/mocks/linux/mock_os_time_linux.h +++ b/shared/test/common/mocks/linux/mock_os_time_linux.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -18,15 +18,28 @@ class MockDeviceTimeDrm : public DeviceTimeDrm { using DeviceTimeDrm::pDrm; bool getGpuCpuTimeImpl(TimeStampData *pGpuCpuTime, OSTime *osTime) override { + getGpuCpuTimeImplCalled++; if (callBaseGetGpuCpuTimeImpl) { return DeviceTimeDrm::getGpuCpuTimeImpl(pGpuCpuTime, osTime); } *pGpuCpuTime = gpuCpuTimeValue; return getGpuCpuTimeImplResult; } + + double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override { + if (callGetDynamicDeviceTimerResolution) { + return DeviceTimeDrm::getDynamicDeviceTimerResolution(hwInfo); + } + return dynamicDeviceTimerResolutionValue; + } + bool callBaseGetGpuCpuTimeImpl = true; bool getGpuCpuTimeImplResult = true; TimeStampData gpuCpuTimeValue{}; + uint32_t getGpuCpuTimeImplCalled = 0; + + bool callGetDynamicDeviceTimerResolution = false; + double dynamicDeviceTimerResolutionValue = 1.0; }; class MockOSTimeLinux : public OSTimeLinux { diff --git a/shared/test/common/mocks/mock_device.cpp b/shared/test/common/mocks/mock_device.cpp index ebf6d0f77c..be85053e9a 100644 --- a/shared/test/common/mocks/mock_device.cpp +++ b/shared/test/common/mocks/mock_device.cpp @@ -50,11 +50,11 @@ const char *MockDevice::getProductAbbrev() const { MockDevice::MockDevice(ExecutionEnvironment *executionEnvironment, uint32_t rootDeviceIndex) : RootDevice(executionEnvironment, rootDeviceIndex) { UltDeviceFactory::initializeMemoryManager(*executionEnvironment); - + auto &hwInfo = getHardwareInfo(); if (!getOSTime()) { getRootDeviceEnvironmentRef().osTime = MockOSTime::create(); + getRootDeviceEnvironmentRef().osTime->setDeviceTimerResolution(hwInfo); } - auto &hwInfo = getHardwareInfo(); executionEnvironment->rootDeviceEnvironments[rootDeviceIndex]->setHwInfoAndInitHelpers(&hwInfo); executionEnvironment->rootDeviceEnvironments[rootDeviceIndex]->initGmm(); if (!executionEnvironment->rootDeviceEnvironments[rootDeviceIndex]->memoryOperationsInterface) { diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index ee1d736758..a5e0f94fff 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -599,4 +599,5 @@ ForceTlbFlushWithTaskCountAfterCopy = -1 ForceSynchronizedDispatchMode = -1 DirectSubmissionControllerAdjustOnThrottleAndAcLineStatus = -1 ReadOnlyAllocationsTypeMask = 0 +GpuTimestampRefreshTimeout = -1 # Please don't edit below this line diff --git a/shared/test/unit_test/os_interface/linux/os_time_test.cpp b/shared/test/unit_test/os_interface/linux/os_time_test.cpp index 6e0cfe35b4..22b400b91d 100644 --- a/shared/test/unit_test/os_interface/linux/os_time_test.cpp +++ b/shared/test/unit_test/os_interface/linux/os_time_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -9,6 +9,7 @@ #include "shared/source/os_interface/linux/ioctl_helper.h" #include "shared/source/os_interface/linux/os_time_linux.h" #include "shared/source/os_interface/os_interface.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/mocks/linux/mock_os_time_linux.h" #include "shared/test/common/mocks/mock_execution_environment.h" #include "shared/test/common/os_interface/linux/device_command_stream_fixture.h" @@ -48,6 +49,8 @@ struct DrmTimeTest : public ::testing::Test { osTime = MockOSTimeLinux::create(*rootDeviceEnvironment.osInterface); osTime->setResolutionFunc(resolutionFuncTrue); osTime->setGetTimeFunc(getTimeFuncTrue); + auto hwInfo = rootDeviceEnvironment.getMutableHardwareInfo(); + osTime->setDeviceTimerResolution(*hwInfo); deviceTime = osTime->getDeviceTime(); } @@ -202,7 +205,7 @@ TEST_F(DrmTimeTest, givenGpuTimestampResolutionQueryWhenIoctlFailsThenDefaultRes drm->getParamRetValue = 0; drm->ioctlRes = -1; - + deviceTime->callGetDynamicDeviceTimerResolution = true; auto result = osTime->getDynamicDeviceTimerResolution(*defaultHwInfo); EXPECT_DOUBLE_EQ(result, defaultResolution); } @@ -239,7 +242,7 @@ TEST_F(DrmTimeTest, givenGpuTimestampResolutionQueryWhenIoctlSuccedsThenCorrectR // 19200000 is frequency yelding 52.083ns resolution drm->getParamRetValue = 19200000; drm->ioctlRes = 0; - + deviceTime->callGetDynamicDeviceTimerResolution = true; auto result = osTime->getDynamicDeviceTimerResolution(*defaultHwInfo); EXPECT_DOUBLE_EQ(result, 52.08333333333333); } @@ -282,3 +285,46 @@ TEST_F(DrmTimeTest, whenGettingMaxGpuTimeStampValueThenHwInfoBasedValueIsReturne EXPECT_EQ(0ull, osTime->getMaxGpuTimeStamp()); } } + +TEST_F(DrmTimeTest, whenGettingMaxGpuTimeStampValueWithinIntervalThenReuseFromPreviousCall) { + EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 0u); + TimeStampData gpuCpuTime; + osTime->getGpuCpuTime(&gpuCpuTime); + EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u); + + auto gpuTimestampBefore = gpuCpuTime.gpuTimeStamp; + auto cpuTimeBefore = actualTime; + + osTime->getGpuCpuTime(&gpuCpuTime); + EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u); + + auto gpuTimestampAfter = gpuCpuTime.gpuTimeStamp; + auto cpuTimeAfter = actualTime; + + auto cpuTimeDiff = cpuTimeAfter - cpuTimeBefore; + auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0]; + auto hwInfo = rootDeviceEnvironment.getHardwareInfo(); + auto deviceTimerResolution = deviceTime->getDynamicDeviceTimerResolution(*hwInfo); + auto gpuTimestampDiff = static_cast(cpuTimeDiff / deviceTimerResolution); + EXPECT_EQ(gpuTimestampAfter, gpuTimestampBefore + gpuTimestampDiff); +} + +TEST_F(DrmTimeTest, whenGettingMaxGpuTimeStampValueAfterIntervalThenCallToKmd) { + DebugManagerStateRestore restore; + debugManager.flags.GpuTimestampRefreshTimeout.set(0); + + // Recreate mock to apply debug flag + auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0]; + osTime = MockOSTimeLinux::create(*rootDeviceEnvironment.osInterface); + osTime->setResolutionFunc(resolutionFuncTrue); + osTime->setGetTimeFunc(getTimeFuncTrue); + auto deviceTime = osTime->getDeviceTime(); + EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 0u); + + TimeStampData gpuCpuTime; + osTime->getGpuCpuTime(&gpuCpuTime); + EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u); + + osTime->getGpuCpuTime(&gpuCpuTime); + EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u); +} diff --git a/shared/test/unit_test/os_interface/windows/os_time_win_tests.cpp b/shared/test/unit_test/os_interface/windows/os_time_win_tests.cpp index 47db94dbe7..36f9124af7 100644 --- a/shared/test/unit_test/os_interface/windows/os_time_win_tests.cpp +++ b/shared/test/unit_test/os_interface/windows/os_time_win_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,10 +29,19 @@ BOOL WINAPI queryPerformanceCounterMock( class MockDeviceTimeWin : public MockDeviceTime { public: bool getGpuCpuTimeImpl(TimeStampData *pGpuCpuTime, OSTime *osTime) override { + getGpuCpuTimeImplCalled++; *pGpuCpuTime = gpuCpuTimeValue; - return true; + return getGpuCpuTimeImplResult; } + + double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override { + return deviceTimerResolution; + } + + bool getGpuCpuTimeImplResult = true; TimeStampData gpuCpuTimeValue{}; + uint32_t getGpuCpuTimeImplCalled = 0; + double deviceTimerResolution = 1; }; struct OSTimeWinTest : public ::testing::Test { @@ -196,3 +205,46 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueThenHwInfoBasedValueIsRetur EXPECT_EQ(0ull, osTime->getMaxGpuTimeStamp()); } } + +TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWithinIntervalThenReuseFromPreviousCall) { + osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock); + LARGE_INTEGER frequency = {}; + frequency.QuadPart = NSEC_PER_SEC; + osTime->setFrequency(frequency); + + auto deviceTime = new MockDeviceTimeWin(); + osTime->deviceTime.reset(deviceTime); + auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0]; + auto hwInfo = rootDeviceEnvironment.getHardwareInfo(); + osTime->setDeviceTimerResolution(*hwInfo); + + EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 0u); + TimeStampData gpuCpuTime; + deviceTime->gpuCpuTimeValue = {1u, 1u}; + valueToSet.QuadPart = 1; + osTime->getGpuCpuTime(&gpuCpuTime); + EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u); + + auto gpuTimestampBefore = gpuCpuTime.gpuTimeStamp; + auto cpuTimeBefore = gpuCpuTime.cpuTimeinNS; + valueToSet.QuadPart = 5; + osTime->getGpuCpuTime(&gpuCpuTime); + EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u); + + auto gpuTimestampAfter = gpuCpuTime.gpuTimeStamp; + auto cpuTimeAfter = gpuCpuTime.cpuTimeinNS; + + auto cpuTimeDiff = cpuTimeAfter - cpuTimeBefore; + + auto deviceTimerResolution = deviceTime->getDynamicDeviceTimerResolution(*hwInfo); + auto gpuTimestampDiff = static_cast(cpuTimeDiff / deviceTimerResolution); + EXPECT_EQ(gpuTimestampAfter, gpuTimestampBefore + gpuTimestampDiff); +} + +TEST_F(OSTimeWinTest, whenGetGpuCpuTimeFailedThenReturnFalse) { + TimeStampData gpuCpuTime; + auto deviceTime = new MockDeviceTimeWin(); + osTime->deviceTime.reset(deviceTime); + deviceTime->getGpuCpuTimeImplResult = false; + EXPECT_FALSE(osTime->getGpuCpuTime(&gpuCpuTime)); +}