Revert "performance: Reuse GPU timestamps by default on Windows"

This reverts commit bca3fecaa0.

Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com>
This commit is contained in:
Compute-Runtime-Validation 2024-05-25 07:39:33 +02:00 committed by Compute-Runtime-Automation
parent d7726c9d86
commit 0b2c9e92e7
11 changed files with 62 additions and 87 deletions

View File

@ -395,8 +395,6 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
auto &gfxCoreHelper = device.getGfxCoreHelper();
auto resolution = device.getDeviceInfo().profilingTimerResolution;
// Calculate startTimestamp only if it was not already set on CPU
if (startTimeStamp.cpuTimeInNs == 0) {
startTimeStamp.gpuTimeStamp = globalStartTS;
addOverflowToTimestamp(startTimeStamp.gpuTimeStamp, submitTimeStamp.gpuTimeStamp);
if (startTimeStamp.gpuTimeStamp < submitTimeStamp.gpuTimeStamp) {
@ -417,8 +415,6 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
startTimeStamp.gpuTimeStamp += static_cast<uint64_t>(1ULL << gfxCoreHelper.getGlobalTimeStampBits());
}
}
}
UNRECOVERABLE_IF(startTimeStamp.gpuTimeStamp < submitTimeStamp.gpuTimeStamp);
auto gpuTicksDiff = startTimeStamp.gpuTimeStamp - submitTimeStamp.gpuTimeStamp;
auto timeDiff = static_cast<uint64_t>(gpuTicksDiff * resolution);

View File

@ -911,10 +911,16 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
}
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitWhenCalculateStartTimestampThenAdjustTimestmaps) {
DebugManagerStateRestore dbgRestore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
MockContext context{};
auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;
HwTimeStamps timestamp{};
timestamp.globalStartTS = 3;
@ -940,10 +946,16 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
}
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitAndStartTSBelowOneWhenCalculateStartTimestampThenAdjustTimestmaps) {
DebugManagerStateRestore dbgRestore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
MockContext context{};
auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;
HwTimeStamps timestamp{};
timestamp.globalStartTS = 2;

View File

@ -350,7 +350,7 @@ DECLARE_DEBUG_VARIABLE(bool, DisableStatelessToStatefulOptimization, false, "Dis
DECLARE_DEBUG_VARIABLE(bool, DisableConcurrentBlockExecution, false, "disables concurrent block kernel execution")
DECLARE_DEBUG_VARIABLE(bool, UseNoRingFlushesKmdMode, true, "Windows only, passes flag to KMD that informs KMD to not emit any ring buffer flushes.")
DECLARE_DEBUG_VARIABLE(bool, DisableZeroCopyForUseHostPtr, false, "When active all buffer allocations created with CL_MEM_USE_HOST_PTR flag will not share memory with CPU.")
DECLARE_DEBUG_VARIABLE(int32_t, EnableReusingGpuTimestamps, -1, "Reuse GPU timestamp for next device time requests. -1: os-specific, 0: disable, 1: enable")
DECLARE_DEBUG_VARIABLE(bool, EnableReusingGpuTimestamps, false, "When enabled, GPU timestamp will be reused for next device time requests")
DECLARE_DEBUG_VARIABLE(int32_t, AllowZeroCopyWithoutCoherency, -1, "Use cacheline flush instead of memory copy for map/unmap mem object")
DECLARE_DEBUG_VARIABLE(int32_t, EnableHostPtrTracking, -1, "Enable host ptr tracking: -1 - default platform setting, 0 - disabled, 1 - enabled")
DECLARE_DEBUG_VARIABLE(int32_t, MaxHwThreadsPercent, 0, "If not zero then maximum number of used HW threads is capped to max * MaxHwThreadsPercent / 100")

View File

@ -7,7 +7,6 @@
#include "shared/source/os_interface/linux/device_time_drm.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/helpers/register_offsets.h"
#include "shared/source/os_interface/linux/drm_neo.h"
#include "shared/source/os_interface/linux/drm_wrappers.h"
@ -51,12 +50,4 @@ uint64_t DeviceTimeDrm::getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) c
return static_cast<uint64_t>(nanosecondsPerSecond / OSTime::getDeviceTimerResolution(hwInfo));
}
bool DeviceTimeDrm::isTimestampsRefreshEnabled() const {
bool timestampsRefreshEnabled = false;
if (debugManager.flags.EnableReusingGpuTimestamps.get() != -1) {
timestampsRefreshEnabled = debugManager.flags.EnableReusingGpuTimestamps.get();
}
return timestampsRefreshEnabled;
}
} // namespace NEO

View File

@ -17,7 +17,6 @@ class DeviceTimeDrm : public DeviceTime {
bool getGpuCpuTimeImpl(TimeStampData *pGpuCpuTime, OSTime *osTime) override;
double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const override;
uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const override;
bool isTimestampsRefreshEnabled() const override;
protected:
Drm *pDrm = nullptr;

View File

@ -19,6 +19,13 @@ double OSTime::getDeviceTimerResolution(HardwareInfo const &hwInfo) {
return hwInfo.capabilityTable.defaultProfilingTimerResolution;
};
DeviceTime::DeviceTime() {
reusingTimestampsEnabled = debugManager.flags.EnableReusingGpuTimestamps.get();
if (reusingTimestampsEnabled) {
timestampRefreshTimeoutNS = NSEC_PER_MSEC * 100; // 100ms
}
}
bool DeviceTime::getGpuCpuTimeImpl(TimeStampData *pGpuCpuTime, OSTime *osTime) {
pGpuCpuTime->cpuTimeinNS = 0;
pGpuCpuTime->gpuTimeStamp = 0;
@ -40,14 +47,6 @@ void DeviceTime::setDeviceTimerResolution(HardwareInfo const &hwInfo) {
}
}
bool DeviceTime::isTimestampsRefreshEnabled() const {
bool timestampsRefreshEnabled = true;
if (debugManager.flags.EnableReusingGpuTimestamps.get() != -1) {
timestampsRefreshEnabled = debugManager.flags.EnableReusingGpuTimestamps.get();
}
return timestampsRefreshEnabled;
}
/**
* @brief If this method is called within interval, GPU timestamp
* will be calculated based on CPU timestamp and previous GPU ticks
@ -64,7 +63,7 @@ bool DeviceTime::getGpuCpuTimestamps(TimeStampData *timeStamp, OSTime *osTime, b
if (forceKmdCall || cpuTimeDiffInNS >= timestampRefreshTimeoutNS) {
refreshTimestamps = true;
}
bool reusingTimestampsEnabled = isTimestampsRefreshEnabled();
if (!reusingTimestampsEnabled || refreshTimestamps) {
if (!getGpuCpuTimeImpl(timeStamp, osTime)) {
return false;

View File

@ -25,12 +25,12 @@ class OSTime;
class DeviceTime {
public:
DeviceTime();
virtual ~DeviceTime() = default;
bool getGpuCpuTime(TimeStampData *pGpuCpuTime, OSTime *osTime, bool forceKmdCall);
virtual bool getGpuCpuTimeImpl(TimeStampData *pGpuCpuTime, OSTime *osTime);
virtual double getDynamicDeviceTimerResolution(HardwareInfo const &hwInfo) const;
virtual uint64_t getDynamicDeviceTimerClock(HardwareInfo const &hwInfo) const;
virtual bool isTimestampsRefreshEnabled() const;
bool getGpuCpuTimestamps(TimeStampData *timeStamp, OSTime *osTime, bool forceKmdCall);
void setDeviceTimerResolution(HardwareInfo const &hwInfo);
void setRefreshTimestampsFlag() {
@ -47,8 +47,9 @@ class DeviceTime {
double deviceTimerResolution = 0;
const uint64_t timestampRefreshMinTimeoutNS = NSEC_PER_MSEC; // 1ms
const uint64_t timestampRefreshMaxTimeoutNS = NSEC_PER_SEC; // 1s
uint64_t timestampRefreshTimeoutNS = NSEC_PER_MSEC * 100; // 100ms
uint64_t timestampRefreshTimeoutNS = 0;
bool refreshTimestamps = true;
bool reusingTimestampsEnabled = false;
TimeStampData fetchedTimestamps{};
};

View File

@ -602,7 +602,7 @@ ForceSynchronizedDispatchMode = -1
DirectSubmissionControllerAdjustOnThrottleAndAcLineStatus = -1
ReadOnlyAllocationsTypeMask = 0
EnableLogLevel = 6
EnableReusingGpuTimestamps = -1
EnableReusingGpuTimestamps = 0
ForceCopyOperationOffloadForComputeCmdList = -1
SecondaryContextEngineTypeMask = -1
# Please don't edit below this line

View File

@ -86,7 +86,6 @@ TEST(RootDeviceEnvironment, givenExecutionEnvironmentWhenInitializeAubCenterIsCa
}
TEST(RootDeviceEnvironment, whenCreatingRootDeviceEnvironmentThenCreateOsAgnosticOsTime) {
DebugManagerStateRestore dbgRestore;
MockExecutionEnvironment executionEnvironment;
executionEnvironment.rootDeviceEnvironments[0]->setHwInfoAndInitHelpers(defaultHwInfo.get());
auto profilingTimerResolution = defaultHwInfo->capabilityTable.defaultProfilingTimerResolution;
@ -110,14 +109,6 @@ TEST(RootDeviceEnvironment, whenCreatingRootDeviceEnvironmentThenCreateOsAgnosti
EXPECT_EQ(profilingTimerResolution, rootDeviceEnvironment->osTime->getDynamicDeviceTimerResolution(*defaultHwInfo));
EXPECT_EQ(static_cast<uint64_t>(1000000000.0 / OSTime::getDeviceTimerResolution(*defaultHwInfo)), rootDeviceEnvironment->osTime->getDynamicDeviceTimerClock(*defaultHwInfo));
struct MockOSTime : public OSTime {
using OSTime::deviceTime;
};
auto deviceTime = static_cast<MockOSTime *>(rootDeviceEnvironment->osTime.get())->deviceTime.get();
EXPECT_TRUE(deviceTime->isTimestampsRefreshEnabled());
debugManager.flags.EnableReusingGpuTimestamps.set(0);
EXPECT_FALSE(deviceTime->isTimestampsRefreshEnabled());
}
TEST(RootDeviceEnvironment, givenUseAubStreamFalseWhenGetAubManagerIsCalledThenReturnNull) {

View File

@ -390,7 +390,7 @@ TEST_F(DrmTimeTest, whenGettingMaxGpuTimeStampValueWhenForceFlagSetThenCallToKmd
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
}
TEST_F(DrmTimeTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysCallKmd) {
TEST_F(DrmTimeTest, givenReusingTimestampsDisabledWhenGetTimestampRefreshTimeoutThenReturnCorrectValue) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(0);
// Recreate mock to apply debug flag
@ -398,11 +398,5 @@ TEST_F(DrmTimeTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysCal
osTime = MockOSTimeLinux::create(*rootDeviceEnvironment.osInterface);
osTime->setResolutionFunc(resolutionFuncTrue);
osTime->setGetTimeFunc(getTimeFuncTrue);
auto deviceTime = osTime->getDeviceTime();
TimeStampData gpuCpuTime;
osTime->getGpuCpuTime(&gpuCpuTime);
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u);
osTime->getGpuCpuTime(&gpuCpuTime);
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
EXPECT_EQ(0ul, osTime->getTimestampRefreshTimeout());
}

View File

@ -53,8 +53,6 @@ struct OSTimeWinTest : public ::testing::Test {
rootDeviceEnvironment.osInterface = std::make_unique<OSInterface>();
rootDeviceEnvironment.osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
osTime = std::unique_ptr<MockOSTimeWin>(new MockOSTimeWin(*rootDeviceEnvironment.osInterface));
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
osTime->setDeviceTimerResolution(*hwInfo);
}
void TearDown() override {
@ -64,11 +62,8 @@ struct OSTimeWinTest : public ::testing::Test {
};
TEST_F(OSTimeWinTest, given36BitGpuTimeStampWhenGpuTimeStampOverflowThenGpuTimeDoesNotDecrease) {
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
auto deviceTime = new MockDeviceTimeWin();
osTime->deviceTime.reset(deviceTime);
osTime->setDeviceTimerResolution(*hwInfo);
TimeStampData gpuCpuTime = {0ull, 0ull};
@ -100,11 +95,8 @@ TEST_F(OSTimeWinTest, given36BitGpuTimeStampWhenGpuTimeStampOverflowThenGpuTimeD
}
TEST_F(OSTimeWinTest, given64BitGpuTimeStampWhenGpuTimeStampOverflowThenOverflowsAreNotDetected) {
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
auto deviceTime = new MockDeviceTimeWin();
osTime->deviceTime.reset(deviceTime);
osTime->setDeviceTimerResolution(*hwInfo);
TimeStampData gpuCpuTime = {0ull, 0ull};
@ -191,12 +183,9 @@ TEST(OSTimeWinTests, givenOSInterfaceWhenGetGpuCpuTimeThenReturnsSuccess) {
auto wddm = new WddmMock(rootDeviceEnvironment);
TimeStampData gpuCpuTime01 = {};
TimeStampData gpuCpuTime02 = {};
rootDeviceEnvironment.osInterface = std::make_unique<OSInterface>();
rootDeviceEnvironment.osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
wddm->init();
auto osTime = OSTime::create(rootDeviceEnvironment.osInterface.get());
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
osTime->setDeviceTimerResolution(*hwInfo);
std::unique_ptr<OSInterface> osInterface(new OSInterface());
osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
auto osTime = OSTime::create(osInterface.get());
auto success = osTime->getGpuCpuTime(&gpuCpuTime01);
EXPECT_TRUE(success);
EXPECT_NE(0u, gpuCpuTime01.cpuTimeinNS);
@ -219,6 +208,8 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueThenHwInfoBasedValueIsRetur
}
TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWithinIntervalThenReuseFromPreviousCall) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
LARGE_INTEGER frequency = {};
frequency.QuadPart = NSEC_PER_SEC;
@ -254,6 +245,8 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWithinIntervalThenReuseFrom
}
TEST_F(OSTimeWinTest, whenGettingGpuTimeStampValueAfterIntervalThenCallToKmdAndAdaptTimeout) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
LARGE_INTEGER frequency = {};
frequency.QuadPart = NSEC_PER_SEC;
@ -311,6 +304,8 @@ TEST_F(OSTimeWinTest, whenGetGpuCpuTimeFailedThenReturnFalse) {
}
TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueAfterFlagSetThenCallToKmd) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
TimeStampData gpuCpuTime;
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
@ -329,6 +324,8 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueAfterFlagSetThenCallToKmd)
}
TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWhenForceFlagSetThenCallToKmd) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
LARGE_INTEGER frequency = {};
frequency.QuadPart = NSEC_PER_SEC;
@ -352,7 +349,7 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWhenForceFlagSetThenCallToK
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
}
TEST_F(OSTimeWinTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysCallKmd) {
TEST_F(OSTimeWinTest, givenReusingTimestampsDisabledWhenGetTimestampRefreshTimeoutThenReturnCorrectValue) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(0);
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
@ -360,10 +357,5 @@ TEST_F(OSTimeWinTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysC
auto deviceTime = new MockDeviceTimeWin();
osTime->deviceTime.reset(deviceTime);
osTime->setDeviceTimerResolution(*hwInfo);
TimeStampData gpuCpuTime;
osTime->getGpuCpuTime(&gpuCpuTime);
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u);
osTime->getGpuCpuTime(&gpuCpuTime);
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
EXPECT_EQ(0ul, osTime->getTimestampRefreshTimeout());
}