performance: Reuse GPU timestamps by default

Related-To: NEO-10615

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2024-05-15 15:22:40 +00:00
committed by Compute-Runtime-Automation
parent 9989829487
commit 7aceed58ca
6 changed files with 94 additions and 86 deletions

View File

@@ -395,6 +395,8 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
auto &gfxCoreHelper = device.getGfxCoreHelper();
auto resolution = device.getDeviceInfo().profilingTimerResolution;
// Calculate startTimestamp only if it was not already set on CPU
if (startTimeStamp.cpuTimeInNs == 0) {
startTimeStamp.gpuTimeStamp = globalStartTS;
addOverflowToTimestamp(startTimeStamp.gpuTimeStamp, submitTimeStamp.gpuTimeStamp);
if (startTimeStamp.gpuTimeStamp < submitTimeStamp.gpuTimeStamp) {
@@ -415,6 +417,8 @@ void Event::calculateProfilingDataInternal(uint64_t contextStartTS, uint64_t con
startTimeStamp.gpuTimeStamp += static_cast<uint64_t>(1ULL << gfxCoreHelper.getGlobalTimeStampBits());
}
}
}
UNRECOVERABLE_IF(startTimeStamp.gpuTimeStamp < submitTimeStamp.gpuTimeStamp);
auto gpuTicksDiff = startTimeStamp.gpuTimeStamp - submitTimeStamp.gpuTimeStamp;
auto timeDiff = static_cast<uint64_t>(gpuTicksDiff * resolution);

View File

@@ -911,16 +911,10 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
}
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitWhenCalculateStartTimestampThenAdjustTimestmaps) {
DebugManagerStateRestore dbgRestore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
MockContext context{};
auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
HwTimeStamps timestamp{};
timestamp.globalStartTS = 3;
@@ -946,16 +940,10 @@ TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmalle
}
TEST_F(InternalsEventTest, givenDeviceTimestampBaseEnabledAndGlobalStartTSSmallerThanQueueTSWithinRecalculationLimitAndStartTSBelowOneWhenCalculateStartTimestampThenAdjustTimestmaps) {
DebugManagerStateRestore dbgRestore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
MockContext context{};
auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get()));
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
MockCommandQueue cmdQ(&context, mockDevice.get(), props, false);
MockCommandQueue cmdQ(mockContext, pClDevice, props, false);
MockEvent<Event> event(&cmdQ, CL_COMPLETE, 0, 0);
auto resolution = mockDevice->getDevice().getDeviceInfo().profilingTimerResolution;
auto resolution = pClDevice->getDevice().getDeviceInfo().profilingTimerResolution;
HwTimeStamps timestamp{};
timestamp.globalStartTS = 2;

View File

@@ -347,7 +347,7 @@ DECLARE_DEBUG_VARIABLE(bool, DisableStatelessToStatefulOptimization, false, "Dis
DECLARE_DEBUG_VARIABLE(bool, DisableConcurrentBlockExecution, false, "disables concurrent block kernel execution")
DECLARE_DEBUG_VARIABLE(bool, UseNoRingFlushesKmdMode, true, "Windows only, passes flag to KMD that informs KMD to not emit any ring buffer flushes.")
DECLARE_DEBUG_VARIABLE(bool, DisableZeroCopyForUseHostPtr, false, "When active all buffer allocations created with CL_MEM_USE_HOST_PTR flag will not share memory with CPU.")
DECLARE_DEBUG_VARIABLE(bool, EnableReusingGpuTimestamps, false, "When enabled, GPU timestamp will be reused for next device time requests")
DECLARE_DEBUG_VARIABLE(bool, EnableReusingGpuTimestamps, true, "When enabled, GPU timestamp will be reused for next device time requests")
DECLARE_DEBUG_VARIABLE(int32_t, AllowZeroCopyWithoutCoherency, -1, "Use cacheline flush instead of memory copy for map/unmap mem object")
DECLARE_DEBUG_VARIABLE(int32_t, EnableHostPtrTracking, -1, "Enable host ptr tracking: -1 - default platform setting, 0 - disabled, 1 - enabled")
DECLARE_DEBUG_VARIABLE(int32_t, MaxHwThreadsPercent, 0, "If not zero then maximum number of used HW threads is capped to max * MaxHwThreadsPercent / 100")

View File

@@ -601,5 +601,5 @@ ForceSynchronizedDispatchMode = -1
DirectSubmissionControllerAdjustOnThrottleAndAcLineStatus = -1
ReadOnlyAllocationsTypeMask = 0
EnableLogLevel = 6
EnableReusingGpuTimestamps = 0
EnableReusingGpuTimestamps = 1
# Please don't edit below this line

View File

@@ -126,27 +126,27 @@ TEST_F(DrmTimeTest, given36BitGpuTimeStampWhenGpuTimeStampOverflowThenGpuTimeDoe
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
EXPECT_EQ(100ull, gpuCpuTime.gpuTimeStamp);
deviceTime->gpuCpuTimeValue.gpuTimeStamp = 200ll;
actualTime = 200ll;
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
EXPECT_EQ(200ull, gpuCpuTime.gpuTimeStamp);
EXPECT_EQ(actualTime, gpuCpuTime.gpuTimeStamp);
osTime->maxGpuTimeStamp = 1ull << 36;
deviceTime->gpuCpuTimeValue.gpuTimeStamp = 10ull; // read below initial value
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
deviceTime->gpuCpuTimeValue = {10ull, 10ull}; // read from KMD below initial value
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime, true));
EXPECT_EQ(osTime->maxGpuTimeStamp + 10ull, gpuCpuTime.gpuTimeStamp);
deviceTime->gpuCpuTimeValue.gpuTimeStamp = 30ull; // second read below initial value
actualTime = 30ull; // second read below initial value
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
EXPECT_EQ(osTime->maxGpuTimeStamp + 30ull, gpuCpuTime.gpuTimeStamp);
EXPECT_EQ(osTime->maxGpuTimeStamp + actualTime, gpuCpuTime.gpuTimeStamp);
deviceTime->gpuCpuTimeValue.gpuTimeStamp = 110ull;
actualTime = 110ull;
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
EXPECT_EQ(osTime->maxGpuTimeStamp + 110ull, gpuCpuTime.gpuTimeStamp);
EXPECT_EQ(osTime->maxGpuTimeStamp + actualTime, gpuCpuTime.gpuTimeStamp);
deviceTime->gpuCpuTimeValue.gpuTimeStamp = 70ull; // second overflow
actualTime = 70ull; // second overflow
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
EXPECT_EQ(2ull * osTime->maxGpuTimeStamp + 70ull, gpuCpuTime.gpuTimeStamp);
EXPECT_EQ(2ull * osTime->maxGpuTimeStamp + actualTime, gpuCpuTime.gpuTimeStamp);
}
TEST_F(DrmTimeTest, given64BitGpuTimeStampWhenGpuTimeStampOverflowThenOverflowsAreNotDetected) {
@@ -157,27 +157,27 @@ TEST_F(DrmTimeTest, given64BitGpuTimeStampWhenGpuTimeStampOverflowThenOverflowsA
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
EXPECT_EQ(100ull, gpuCpuTime.gpuTimeStamp);
deviceTime->gpuCpuTimeValue.gpuTimeStamp = 200ull;
actualTime = 200ull;
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
EXPECT_EQ(200ull, gpuCpuTime.gpuTimeStamp);
EXPECT_EQ(actualTime, gpuCpuTime.gpuTimeStamp);
osTime->maxGpuTimeStamp = 0ull;
deviceTime->gpuCpuTimeValue.gpuTimeStamp = 10ull; // read below initial value
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
deviceTime->gpuCpuTimeValue = {10ull, 10ull}; // read from KMD below initial value
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime, true));
EXPECT_EQ(10ull, gpuCpuTime.gpuTimeStamp);
deviceTime->gpuCpuTimeValue.gpuTimeStamp = 30ull;
actualTime = 30ull;
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
EXPECT_EQ(30ull, gpuCpuTime.gpuTimeStamp);
EXPECT_EQ(actualTime, gpuCpuTime.gpuTimeStamp);
deviceTime->gpuCpuTimeValue.gpuTimeStamp = 110ull;
actualTime = 110ull;
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
EXPECT_EQ(110ull, gpuCpuTime.gpuTimeStamp);
EXPECT_EQ(actualTime, gpuCpuTime.gpuTimeStamp);
deviceTime->gpuCpuTimeValue.gpuTimeStamp = 70ull;
actualTime = 70ull;
EXPECT_TRUE(osTime->getGpuCpuTime(&gpuCpuTime));
EXPECT_EQ(70ull, gpuCpuTime.gpuTimeStamp);
EXPECT_EQ(actualTime, gpuCpuTime.gpuTimeStamp);
}
TEST_F(DrmTimeTest, GivenInvalidDrmWhenGettingGpuCpuTimeThenFails) {
@@ -287,17 +287,8 @@ TEST_F(DrmTimeTest, whenGettingMaxGpuTimeStampValueThenHwInfoBasedValueIsReturne
}
TEST_F(DrmTimeTest, whenGettingGpuTimeStampValueWithinIntervalThenReuseFromPreviousCall) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
// Recreate mock to apply debug flag
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
osTime = MockOSTimeLinux::create(*rootDeviceEnvironment.osInterface);
osTime->setResolutionFunc(resolutionFuncTrue);
osTime->setGetTimeFunc(getTimeFuncTrue);
osTime->setDeviceTimerResolution(*hwInfo);
auto deviceTime = osTime->getDeviceTime();
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 0u);
TimeStampData gpuCpuTime;
@@ -320,17 +311,6 @@ TEST_F(DrmTimeTest, whenGettingGpuTimeStampValueWithinIntervalThenReuseFromPrevi
}
TEST_F(DrmTimeTest, whenGettingGpuTimeStampValueAfterIntervalThenCallToKmdAndAdaptTimeout) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
// Recreate mock to apply debug flag
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
osTime = MockOSTimeLinux::create(*rootDeviceEnvironment.osInterface);
osTime->setResolutionFunc(resolutionFuncTrue);
osTime->setGetTimeFunc(getTimeFuncTrue);
osTime->setDeviceTimerResolution(*hwInfo);
auto deviceTime = osTime->getDeviceTime();
deviceTime->callBaseGetGpuCpuTimeImpl = false;
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 0u);
@@ -400,3 +380,20 @@ TEST_F(DrmTimeTest, givenReusingTimestampsDisabledWhenGetTimestampRefreshTimeout
osTime->setGetTimeFunc(getTimeFuncTrue);
EXPECT_EQ(0ul, osTime->getTimestampRefreshTimeout());
}
TEST_F(DrmTimeTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysCallKmd) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(0);
// Recreate mock to apply debug flag
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
osTime = MockOSTimeLinux::create(*rootDeviceEnvironment.osInterface);
osTime->setResolutionFunc(resolutionFuncTrue);
osTime->setGetTimeFunc(getTimeFuncTrue);
auto deviceTime = osTime->getDeviceTime();
TimeStampData gpuCpuTime;
osTime->getGpuCpuTime(&gpuCpuTime);
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u);
osTime->getGpuCpuTime(&gpuCpuTime);
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
}

View File

@@ -53,6 +53,8 @@ struct OSTimeWinTest : public ::testing::Test {
rootDeviceEnvironment.osInterface = std::make_unique<OSInterface>();
rootDeviceEnvironment.osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
osTime = std::unique_ptr<MockOSTimeWin>(new MockOSTimeWin(*rootDeviceEnvironment.osInterface));
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
osTime->setDeviceTimerResolution(*hwInfo);
}
void TearDown() override {
@@ -62,8 +64,11 @@ struct OSTimeWinTest : public ::testing::Test {
};
TEST_F(OSTimeWinTest, given36BitGpuTimeStampWhenGpuTimeStampOverflowThenGpuTimeDoesNotDecrease) {
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
auto deviceTime = new MockDeviceTimeWin();
osTime->deviceTime.reset(deviceTime);
osTime->setDeviceTimerResolution(*hwInfo);
TimeStampData gpuCpuTime = {0ull, 0ull};
@@ -95,8 +100,11 @@ TEST_F(OSTimeWinTest, given36BitGpuTimeStampWhenGpuTimeStampOverflowThenGpuTimeD
}
TEST_F(OSTimeWinTest, given64BitGpuTimeStampWhenGpuTimeStampOverflowThenOverflowsAreNotDetected) {
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
auto deviceTime = new MockDeviceTimeWin();
osTime->deviceTime.reset(deviceTime);
osTime->setDeviceTimerResolution(*hwInfo);
TimeStampData gpuCpuTime = {0ull, 0ull};
@@ -183,9 +191,12 @@ TEST(OSTimeWinTests, givenOSInterfaceWhenGetGpuCpuTimeThenReturnsSuccess) {
auto wddm = new WddmMock(rootDeviceEnvironment);
TimeStampData gpuCpuTime01 = {};
TimeStampData gpuCpuTime02 = {};
std::unique_ptr<OSInterface> osInterface(new OSInterface());
osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
auto osTime = OSTime::create(osInterface.get());
rootDeviceEnvironment.osInterface = std::make_unique<OSInterface>();
rootDeviceEnvironment.osInterface->setDriverModel(std::unique_ptr<DriverModel>(wddm));
wddm->init();
auto osTime = OSTime::create(rootDeviceEnvironment.osInterface.get());
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
osTime->setDeviceTimerResolution(*hwInfo);
auto success = osTime->getGpuCpuTime(&gpuCpuTime01);
EXPECT_TRUE(success);
EXPECT_NE(0u, gpuCpuTime01.cpuTimeinNS);
@@ -208,8 +219,6 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueThenHwInfoBasedValueIsRetur
}
TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWithinIntervalThenReuseFromPreviousCall) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
LARGE_INTEGER frequency = {};
frequency.QuadPart = NSEC_PER_SEC;
@@ -245,8 +254,6 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWithinIntervalThenReuseFrom
}
TEST_F(OSTimeWinTest, whenGettingGpuTimeStampValueAfterIntervalThenCallToKmdAndAdaptTimeout) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
LARGE_INTEGER frequency = {};
frequency.QuadPart = NSEC_PER_SEC;
@@ -304,8 +311,6 @@ TEST_F(OSTimeWinTest, whenGetGpuCpuTimeFailedThenReturnFalse) {
}
TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueAfterFlagSetThenCallToKmd) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
TimeStampData gpuCpuTime;
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
@@ -324,8 +329,6 @@ TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueAfterFlagSetThenCallToKmd)
}
TEST_F(OSTimeWinTest, whenGettingMaxGpuTimeStampValueWhenForceFlagSetThenCallToKmd) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(true);
osTime->overrideQueryPerformanceCounterFunction(queryPerformanceCounterMock);
LARGE_INTEGER frequency = {};
frequency.QuadPart = NSEC_PER_SEC;
@@ -359,3 +362,19 @@ TEST_F(OSTimeWinTest, givenReusingTimestampsDisabledWhenGetTimestampRefreshTimeo
osTime->setDeviceTimerResolution(*hwInfo);
EXPECT_EQ(0ul, osTime->getTimestampRefreshTimeout());
}
TEST_F(OSTimeWinTest, givenReusingTimestampsDisabledWhenGetGpuCpuTimeThenAlwaysCallKmd) {
DebugManagerStateRestore restore;
debugManager.flags.EnableReusingGpuTimestamps.set(0);
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0];
auto hwInfo = rootDeviceEnvironment.getHardwareInfo();
auto deviceTime = new MockDeviceTimeWin();
osTime->deviceTime.reset(deviceTime);
osTime->setDeviceTimerResolution(*hwInfo);
TimeStampData gpuCpuTime;
osTime->getGpuCpuTime(&gpuCpuTime);
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 1u);
osTime->getGpuCpuTime(&gpuCpuTime);
EXPECT_EQ(deviceTime->getGpuCpuTimeImplCalled, 2u);
}