fix: use kernel timestamp properties instead of global timestamp

Related-To: NEO-11555

Signed-off-by: Joshua Santosh Ranjan <joshua.santosh.ranjan@intel.com>
This commit is contained in:
Joshua Santosh Ranjan 2024-06-12 16:04:39 +00:00 committed by Compute-Runtime-Automation
parent ad374fbd8f
commit deefea51ee
3 changed files with 56 additions and 15 deletions

View File

@ -10,6 +10,7 @@
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/sub_device.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/memory_operations_handler.h"
@ -738,12 +739,21 @@ template <typename TagSizeT>
void EventImp<TagSizeT>::getSynchronizedKernelTimestamps(ze_synchronized_timestamp_result_ext_t *pSynchronizedTimestampsBuffer,
const uint32_t count, const ze_kernel_timestamp_result_t *pKernelTimestampsBuffer) {
auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
auto &hwInfo = device->getNEODevice()->getHardwareInfo();
const auto resolution = device->getNEODevice()->getDeviceInfo().profilingTimerResolution;
auto deviceTsInNs = gfxCoreHelper.getGpuTimeStampInNS(referenceTs.gpuTimeStamp, resolution);
const auto maxKernelTsValue = maxNBitValue(hwInfo.capabilityTable.kernelTimestampValidBits);
const auto numBitsForResolution = Math::log2(static_cast<uint64_t>(resolution)) + 1u;
const auto clampedBitsCount = std::min(hwInfo.capabilityTable.kernelTimestampValidBits, 64u - numBitsForResolution);
const auto maxClampedTsValue = maxNBitValue(clampedBitsCount);
auto convertDeviceTsToNanoseconds = [&resolution, &maxClampedTsValue](uint64_t deviceTs) {
// Use clamped maximum to avoid overflows
return static_cast<uint64_t>((deviceTs & maxClampedTsValue) * resolution);
};
auto deviceTsInNs = convertDeviceTsToNanoseconds(referenceTs.gpuTimeStamp);
auto getDuration = [&](uint64_t startTs, uint64_t endTs) {
const uint64_t maxValue = maxKernelTsValue;
startTs &= maxValue;
@ -772,8 +782,8 @@ void EventImp<TagSizeT>::getSynchronizedKernelTimestamps(ze_synchronized_timesta
int64_t offset = tsOffsetInNs;
uint64_t startTimeStampInNs = static_cast<uint64_t>(deviceTs->kernelStart * resolution) + offset;
if (startTimeStampInNs < referenceHostTsInNs) {
offset += static_cast<uint64_t>(maxNBitValue(gfxCoreHelper.getGlobalTimeStampBits()) * resolution);
startTimeStampInNs = static_cast<uint64_t>(deviceTs->kernelStart * resolution) + offset;
offset += static_cast<uint64_t>(convertDeviceTsToNanoseconds(maxKernelTsValue));
startTimeStampInNs = static_cast<uint64_t>(convertDeviceTsToNanoseconds(deviceTs->kernelStart) + offset);
}
// Get the kernel timestamp duration
@ -805,6 +815,10 @@ ze_result_t EventImp<TagSizeT>::queryKernelTimestampsExt(Device *device, uint32_
return queryTimestampsExp(device, pCount, nullptr);
}
if (queryStatus() != ZE_RESULT_SUCCESS) {
return ZE_RESULT_NOT_READY;
}
ze_result_t status = queryTimestampsExp(device, pCount, pResults->pKernelTimestampsBuffer);
if (status == ZE_RESULT_SUCCESS && hasKerneMappedTsCapability) {

View File

@ -698,6 +698,11 @@ bool testKernelMappedTimestampMap(int argc, char *argv[],
SUCCESS_OR_TERMINATE(zeCommandListClose(cmdList));
}
uint64_t referenceHostTs, referenceDeviceTs = 0;
SUCCESS_OR_TERMINATE(zeDeviceGetGlobalTimestamps(device, &referenceHostTs, &referenceDeviceTs));
std::cout << "ReferenceDeviceTs: " << referenceDeviceTs << "| ReferenceHostTs: " << referenceHostTs << "\n";
previousMaximumSyncTs = referenceHostTs;
for (uint32_t i = 0; i < 10; i++) {
if (!useImmediate) {
@ -722,7 +727,6 @@ bool testKernelMappedTimestampMap(int argc, char *argv[],
if (verboseLevel == 1) {
std::cout << "[iter(" << i << ")][event(" << j << ")]====>\n";
}
SUCCESS_OR_TERMINATE(zeEventQueryStatus(kernelTsEvent[j]));
SUCCESS_OR_TERMINATE(zeEventQueryKernelTimestampsExt(kernelTsEvent[j], device, &count, nullptr));
if (count == 0) {
return false;
@ -754,12 +758,21 @@ bool testKernelMappedTimestampMap(int argc, char *argv[],
<< "[global-ts(" << ts.global.kernelStart << " , " << ts.global.kernelEnd << " ) "
<< "| syncTs( " << syncTs.global.kernelStart << " , " << syncTs.global.kernelEnd << " )] "
<< "# [context-ts( " << ts.context.kernelStart << " , " << ts.context.kernelEnd << " ) "
<< "| syncTs ( " << syncTs.context.kernelStart << " , " << syncTs.context.kernelEnd << " )]\n";
<< "| syncTs ( " << syncTs.context.kernelStart << " , " << syncTs.context.kernelEnd << " )]"
<< "| timeTaken (" << currentMinimumSyncTs - previousMaximumSyncTs << " ns)"
<< "\n";
}
if (verboseLevel == 2) {
std::cout << "KernelSyncTs: " << syncTs.global.kernelStart << " , " << syncTs.global.kernelEnd
<< " | ContextSyncTs: " << syncTs.context.kernelStart << " , " << syncTs.context.kernelEnd << "\n";
<< " | ContextSyncTs: " << syncTs.context.kernelStart << " , " << syncTs.context.kernelEnd
<< "| timeTaken (" << currentMinimumSyncTs - previousMaximumSyncTs << " ns)"
<< "\n";
}
if ((currentMinimumSyncTs - previousMaximumSyncTs) > 10 * 1E9) {
std::cout << "\n\n!!FAILED: Time Taken Too long! (Current Minimum Ts : " << currentMinimumSyncTs << " | Previous Maximum Ts : " << previousMaximumSyncTs << ")\n\n";
return false;
}
}
SUCCESS_OR_TERMINATE(zeEventHostReset(kernelTsEvent[j]));

View File

@ -2212,6 +2212,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenpCountLargerThanSupportedWhenCallingQ
ze_event_query_kernel_timestamps_results_ext_properties_t results{};
results.pKernelTimestampsBuffer = kernelTsBuffer.data();
results.pSynchronizedTimestampsBuffer = nullptr;
event->hostSignal(false);
auto result = event->queryKernelTimestampsExt(device, &pCount, &results);
@ -2229,6 +2230,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithStaticPartitionOffThenQueryK
ze_event_query_kernel_timestamps_results_ext_properties_t results{};
results.pKernelTimestampsBuffer = kernelTsBuffer.data();
results.pSynchronizedTimestampsBuffer = nullptr;
event->hostSignal(false);
uint32_t pCount = 10;
auto result = event->queryKernelTimestampsExt(device, &pCount, &results);
@ -2236,17 +2238,34 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithStaticPartitionOffThenQueryK
EXPECT_EQ(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE, result);
}
TEST_F(EventqueryKernelTimestampsExt, givenEventStatusNotReadyThenQueryKernelTimestampsExtReturnsNotReady) {
DebugManagerStateRestore restore;
NEO::debugManager.flags.EnableStaticPartitioning.set(0);
event->hasKerneMappedTsCapability = true;
std::vector<ze_kernel_timestamp_result_t> kernelTsBuffer(2);
ze_event_query_kernel_timestamps_results_ext_properties_t results{};
results.pKernelTimestampsBuffer = kernelTsBuffer.data();
results.pSynchronizedTimestampsBuffer = nullptr;
uint32_t pCount = 10;
auto result = event->queryKernelTimestampsExt(device, &pCount, &results);
EXPECT_EQ(ZE_RESULT_NOT_READY, result);
}
TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhenQueryKernelTimestampsExtIsCalledCorrectValuesAreReturned) {
auto &hwInfo = device->getNEODevice()->getHardwareInfo();
typename MockTimestampPackets32::Packet packetData[3];
device->getNEODevice()->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable.kernelTimestampValidBits = 32;
auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
event->setPacketsInUse(3u);
event->hasKerneMappedTsCapability = true;
const auto deviceTsFrequency = device->getNEODevice()->getDeviceInfo().profilingTimerResolution;
const int64_t gpuReferenceTimeInNs = 2000;
const int64_t cpuReferenceTimeInNs = 3000;
const auto maxKernelTsValue = maxNBitValue(32);
const auto maxKernelTsValue = maxNBitValue(hwInfo.capabilityTable.kernelTimestampValidBits);
NEO::TimeStampData *referenceTs = event->peekReferenceTs();
referenceTs->cpuTimeinNS = cpuReferenceTimeInNs;
@ -2285,11 +2304,6 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhe
results.pKernelTimestampsBuffer = kernelTsBuffer.data();
results.pSynchronizedTimestampsBuffer = synchronizedTsBuffer.data();
for (uint32_t packetId = 0; packetId < count; packetId++) {
event->kernelEventCompletionData[0].assignDataToAllTimestamps(packetId, event->hostAddress);
event->hostAddress = ptrOffset(event->hostAddress, NEO::TimestampPackets<uint32_t, NEO::TimestampPacketConstants::preferredPacketCount>::getSinglePacketSize());
}
EXPECT_EQ(ZE_RESULT_SUCCESS, event->queryKernelTimestampsExt(device, &count, &results));
uint64_t errorOffset = 5;
// Packet 1
@ -2309,7 +2323,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhe
// Packet 2
expectedGlobalStart = (cpuReferenceTimeInNs - gpuReferenceTimeInNs) + 500u +
static_cast<uint64_t>(maxNBitValue(gfxCoreHelper.getGlobalTimeStampBits()) * deviceTsFrequency);
static_cast<uint64_t>(maxNBitValue(hwInfo.capabilityTable.kernelTimestampValidBits) * deviceTsFrequency);
expectedGlobalEnd = expectedGlobalStart + (1500 - 500);
EXPECT_GE(results.pSynchronizedTimestampsBuffer[1].global.kernelStart, expectedGlobalStart - errorOffset);
EXPECT_LE(results.pSynchronizedTimestampsBuffer[1].global.kernelStart, expectedGlobalStart + errorOffset);