feature: add initial support for host mapped timestamps

Related-To: LOCI-4171

Signed-off-by: Joshua Santosh Ranjan <joshua.santosh.ranjan@intel.com>
This commit is contained in:
Joshua Santosh Ranjan
2023-06-09 15:07:24 +00:00
committed by Compute-Runtime-Automation
parent 9214d0c635
commit 97b4d8bab5
22 changed files with 813 additions and 28 deletions

View File

@@ -9,6 +9,7 @@
#include "shared/source/helpers/timestamp_packet_constants.h"
#include "shared/source/helpers/timestamp_packet_container.h"
#include "shared/source/memory_manager/multi_graphics_allocation.h"
#include "shared/source/os_interface/os_time.h"
#include <level_zero/ze_api.h>
@@ -69,6 +70,8 @@ struct Event : _ze_event_handle_t {
virtual ze_result_t reset() = 0;
virtual ze_result_t queryKernelTimestamp(ze_kernel_timestamp_result_t *dstptr) = 0;
virtual ze_result_t queryTimestampsExp(Device *device, uint32_t *count, ze_kernel_timestamp_result_t *timestamps) = 0;
virtual ze_result_t queryKernelTimestampsExt(Device *device, uint32_t *pCount, ze_event_query_kernel_timestamps_results_ext_properties_t *pResults) = 0;
enum State : uint32_t {
STATE_SIGNALED = 0u,
HOST_CACHING_DISABLED_PERMANENT = std::numeric_limits<uint32_t>::max() - 2,
@@ -210,6 +213,10 @@ struct Event : _ze_event_handle_t {
void enableInOrderExecMode(const NEO::TimestampPacketContainer &inOrderSyncNodes);
bool isInOrderExecEvent() const { return inOrderExecEvent; }
const NEO::TimestampPacketContainer *getInOrderTimestampPacket() const { return inOrderTimestampPacket.get(); }
void setReferenceTs(NEO::TimeStampData &timestamp) {
referenceTs = timestamp;
}
bool hasKerneMappedTsCapability = false;
protected:
Event(EventPool *eventPool, int index, Device *device) : device(device), eventPool(eventPool), index(index) {}
@@ -218,6 +225,7 @@ struct Event : _ze_event_handle_t {
uint64_t globalEndTS = 1;
uint64_t contextStartTS = 1;
uint64_t contextEndTS = 1;
NEO::TimeStampData referenceTs{};
std::chrono::microseconds gpuHangCheckPeriod{500'000};
std::bitset<EventPacketsCount::maxKernelSplit> l3FlushAppliedOnKernel;
@@ -268,6 +276,9 @@ struct EventPool : _ze_event_pool_handle_t {
DriverHandleImp *driver, ContextImp *context, uint32_t numDevices, ze_device_handle_t *deviceHandles);
EventPool(const ze_event_pool_desc_t *desc) : EventPool(desc->count) {
eventPoolFlags = desc->flags;
if (eventPoolFlags & ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP) {
eventPoolFlags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
}
}
virtual ~EventPool();
MOCKABLE_VIRTUAL ze_result_t destroy();
@@ -299,6 +310,13 @@ struct EventPool : _ze_event_pool_handle_t {
return false;
}
bool isEventPoolKerneMappedTsFlagSet() const {
if (eventPoolFlags & ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP) {
return true;
}
return false;
}
uint32_t getMaxKernelCount() const {
return maxKernelCount;
}

View File

@@ -52,6 +52,7 @@ struct EventImp : public Event {
ze_result_t queryKernelTimestamp(ze_kernel_timestamp_result_t *dstptr) override;
ze_result_t queryTimestampsExp(Device *device, uint32_t *count, ze_kernel_timestamp_result_t *timestamps) override;
ze_result_t queryKernelTimestampsExt(Device *device, uint32_t *pCount, ze_event_query_kernel_timestamps_results_ext_properties_t *pResults) override;
void resetDeviceCompletionData(bool resetAllPackets);
void resetKernelCountAndPacketUsedCount() override;
@@ -74,6 +75,8 @@ struct EventImp : public Event {
ze_result_t hostEventSetValueTimestamps(TagSizeT eventVal);
MOCKABLE_VIRTUAL void assignKernelEventCompletionData(void *address);
void setRemainingPackets(TagSizeT eventVal, void *nextPacketAddress, uint32_t packetsAlreadySet);
void getSynchronizedKernelTimestamps(ze_synchronized_timestamp_result_ext_t *pSynchronizedTimestampsBuffer,
const uint32_t count, const ze_kernel_timestamp_result_t *pKernelTimestampsBuffer);
};
} // namespace L0

View File

@@ -9,6 +9,7 @@
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/sub_device.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/memory_operations_handler.h"
#include "shared/source/os_interface/os_time.h"
@@ -32,6 +33,7 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *
event->setEventTimestampFlag(true);
event->setSinglePacketSize(NEO::TimestampPackets<TagSizeT>::getSinglePacketSize());
}
event->hasKerneMappedTsCapability = eventPool->isEventPoolKerneMappedTsFlagSet();
auto &hwInfo = neoDevice->getHardwareInfo();
event->signalAllEventPackets = L0GfxCoreHelper::useSignalAllEventPackets(hwInfo);
@@ -494,6 +496,85 @@ ze_result_t EventImp<TagSizeT>::queryTimestampsExp(Device *device, uint32_t *cou
return ZE_RESULT_SUCCESS;
}
template <typename TagSizeT>
void EventImp<TagSizeT>::getSynchronizedKernelTimestamps(ze_synchronized_timestamp_result_ext_t *pSynchronizedTimestampsBuffer,
const uint32_t count, const ze_kernel_timestamp_result_t *pKernelTimestampsBuffer) {
auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
auto &hwInfo = device->getNEODevice()->getHardwareInfo();
const auto frequency = device->getNEODevice()->getDeviceInfo().profilingTimerResolution;
auto deviceTsInNs = gfxCoreHelper.getGpuTimeStampInNS(referenceTs.gpuTimeStamp, frequency);
const auto maxKernelTsValue = maxNBitValue(hwInfo.capabilityTable.kernelTimestampValidBits);
auto getDuration = [&](uint64_t startTs, uint64_t endTs) {
const uint64_t maxValue = maxKernelTsValue;
startTs &= maxValue;
endTs &= maxValue;
if (startTs > endTs) {
// Resolve overflows
return endTs + (maxValue - startTs);
} else {
return endTs - startTs;
}
};
const auto &referenceHostTsInNs = referenceTs.cpuTimeinNS;
// High Level Approach:
// startTimeStamp = (referenceHostTsInNs - submitDeviceTs) + kernelDeviceTsStart
// deviceDuration = kernelDeviceTsEnd - kernelDeviceTsStart
// endTimeStamp = startTimeStamp + deviceDuration
// Get offset between Device and Host timestamps
const int64_t tsOffsetInNs = referenceHostTsInNs - deviceTsInNs;
auto calculateSynchronizedTs = [&](ze_synchronized_timestamp_data_ext_t *synchronizedTs, const ze_kernel_timestamp_data_t *deviceTs) {
// Add the offset to the kernel timestamp to find the start timestamp on the CPU timescale
int64_t offset = tsOffsetInNs;
uint64_t startTimeStampInNs = static_cast<uint64_t>(deviceTs->kernelStart * frequency) + offset;
if (startTimeStampInNs < referenceHostTsInNs) {
offset += static_cast<uint64_t>(maxNBitValue(gfxCoreHelper.getGlobalTimeStampBits()) * frequency);
startTimeStampInNs = static_cast<uint64_t>(deviceTs->kernelStart * frequency) + offset;
}
// Get the kernel timestamp duration
uint64_t deviceDuration = getDuration(deviceTs->kernelStart, deviceTs->kernelEnd);
uint64_t deviceDurationNs = static_cast<uint64_t>(deviceDuration * frequency);
// Add the duration to the startTimeStamp to get the endTimeStamp
uint64_t endTimeStampInNs = startTimeStampInNs + deviceDurationNs;
synchronizedTs->kernelStart = startTimeStampInNs;
synchronizedTs->kernelEnd = endTimeStampInNs;
};
for (uint32_t index = 0; index < count; index++) {
calculateSynchronizedTs(&pSynchronizedTimestampsBuffer[index].global, &pKernelTimestampsBuffer[index].global);
pSynchronizedTimestampsBuffer[index].context.kernelStart = pSynchronizedTimestampsBuffer[index].global.kernelStart;
uint64_t deviceDuration = getDuration(pKernelTimestampsBuffer[index].context.kernelStart,
pKernelTimestampsBuffer[index].context.kernelEnd);
uint64_t deviceDurationNs = static_cast<uint64_t>(deviceDuration * frequency);
pSynchronizedTimestampsBuffer[index].context.kernelEnd = pSynchronizedTimestampsBuffer[index].context.kernelStart +
deviceDurationNs;
}
}
template <typename TagSizeT>
ze_result_t EventImp<TagSizeT>::queryKernelTimestampsExt(Device *device, uint32_t *pCount, ze_event_query_kernel_timestamps_results_ext_properties_t *pResults) {
if (*pCount == 0) {
return queryTimestampsExp(device, pCount, nullptr);
}
ze_result_t status = queryTimestampsExp(device, pCount, pResults->pKernelTimestampsBuffer);
if (status == ZE_RESULT_SUCCESS && hasKerneMappedTsCapability) {
getSynchronizedKernelTimestamps(pResults->pSynchronizedTimestampsBuffer, *pCount, pResults->pKernelTimestampsBuffer);
}
return status;
}
template <typename TagSizeT>
uint32_t EventImp<TagSizeT>::getPacketsInUse() const {
uint32_t packetsInUse = 0;