mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
feature: add initial support for host mapped timestamps
Related-To: LOCI-4171 Signed-off-by: Joshua Santosh Ranjan <joshua.santosh.ranjan@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
9214d0c635
commit
97b4d8bab5
@@ -9,6 +9,7 @@
|
||||
#include "shared/source/helpers/timestamp_packet_constants.h"
|
||||
#include "shared/source/helpers/timestamp_packet_container.h"
|
||||
#include "shared/source/memory_manager/multi_graphics_allocation.h"
|
||||
#include "shared/source/os_interface/os_time.h"
|
||||
|
||||
#include <level_zero/ze_api.h>
|
||||
|
||||
@@ -69,6 +70,8 @@ struct Event : _ze_event_handle_t {
|
||||
virtual ze_result_t reset() = 0;
|
||||
virtual ze_result_t queryKernelTimestamp(ze_kernel_timestamp_result_t *dstptr) = 0;
|
||||
virtual ze_result_t queryTimestampsExp(Device *device, uint32_t *count, ze_kernel_timestamp_result_t *timestamps) = 0;
|
||||
virtual ze_result_t queryKernelTimestampsExt(Device *device, uint32_t *pCount, ze_event_query_kernel_timestamps_results_ext_properties_t *pResults) = 0;
|
||||
|
||||
enum State : uint32_t {
|
||||
STATE_SIGNALED = 0u,
|
||||
HOST_CACHING_DISABLED_PERMANENT = std::numeric_limits<uint32_t>::max() - 2,
|
||||
@@ -210,6 +213,10 @@ struct Event : _ze_event_handle_t {
|
||||
void enableInOrderExecMode(const NEO::TimestampPacketContainer &inOrderSyncNodes);
|
||||
bool isInOrderExecEvent() const { return inOrderExecEvent; }
|
||||
const NEO::TimestampPacketContainer *getInOrderTimestampPacket() const { return inOrderTimestampPacket.get(); }
|
||||
void setReferenceTs(NEO::TimeStampData ×tamp) {
|
||||
referenceTs = timestamp;
|
||||
}
|
||||
bool hasKerneMappedTsCapability = false;
|
||||
|
||||
protected:
|
||||
Event(EventPool *eventPool, int index, Device *device) : device(device), eventPool(eventPool), index(index) {}
|
||||
@@ -218,6 +225,7 @@ struct Event : _ze_event_handle_t {
|
||||
uint64_t globalEndTS = 1;
|
||||
uint64_t contextStartTS = 1;
|
||||
uint64_t contextEndTS = 1;
|
||||
NEO::TimeStampData referenceTs{};
|
||||
|
||||
std::chrono::microseconds gpuHangCheckPeriod{500'000};
|
||||
std::bitset<EventPacketsCount::maxKernelSplit> l3FlushAppliedOnKernel;
|
||||
@@ -268,6 +276,9 @@ struct EventPool : _ze_event_pool_handle_t {
|
||||
DriverHandleImp *driver, ContextImp *context, uint32_t numDevices, ze_device_handle_t *deviceHandles);
|
||||
EventPool(const ze_event_pool_desc_t *desc) : EventPool(desc->count) {
|
||||
eventPoolFlags = desc->flags;
|
||||
if (eventPoolFlags & ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP) {
|
||||
eventPoolFlags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
}
|
||||
}
|
||||
virtual ~EventPool();
|
||||
MOCKABLE_VIRTUAL ze_result_t destroy();
|
||||
@@ -299,6 +310,13 @@ struct EventPool : _ze_event_pool_handle_t {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool isEventPoolKerneMappedTsFlagSet() const {
|
||||
if (eventPoolFlags & ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t getMaxKernelCount() const {
|
||||
return maxKernelCount;
|
||||
}
|
||||
|
||||
@@ -52,6 +52,7 @@ struct EventImp : public Event {
|
||||
|
||||
ze_result_t queryKernelTimestamp(ze_kernel_timestamp_result_t *dstptr) override;
|
||||
ze_result_t queryTimestampsExp(Device *device, uint32_t *count, ze_kernel_timestamp_result_t *timestamps) override;
|
||||
ze_result_t queryKernelTimestampsExt(Device *device, uint32_t *pCount, ze_event_query_kernel_timestamps_results_ext_properties_t *pResults) override;
|
||||
|
||||
void resetDeviceCompletionData(bool resetAllPackets);
|
||||
void resetKernelCountAndPacketUsedCount() override;
|
||||
@@ -74,6 +75,8 @@ struct EventImp : public Event {
|
||||
ze_result_t hostEventSetValueTimestamps(TagSizeT eventVal);
|
||||
MOCKABLE_VIRTUAL void assignKernelEventCompletionData(void *address);
|
||||
void setRemainingPackets(TagSizeT eventVal, void *nextPacketAddress, uint32_t packetsAlreadySet);
|
||||
void getSynchronizedKernelTimestamps(ze_synchronized_timestamp_result_ext_t *pSynchronizedTimestampsBuffer,
|
||||
const uint32_t count, const ze_kernel_timestamp_result_t *pKernelTimestampsBuffer);
|
||||
};
|
||||
|
||||
} // namespace L0
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/device/sub_device.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/memory_manager/internal_allocation_storage.h"
|
||||
#include "shared/source/memory_manager/memory_operations_handler.h"
|
||||
#include "shared/source/os_interface/os_time.h"
|
||||
@@ -32,6 +33,7 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *
|
||||
event->setEventTimestampFlag(true);
|
||||
event->setSinglePacketSize(NEO::TimestampPackets<TagSizeT>::getSinglePacketSize());
|
||||
}
|
||||
event->hasKerneMappedTsCapability = eventPool->isEventPoolKerneMappedTsFlagSet();
|
||||
auto &hwInfo = neoDevice->getHardwareInfo();
|
||||
|
||||
event->signalAllEventPackets = L0GfxCoreHelper::useSignalAllEventPackets(hwInfo);
|
||||
@@ -494,6 +496,85 @@ ze_result_t EventImp<TagSizeT>::queryTimestampsExp(Device *device, uint32_t *cou
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
template <typename TagSizeT>
|
||||
void EventImp<TagSizeT>::getSynchronizedKernelTimestamps(ze_synchronized_timestamp_result_ext_t *pSynchronizedTimestampsBuffer,
|
||||
const uint32_t count, const ze_kernel_timestamp_result_t *pKernelTimestampsBuffer) {
|
||||
|
||||
auto &gfxCoreHelper = device->getNEODevice()->getGfxCoreHelper();
|
||||
auto &hwInfo = device->getNEODevice()->getHardwareInfo();
|
||||
const auto frequency = device->getNEODevice()->getDeviceInfo().profilingTimerResolution;
|
||||
auto deviceTsInNs = gfxCoreHelper.getGpuTimeStampInNS(referenceTs.gpuTimeStamp, frequency);
|
||||
const auto maxKernelTsValue = maxNBitValue(hwInfo.capabilityTable.kernelTimestampValidBits);
|
||||
|
||||
auto getDuration = [&](uint64_t startTs, uint64_t endTs) {
|
||||
const uint64_t maxValue = maxKernelTsValue;
|
||||
startTs &= maxValue;
|
||||
endTs &= maxValue;
|
||||
|
||||
if (startTs > endTs) {
|
||||
// Resolve overflows
|
||||
return endTs + (maxValue - startTs);
|
||||
} else {
|
||||
return endTs - startTs;
|
||||
}
|
||||
};
|
||||
|
||||
const auto &referenceHostTsInNs = referenceTs.cpuTimeinNS;
|
||||
|
||||
// High Level Approach:
|
||||
// startTimeStamp = (referenceHostTsInNs - submitDeviceTs) + kernelDeviceTsStart
|
||||
// deviceDuration = kernelDeviceTsEnd - kernelDeviceTsStart
|
||||
// endTimeStamp = startTimeStamp + deviceDuration
|
||||
|
||||
// Get offset between Device and Host timestamps
|
||||
const int64_t tsOffsetInNs = referenceHostTsInNs - deviceTsInNs;
|
||||
|
||||
auto calculateSynchronizedTs = [&](ze_synchronized_timestamp_data_ext_t *synchronizedTs, const ze_kernel_timestamp_data_t *deviceTs) {
|
||||
// Add the offset to the kernel timestamp to find the start timestamp on the CPU timescale
|
||||
int64_t offset = tsOffsetInNs;
|
||||
uint64_t startTimeStampInNs = static_cast<uint64_t>(deviceTs->kernelStart * frequency) + offset;
|
||||
if (startTimeStampInNs < referenceHostTsInNs) {
|
||||
offset += static_cast<uint64_t>(maxNBitValue(gfxCoreHelper.getGlobalTimeStampBits()) * frequency);
|
||||
startTimeStampInNs = static_cast<uint64_t>(deviceTs->kernelStart * frequency) + offset;
|
||||
}
|
||||
|
||||
// Get the kernel timestamp duration
|
||||
uint64_t deviceDuration = getDuration(deviceTs->kernelStart, deviceTs->kernelEnd);
|
||||
uint64_t deviceDurationNs = static_cast<uint64_t>(deviceDuration * frequency);
|
||||
// Add the duration to the startTimeStamp to get the endTimeStamp
|
||||
uint64_t endTimeStampInNs = startTimeStampInNs + deviceDurationNs;
|
||||
|
||||
synchronizedTs->kernelStart = startTimeStampInNs;
|
||||
synchronizedTs->kernelEnd = endTimeStampInNs;
|
||||
};
|
||||
|
||||
for (uint32_t index = 0; index < count; index++) {
|
||||
calculateSynchronizedTs(&pSynchronizedTimestampsBuffer[index].global, &pKernelTimestampsBuffer[index].global);
|
||||
|
||||
pSynchronizedTimestampsBuffer[index].context.kernelStart = pSynchronizedTimestampsBuffer[index].global.kernelStart;
|
||||
uint64_t deviceDuration = getDuration(pKernelTimestampsBuffer[index].context.kernelStart,
|
||||
pKernelTimestampsBuffer[index].context.kernelEnd);
|
||||
uint64_t deviceDurationNs = static_cast<uint64_t>(deviceDuration * frequency);
|
||||
pSynchronizedTimestampsBuffer[index].context.kernelEnd = pSynchronizedTimestampsBuffer[index].context.kernelStart +
|
||||
deviceDurationNs;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TagSizeT>
|
||||
ze_result_t EventImp<TagSizeT>::queryKernelTimestampsExt(Device *device, uint32_t *pCount, ze_event_query_kernel_timestamps_results_ext_properties_t *pResults) {
|
||||
|
||||
if (*pCount == 0) {
|
||||
return queryTimestampsExp(device, pCount, nullptr);
|
||||
}
|
||||
|
||||
ze_result_t status = queryTimestampsExp(device, pCount, pResults->pKernelTimestampsBuffer);
|
||||
|
||||
if (status == ZE_RESULT_SUCCESS && hasKerneMappedTsCapability) {
|
||||
getSynchronizedKernelTimestamps(pResults->pSynchronizedTimestampsBuffer, *pCount, pResults->pKernelTimestampsBuffer);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
template <typename TagSizeT>
|
||||
uint32_t EventImp<TagSizeT>::getPacketsInUse() const {
|
||||
uint32_t packetsInUse = 0;
|
||||
|
||||
Reference in New Issue
Block a user