fix: Mapped time stamp retrieval with IPC

Create shared allocations to maintain reference timestamps.
Add flag to IPC pool data to check whether mapped time stamp
flag is set.

Related-To: NEO-12313

Signed-off-by: Bellekallu Rajkiran <bellekallu.rajkiran@intel.com>
This commit is contained in:
Bellekallu Rajkiran
2025-01-20 11:25:34 +00:00
committed by Compute-Runtime-Automation
parent 474b91aa36
commit ad3110ff67
7 changed files with 73 additions and 55 deletions

View File

@@ -83,7 +83,7 @@ zexCounterBasedEventCreate2(ze_context_handle_t hContext, ze_device_handle_t hDe
signalScope, // signalScope
desc->waitScope, // waitScope
timestampFlag, // timestampPool
mappedTimestampFlag, // kerneMappedTsPoolFlag
mappedTimestampFlag, // kernelMappedTsPoolFlag
false, // importedIpcPool
ipcFlag, // ipcPool
};

View File

@@ -329,7 +329,7 @@ void CommandListImp::storeReferenceTsToMappedEvents(bool isClearEnabled) {
}
void CommandListImp::addToMappedEventList(Event *event) {
if (event && event->hasKerneMappedTsCapability) {
if (event && event->hasKernelMappedTsCapability) {
if (std::find(mappedTsEventList.begin(), mappedTsEventList.end(), event) == mappedTsEventList.end()) {
mappedTsEventList.push_back(event);
}

View File

@@ -210,9 +210,15 @@ void EventPool::initializeSizeParameters(uint32_t numDevices, ze_device_handle_t
eventPackets = driver.getEventMaxPacketCount(numDevices, deviceHandles);
maxKernelCount = driver.getEventMaxKernelCount(numDevices, deviceHandles);
}
setEventSize(static_cast<uint32_t>(alignUp(eventPackets * gfxCoreHelper.getSingleTimestampPacketSize(), eventAlignment)));
eventPoolSize = alignUp<size_t>(this->numEvents * eventSize, MemoryConstants::pageSize64k);
auto eventSize = eventPackets * gfxCoreHelper.getSingleTimestampPacketSize();
if (eventPoolFlags & ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP) {
eventSize += sizeof(NEO::TimeStampData);
}
setEventSize(static_cast<uint32_t>(alignUp(eventSize, eventAlignment)));
eventPoolSize = alignUp<size_t>(this->numEvents * this->eventSize, MemoryConstants::pageSize64k);
}
EventPool *EventPool::create(DriverHandle *driver, Context *context, uint32_t numDevices, ze_device_handle_t *deviceHandles, const ze_event_pool_desc_t *desc, ze_result_t &result) {
@@ -311,7 +317,7 @@ ze_result_t Event::openCounterBasedIpcHandle(const IpcCounterBasedEventData &ipc
ipcData.signalScopeFlags, // signalScope
ipcData.waitScopeFlags, // waitScope
false, // timestampPool
false, // kerneMappedTsPoolFlag
false, // kernelMappedTsPoolFlag
true, // importedIpcPool
false, // ipcPool
};
@@ -385,6 +391,7 @@ ze_result_t EventPool::getIpcHandle(ze_ipc_event_pool_handle_t *ipcHandle) {
poolData.isImplicitScalingCapable = this->isImplicitScalingCapable;
poolData.maxEventPackets = this->getEventMaxPackets();
poolData.numDevices = static_cast<uint32_t>(this->devices.size());
poolData.isEventPoolKernelMappedTsFlagSet = this->isEventPoolKernelMappedTsFlagSet();
auto memoryManager = this->context->getDriverHandle()->getMemoryManager();
auto allocation = this->eventPoolAllocations->getDefaultGraphicsAllocation();
@@ -402,6 +409,9 @@ ze_result_t EventPool::openEventPoolIpcHandle(const ze_ipc_event_pool_handle_t &
const IpcEventPoolData &poolData = *reinterpret_cast<const IpcEventPoolData *>(ipcEventPoolHandle.data);
ze_event_pool_desc_t desc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC};
if (poolData.isEventPoolKernelMappedTsFlagSet) {
desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP;
}
desc.count = static_cast<uint32_t>(poolData.numEvents);
auto eventPool = std::make_unique<EventPool>(&desc);
eventPool->isDeviceEventPoolAllocation = poolData.isDeviceEventPoolAllocation;
@@ -640,10 +650,11 @@ void Event::unsetCmdQueue() {
}
void Event::setReferenceTs(uint64_t currentCpuTimeStamp) {
NEO::TimeStampData *referenceTs = static_cast<NEO::TimeStampData *>(ptrOffset(getHostAddress(), maxPacketCount * singlePacketSize));
const auto recalculate =
(currentCpuTimeStamp - referenceTs.cpuTimeinNS) > timestampRefreshIntervalInNanoSec;
if (referenceTs.cpuTimeinNS == 0 || recalculate) {
device->getNEODevice()->getOSTime()->getGpuCpuTime(&referenceTs, true);
(currentCpuTimeStamp - referenceTs->cpuTimeinNS) > timestampRefreshIntervalInNanoSec;
if (referenceTs->cpuTimeinNS == 0 || recalculate) {
device->getNEODevice()->getOSTime()->getGpuCpuTime(referenceTs, true);
}
}

View File

@@ -8,6 +8,7 @@
#pragma once
#include "shared/source/helpers/common_types.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/ptr_math.h"
#include "shared/source/helpers/timestamp_packet_constants.h"
#include "shared/source/helpers/timestamp_packet_container.h"
#include "shared/source/memory_manager/multi_graphics_allocation.h"
@@ -61,6 +62,7 @@ struct IpcEventPoolData {
bool isDeviceEventPoolAllocation = false;
bool isHostVisibleEventPoolAllocation = false;
bool isImplicitScalingCapable = false;
bool isEventPoolKernelMappedTsFlagSet = false;
};
struct IpcCounterBasedEventData {
@@ -95,7 +97,7 @@ struct EventDescriptor {
uint32_t signalScope = 0;
uint32_t waitScope = 0;
bool timestampPool = false;
bool kerneMappedTsPoolFlag = false;
bool kernelMappedTsPoolFlag = false;
bool importedIpcPool = false;
bool ipcPool = false;
};
@@ -303,11 +305,11 @@ struct Event : _ze_event_handle_t {
uint32_t getInOrderAllocationOffset() const { return inOrderAllocationOffset; }
void setLatestUsedCmdQueue(CommandQueue *newCmdQ);
NEO::TimeStampData *peekReferenceTs() {
return &referenceTs;
return static_cast<NEO::TimeStampData *>(ptrOffset(getHostAddress(), getMaxPacketsCount() * getSinglePacketSize()));
}
void setReferenceTs(uint64_t currentCpuTimeStamp);
const CommandQueue *getLatestUsedCmdQueue() const { return latestUsedCmdQueue; }
bool hasKerneMappedTsCapability = false;
bool hasKernelMappedTsCapability = false;
std::shared_ptr<NEO::InOrderExecInfo> &getInOrderExecInfo() { return inOrderExecInfo; }
void enableKmdWaitMode() { kmdWaitMode = true; }
void enableInterruptMode() { interruptMode = true; }
@@ -348,7 +350,6 @@ struct Event : _ze_event_handle_t {
uint64_t globalEndTS = 1;
uint64_t contextStartTS = 1;
uint64_t contextEndTS = 1;
NEO::TimeStampData referenceTs{};
uint64_t inOrderExecSignalValue = 0;
uint32_t inOrderAllocationOffset = 0;
@@ -446,7 +447,7 @@ struct EventPool : _ze_event_pool_handle_t {
return false;
}
bool isEventPoolKerneMappedTsFlagSet() const {
bool isEventPoolKernelMappedTsFlagSet() const {
if (eventPoolFlags & ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP) {
return true;
}

View File

@@ -41,7 +41,7 @@ Event *Event::create(const EventDescriptor &eventDescriptor, Device *device, ze_
event->setEventTimestampFlag(true);
event->setSinglePacketSize(NEO::TimestampPackets<TagSizeT, NEO::TimestampPacketConstants::preferredPacketCount>::getSinglePacketSize());
}
event->hasKerneMappedTsCapability = eventDescriptor.kerneMappedTsPoolFlag;
event->hasKernelMappedTsCapability = eventDescriptor.kernelMappedTsPoolFlag;
event->signalAllEventPackets = L0GfxCoreHelper::useSignalAllEventPackets(hwInfo);
@@ -118,19 +118,19 @@ Event *Event::create(const EventDescriptor &eventDescriptor, Device *device, ze_
template <typename TagSizeT>
Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *device) {
EventDescriptor eventDescriptor = {
&eventPool->getAllocation(), // eventPoolAllocation
desc->pNext, // extensions
eventPool->getEventSize(), // totalEventSize
eventPool->getMaxKernelCount(), // maxKernelCount
eventPool->getEventMaxPackets(), // maxPacketsCount
eventPool->getCounterBasedFlags(), // counterBasedFlags
desc->index, // index
desc->signal, // signalScope
desc->wait, // waitScope
eventPool->isEventPoolTimestampFlagSet(), // timestampPool
eventPool->isEventPoolKerneMappedTsFlagSet(), // kerneMappedTsPoolFlag
eventPool->getImportedIpcPool(), // importedIpcPool
eventPool->isIpcPoolFlagSet(), // ipcPool
&eventPool->getAllocation(), // eventPoolAllocation
desc->pNext, // extensions
eventPool->getEventSize(), // totalEventSize
eventPool->getMaxKernelCount(), // maxKernelCount
eventPool->getEventMaxPackets(), // maxPacketsCount
eventPool->getCounterBasedFlags(), // counterBasedFlags
desc->index, // index
desc->signal, // signalScope
desc->wait, // waitScope
eventPool->isEventPoolTimestampFlagSet(), // timestampPool
eventPool->isEventPoolKernelMappedTsFlagSet(), // kernelMappedTsPoolFlag
eventPool->getImportedIpcPool(), // importedIpcPool
eventPool->isIpcPoolFlagSet(), // ipcPool
};
if (eventPool->getCounterBasedFlags() != 0 && standaloneInOrderTimestampAllocationEnabled()) {
@@ -886,7 +886,8 @@ void EventImp<TagSizeT>::getSynchronizedKernelTimestamps(ze_synchronized_timesta
return static_cast<uint64_t>((deviceTs & maxClampedTsValue) * resolution);
};
auto deviceTsInNs = convertDeviceTsToNanoseconds(referenceTs.gpuTimeStamp);
NEO::TimeStampData *referenceTs = static_cast<NEO::TimeStampData *>(ptrOffset(getHostAddress(), maxPacketCount * singlePacketSize));
auto deviceTsInNs = convertDeviceTsToNanoseconds(referenceTs->gpuTimeStamp);
auto getDuration = [&](uint64_t startTs, uint64_t endTs) {
const uint64_t maxValue = maxKernelTsValue;
@@ -901,7 +902,7 @@ void EventImp<TagSizeT>::getSynchronizedKernelTimestamps(ze_synchronized_timesta
}
};
const auto &referenceHostTsInNs = referenceTs.cpuTimeinNS;
const auto &referenceHostTsInNs = referenceTs->cpuTimeinNS;
// High Level Approach:
// startTimeStamp = (referenceHostTsInNs - submitDeviceTs) + kernelDeviceTsStart
@@ -955,7 +956,7 @@ ze_result_t EventImp<TagSizeT>::queryKernelTimestampsExt(Device *device, uint32_
ze_result_t status = queryTimestampsExp(device, pCount, pResults->pKernelTimestampsBuffer);
if (status == ZE_RESULT_SUCCESS && hasKerneMappedTsCapability) {
if (status == ZE_RESULT_SUCCESS && hasKernelMappedTsCapability) {
getSynchronizedKernelTimestamps(pResults->pSynchronizedTimestampsBuffer, *pCount, pResults->pKernelTimestampsBuffer);
}
return status;

View File

@@ -98,7 +98,7 @@ struct InOrderCmdListFixture : public ::Test<ModuleFixture> {
0, // signalScope
0, // waitScope
false, // timestampPool
false, // kerneMappedTsPoolFlag
false, // kernelMappedTsPoolFlag
false, // importedIpcPool
false, // ipcPool
};

View File

@@ -2264,7 +2264,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithStaticPartitionOffThenQueryK
DebugManagerStateRestore restore;
NEO::debugManager.flags.EnableStaticPartitioning.set(0);
event->hasKerneMappedTsCapability = true;
event->hasKernelMappedTsCapability = true;
std::vector<ze_kernel_timestamp_result_t> kernelTsBuffer(2);
ze_event_query_kernel_timestamps_results_ext_properties_t results{};
@@ -2282,7 +2282,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventStatusNotReadyThenQueryKernelTim
DebugManagerStateRestore restore;
NEO::debugManager.flags.EnableStaticPartitioning.set(0);
event->hasKerneMappedTsCapability = true;
event->hasKernelMappedTsCapability = true;
std::vector<ze_kernel_timestamp_result_t> kernelTsBuffer(2);
ze_event_query_kernel_timestamps_results_ext_properties_t results{};
@@ -2297,43 +2297,48 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventStatusNotReadyThenQueryKernelTim
TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhenQueryKernelTimestampsExtIsCalledCorrectValuesAreReturned) {
struct MappedTimeStampData {
typename MockTimestampPackets32::Packet packetData[3];
NEO::TimeStampData referenceTs{};
} mappedTimeStampData;
auto &hwInfo = device->getNEODevice()->getHardwareInfo();
typename MockTimestampPackets32::Packet packetData[3];
device->getNEODevice()->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable.kernelTimestampValidBits = 32;
event->setPacketsInUse(3u);
event->hasKerneMappedTsCapability = true;
event->hasKernelMappedTsCapability = true;
const auto deviceTsFrequency = device->getNEODevice()->getDeviceInfo().profilingTimerResolution;
const int64_t gpuReferenceTimeInNs = 2000;
const int64_t cpuReferenceTimeInNs = 3000;
const auto maxKernelTsValue = maxNBitValue(hwInfo.capabilityTable.kernelTimestampValidBits);
NEO::TimeStampData *referenceTs = event->peekReferenceTs();
referenceTs->cpuTimeinNS = cpuReferenceTimeInNs;
referenceTs->gpuTimeStamp = static_cast<uint64_t>(gpuReferenceTimeInNs / deviceTsFrequency);
auto timeToTimeStamp = [&](uint32_t timeInNs) {
return static_cast<uint32_t>(timeInNs / deviceTsFrequency);
};
packetData[0].contextStart = 50u;
packetData[0].contextEnd = 100u;
packetData[0].globalStart = timeToTimeStamp(4000u);
packetData[0].globalEnd = timeToTimeStamp(5000u);
mappedTimeStampData.packetData[0].contextStart = 50u;
mappedTimeStampData.packetData[0].contextEnd = 100u;
mappedTimeStampData.packetData[0].globalStart = timeToTimeStamp(4000u);
mappedTimeStampData.packetData[0].globalEnd = timeToTimeStamp(5000u);
// Device Ts overflow case
packetData[1].contextStart = 20u;
packetData[1].contextEnd = 30u;
packetData[1].globalStart = timeToTimeStamp(500u);
packetData[1].globalEnd = timeToTimeStamp(1500u);
mappedTimeStampData.packetData[1].contextStart = 20u;
mappedTimeStampData.packetData[1].contextEnd = 30u;
mappedTimeStampData.packetData[1].globalStart = timeToTimeStamp(500u);
mappedTimeStampData.packetData[1].globalEnd = timeToTimeStamp(1500u);
packetData[2].contextStart = 20u;
packetData[2].contextEnd = 30u;
packetData[2].globalStart = timeToTimeStamp(5000u);
packetData[2].globalEnd = timeToTimeStamp(500u);
mappedTimeStampData.packetData[2].contextStart = 20u;
mappedTimeStampData.packetData[2].contextEnd = 30u;
mappedTimeStampData.packetData[2].globalStart = timeToTimeStamp(5000u);
mappedTimeStampData.packetData[2].globalEnd = timeToTimeStamp(500u);
event->hostAddressFromPool = packetData;
event->hostAddressFromPool = &mappedTimeStampData;
event->maxPacketCount = 3;
uint32_t count = 0;
NEO::TimeStampData *referenceTs = event->peekReferenceTs();
referenceTs->cpuTimeinNS = cpuReferenceTimeInNs;
referenceTs->gpuTimeStamp = static_cast<uint64_t>(gpuReferenceTimeInNs / deviceTsFrequency);
EXPECT_EQ(ZE_RESULT_SUCCESS, event->queryKernelTimestampsExt(device, &count, nullptr));
EXPECT_EQ(count, 3u);
@@ -2355,7 +2360,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhe
EXPECT_LE(results.pSynchronizedTimestampsBuffer[0].global.kernelEnd, expectedGlobalEnd + errorOffset);
auto expectedContextStart = expectedGlobalStart;
auto expectedContextEnd = expectedContextStart + (packetData[0].contextEnd - packetData[0].contextStart) * deviceTsFrequency;
auto expectedContextEnd = expectedContextStart + (mappedTimeStampData.packetData[0].contextEnd - mappedTimeStampData.packetData[0].contextStart) * deviceTsFrequency;
EXPECT_GE(results.pSynchronizedTimestampsBuffer[0].context.kernelStart, expectedContextStart - errorOffset);
EXPECT_LE(results.pSynchronizedTimestampsBuffer[0].context.kernelStart, expectedContextStart + errorOffset);
EXPECT_GE(results.pSynchronizedTimestampsBuffer[0].context.kernelEnd, expectedContextEnd - errorOffset);
@@ -2371,7 +2376,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhe
EXPECT_LE(results.pSynchronizedTimestampsBuffer[1].global.kernelEnd, expectedGlobalEnd + errorOffset);
expectedContextStart = expectedGlobalStart;
expectedContextEnd = expectedContextStart + (packetData[1].contextEnd - packetData[1].contextStart) * deviceTsFrequency;
expectedContextEnd = expectedContextStart + (mappedTimeStampData.packetData[1].contextEnd - mappedTimeStampData.packetData[1].contextStart) * deviceTsFrequency;
EXPECT_GE(results.pSynchronizedTimestampsBuffer[1].context.kernelStart, expectedContextStart - errorOffset);
EXPECT_LE(results.pSynchronizedTimestampsBuffer[1].context.kernelStart, expectedContextStart + errorOffset);
EXPECT_GE(results.pSynchronizedTimestampsBuffer[1].context.kernelEnd, expectedContextEnd - errorOffset);
@@ -2386,7 +2391,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhe
EXPECT_LE(results.pSynchronizedTimestampsBuffer[2].global.kernelEnd, expectedGlobalEnd + errorOffset);
expectedContextStart = expectedGlobalStart;
expectedContextEnd = expectedContextStart + (packetData[2].contextEnd - packetData[1].contextStart) * deviceTsFrequency;
expectedContextEnd = expectedContextStart + (mappedTimeStampData.packetData[2].contextEnd - mappedTimeStampData.packetData[1].contextStart) * deviceTsFrequency;
EXPECT_GE(results.pSynchronizedTimestampsBuffer[2].context.kernelStart, expectedContextStart - errorOffset);
EXPECT_LE(results.pSynchronizedTimestampsBuffer[2].context.kernelStart, expectedContextStart + errorOffset);
EXPECT_GE(results.pSynchronizedTimestampsBuffer[2].context.kernelEnd, expectedContextEnd - errorOffset);