From ad3110ff678753c4422556bf15fe232980046e50 Mon Sep 17 00:00:00 2001 From: Bellekallu Rajkiran Date: Mon, 20 Jan 2025 11:25:34 +0000 Subject: [PATCH] fix: Mapped time stamp retrieval with IPC Create shared allocations to maintain reference timestamps. Add flag to IPC pool data to check whether mapped time stamp flag is set. Related-To: NEO-12313 Signed-off-by: Bellekallu Rajkiran --- .../driver_experimental/public/zex_event.cpp | 2 +- .../core/source/cmdlist/cmdlist_imp.cpp | 2 +- level_zero/core/source/event/event.cpp | 23 +++++--- level_zero/core/source/event/event.h | 11 ++-- level_zero/core/source/event/event_impl.inl | 35 ++++++------ .../fixtures/in_order_cmd_list_fixture.h | 2 +- .../unit_tests/sources/event/test_event.cpp | 53 ++++++++++--------- 7 files changed, 73 insertions(+), 55 deletions(-) diff --git a/level_zero/api/driver_experimental/public/zex_event.cpp b/level_zero/api/driver_experimental/public/zex_event.cpp index 56aed26a16..be69f3fdd4 100644 --- a/level_zero/api/driver_experimental/public/zex_event.cpp +++ b/level_zero/api/driver_experimental/public/zex_event.cpp @@ -83,7 +83,7 @@ zexCounterBasedEventCreate2(ze_context_handle_t hContext, ze_device_handle_t hDe signalScope, // signalScope desc->waitScope, // waitScope timestampFlag, // timestampPool - mappedTimestampFlag, // kerneMappedTsPoolFlag + mappedTimestampFlag, // kernelMappedTsPoolFlag false, // importedIpcPool ipcFlag, // ipcPool }; diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.cpp b/level_zero/core/source/cmdlist/cmdlist_imp.cpp index 9c8893144f..73288a409d 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.cpp +++ b/level_zero/core/source/cmdlist/cmdlist_imp.cpp @@ -329,7 +329,7 @@ void CommandListImp::storeReferenceTsToMappedEvents(bool isClearEnabled) { } void CommandListImp::addToMappedEventList(Event *event) { - if (event && event->hasKerneMappedTsCapability) { + if (event && event->hasKernelMappedTsCapability) { if (std::find(mappedTsEventList.begin(), mappedTsEventList.end(), event) == mappedTsEventList.end()) { mappedTsEventList.push_back(event); } diff --git a/level_zero/core/source/event/event.cpp b/level_zero/core/source/event/event.cpp index 29d474fa1f..ca2b61ded5 100644 --- a/level_zero/core/source/event/event.cpp +++ b/level_zero/core/source/event/event.cpp @@ -210,9 +210,15 @@ void EventPool::initializeSizeParameters(uint32_t numDevices, ze_device_handle_t eventPackets = driver.getEventMaxPacketCount(numDevices, deviceHandles); maxKernelCount = driver.getEventMaxKernelCount(numDevices, deviceHandles); } - setEventSize(static_cast(alignUp(eventPackets * gfxCoreHelper.getSingleTimestampPacketSize(), eventAlignment))); - eventPoolSize = alignUp(this->numEvents * eventSize, MemoryConstants::pageSize64k); + auto eventSize = eventPackets * gfxCoreHelper.getSingleTimestampPacketSize(); + if (eventPoolFlags & ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP) { + eventSize += sizeof(NEO::TimeStampData); + } + + setEventSize(static_cast(alignUp(eventSize, eventAlignment))); + + eventPoolSize = alignUp(this->numEvents * this->eventSize, MemoryConstants::pageSize64k); } EventPool *EventPool::create(DriverHandle *driver, Context *context, uint32_t numDevices, ze_device_handle_t *deviceHandles, const ze_event_pool_desc_t *desc, ze_result_t &result) { @@ -311,7 +317,7 @@ ze_result_t Event::openCounterBasedIpcHandle(const IpcCounterBasedEventData &ipc ipcData.signalScopeFlags, // signalScope ipcData.waitScopeFlags, // waitScope false, // timestampPool - false, // kerneMappedTsPoolFlag + false, // kernelMappedTsPoolFlag true, // importedIpcPool false, // ipcPool }; @@ -385,6 +391,7 @@ ze_result_t EventPool::getIpcHandle(ze_ipc_event_pool_handle_t *ipcHandle) { poolData.isImplicitScalingCapable = this->isImplicitScalingCapable; poolData.maxEventPackets = this->getEventMaxPackets(); poolData.numDevices = static_cast(this->devices.size()); + poolData.isEventPoolKernelMappedTsFlagSet = this->isEventPoolKernelMappedTsFlagSet(); auto memoryManager = this->context->getDriverHandle()->getMemoryManager(); auto allocation = this->eventPoolAllocations->getDefaultGraphicsAllocation(); @@ -402,6 +409,9 @@ ze_result_t EventPool::openEventPoolIpcHandle(const ze_ipc_event_pool_handle_t & const IpcEventPoolData &poolData = *reinterpret_cast(ipcEventPoolHandle.data); ze_event_pool_desc_t desc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC}; + if (poolData.isEventPoolKernelMappedTsFlagSet) { + desc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP; + } desc.count = static_cast(poolData.numEvents); auto eventPool = std::make_unique(&desc); eventPool->isDeviceEventPoolAllocation = poolData.isDeviceEventPoolAllocation; @@ -640,10 +650,11 @@ void Event::unsetCmdQueue() { } void Event::setReferenceTs(uint64_t currentCpuTimeStamp) { + NEO::TimeStampData *referenceTs = static_cast(ptrOffset(getHostAddress(), maxPacketCount * singlePacketSize)); const auto recalculate = - (currentCpuTimeStamp - referenceTs.cpuTimeinNS) > timestampRefreshIntervalInNanoSec; - if (referenceTs.cpuTimeinNS == 0 || recalculate) { - device->getNEODevice()->getOSTime()->getGpuCpuTime(&referenceTs, true); + (currentCpuTimeStamp - referenceTs->cpuTimeinNS) > timestampRefreshIntervalInNanoSec; + if (referenceTs->cpuTimeinNS == 0 || recalculate) { + device->getNEODevice()->getOSTime()->getGpuCpuTime(referenceTs, true); } } diff --git a/level_zero/core/source/event/event.h b/level_zero/core/source/event/event.h index 3cc4d23714..aa827db007 100644 --- a/level_zero/core/source/event/event.h +++ b/level_zero/core/source/event/event.h @@ -8,6 +8,7 @@ #pragma once #include "shared/source/helpers/common_types.h" #include "shared/source/helpers/constants.h" +#include "shared/source/helpers/ptr_math.h" #include "shared/source/helpers/timestamp_packet_constants.h" #include "shared/source/helpers/timestamp_packet_container.h" #include "shared/source/memory_manager/multi_graphics_allocation.h" @@ -61,6 +62,7 @@ struct IpcEventPoolData { bool isDeviceEventPoolAllocation = false; bool isHostVisibleEventPoolAllocation = false; bool isImplicitScalingCapable = false; + bool isEventPoolKernelMappedTsFlagSet = false; }; struct IpcCounterBasedEventData { @@ -95,7 +97,7 @@ struct EventDescriptor { uint32_t signalScope = 0; uint32_t waitScope = 0; bool timestampPool = false; - bool kerneMappedTsPoolFlag = false; + bool kernelMappedTsPoolFlag = false; bool importedIpcPool = false; bool ipcPool = false; }; @@ -303,11 +305,11 @@ struct Event : _ze_event_handle_t { uint32_t getInOrderAllocationOffset() const { return inOrderAllocationOffset; } void setLatestUsedCmdQueue(CommandQueue *newCmdQ); NEO::TimeStampData *peekReferenceTs() { - return &referenceTs; + return static_cast(ptrOffset(getHostAddress(), getMaxPacketsCount() * getSinglePacketSize())); } void setReferenceTs(uint64_t currentCpuTimeStamp); const CommandQueue *getLatestUsedCmdQueue() const { return latestUsedCmdQueue; } - bool hasKerneMappedTsCapability = false; + bool hasKernelMappedTsCapability = false; std::shared_ptr &getInOrderExecInfo() { return inOrderExecInfo; } void enableKmdWaitMode() { kmdWaitMode = true; } void enableInterruptMode() { interruptMode = true; } @@ -348,7 +350,6 @@ struct Event : _ze_event_handle_t { uint64_t globalEndTS = 1; uint64_t contextStartTS = 1; uint64_t contextEndTS = 1; - NEO::TimeStampData referenceTs{}; uint64_t inOrderExecSignalValue = 0; uint32_t inOrderAllocationOffset = 0; @@ -446,7 +447,7 @@ struct EventPool : _ze_event_pool_handle_t { return false; } - bool isEventPoolKerneMappedTsFlagSet() const { + bool isEventPoolKernelMappedTsFlagSet() const { if (eventPoolFlags & ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP) { return true; } diff --git a/level_zero/core/source/event/event_impl.inl b/level_zero/core/source/event/event_impl.inl index c4edbeb82b..0db8c98e8b 100644 --- a/level_zero/core/source/event/event_impl.inl +++ b/level_zero/core/source/event/event_impl.inl @@ -41,7 +41,7 @@ Event *Event::create(const EventDescriptor &eventDescriptor, Device *device, ze_ event->setEventTimestampFlag(true); event->setSinglePacketSize(NEO::TimestampPackets::getSinglePacketSize()); } - event->hasKerneMappedTsCapability = eventDescriptor.kerneMappedTsPoolFlag; + event->hasKernelMappedTsCapability = eventDescriptor.kernelMappedTsPoolFlag; event->signalAllEventPackets = L0GfxCoreHelper::useSignalAllEventPackets(hwInfo); @@ -118,19 +118,19 @@ Event *Event::create(const EventDescriptor &eventDescriptor, Device *device, ze_ template Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *device) { EventDescriptor eventDescriptor = { - &eventPool->getAllocation(), // eventPoolAllocation - desc->pNext, // extensions - eventPool->getEventSize(), // totalEventSize - eventPool->getMaxKernelCount(), // maxKernelCount - eventPool->getEventMaxPackets(), // maxPacketsCount - eventPool->getCounterBasedFlags(), // counterBasedFlags - desc->index, // index - desc->signal, // signalScope - desc->wait, // waitScope - eventPool->isEventPoolTimestampFlagSet(), // timestampPool - eventPool->isEventPoolKerneMappedTsFlagSet(), // kerneMappedTsPoolFlag - eventPool->getImportedIpcPool(), // importedIpcPool - eventPool->isIpcPoolFlagSet(), // ipcPool + &eventPool->getAllocation(), // eventPoolAllocation + desc->pNext, // extensions + eventPool->getEventSize(), // totalEventSize + eventPool->getMaxKernelCount(), // maxKernelCount + eventPool->getEventMaxPackets(), // maxPacketsCount + eventPool->getCounterBasedFlags(), // counterBasedFlags + desc->index, // index + desc->signal, // signalScope + desc->wait, // waitScope + eventPool->isEventPoolTimestampFlagSet(), // timestampPool + eventPool->isEventPoolKernelMappedTsFlagSet(), // kernelMappedTsPoolFlag + eventPool->getImportedIpcPool(), // importedIpcPool + eventPool->isIpcPoolFlagSet(), // ipcPool }; if (eventPool->getCounterBasedFlags() != 0 && standaloneInOrderTimestampAllocationEnabled()) { @@ -886,7 +886,8 @@ void EventImp::getSynchronizedKernelTimestamps(ze_synchronized_timesta return static_cast((deviceTs & maxClampedTsValue) * resolution); }; - auto deviceTsInNs = convertDeviceTsToNanoseconds(referenceTs.gpuTimeStamp); + NEO::TimeStampData *referenceTs = static_cast(ptrOffset(getHostAddress(), maxPacketCount * singlePacketSize)); + auto deviceTsInNs = convertDeviceTsToNanoseconds(referenceTs->gpuTimeStamp); auto getDuration = [&](uint64_t startTs, uint64_t endTs) { const uint64_t maxValue = maxKernelTsValue; @@ -901,7 +902,7 @@ void EventImp::getSynchronizedKernelTimestamps(ze_synchronized_timesta } }; - const auto &referenceHostTsInNs = referenceTs.cpuTimeinNS; + const auto &referenceHostTsInNs = referenceTs->cpuTimeinNS; // High Level Approach: // startTimeStamp = (referenceHostTsInNs - submitDeviceTs) + kernelDeviceTsStart @@ -955,7 +956,7 @@ ze_result_t EventImp::queryKernelTimestampsExt(Device *device, uint32_ ze_result_t status = queryTimestampsExp(device, pCount, pResults->pKernelTimestampsBuffer); - if (status == ZE_RESULT_SUCCESS && hasKerneMappedTsCapability) { + if (status == ZE_RESULT_SUCCESS && hasKernelMappedTsCapability) { getSynchronizedKernelTimestamps(pResults->pSynchronizedTimestampsBuffer, *pCount, pResults->pKernelTimestampsBuffer); } return status; diff --git a/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h b/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h index 280bce13d9..dfc1c0709f 100644 --- a/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/in_order_cmd_list_fixture.h @@ -98,7 +98,7 @@ struct InOrderCmdListFixture : public ::Test { 0, // signalScope 0, // waitScope false, // timestampPool - false, // kerneMappedTsPoolFlag + false, // kernelMappedTsPoolFlag false, // importedIpcPool false, // ipcPool }; diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp index d723602fb2..166a188476 100644 --- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp +++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp @@ -2264,7 +2264,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithStaticPartitionOffThenQueryK DebugManagerStateRestore restore; NEO::debugManager.flags.EnableStaticPartitioning.set(0); - event->hasKerneMappedTsCapability = true; + event->hasKernelMappedTsCapability = true; std::vector kernelTsBuffer(2); ze_event_query_kernel_timestamps_results_ext_properties_t results{}; @@ -2282,7 +2282,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventStatusNotReadyThenQueryKernelTim DebugManagerStateRestore restore; NEO::debugManager.flags.EnableStaticPartitioning.set(0); - event->hasKerneMappedTsCapability = true; + event->hasKernelMappedTsCapability = true; std::vector kernelTsBuffer(2); ze_event_query_kernel_timestamps_results_ext_properties_t results{}; @@ -2297,43 +2297,48 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventStatusNotReadyThenQueryKernelTim TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhenQueryKernelTimestampsExtIsCalledCorrectValuesAreReturned) { + struct MappedTimeStampData { + typename MockTimestampPackets32::Packet packetData[3]; + NEO::TimeStampData referenceTs{}; + } mappedTimeStampData; + auto &hwInfo = device->getNEODevice()->getHardwareInfo(); - typename MockTimestampPackets32::Packet packetData[3]; device->getNEODevice()->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable.kernelTimestampValidBits = 32; event->setPacketsInUse(3u); - event->hasKerneMappedTsCapability = true; + event->hasKernelMappedTsCapability = true; const auto deviceTsFrequency = device->getNEODevice()->getDeviceInfo().profilingTimerResolution; const int64_t gpuReferenceTimeInNs = 2000; const int64_t cpuReferenceTimeInNs = 3000; const auto maxKernelTsValue = maxNBitValue(hwInfo.capabilityTable.kernelTimestampValidBits); - NEO::TimeStampData *referenceTs = event->peekReferenceTs(); - referenceTs->cpuTimeinNS = cpuReferenceTimeInNs; - referenceTs->gpuTimeStamp = static_cast(gpuReferenceTimeInNs / deviceTsFrequency); - auto timeToTimeStamp = [&](uint32_t timeInNs) { return static_cast(timeInNs / deviceTsFrequency); }; - packetData[0].contextStart = 50u; - packetData[0].contextEnd = 100u; - packetData[0].globalStart = timeToTimeStamp(4000u); - packetData[0].globalEnd = timeToTimeStamp(5000u); + mappedTimeStampData.packetData[0].contextStart = 50u; + mappedTimeStampData.packetData[0].contextEnd = 100u; + mappedTimeStampData.packetData[0].globalStart = timeToTimeStamp(4000u); + mappedTimeStampData.packetData[0].globalEnd = timeToTimeStamp(5000u); // Device Ts overflow case - packetData[1].contextStart = 20u; - packetData[1].contextEnd = 30u; - packetData[1].globalStart = timeToTimeStamp(500u); - packetData[1].globalEnd = timeToTimeStamp(1500u); + mappedTimeStampData.packetData[1].contextStart = 20u; + mappedTimeStampData.packetData[1].contextEnd = 30u; + mappedTimeStampData.packetData[1].globalStart = timeToTimeStamp(500u); + mappedTimeStampData.packetData[1].globalEnd = timeToTimeStamp(1500u); - packetData[2].contextStart = 20u; - packetData[2].contextEnd = 30u; - packetData[2].globalStart = timeToTimeStamp(5000u); - packetData[2].globalEnd = timeToTimeStamp(500u); + mappedTimeStampData.packetData[2].contextStart = 20u; + mappedTimeStampData.packetData[2].contextEnd = 30u; + mappedTimeStampData.packetData[2].globalStart = timeToTimeStamp(5000u); + mappedTimeStampData.packetData[2].globalEnd = timeToTimeStamp(500u); - event->hostAddressFromPool = packetData; + event->hostAddressFromPool = &mappedTimeStampData; + event->maxPacketCount = 3; uint32_t count = 0; + NEO::TimeStampData *referenceTs = event->peekReferenceTs(); + referenceTs->cpuTimeinNS = cpuReferenceTimeInNs; + referenceTs->gpuTimeStamp = static_cast(gpuReferenceTimeInNs / deviceTsFrequency); + EXPECT_EQ(ZE_RESULT_SUCCESS, event->queryKernelTimestampsExt(device, &count, nullptr)); EXPECT_EQ(count, 3u); @@ -2355,7 +2360,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhe EXPECT_LE(results.pSynchronizedTimestampsBuffer[0].global.kernelEnd, expectedGlobalEnd + errorOffset); auto expectedContextStart = expectedGlobalStart; - auto expectedContextEnd = expectedContextStart + (packetData[0].contextEnd - packetData[0].contextStart) * deviceTsFrequency; + auto expectedContextEnd = expectedContextStart + (mappedTimeStampData.packetData[0].contextEnd - mappedTimeStampData.packetData[0].contextStart) * deviceTsFrequency; EXPECT_GE(results.pSynchronizedTimestampsBuffer[0].context.kernelStart, expectedContextStart - errorOffset); EXPECT_LE(results.pSynchronizedTimestampsBuffer[0].context.kernelStart, expectedContextStart + errorOffset); EXPECT_GE(results.pSynchronizedTimestampsBuffer[0].context.kernelEnd, expectedContextEnd - errorOffset); @@ -2371,7 +2376,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhe EXPECT_LE(results.pSynchronizedTimestampsBuffer[1].global.kernelEnd, expectedGlobalEnd + errorOffset); expectedContextStart = expectedGlobalStart; - expectedContextEnd = expectedContextStart + (packetData[1].contextEnd - packetData[1].contextStart) * deviceTsFrequency; + expectedContextEnd = expectedContextStart + (mappedTimeStampData.packetData[1].contextEnd - mappedTimeStampData.packetData[1].contextStart) * deviceTsFrequency; EXPECT_GE(results.pSynchronizedTimestampsBuffer[1].context.kernelStart, expectedContextStart - errorOffset); EXPECT_LE(results.pSynchronizedTimestampsBuffer[1].context.kernelStart, expectedContextStart + errorOffset); EXPECT_GE(results.pSynchronizedTimestampsBuffer[1].context.kernelEnd, expectedContextEnd - errorOffset); @@ -2386,7 +2391,7 @@ TEST_F(EventqueryKernelTimestampsExt, givenEventWithMappedTimestampCapabilityWhe EXPECT_LE(results.pSynchronizedTimestampsBuffer[2].global.kernelEnd, expectedGlobalEnd + errorOffset); expectedContextStart = expectedGlobalStart; - expectedContextEnd = expectedContextStart + (packetData[2].contextEnd - packetData[1].contextStart) * deviceTsFrequency; + expectedContextEnd = expectedContextStart + (mappedTimeStampData.packetData[2].contextEnd - mappedTimeStampData.packetData[1].contextStart) * deviceTsFrequency; EXPECT_GE(results.pSynchronizedTimestampsBuffer[2].context.kernelStart, expectedContextStart - errorOffset); EXPECT_LE(results.pSynchronizedTimestampsBuffer[2].context.kernelStart, expectedContextStart + errorOffset); EXPECT_GE(results.pSynchronizedTimestampsBuffer[2].context.kernelEnd, expectedContextEnd - errorOffset);