mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-05 18:06:32 +08:00
Add extended functionality for timestamps at Event
Related-To: NEO-4584 Signed-off-by: Daria Hinz <daria.hinz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
55fb319517
commit
1ef9a1c35f
@@ -17,6 +17,11 @@ enum class ImageType;
|
||||
}
|
||||
|
||||
namespace L0 {
|
||||
struct EventData {
|
||||
uint64_t address;
|
||||
uint64_t packetsInUse;
|
||||
};
|
||||
|
||||
struct AlignedAllocationData {
|
||||
uintptr_t alignedAllocationPtr = 0u;
|
||||
size_t offset = 0u;
|
||||
@@ -140,7 +145,8 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
uint64_t dstOffset, void *srcPtr,
|
||||
NEO::GraphicsAllocation *srcPtrAlloc,
|
||||
uint64_t srcOffset, uint32_t size,
|
||||
uint32_t elementSize, Builtin builtin);
|
||||
uint32_t elementSize, Builtin builtin,
|
||||
ze_event_handle_t hSignalEvent);
|
||||
|
||||
MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyBlit(uintptr_t dstPtr,
|
||||
NEO::GraphicsAllocation *dstPtrAlloc,
|
||||
@@ -198,6 +204,7 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
const void **pRanges);
|
||||
|
||||
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]);
|
||||
void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker);
|
||||
void appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker);
|
||||
void appendEventForProfilingCopyCommand(ze_event_handle_t hEvent, bool beforeWalker);
|
||||
void appendSignalEventPostWalker(ze_event_handle_t hEvent);
|
||||
|
||||
@@ -208,7 +208,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
|
||||
uint64_t baseAddr = event->getGpuAddress();
|
||||
size_t eventOffset = 0;
|
||||
if (event->isTimestampEvent) {
|
||||
eventOffset = offsetof(KernelTimestampEvent, contextEnd);
|
||||
eventOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
|
||||
}
|
||||
commandContainer.addToResidencyContainer(&event->getAllocation());
|
||||
if (isCopyOnly()) {
|
||||
@@ -650,7 +650,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(v
|
||||
uint64_t srcOffset,
|
||||
uint32_t size,
|
||||
uint32_t elementSize,
|
||||
Builtin builtin) {
|
||||
Builtin builtin,
|
||||
ze_event_handle_t hSignalEvent) {
|
||||
|
||||
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
|
||||
|
||||
@@ -679,7 +680,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(v
|
||||
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
|
||||
|
||||
return CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(builtinFunction->toHandle(), &dispatchFuncArgs,
|
||||
nullptr, 0, nullptr);
|
||||
hSignalEvent, 0, nullptr);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
@@ -866,7 +867,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
|
||||
reinterpret_cast<void *>(&srcAllocationStruct.alignedAllocationPtr),
|
||||
srcAllocationStruct.alloc, srcAllocationStruct.offset,
|
||||
static_cast<uint32_t>(leftSize), 1,
|
||||
Builtin::CopyBufferToBufferSide);
|
||||
Builtin::CopyBufferToBufferSide,
|
||||
nullptr);
|
||||
}
|
||||
|
||||
if (ret == ZE_RESULT_SUCCESS && middleSizeBytes) {
|
||||
@@ -880,7 +882,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
|
||||
srcAllocationStruct.alloc, leftSize + srcAllocationStruct.offset,
|
||||
static_cast<uint32_t>(middleSizeBytes),
|
||||
static_cast<uint32_t>(middleElSize),
|
||||
Builtin::CopyBufferToBufferMiddle);
|
||||
Builtin::CopyBufferToBufferMiddle,
|
||||
nullptr);
|
||||
}
|
||||
|
||||
if (ret == ZE_RESULT_SUCCESS && rightSize) {
|
||||
@@ -893,11 +896,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
|
||||
reinterpret_cast<void *>(&srcAllocationStruct.alignedAllocationPtr),
|
||||
srcAllocationStruct.alloc, leftSize + middleSizeBytes + srcAllocationStruct.offset,
|
||||
static_cast<uint32_t>(rightSize), 1u,
|
||||
Builtin::CopyBufferToBufferSide);
|
||||
Builtin::CopyBufferToBufferSide,
|
||||
nullptr);
|
||||
}
|
||||
|
||||
this->appendSignalEventPostWalker(hSignalEvent);
|
||||
|
||||
if (dstAllocationStruct.needsFlush && !isCopyOnly()) {
|
||||
NEO::PipeControlArgs args(true);
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
|
||||
@@ -1266,14 +1269,11 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingCopyCommand(ze
|
||||
return;
|
||||
}
|
||||
commandContainer.addToResidencyContainer(&event->getAllocation());
|
||||
auto baseAddr = event->getGpuAddress();
|
||||
auto contextOffset = beforeWalker ? offsetof(KernelTimestampEvent, contextStart) : offsetof(KernelTimestampEvent, contextEnd);
|
||||
auto globalOffset = beforeWalker ? offsetof(KernelTimestampEvent, globalStart) : offsetof(KernelTimestampEvent, globalEnd);
|
||||
|
||||
if (!beforeWalker) {
|
||||
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, false, false);
|
||||
}
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, ptrOffset(baseAddr, globalOffset));
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, ptrOffset(baseAddr, contextOffset));
|
||||
appendWriteKernelTimestamp(hEvent, beforeWalker);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
@@ -1361,7 +1361,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
|
||||
uint64_t baseAddr = event->getGpuAddress();
|
||||
size_t eventSignalOffset = 0;
|
||||
if (event->isTimestampEvent) {
|
||||
eventSignalOffset = offsetof(KernelTimestampEvent, contextEnd);
|
||||
eventSignalOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
|
||||
}
|
||||
|
||||
if (isCopyOnly()) {
|
||||
@@ -1396,7 +1396,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
|
||||
|
||||
gpuAddr = event->getGpuAddress();
|
||||
if (event->isTimestampEvent) {
|
||||
gpuAddr += offsetof(KernelTimestampEvent, contextEnd);
|
||||
gpuAddr += offsetof(TimestampPacketStorage::Packet, contextEnd);
|
||||
}
|
||||
NEO::EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(),
|
||||
gpuAddr,
|
||||
@@ -1418,6 +1418,18 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker) {
|
||||
auto event = Event::fromHandle(hEvent);
|
||||
|
||||
auto baseAddr = event->getGpuAddress();
|
||||
auto contextOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, contextStart) : offsetof(TimestampPacketStorage::Packet, contextEnd);
|
||||
auto globalOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, globalStart) : offsetof(TimestampPacketStorage::Packet, globalEnd);
|
||||
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, ptrOffset(baseAddr, globalOffset));
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, ptrOffset(baseAddr, contextOffset));
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker) {
|
||||
if (!hEvent) {
|
||||
@@ -1433,29 +1445,16 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(ze_event_hand
|
||||
}
|
||||
|
||||
commandContainer.addToResidencyContainer(&event->getAllocation());
|
||||
auto baseAddr = event->getGpuAddress();
|
||||
|
||||
if (beforeWalker) {
|
||||
auto contextStartAddr = baseAddr;
|
||||
auto globalStartAddr = baseAddr + offsetof(KernelTimestampEvent, globalStart);
|
||||
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(),
|
||||
REG_GLOBAL_TIMESTAMP_LDW, globalStartAddr);
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(),
|
||||
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddr);
|
||||
|
||||
appendWriteKernelTimestamp(hEvent, beforeWalker);
|
||||
} else {
|
||||
auto contextEndAddr = baseAddr + offsetof(KernelTimestampEvent, contextEnd);
|
||||
auto globalEndAddr = baseAddr + offsetof(KernelTimestampEvent, globalEnd);
|
||||
|
||||
NEO::PipeControlArgs args;
|
||||
args.dcFlushEnable = true;
|
||||
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
|
||||
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(),
|
||||
REG_GLOBAL_TIMESTAMP_LDW, globalEndAddr);
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(),
|
||||
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddr);
|
||||
appendWriteKernelTimestamp(hEvent, beforeWalker);
|
||||
|
||||
args.dcFlushEnable = (!event->signalScope) ? false : true;
|
||||
if (args.dcFlushEnable) {
|
||||
@@ -1525,15 +1524,16 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
|
||||
auto dstptrAllocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(ze_kernel_timestamp_result_t) * numEvents);
|
||||
commandContainer.addToResidencyContainer(dstptrAllocationStruct.alloc);
|
||||
|
||||
std::unique_ptr<uint64_t[]> timestampsAddress = std::make_unique<uint64_t[]>(numEvents);
|
||||
std::unique_ptr<EventData[]> timestampsAddress = std::make_unique<EventData[]>(numEvents);
|
||||
|
||||
for (uint32_t i = 0u; i < numEvents; ++i) {
|
||||
auto event = Event::fromHandle(phEvents[i]);
|
||||
commandContainer.addToResidencyContainer(&event->getAllocation());
|
||||
timestampsAddress[i] = event->getGpuAddress();
|
||||
timestampsAddress[i].address = event->getGpuAddress();
|
||||
timestampsAddress[i].packetsInUse = event->getPacketsInUse();
|
||||
}
|
||||
|
||||
size_t alignedSize = alignUp<size_t>(sizeof(uint64_t) * numEvents, MemoryConstants::pageSize64k);
|
||||
size_t alignedSize = alignUp<size_t>(sizeof(EventData) * numEvents, MemoryConstants::pageSize64k);
|
||||
NEO::GraphicsAllocation::AllocationType allocationType = NEO::GraphicsAllocation::AllocationType::BUFFER;
|
||||
NEO::AllocationProperties allocationProperties{device->getRootDeviceIndex(),
|
||||
true,
|
||||
@@ -1549,13 +1549,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
|
||||
commandContainer.addToResidencyContainer(timestampsGPUAddress);
|
||||
commandContainer.getDeallocationContainer().push_back(timestampsGPUAddress);
|
||||
|
||||
bool result = device->getDriverHandle()->getMemoryManager()->copyMemoryToAllocation(timestampsGPUAddress, 0, timestampsAddress.get(), sizeof(uint64_t) * numEvents);
|
||||
bool result = device->getDriverHandle()->getMemoryManager()->copyMemoryToAllocation(timestampsGPUAddress, 0, timestampsAddress.get(), sizeof(EventData) * numEvents);
|
||||
|
||||
UNRECOVERABLE_IF(!result);
|
||||
|
||||
Kernel *builtinFunction = nullptr;
|
||||
auto useOnlyGlobalTimestamps = NEO::HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily).useOnlyGlobalTimestamps() ? 1u : 0u;
|
||||
|
||||
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
|
||||
|
||||
if (pOffsets == nullptr) {
|
||||
|
||||
@@ -121,6 +121,7 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *
|
||||
|
||||
if (eventPool->isEventPoolUsedForTimestamp) {
|
||||
event->isTimestampEvent = true;
|
||||
event->timestampsData = std::make_unique<TimestampPacketStorage>();
|
||||
}
|
||||
|
||||
auto alloc = eventPool->getAllocation().getGraphicsAllocation(device->getNEODevice()->getRootDeviceIndex());
|
||||
@@ -128,11 +129,9 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *
|
||||
uint64_t baseHostAddr = reinterpret_cast<uint64_t>(alloc->getUnderlyingBuffer());
|
||||
event->hostAddress = reinterpret_cast<void *>(baseHostAddr + (desc->index * eventPool->getEventSize()));
|
||||
event->gpuAddress = alloc->getGpuAddress() + (desc->index * eventPool->getEventSize());
|
||||
|
||||
event->signalScope = desc->signal;
|
||||
event->waitScope = desc->wait;
|
||||
event->csr = static_cast<DeviceImp *>(device)->neoDevice->getDefaultEngine().commandStreamReceiver;
|
||||
|
||||
event->reset();
|
||||
|
||||
return event;
|
||||
@@ -144,6 +143,49 @@ NEO::GraphicsAllocation &Event::getAllocation() {
|
||||
return *eventImp->eventPool->getAllocation().getGraphicsAllocation(eventImp->device->getNEODevice()->getRootDeviceIndex());
|
||||
}
|
||||
|
||||
ze_result_t EventImp::calculateProfilingData() {
|
||||
globalStartTS = timestampsData->packets[0].globalStart;
|
||||
globalEndTS = timestampsData->packets[0].globalEnd;
|
||||
contextStartTS = timestampsData->packets[0].contextStart;
|
||||
contextEndTS = timestampsData->packets[0].contextEnd;
|
||||
|
||||
for (auto i = 1u; i < packetsInUse; i++) {
|
||||
auto &packet = timestampsData->packets[i];
|
||||
if (globalStartTS > packet.globalStart) {
|
||||
globalStartTS = packet.globalStart;
|
||||
}
|
||||
if (contextStartTS > packet.contextStart) {
|
||||
contextStartTS = packet.contextStart;
|
||||
}
|
||||
if (contextEndTS < packet.contextEnd) {
|
||||
contextEndTS = packet.contextEnd;
|
||||
}
|
||||
if (globalEndTS < packet.globalEnd) {
|
||||
globalEndTS = packet.globalEnd;
|
||||
}
|
||||
}
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
void EventImp::assignTimestampData(void *address) {
|
||||
auto baseAddr = reinterpret_cast<uint64_t>(address);
|
||||
|
||||
auto copyData = [&](uint32_t ×tampField, auto tsAddr) {
|
||||
memcpy_s(static_cast<void *>(×tampField), sizeof(uint32_t), reinterpret_cast<void *>(tsAddr), sizeof(uint32_t));
|
||||
};
|
||||
|
||||
for (uint32_t i = 0; i < packetsInUse; i++) {
|
||||
|
||||
auto &packet = timestampsData->packets[i];
|
||||
copyData(packet.globalStart, baseAddr + offsetof(TimestampPacketStorage::Packet, globalStart));
|
||||
copyData(packet.contextStart, baseAddr + offsetof(TimestampPacketStorage::Packet, contextStart));
|
||||
copyData(packet.globalEnd, baseAddr + offsetof(TimestampPacketStorage::Packet, globalEnd));
|
||||
copyData(packet.contextEnd, baseAddr + offsetof(TimestampPacketStorage::Packet, contextEnd));
|
||||
baseAddr += sizeof(struct TimestampPacketStorage::Packet);
|
||||
}
|
||||
}
|
||||
|
||||
ze_result_t Event::destroy() {
|
||||
delete this;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
@@ -158,7 +200,7 @@ ze_result_t EventImp::queryStatus() {
|
||||
this->csr->downloadAllocations();
|
||||
if (isTimestampEvent) {
|
||||
auto baseAddr = reinterpret_cast<uint64_t>(hostAddress);
|
||||
auto timeStampAddress = baseAddr + offsetof(KernelTimestampEvent, contextEnd);
|
||||
auto timeStampAddress = baseAddr + offsetof(TimestampPacketStorage::Packet, contextEnd);
|
||||
hostAddr = reinterpret_cast<uint64_t *>(timeStampAddress);
|
||||
}
|
||||
memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), static_cast<void *>(hostAddr), sizeof(uint32_t));
|
||||
@@ -172,16 +214,17 @@ ze_result_t EventImp::hostEventSetValueTimestamps(uint32_t eventVal) {
|
||||
|
||||
auto eventTsSetFunc = [&](auto tsAddr) {
|
||||
auto tsptr = reinterpret_cast<void *>(tsAddr);
|
||||
|
||||
memcpy_s(tsptr, sizeof(uint32_t), static_cast<void *>(&eventVal), sizeof(uint32_t));
|
||||
if (!signalScopeFlag) {
|
||||
NEO::CpuIntrinsics::clFlush(tsptr);
|
||||
}
|
||||
};
|
||||
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextStart));
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalStart));
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextEnd));
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalEnd));
|
||||
eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, contextStart));
|
||||
eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, globalStart));
|
||||
eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, contextEnd));
|
||||
eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, globalEnd));
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
@@ -247,33 +290,30 @@ ze_result_t EventImp::reset() {
|
||||
}
|
||||
|
||||
ze_result_t EventImp::queryKernelTimestamp(ze_kernel_timestamp_result_t *dstptr) {
|
||||
auto baseAddr = reinterpret_cast<uint64_t>(hostAddress);
|
||||
constexpr uint64_t tsMask = (1ull << 32) - 1;
|
||||
uint64_t tsData = Event::STATE_INITIAL & tsMask;
|
||||
|
||||
ze_kernel_timestamp_result_t &result = *dstptr;
|
||||
|
||||
// Ensure timestamps have been written
|
||||
if (queryStatus() != ZE_RESULT_SUCCESS) {
|
||||
return ZE_RESULT_NOT_READY;
|
||||
}
|
||||
|
||||
auto eventTsSetFunc = [&](auto tsAddr, uint64_t ×tampField) {
|
||||
memcpy_s(static_cast<void *>(&tsData), sizeof(uint32_t), reinterpret_cast<void *>(tsAddr), sizeof(uint32_t));
|
||||
assignTimestampData(hostAddress);
|
||||
calculateProfilingData();
|
||||
|
||||
tsData &= tsMask;
|
||||
memcpy_s(&(timestampField), sizeof(uint64_t), static_cast<void *>(&tsData), sizeof(uint64_t));
|
||||
auto eventTsSetFunc = [&](uint64_t ×tampFieldToCopy, uint64_t ×tampFieldForWriting) {
|
||||
memcpy_s(&(timestampFieldForWriting), sizeof(uint64_t), static_cast<void *>(×tampFieldToCopy), sizeof(uint64_t));
|
||||
};
|
||||
|
||||
if (!NEO::HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily).useOnlyGlobalTimestamps()) {
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextStart), result.context.kernelStart);
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalStart), result.global.kernelStart);
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextEnd), result.context.kernelEnd);
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalEnd), result.global.kernelEnd);
|
||||
eventTsSetFunc(contextStartTS, result.context.kernelStart);
|
||||
eventTsSetFunc(globalStartTS, result.global.kernelStart);
|
||||
eventTsSetFunc(contextEndTS, result.context.kernelEnd);
|
||||
eventTsSetFunc(globalEndTS, result.global.kernelEnd);
|
||||
} else {
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalStart), result.context.kernelStart);
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalStart), result.global.kernelStart);
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalEnd), result.context.kernelEnd);
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalEnd), result.global.kernelEnd);
|
||||
eventTsSetFunc(globalStartTS, result.context.kernelStart);
|
||||
eventTsSetFunc(globalStartTS, result.global.kernelStart);
|
||||
eventTsSetFunc(globalEndTS, result.context.kernelEnd);
|
||||
eventTsSetFunc(globalEndTS, result.global.kernelEnd);
|
||||
}
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/helpers/timestamp_packet.h"
|
||||
|
||||
#include "level_zero/core/source/cmdlist/cmdlist.h"
|
||||
#include "level_zero/core/source/device/device.h"
|
||||
#include "level_zero/core/source/driver/driver_handle.h"
|
||||
@@ -20,6 +22,7 @@ namespace L0 {
|
||||
typedef uint64_t FlushStamp;
|
||||
struct EventPool;
|
||||
struct MetricStreamer;
|
||||
using TimestampPacketStorage = NEO::TimestampPackets<uint32_t>;
|
||||
|
||||
struct Event : _ze_event_handle_t {
|
||||
virtual ~Event() = default;
|
||||
@@ -29,7 +32,6 @@ struct Event : _ze_event_handle_t {
|
||||
virtual ze_result_t queryStatus() = 0;
|
||||
virtual ze_result_t reset() = 0;
|
||||
virtual ze_result_t queryKernelTimestamp(ze_kernel_timestamp_result_t *dstptr) = 0;
|
||||
|
||||
enum State : uint32_t {
|
||||
STATE_SIGNALED = 0u,
|
||||
STATE_CLEARED = static_cast<uint32_t>(-1),
|
||||
@@ -48,15 +50,22 @@ struct Event : _ze_event_handle_t {
|
||||
|
||||
void *hostAddress = nullptr;
|
||||
uint64_t gpuAddress;
|
||||
uint32_t getPacketsInUse() { return packetsInUse; }
|
||||
|
||||
ze_event_scope_flags_t signalScope = 0u;
|
||||
ze_event_scope_flags_t waitScope = 0u;
|
||||
|
||||
bool isTimestampEvent = false;
|
||||
|
||||
std::unique_ptr<TimestampPacketStorage> timestampsData = nullptr;
|
||||
uint64_t globalStartTS;
|
||||
uint64_t globalEndTS;
|
||||
uint64_t contextStartTS;
|
||||
uint64_t contextEndTS;
|
||||
|
||||
uint32_t packetsInUse = 1;
|
||||
|
||||
// Metric streamer instance associated with the event.
|
||||
MetricStreamer *metricStreamer = nullptr;
|
||||
|
||||
NEO::CommandStreamReceiver *csr = nullptr;
|
||||
|
||||
protected:
|
||||
@@ -83,18 +92,13 @@ struct EventImp : public Event {
|
||||
EventPool *eventPool;
|
||||
|
||||
protected:
|
||||
ze_result_t calculateProfilingData();
|
||||
ze_result_t hostEventSetValue(uint32_t eventValue);
|
||||
ze_result_t hostEventSetValueTimestamps(uint32_t eventVal);
|
||||
void assignTimestampData(void *address);
|
||||
void makeAllocationResident();
|
||||
};
|
||||
|
||||
struct KernelTimestampEvent {
|
||||
uint32_t contextStart = Event::STATE_INITIAL;
|
||||
uint32_t globalStart = Event::STATE_INITIAL;
|
||||
uint32_t contextEnd = Event::STATE_INITIAL;
|
||||
uint32_t globalEnd = Event::STATE_INITIAL;
|
||||
};
|
||||
|
||||
struct EventPool : _ze_event_pool_handle_t {
|
||||
static EventPool *create(DriverHandle *driver, uint32_t numDevices, ze_device_handle_t *phDevices, const ze_event_pool_desc_t *desc);
|
||||
virtual ~EventPool() = default;
|
||||
@@ -151,8 +155,8 @@ struct EventPoolImp : public EventPool {
|
||||
size_t numEvents;
|
||||
|
||||
protected:
|
||||
const uint32_t eventSize = static_cast<uint32_t>(alignUp(sizeof(struct KernelTimestampEvent),
|
||||
MemoryConstants::cacheLineSize));
|
||||
const uint32_t eventSize = static_cast<uint32_t>(NEO::TimestampPacketSizeControl::preferredPacketCount * alignUp(sizeof(struct TimestampPacketStorage::Packet),
|
||||
MemoryConstants::cacheLineSize));
|
||||
const uint32_t eventAlignment = MemoryConstants::cacheLineSize;
|
||||
};
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ extern bool verbose;
|
||||
bool verbose = false;
|
||||
|
||||
inline std::vector<uint8_t> loadBinaryFile(const std::string &filePath) {
|
||||
std::ifstream stream(filePath, std::ios::in);
|
||||
std::ifstream stream(filePath, std::ios::binary);
|
||||
if (!stream.good()) {
|
||||
std::cerr << "Failed to load binary file: " << filePath << " " << strerror(errno) << "\n";
|
||||
return {};
|
||||
|
||||
@@ -42,7 +42,7 @@ struct TimestampEvent : public Test<DeviceFixture> {
|
||||
};
|
||||
|
||||
GEN12LPTEST_F(TimestampEvent, givenEventTimestampsWhenQueryKernelTimestampThenCorrectDataAreSet) {
|
||||
KernelTimestampEvent data = {};
|
||||
TimestampPacketStorage::Packet data = {};
|
||||
data.contextStart = 1u;
|
||||
data.contextEnd = 2u;
|
||||
data.globalStart = 3u;
|
||||
@@ -50,6 +50,7 @@ GEN12LPTEST_F(TimestampEvent, givenEventTimestampsWhenQueryKernelTimestampThenCo
|
||||
|
||||
event->hostAddress = &data;
|
||||
|
||||
event->packetsInUse = 1;
|
||||
ze_kernel_timestamp_result_t result = {};
|
||||
|
||||
event->queryKernelTimestamp(&result);
|
||||
@@ -58,5 +59,31 @@ GEN12LPTEST_F(TimestampEvent, givenEventTimestampsWhenQueryKernelTimestampThenCo
|
||||
EXPECT_EQ(data.globalStart, result.global.kernelStart);
|
||||
EXPECT_EQ(data.globalEnd, result.global.kernelEnd);
|
||||
}
|
||||
|
||||
GEN12LPTEST_F(TimestampEvent, givenEventMoreThanOneTimestampsPacketWhenQueryKernelTimestampThenCorrectCalculationAreMade) {
|
||||
TimestampPacketStorage::Packet data[3] = {};
|
||||
data[0].contextStart = 3u;
|
||||
data[0].contextEnd = 4u;
|
||||
data[0].globalStart = 5u;
|
||||
data[0].globalEnd = 6u;
|
||||
data[1].contextStart = 2u;
|
||||
data[1].contextEnd = 6u;
|
||||
data[1].globalStart = 4u;
|
||||
data[1].globalEnd = 8u;
|
||||
data[2].contextStart = 4u;
|
||||
data[2].contextEnd = 5u;
|
||||
data[2].globalStart = 6u;
|
||||
data[2].globalEnd = 7u;
|
||||
|
||||
event->hostAddress = &data;
|
||||
event->packetsInUse = 3;
|
||||
ze_kernel_timestamp_result_t result = {};
|
||||
|
||||
event->queryKernelTimestamp(&result);
|
||||
EXPECT_EQ(data[1].globalStart, result.context.kernelStart);
|
||||
EXPECT_EQ(data[1].globalEnd, result.context.kernelEnd);
|
||||
EXPECT_EQ(data[1].globalStart, result.global.kernelStart);
|
||||
EXPECT_EQ(data[1].globalEnd, result.global.kernelEnd);
|
||||
}
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
@@ -979,8 +979,8 @@ HWTEST2_F(CommandListCreate, givenCopyCommandListWhenProfilingBeforeCommandForCo
|
||||
|
||||
commandList->appendEventForProfilingCopyCommand(event->toHandle(), true);
|
||||
|
||||
auto contextOffset = offsetof(KernelTimestampEvent, contextStart);
|
||||
auto globalOffset = offsetof(KernelTimestampEvent, globalStart);
|
||||
auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextStart);
|
||||
auto globalOffset = offsetof(TimestampPacketStorage::Packet, globalStart);
|
||||
auto baseAddr = event->getGpuAddress();
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
@@ -1012,8 +1012,8 @@ HWTEST2_F(CommandListCreate, givenCopyCommandListWhenProfilingAfterCommandForCop
|
||||
|
||||
commandList->appendEventForProfilingCopyCommand(event->toHandle(), false);
|
||||
|
||||
auto contextOffset = offsetof(KernelTimestampEvent, contextEnd);
|
||||
auto globalOffset = offsetof(KernelTimestampEvent, globalEnd);
|
||||
auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
|
||||
auto globalOffset = offsetof(TimestampPacketStorage::Packet, globalEnd);
|
||||
auto baseAddr = event->getGpuAddress();
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
|
||||
@@ -42,7 +42,8 @@ class MockCommandListHw : public WhiteBox<::L0::CommandListCoreFamily<gfxCoreFam
|
||||
uint64_t srcOffset,
|
||||
uint32_t size,
|
||||
uint32_t elementSize,
|
||||
Builtin builtin) override {
|
||||
Builtin builtin,
|
||||
ze_event_handle_t hSignalEvent) override {
|
||||
appendMemoryCopyKernelWithGACalledTimes++;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -137,7 +137,7 @@ HWTEST2_F(CommandListAppendEventReset, givenTimestampEventUsedInResetThenPipeCon
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create(eventPool.get(), &eventDesc, device));
|
||||
|
||||
commandList->appendEventReset(event->toHandle());
|
||||
auto contextOffset = offsetof(KernelTimestampEvent, contextEnd);
|
||||
auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
|
||||
auto baseAddr = event->getGpuAddress();
|
||||
auto gpuAddress = ptrOffset(baseAddr, contextOffset);
|
||||
|
||||
|
||||
@@ -201,7 +201,7 @@ HWTEST2_F(CommandListAppendSignalEvent, givenTimestampEventUsedInSignalThenPipeC
|
||||
auto event = std::unique_ptr<L0::Event>(L0::Event::create(eventPool.get(), &eventDesc, device));
|
||||
|
||||
commandList->appendSignalEvent(event->toHandle());
|
||||
auto contextOffset = offsetof(KernelTimestampEvent, contextEnd);
|
||||
auto contextOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
|
||||
auto baseAddr = event->getGpuAddress();
|
||||
auto gpuAddress = ptrOffset(baseAddr, contextOffset);
|
||||
|
||||
|
||||
@@ -327,10 +327,10 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListWhenTimestampPassedToMemoryCopyR
|
||||
GenCmdList cmdList;
|
||||
|
||||
auto baseAddr = event->getGpuAddress();
|
||||
auto contextStartOffset = offsetof(KernelTimestampEvent, contextStart);
|
||||
auto globalStartOffset = offsetof(KernelTimestampEvent, globalStart);
|
||||
auto contextEndOffset = offsetof(KernelTimestampEvent, contextEnd);
|
||||
auto globalEndOffset = offsetof(KernelTimestampEvent, globalEnd);
|
||||
auto contextStartOffset = offsetof(TimestampPacketStorage::Packet, contextStart);
|
||||
auto globalStartOffset = offsetof(TimestampPacketStorage::Packet, globalStart);
|
||||
auto contextEndOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
|
||||
auto globalEndOffset = offsetof(TimestampPacketStorage::Packet, globalEnd);
|
||||
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed()));
|
||||
|
||||
@@ -43,8 +43,8 @@ TEST_F(EventPoolCreate, givenTimestampEventsThenEventSizeSufficientForAllKernelT
|
||||
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), 0, nullptr, &eventPoolDesc));
|
||||
ASSERT_NE(nullptr, eventPool);
|
||||
|
||||
uint32_t kernelTimestampsSize = static_cast<uint32_t>(alignUp(sizeof(struct KernelTimestampEvent),
|
||||
MemoryConstants::cacheLineSize));
|
||||
uint32_t kernelTimestampsSize = static_cast<uint32_t>(NEO::TimestampPacketSizeControl::preferredPacketCount *
|
||||
alignUp(sizeof(struct TimestampPacketStorage::Packet), MemoryConstants::cacheLineSize));
|
||||
EXPECT_EQ(kernelTimestampsSize, eventPool->getEventSize());
|
||||
}
|
||||
|
||||
@@ -213,7 +213,7 @@ TEST_F(TimestampEventCreate, givenTimestampEventThenAllocationsIsOfPacketTagBuff
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_GEN9_CORE, TimestampEventCreate, givenEventTimestampsWhenQueryKernelTimestampThenCorrectDataAreSet) {
|
||||
KernelTimestampEvent data = {};
|
||||
TimestampPacketStorage::Packet data = {};
|
||||
data.contextStart = 1u;
|
||||
data.contextEnd = 2u;
|
||||
data.globalStart = 3u;
|
||||
|
||||
Reference in New Issue
Block a user