Add extended functionality for timestamps at Event

Related-To: NEO-4584
Signed-off-by: Daria Hinz <daria.hinz@intel.com>
This commit is contained in:
Daria Hinz
2020-09-15 09:33:12 +02:00
committed by Compute-Runtime-Automation
parent 55fb319517
commit 1ef9a1c35f
21 changed files with 406 additions and 156 deletions

View File

@@ -17,6 +17,11 @@ enum class ImageType;
}
namespace L0 {
struct EventData {
uint64_t address;
uint64_t packetsInUse;
};
struct AlignedAllocationData {
uintptr_t alignedAllocationPtr = 0u;
size_t offset = 0u;
@@ -140,7 +145,8 @@ struct CommandListCoreFamily : CommandListImp {
uint64_t dstOffset, void *srcPtr,
NEO::GraphicsAllocation *srcPtrAlloc,
uint64_t srcOffset, uint32_t size,
uint32_t elementSize, Builtin builtin);
uint32_t elementSize, Builtin builtin,
ze_event_handle_t hSignalEvent);
MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyBlit(uintptr_t dstPtr,
NEO::GraphicsAllocation *dstPtrAlloc,
@@ -198,6 +204,7 @@ struct CommandListCoreFamily : CommandListImp {
const void **pRanges);
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]);
void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker);
void appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker);
void appendEventForProfilingCopyCommand(ze_event_handle_t hEvent, bool beforeWalker);
void appendSignalEventPostWalker(ze_event_handle_t hEvent);

View File

@@ -208,7 +208,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
uint64_t baseAddr = event->getGpuAddress();
size_t eventOffset = 0;
if (event->isTimestampEvent) {
eventOffset = offsetof(KernelTimestampEvent, contextEnd);
eventOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
}
commandContainer.addToResidencyContainer(&event->getAllocation());
if (isCopyOnly()) {
@@ -650,7 +650,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(v
uint64_t srcOffset,
uint32_t size,
uint32_t elementSize,
Builtin builtin) {
Builtin builtin,
ze_event_handle_t hSignalEvent) {
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
@@ -679,7 +680,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(v
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
return CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(builtinFunction->toHandle(), &dispatchFuncArgs,
nullptr, 0, nullptr);
hSignalEvent, 0, nullptr);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -866,7 +867,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
reinterpret_cast<void *>(&srcAllocationStruct.alignedAllocationPtr),
srcAllocationStruct.alloc, srcAllocationStruct.offset,
static_cast<uint32_t>(leftSize), 1,
Builtin::CopyBufferToBufferSide);
Builtin::CopyBufferToBufferSide,
nullptr);
}
if (ret == ZE_RESULT_SUCCESS && middleSizeBytes) {
@@ -880,7 +882,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
srcAllocationStruct.alloc, leftSize + srcAllocationStruct.offset,
static_cast<uint32_t>(middleSizeBytes),
static_cast<uint32_t>(middleElSize),
Builtin::CopyBufferToBufferMiddle);
Builtin::CopyBufferToBufferMiddle,
nullptr);
}
if (ret == ZE_RESULT_SUCCESS && rightSize) {
@@ -893,11 +896,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
reinterpret_cast<void *>(&srcAllocationStruct.alignedAllocationPtr),
srcAllocationStruct.alloc, leftSize + middleSizeBytes + srcAllocationStruct.offset,
static_cast<uint32_t>(rightSize), 1u,
Builtin::CopyBufferToBufferSide);
Builtin::CopyBufferToBufferSide,
nullptr);
}
this->appendSignalEventPostWalker(hSignalEvent);
if (dstAllocationStruct.needsFlush && !isCopyOnly()) {
NEO::PipeControlArgs args(true);
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
@@ -1266,14 +1269,11 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingCopyCommand(ze
return;
}
commandContainer.addToResidencyContainer(&event->getAllocation());
auto baseAddr = event->getGpuAddress();
auto contextOffset = beforeWalker ? offsetof(KernelTimestampEvent, contextStart) : offsetof(KernelTimestampEvent, contextEnd);
auto globalOffset = beforeWalker ? offsetof(KernelTimestampEvent, globalStart) : offsetof(KernelTimestampEvent, globalEnd);
if (!beforeWalker) {
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, false, false);
}
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, ptrOffset(baseAddr, globalOffset));
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, ptrOffset(baseAddr, contextOffset));
appendWriteKernelTimestamp(hEvent, beforeWalker);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -1361,7 +1361,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
uint64_t baseAddr = event->getGpuAddress();
size_t eventSignalOffset = 0;
if (event->isTimestampEvent) {
eventSignalOffset = offsetof(KernelTimestampEvent, contextEnd);
eventSignalOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
}
if (isCopyOnly()) {
@@ -1396,7 +1396,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
gpuAddr = event->getGpuAddress();
if (event->isTimestampEvent) {
gpuAddr += offsetof(KernelTimestampEvent, contextEnd);
gpuAddr += offsetof(TimestampPacketStorage::Packet, contextEnd);
}
NEO::EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(),
gpuAddr,
@@ -1418,6 +1418,18 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
return ZE_RESULT_SUCCESS;
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker) {
auto event = Event::fromHandle(hEvent);
auto baseAddr = event->getGpuAddress();
auto contextOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, contextStart) : offsetof(TimestampPacketStorage::Packet, contextEnd);
auto globalOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, globalStart) : offsetof(TimestampPacketStorage::Packet, globalEnd);
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, ptrOffset(baseAddr, globalOffset));
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, ptrOffset(baseAddr, contextOffset));
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker) {
if (!hEvent) {
@@ -1433,29 +1445,16 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(ze_event_hand
}
commandContainer.addToResidencyContainer(&event->getAllocation());
auto baseAddr = event->getGpuAddress();
if (beforeWalker) {
auto contextStartAddr = baseAddr;
auto globalStartAddr = baseAddr + offsetof(KernelTimestampEvent, globalStart);
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(),
REG_GLOBAL_TIMESTAMP_LDW, globalStartAddr);
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(),
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddr);
appendWriteKernelTimestamp(hEvent, beforeWalker);
} else {
auto contextEndAddr = baseAddr + offsetof(KernelTimestampEvent, contextEnd);
auto globalEndAddr = baseAddr + offsetof(KernelTimestampEvent, globalEnd);
NEO::PipeControlArgs args;
args.dcFlushEnable = true;
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(),
REG_GLOBAL_TIMESTAMP_LDW, globalEndAddr);
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(),
GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddr);
appendWriteKernelTimestamp(hEvent, beforeWalker);
args.dcFlushEnable = (!event->signalScope) ? false : true;
if (args.dcFlushEnable) {
@@ -1525,15 +1524,16 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
auto dstptrAllocationStruct = getAlignedAllocation(this->device, dstptr, sizeof(ze_kernel_timestamp_result_t) * numEvents);
commandContainer.addToResidencyContainer(dstptrAllocationStruct.alloc);
std::unique_ptr<uint64_t[]> timestampsAddress = std::make_unique<uint64_t[]>(numEvents);
std::unique_ptr<EventData[]> timestampsAddress = std::make_unique<EventData[]>(numEvents);
for (uint32_t i = 0u; i < numEvents; ++i) {
auto event = Event::fromHandle(phEvents[i]);
commandContainer.addToResidencyContainer(&event->getAllocation());
timestampsAddress[i] = event->getGpuAddress();
timestampsAddress[i].address = event->getGpuAddress();
timestampsAddress[i].packetsInUse = event->getPacketsInUse();
}
size_t alignedSize = alignUp<size_t>(sizeof(uint64_t) * numEvents, MemoryConstants::pageSize64k);
size_t alignedSize = alignUp<size_t>(sizeof(EventData) * numEvents, MemoryConstants::pageSize64k);
NEO::GraphicsAllocation::AllocationType allocationType = NEO::GraphicsAllocation::AllocationType::BUFFER;
NEO::AllocationProperties allocationProperties{device->getRootDeviceIndex(),
true,
@@ -1549,13 +1549,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendQueryKernelTimestamps(
commandContainer.addToResidencyContainer(timestampsGPUAddress);
commandContainer.getDeallocationContainer().push_back(timestampsGPUAddress);
bool result = device->getDriverHandle()->getMemoryManager()->copyMemoryToAllocation(timestampsGPUAddress, 0, timestampsAddress.get(), sizeof(uint64_t) * numEvents);
bool result = device->getDriverHandle()->getMemoryManager()->copyMemoryToAllocation(timestampsGPUAddress, 0, timestampsAddress.get(), sizeof(EventData) * numEvents);
UNRECOVERABLE_IF(!result);
Kernel *builtinFunction = nullptr;
auto useOnlyGlobalTimestamps = NEO::HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily).useOnlyGlobalTimestamps() ? 1u : 0u;
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
if (pOffsets == nullptr) {

View File

@@ -121,6 +121,7 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *
if (eventPool->isEventPoolUsedForTimestamp) {
event->isTimestampEvent = true;
event->timestampsData = std::make_unique<TimestampPacketStorage>();
}
auto alloc = eventPool->getAllocation().getGraphicsAllocation(device->getNEODevice()->getRootDeviceIndex());
@@ -128,11 +129,9 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *
uint64_t baseHostAddr = reinterpret_cast<uint64_t>(alloc->getUnderlyingBuffer());
event->hostAddress = reinterpret_cast<void *>(baseHostAddr + (desc->index * eventPool->getEventSize()));
event->gpuAddress = alloc->getGpuAddress() + (desc->index * eventPool->getEventSize());
event->signalScope = desc->signal;
event->waitScope = desc->wait;
event->csr = static_cast<DeviceImp *>(device)->neoDevice->getDefaultEngine().commandStreamReceiver;
event->reset();
return event;
@@ -144,6 +143,49 @@ NEO::GraphicsAllocation &Event::getAllocation() {
return *eventImp->eventPool->getAllocation().getGraphicsAllocation(eventImp->device->getNEODevice()->getRootDeviceIndex());
}
ze_result_t EventImp::calculateProfilingData() {
globalStartTS = timestampsData->packets[0].globalStart;
globalEndTS = timestampsData->packets[0].globalEnd;
contextStartTS = timestampsData->packets[0].contextStart;
contextEndTS = timestampsData->packets[0].contextEnd;
for (auto i = 1u; i < packetsInUse; i++) {
auto &packet = timestampsData->packets[i];
if (globalStartTS > packet.globalStart) {
globalStartTS = packet.globalStart;
}
if (contextStartTS > packet.contextStart) {
contextStartTS = packet.contextStart;
}
if (contextEndTS < packet.contextEnd) {
contextEndTS = packet.contextEnd;
}
if (globalEndTS < packet.globalEnd) {
globalEndTS = packet.globalEnd;
}
}
return ZE_RESULT_SUCCESS;
}
void EventImp::assignTimestampData(void *address) {
auto baseAddr = reinterpret_cast<uint64_t>(address);
auto copyData = [&](uint32_t &timestampField, auto tsAddr) {
memcpy_s(static_cast<void *>(&timestampField), sizeof(uint32_t), reinterpret_cast<void *>(tsAddr), sizeof(uint32_t));
};
for (uint32_t i = 0; i < packetsInUse; i++) {
auto &packet = timestampsData->packets[i];
copyData(packet.globalStart, baseAddr + offsetof(TimestampPacketStorage::Packet, globalStart));
copyData(packet.contextStart, baseAddr + offsetof(TimestampPacketStorage::Packet, contextStart));
copyData(packet.globalEnd, baseAddr + offsetof(TimestampPacketStorage::Packet, globalEnd));
copyData(packet.contextEnd, baseAddr + offsetof(TimestampPacketStorage::Packet, contextEnd));
baseAddr += sizeof(struct TimestampPacketStorage::Packet);
}
}
ze_result_t Event::destroy() {
delete this;
return ZE_RESULT_SUCCESS;
@@ -158,7 +200,7 @@ ze_result_t EventImp::queryStatus() {
this->csr->downloadAllocations();
if (isTimestampEvent) {
auto baseAddr = reinterpret_cast<uint64_t>(hostAddress);
auto timeStampAddress = baseAddr + offsetof(KernelTimestampEvent, contextEnd);
auto timeStampAddress = baseAddr + offsetof(TimestampPacketStorage::Packet, contextEnd);
hostAddr = reinterpret_cast<uint64_t *>(timeStampAddress);
}
memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), static_cast<void *>(hostAddr), sizeof(uint32_t));
@@ -172,16 +214,17 @@ ze_result_t EventImp::hostEventSetValueTimestamps(uint32_t eventVal) {
auto eventTsSetFunc = [&](auto tsAddr) {
auto tsptr = reinterpret_cast<void *>(tsAddr);
memcpy_s(tsptr, sizeof(uint32_t), static_cast<void *>(&eventVal), sizeof(uint32_t));
if (!signalScopeFlag) {
NEO::CpuIntrinsics::clFlush(tsptr);
}
};
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextStart));
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalStart));
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextEnd));
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalEnd));
eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, contextStart));
eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, globalStart));
eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, contextEnd));
eventTsSetFunc(baseAddr + offsetof(TimestampPacketStorage::Packet, globalEnd));
return ZE_RESULT_SUCCESS;
}
@@ -247,33 +290,30 @@ ze_result_t EventImp::reset() {
}
ze_result_t EventImp::queryKernelTimestamp(ze_kernel_timestamp_result_t *dstptr) {
auto baseAddr = reinterpret_cast<uint64_t>(hostAddress);
constexpr uint64_t tsMask = (1ull << 32) - 1;
uint64_t tsData = Event::STATE_INITIAL & tsMask;
ze_kernel_timestamp_result_t &result = *dstptr;
// Ensure timestamps have been written
if (queryStatus() != ZE_RESULT_SUCCESS) {
return ZE_RESULT_NOT_READY;
}
auto eventTsSetFunc = [&](auto tsAddr, uint64_t &timestampField) {
memcpy_s(static_cast<void *>(&tsData), sizeof(uint32_t), reinterpret_cast<void *>(tsAddr), sizeof(uint32_t));
assignTimestampData(hostAddress);
calculateProfilingData();
tsData &= tsMask;
memcpy_s(&(timestampField), sizeof(uint64_t), static_cast<void *>(&tsData), sizeof(uint64_t));
auto eventTsSetFunc = [&](uint64_t &timestampFieldToCopy, uint64_t &timestampFieldForWriting) {
memcpy_s(&(timestampFieldForWriting), sizeof(uint64_t), static_cast<void *>(&timestampFieldToCopy), sizeof(uint64_t));
};
if (!NEO::HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily).useOnlyGlobalTimestamps()) {
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextStart), result.context.kernelStart);
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalStart), result.global.kernelStart);
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextEnd), result.context.kernelEnd);
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalEnd), result.global.kernelEnd);
eventTsSetFunc(contextStartTS, result.context.kernelStart);
eventTsSetFunc(globalStartTS, result.global.kernelStart);
eventTsSetFunc(contextEndTS, result.context.kernelEnd);
eventTsSetFunc(globalEndTS, result.global.kernelEnd);
} else {
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalStart), result.context.kernelStart);
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalStart), result.global.kernelStart);
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalEnd), result.context.kernelEnd);
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalEnd), result.global.kernelEnd);
eventTsSetFunc(globalStartTS, result.context.kernelStart);
eventTsSetFunc(globalStartTS, result.global.kernelStart);
eventTsSetFunc(globalEndTS, result.context.kernelEnd);
eventTsSetFunc(globalEndTS, result.global.kernelEnd);
}
return ZE_RESULT_SUCCESS;

View File

@@ -7,6 +7,8 @@
#pragma once
#include "shared/source/helpers/timestamp_packet.h"
#include "level_zero/core/source/cmdlist/cmdlist.h"
#include "level_zero/core/source/device/device.h"
#include "level_zero/core/source/driver/driver_handle.h"
@@ -20,6 +22,7 @@ namespace L0 {
typedef uint64_t FlushStamp;
struct EventPool;
struct MetricStreamer;
using TimestampPacketStorage = NEO::TimestampPackets<uint32_t>;
struct Event : _ze_event_handle_t {
virtual ~Event() = default;
@@ -29,7 +32,6 @@ struct Event : _ze_event_handle_t {
virtual ze_result_t queryStatus() = 0;
virtual ze_result_t reset() = 0;
virtual ze_result_t queryKernelTimestamp(ze_kernel_timestamp_result_t *dstptr) = 0;
enum State : uint32_t {
STATE_SIGNALED = 0u,
STATE_CLEARED = static_cast<uint32_t>(-1),
@@ -48,15 +50,22 @@ struct Event : _ze_event_handle_t {
void *hostAddress = nullptr;
uint64_t gpuAddress;
uint32_t getPacketsInUse() { return packetsInUse; }
ze_event_scope_flags_t signalScope = 0u;
ze_event_scope_flags_t waitScope = 0u;
bool isTimestampEvent = false;
std::unique_ptr<TimestampPacketStorage> timestampsData = nullptr;
uint64_t globalStartTS;
uint64_t globalEndTS;
uint64_t contextStartTS;
uint64_t contextEndTS;
uint32_t packetsInUse = 1;
// Metric streamer instance associated with the event.
MetricStreamer *metricStreamer = nullptr;
NEO::CommandStreamReceiver *csr = nullptr;
protected:
@@ -83,18 +92,13 @@ struct EventImp : public Event {
EventPool *eventPool;
protected:
ze_result_t calculateProfilingData();
ze_result_t hostEventSetValue(uint32_t eventValue);
ze_result_t hostEventSetValueTimestamps(uint32_t eventVal);
void assignTimestampData(void *address);
void makeAllocationResident();
};
struct KernelTimestampEvent {
uint32_t contextStart = Event::STATE_INITIAL;
uint32_t globalStart = Event::STATE_INITIAL;
uint32_t contextEnd = Event::STATE_INITIAL;
uint32_t globalEnd = Event::STATE_INITIAL;
};
struct EventPool : _ze_event_pool_handle_t {
static EventPool *create(DriverHandle *driver, uint32_t numDevices, ze_device_handle_t *phDevices, const ze_event_pool_desc_t *desc);
virtual ~EventPool() = default;
@@ -151,8 +155,8 @@ struct EventPoolImp : public EventPool {
size_t numEvents;
protected:
const uint32_t eventSize = static_cast<uint32_t>(alignUp(sizeof(struct KernelTimestampEvent),
MemoryConstants::cacheLineSize));
const uint32_t eventSize = static_cast<uint32_t>(NEO::TimestampPacketSizeControl::preferredPacketCount * alignUp(sizeof(struct TimestampPacketStorage::Packet),
MemoryConstants::cacheLineSize));
const uint32_t eventAlignment = MemoryConstants::cacheLineSize;
};