mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-05 09:09:04 +08:00
Fix event sizing for kernel timestamps
Standardize the format for kernel timestamps and logic used to read the data. Change-Id: I9418c2e09987dc778302026b705d056c84996983 Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@intel.com>
This commit is contained in:
committed by
sys_ocldev
parent
6ede3107ca
commit
b42d789e04
@@ -1242,7 +1242,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
|
||||
|
||||
gpuAddr = event->getGpuAddress();
|
||||
if (event->isTimestampEvent) {
|
||||
gpuAddr += event->getOffsetOfEventTimestampRegister(Event::CONTEXT_END);
|
||||
gpuAddr += offsetof(KernelTimestampEvent, contextEnd);
|
||||
}
|
||||
|
||||
NEO::HardwareCommandsHelper<GfxFamily>::programMiSemaphoreWait(*(commandContainer.getCommandStream()),
|
||||
|
||||
@@ -87,37 +87,34 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(ze_event_hand
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t timeStampAddress = 0;
|
||||
|
||||
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
|
||||
commandContainer.addToResidencyContainer(&event->getAllocation());
|
||||
auto baseAddr = event->getGpuAddress();
|
||||
|
||||
if (beforeWalker) {
|
||||
timeStampAddress = event->getGpuAddress() + event->getOffsetOfEventTimestampRegister(Event::GLOBAL_START);
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, timeStampAddress);
|
||||
auto contextStartAddr = baseAddr;
|
||||
auto globalStartAddr = baseAddr + offsetof(KernelTimestampEvent, globalStart);
|
||||
|
||||
timeStampAddress = event->getGpuAddress() + event->getOffsetOfEventTimestampRegister(Event::CONTEXT_START);
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timeStampAddress);
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, globalStartAddr);
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextStartAddr);
|
||||
} else {
|
||||
|
||||
timeStampAddress = event->getGpuAddress() + event->getOffsetOfEventTimestampRegister(Event::GLOBAL_END);
|
||||
NEO::PipeControlArgs args;
|
||||
args.dcFlushEnable = (event->signalScope == ZE_EVENT_SCOPE_FLAG_NONE) ? false : true;
|
||||
auto contextEndAddr = baseAddr + offsetof(KernelTimestampEvent, contextEnd);
|
||||
auto globalEndAddr = baseAddr + offsetof(KernelTimestampEvent, globalEnd);
|
||||
|
||||
if (isCopyOnlyCmdList) {
|
||||
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), timeStampAddress, 0llu, true, true);
|
||||
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), globalEndAddr, 0llu, true, true);
|
||||
} else {
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
|
||||
*(commandContainer.getCommandStream()), POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_TIMESTAMP,
|
||||
timeStampAddress,
|
||||
0llu,
|
||||
device->getHwInfo(),
|
||||
args);
|
||||
NEO::PipeControlArgs args;
|
||||
args.dcFlushEnable = false;
|
||||
|
||||
timeStampAddress = event->getGpuAddress() + event->getOffsetOfEventTimestampRegister(Event::CONTEXT_END);
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timeStampAddress);
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
|
||||
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, globalEndAddr);
|
||||
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, contextEndAddr);
|
||||
|
||||
args.dcFlushEnable = (event->signalScope == ZE_EVENT_SCOPE_FLAG_NONE) ? false : true;
|
||||
if (args.dcFlushEnable) {
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ struct EventImp : public Event {
|
||||
if (isTimestampEvent) {
|
||||
auto baseAddr = reinterpret_cast<uint64_t>(hostAddress);
|
||||
|
||||
auto timeStampAddress = baseAddr + getOffsetOfEventTimestampRegister(Event::CONTEXT_END);
|
||||
auto timeStampAddress = baseAddr + offsetof(KernelTimestampEvent, contextEnd);
|
||||
hostAddr = reinterpret_cast<uint64_t *>(timeStampAddress);
|
||||
}
|
||||
|
||||
@@ -80,10 +80,8 @@ struct EventPoolImp : public EventPool {
|
||||
pool[i] = EventPool::EVENT_STATE_INITIAL;
|
||||
}
|
||||
|
||||
auto timestampMultiplier = 1;
|
||||
if (flags & ZE_EVENT_POOL_FLAG_TIMESTAMP) {
|
||||
isEventPoolUsedForTimestamp = true;
|
||||
timestampMultiplier = numEventTimestampsToRead;
|
||||
}
|
||||
|
||||
ze_device_handle_t hDevice;
|
||||
@@ -98,7 +96,7 @@ struct EventPoolImp : public EventPool {
|
||||
device = Device::fromHandle(hDevice);
|
||||
|
||||
NEO::AllocationProperties properties(
|
||||
device->getRootDeviceIndex(), count * eventSize * timestampMultiplier,
|
||||
device->getRootDeviceIndex(), count * eventSize,
|
||||
NEO::GraphicsAllocation::AllocationType::BUFFER_HOST_MEMORY);
|
||||
properties.alignment = MemoryConstants::cacheLineSize;
|
||||
eventPoolAllocation = driver->getMemoryManager()->allocateGraphicsMemoryWithProperties(properties);
|
||||
@@ -142,8 +140,6 @@ struct EventPoolImp : public EventPool {
|
||||
|
||||
uint32_t getEventSize() override { return eventSize; }
|
||||
|
||||
uint32_t getNumEventTimestampsToRead() override { return numEventTimestampsToRead; }
|
||||
|
||||
ze_result_t destroyPool() {
|
||||
if (eventPoolUsedCount != 0) {
|
||||
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
|
||||
@@ -164,9 +160,8 @@ struct EventPoolImp : public EventPool {
|
||||
std::queue<int> lastEventPoolOffsetUsed;
|
||||
|
||||
protected:
|
||||
const uint32_t eventSize = 16u;
|
||||
const uint32_t eventSize = sizeof(struct KernelTimestampEvent);
|
||||
const uint32_t eventAlignment = MemoryConstants::cacheLineSize;
|
||||
const int32_t numEventTimestampsToRead = 4u;
|
||||
};
|
||||
|
||||
Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *device) {
|
||||
@@ -220,17 +215,22 @@ void EventImp::makeAllocationResident() {
|
||||
}
|
||||
|
||||
ze_result_t EventImp::hostEventSetValueTimestamps(uint32_t eventVal) {
|
||||
for (uint32_t i = 0; i < this->eventPool->getNumEventTimestampsToRead(); i++) {
|
||||
auto baseAddr = reinterpret_cast<uint64_t>(hostAddress);
|
||||
auto timeStampAddress = baseAddr + getOffsetOfEventTimestampRegister(i);
|
||||
auto tsptr = reinterpret_cast<uint64_t *>(timeStampAddress);
|
||||
|
||||
*(tsptr) = eventVal;
|
||||
auto baseAddr = reinterpret_cast<uint64_t>(hostAddress);
|
||||
auto signalScopeFlag = this->signalScope;
|
||||
|
||||
if (this->signalScope != ZE_EVENT_SCOPE_FLAG_NONE) {
|
||||
auto eventTsSetFunc = [&](auto tsAddr) {
|
||||
auto tsptr = reinterpret_cast<void *>(tsAddr);
|
||||
memcpy_s(tsptr, sizeof(uint32_t), static_cast<void *>(&eventVal), sizeof(uint32_t));
|
||||
if (signalScopeFlag != ZE_EVENT_SCOPE_FLAG_NONE) {
|
||||
NEO::CpuIntrinsics::clFlush(tsptr);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextStart));
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalStart));
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, contextEnd));
|
||||
eventTsSetFunc(baseAddr + offsetof(KernelTimestampEvent, globalEnd));
|
||||
|
||||
makeAllocationResident();
|
||||
|
||||
@@ -239,7 +239,7 @@ ze_result_t EventImp::hostEventSetValueTimestamps(uint32_t eventVal) {
|
||||
|
||||
ze_result_t EventImp::hostEventSetValue(uint32_t eventVal) {
|
||||
if (isTimestampEvent) {
|
||||
hostEventSetValueTimestamps(eventVal);
|
||||
return hostEventSetValueTimestamps(eventVal);
|
||||
}
|
||||
|
||||
auto hostAddr = static_cast<uint64_t *>(hostAddress);
|
||||
@@ -303,7 +303,7 @@ ze_result_t EventImp::reset() {
|
||||
|
||||
ze_result_t EventImp::getTimestamp(ze_event_timestamp_type_t timestampType, void *dstptr) {
|
||||
auto baseAddr = reinterpret_cast<uint64_t>(hostAddress);
|
||||
uint64_t *tsptr = nullptr;
|
||||
uint64_t tsAddr = 0u;
|
||||
constexpr uint64_t tsMask = (1ull << 32) - 1;
|
||||
uint64_t tsData = Event::STATE_INITIAL & tsMask;
|
||||
|
||||
@@ -316,17 +316,19 @@ ze_result_t EventImp::getTimestamp(ze_event_timestamp_type_t timestampType, void
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
if (timestampType == ZE_EVENT_TIMESTAMP_GLOBAL_START) {
|
||||
tsptr = reinterpret_cast<uint64_t *>(baseAddr + getOffsetOfEventTimestampRegister(Event::GLOBAL_START));
|
||||
} else if (timestampType == ZE_EVENT_TIMESTAMP_GLOBAL_END) {
|
||||
tsptr = reinterpret_cast<uint64_t *>(baseAddr + getOffsetOfEventTimestampRegister(Event::GLOBAL_END));
|
||||
} else if (timestampType == ZE_EVENT_TIMESTAMP_CONTEXT_START) {
|
||||
tsptr = reinterpret_cast<uint64_t *>(baseAddr + getOffsetOfEventTimestampRegister(Event::CONTEXT_START));
|
||||
if (timestampType == ZE_EVENT_TIMESTAMP_CONTEXT_START) {
|
||||
tsAddr = baseAddr + offsetof(KernelTimestampEvent, contextStart);
|
||||
} else if (timestampType == ZE_EVENT_TIMESTAMP_GLOBAL_START) {
|
||||
tsAddr = baseAddr + offsetof(KernelTimestampEvent, globalStart);
|
||||
} else if (timestampType == ZE_EVENT_TIMESTAMP_CONTEXT_END) {
|
||||
tsAddr = baseAddr + offsetof(KernelTimestampEvent, contextEnd);
|
||||
} else {
|
||||
tsptr = reinterpret_cast<uint64_t *>(baseAddr + getOffsetOfEventTimestampRegister(Event::CONTEXT_END));
|
||||
tsAddr = baseAddr + offsetof(KernelTimestampEvent, globalEnd);
|
||||
}
|
||||
|
||||
tsData = (*tsptr & tsMask);
|
||||
memcpy_s(static_cast<void *>(&tsData), sizeof(uint32_t), reinterpret_cast<void *>(tsAddr), sizeof(uint32_t));
|
||||
|
||||
tsData &= tsMask;
|
||||
memcpy_s(dstptr, sizeof(uint64_t), static_cast<void *>(&tsData), sizeof(uint64_t));
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
@@ -353,14 +355,9 @@ ze_result_t EventPoolImp::reserveEventFromPool(int index, Event *event) {
|
||||
lastEventPoolOffsetUsed.pop();
|
||||
}
|
||||
|
||||
auto timestampMultiplier = 1;
|
||||
if (static_cast<struct EventPool *>(this)->isEventPoolUsedForTimestamp) {
|
||||
timestampMultiplier = numEventTimestampsToRead;
|
||||
}
|
||||
|
||||
uint64_t baseHostAddr = reinterpret_cast<uint64_t>(eventPoolAllocation->getUnderlyingBuffer());
|
||||
event->hostAddress = reinterpret_cast<void *>(baseHostAddr + (event->offsetUsed * eventSize * timestampMultiplier));
|
||||
event->gpuAddress = eventPoolAllocation->getGpuAddress() + (event->offsetUsed * eventSize * timestampMultiplier);
|
||||
event->hostAddress = reinterpret_cast<void *>(baseHostAddr + (event->offsetUsed * eventSize));
|
||||
event->gpuAddress = eventPoolAllocation->getGpuAddress() + (event->offsetUsed * eventSize);
|
||||
|
||||
eventPoolUsedCount++;
|
||||
|
||||
|
||||
@@ -36,13 +36,6 @@ struct Event : _ze_event_handle_t {
|
||||
STATE_INITIAL = STATE_CLEARED
|
||||
};
|
||||
|
||||
enum EventTimestampRegister : uint32_t {
|
||||
GLOBAL_START = 0u,
|
||||
GLOBAL_END,
|
||||
CONTEXT_START,
|
||||
CONTEXT_END
|
||||
};
|
||||
|
||||
static Event *create(EventPool *eventPool, const ze_event_desc_t *desc, Device *device);
|
||||
|
||||
static Event *fromHandle(ze_event_handle_t handle) { return static_cast<Event *>(handle); }
|
||||
@@ -72,6 +65,13 @@ struct Event : _ze_event_handle_t {
|
||||
NEO::GraphicsAllocation *allocation = nullptr;
|
||||
};
|
||||
|
||||
struct KernelTimestampEvent {
|
||||
uint32_t contextStart = Event::STATE_INITIAL;
|
||||
uint32_t globalStart = Event::STATE_INITIAL;
|
||||
uint32_t contextEnd = Event::STATE_INITIAL;
|
||||
uint32_t globalEnd = Event::STATE_INITIAL;
|
||||
};
|
||||
|
||||
struct EventPool : _ze_event_pool_handle_t {
|
||||
static EventPool *create(DriverHandle *driver, uint32_t numDevices, ze_device_handle_t *phDevices, const ze_event_pool_desc_t *desc);
|
||||
virtual ~EventPool() = default;
|
||||
@@ -100,7 +100,6 @@ struct EventPool : _ze_event_pool_handle_t {
|
||||
virtual NEO::GraphicsAllocation &getAllocation() { return *eventPoolAllocation; }
|
||||
|
||||
virtual uint32_t getEventSize() = 0;
|
||||
virtual uint32_t getNumEventTimestampsToRead() = 0;
|
||||
|
||||
bool isEventPoolUsedForTimestamp = false;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user