Implementation of profiling for copy/fill API's

Signed-off-by: Daria Hinz <daria.hinz@intel.com>
This commit is contained in:
Daria Hinz
2020-12-18 18:01:08 +01:00
committed by Compute-Runtime-Automation
parent 0e52c6b08d
commit 4591101541
8 changed files with 140 additions and 102 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2020 Intel Corporation
* Copyright (C) 2019-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -204,7 +204,7 @@ struct CommandListCoreFamily : CommandListImp {
ze_event_handle_t hEvent,
bool isIndirect,
bool isPredicate);
ze_result_t appendLaunchKernelSplit(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent);
ze_result_t prepareIndirectParams(const ze_group_count_t *pThreadGroupDimensions);
void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes,
@@ -213,6 +213,7 @@ struct CommandListCoreFamily : CommandListImp {
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]);
void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb);
void appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker);
void appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker);
void appendEventForProfilingCopyCommand(ze_event_handle_t hEvent, bool beforeWalker);
void appendSignalEventPostWalker(ze_event_handle_t hEvent);
void programStateBaseAddress(NEO::CommandContainer &container, bool genericMediaStateClearRequired);
@@ -220,7 +221,7 @@ struct CommandListCoreFamily : CommandListImp {
uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region);
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize);
ze_result_t addEventsToCmdList(ze_event_handle_t hEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
};
template <PRODUCT_FAMILY gfxProductFamily>

View File

@@ -135,7 +135,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(ze_kernel_h
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) {
ze_result_t ret = addEventsToCmdList(hEvent, numWaitEvents, phWaitEvents);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
@@ -161,14 +161,13 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelIndirect(ze_
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) {
ze_result_t ret = addEventsToCmdList(hEvent, numWaitEvents, phWaitEvents);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
appendEventForProfiling(hEvent, true);
ret = appendLaunchKernelWithParams(hKernel, pDispatchArgumentsBuffer,
nullptr, true, false);
appendSignalEventPostWalker(hEvent);
return ret;
@@ -183,11 +182,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchMultipleKernelsInd
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) {
ze_result_t ret = addEventsToCmdList(hEvent, numWaitEvents, phWaitEvents);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
appendEventForProfiling(hEvent, true);
const bool haveLaunchArguments = pLaunchArgumentsBuffer != nullptr;
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(pNumLaunchArguments);
auto alloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
@@ -219,6 +218,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
size_t eventOffset = 0;
if (event->isTimestampEvent) {
eventOffset = offsetof(TimestampPacketStorage::Packet, contextEnd);
event->resetPackets();
}
commandContainer.addToResidencyContainer(&event->getAllocation());
if (isCopyOnly()) {
@@ -243,10 +243,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) {
ze_result_t ret = addEventsToCmdList(hSignalEvent, numWaitEvents, phWaitEvents);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
appendEventForProfiling(hSignalEvent, true);
if (isCopyOnly()) {
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, false, false);
@@ -268,14 +269,14 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryRangesBarrier(uint
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) {
ze_result_t ret = addEventsToCmdList(hSignalEvent, numWaitEvents, phWaitEvents);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
appendEventForProfiling(hSignalEvent, true);
applyMemoryRangesBarrier(numRanges, pRangeSizes, pRanges);
this->appendSignalEventPostWalker(hSignalEvent);
appendSignalEventPostWalker(hSignalEvent);
if (this->cmdListType == CommandListType::TYPE_IMMEDIATE) {
executeCommandListImmediate(true);
@@ -624,8 +625,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendImageCopyRegion(ze_image
kernel->setArgumentValue(2, sizeof(srcOffset), &srcOffset);
kernel->setArgumentValue(3, sizeof(dstOffset), &dstOffset);
appendEventForProfiling(hEvent, true);
return CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(kernel->toHandle(), &functionArgs,
hEvent, numWaitEvents, phWaitEvents);
}
@@ -691,8 +690,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyKernelWithGA(v
uint32_t groups = (size + ((groupSizeX * elementSize) - 1)) / (groupSizeX * elementSize);
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
return CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(builtinFunction->toHandle(), &dispatchFuncArgs,
hSignalEvent, 0, nullptr);
return CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -751,11 +749,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(NEO
blitProperties.srcSize = srcSize;
blitProperties.dstSize = dstSize;
ze_result_t ret = addEventsToCmdList(hSignalEvent, numWaitEvents, phWaitEvents);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
appendEventForProfiling(hSignalEvent, true);
if (copyOneCommand) {
NEO::BlitCommandsHelper<GfxFamily>::dispatchBlitCommandsRegion(blitProperties, *commandContainer.getCommandStream(), *device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]);
} else {
@@ -869,11 +868,14 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
auto dstAllocationStruct = getAlignedAllocation(this->device, dstptr, size);
auto srcAllocationStruct = getAlignedAllocation(this->device, srcptr, size);
ze_result_t ret = addEventsToCmdList(hSignalEvent, numWaitEvents, phWaitEvents);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
appendEventForProfilingAllWalkers(hSignalEvent, true);
if (ret == ZE_RESULT_SUCCESS && leftSize) {
ret = isCopyOnly() ? appendMemoryCopyBlit(dstAllocationStruct.alignedAllocationPtr,
dstAllocationStruct.alloc, dstAllocationStruct.offset,
@@ -885,7 +887,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
srcAllocationStruct.alloc, srcAllocationStruct.offset,
static_cast<uint32_t>(leftSize), 1,
Builtin::CopyBufferToBufferSide,
nullptr);
hSignalEvent);
}
if (ret == ZE_RESULT_SUCCESS && middleSizeBytes) {
@@ -900,7 +902,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
static_cast<uint32_t>(middleSizeBytes),
static_cast<uint32_t>(middleElSize),
Builtin::CopyBufferToBufferMiddle,
nullptr);
hSignalEvent);
}
if (ret == ZE_RESULT_SUCCESS && rightSize) {
@@ -914,10 +916,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
srcAllocationStruct.alloc, leftSize + middleSizeBytes + srcAllocationStruct.offset,
static_cast<uint32_t>(rightSize), 1u,
Builtin::CopyBufferToBufferSide,
nullptr);
hSignalEvent);
}
this->appendSignalEventPostWalker(hSignalEvent);
appendEventForProfilingAllWalkers(hSignalEvent, false);
if (dstAllocationStruct.needsFlush && !isCopyOnly()) {
NEO::PipeControlArgs args(true);
@@ -1133,7 +1135,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
return appendBlitFill(ptr, pattern, patternSize, size, hSignalEvent, numWaitEvents, phWaitEvents);
}
ze_result_t ret = addEventsToCmdList(hSignalEvent, numWaitEvents, phWaitEvents);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
@@ -1207,11 +1209,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
builtinFunction->setArgumentValue(3, sizeof(srcOffset), &srcOffset);
}
appendEventForProfilingAllWalkers(hSignalEvent, true);
uint32_t groups = static_cast<uint32_t>(size) / groupSizeX;
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
ze_result_t res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(builtinFunction->toHandle(),
&dispatchFuncArgs, nullptr,
0, nullptr);
ze_result_t res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
if (res) {
return res;
@@ -1226,12 +1229,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset);
res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(builtinFunction->toHandle(),
&dispatchFuncArgs, nullptr,
0, nullptr);
res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
}
this->appendSignalEventPostWalker(hSignalEvent);
appendEventForProfilingAllWalkers(hSignalEvent, false);
if (hostPointerNeedsFlush) {
NEO::PipeControlArgs args(true);
@@ -1253,11 +1254,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBlitFill(void *ptr,
if (NEO::HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily).getMaxFillPaternSizeForCopyEngine() < patternSize) {
return ZE_RESULT_ERROR_INVALID_SIZE;
} else {
ze_result_t ret = addEventsToCmdList(hSignalEvent, numWaitEvents, phWaitEvents);
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
appendEventForProfiling(hSignalEvent, true);
NEO::GraphicsAllocation *gpuAllocation = device->getDriverHandle()->getDriverSystemMemoryAllocation(ptr,
size,
neoDevice->getRootDeviceIndex(),
@@ -1289,6 +1290,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(ze_event_
CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(hEvent);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingCopyCommand(ze_event_handle_t hEvent, bool beforeWalker) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
@@ -1363,8 +1365,7 @@ inline AlignedAllocationData CommandListCoreFamily<gfxCoreFamily>::getAlignedAll
}
template <GFXCORE_FAMILY gfxCoreFamily>
inline ze_result_t CommandListCoreFamily<gfxCoreFamily>::addEventsToCmdList(ze_event_handle_t hEvent,
uint32_t numWaitEvents,
inline ze_result_t CommandListCoreFamily<gfxCoreFamily>::addEventsToCmdList(uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) {
if (numWaitEvents > 0) {
@@ -1375,8 +1376,6 @@ inline ze_result_t CommandListCoreFamily<gfxCoreFamily>::addEventsToCmdList(ze_e
}
}
appendEventForProfiling(hEvent, true);
return ZE_RESULT_SUCCESS;
}
@@ -1455,6 +1454,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(ze_event_h
auto baseAddr = event->getGpuAddress();
auto contextOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, contextStart) : offsetof(TimestampPacketStorage::Packet, contextEnd);
auto globalOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, globalStart) : offsetof(TimestampPacketStorage::Packet, globalEnd);
if (maskLsb) {
NEO::EncodeMathMMIO<GfxFamily>::encodeBitwiseAndVal(commandContainer, REG_GLOBAL_TIMESTAMP_LDW, mask, ptrOffset(baseAddr, globalOffset));
NEO::EncodeMathMMIO<GfxFamily>::encodeBitwiseAndVal(commandContainer, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, mask, ptrOffset(baseAddr, contextOffset));
@@ -1462,6 +1462,10 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(ze_event_h
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, ptrOffset(baseAddr, globalOffset));
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, ptrOffset(baseAddr, contextOffset));
}
if (beforeWalker) {
event->increasePacketsInUse();
}
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2020 Intel Corporation
* Copyright (C) 2019-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -31,6 +31,22 @@ size_t CommandListCoreFamily<gfxCoreFamily>::getReserveSshSize() {
return helper.getRenderSurfaceStateSize();
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(ze_kernel_handle_t hKernel,
const ze_group_count_t *pThreadGroupDimensions,
ze_event_handle_t hEvent) {
return appendLaunchKernelWithParams(hKernel, pThreadGroupDimensions, nullptr, false, false);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(ze_event_handle_t hEvent, bool beforeWalker) {
if (beforeWalker) {
appendEventForProfiling(hEvent, true);
} else {
appendSignalEventPostWalker(hEvent);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(ze_kernel_handle_t hKernel,
const ze_group_count_t *pThreadGroupDimensions,
@@ -39,6 +55,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
bool isPredicate) {
const auto kernel = Kernel::fromHandle(hKernel);
UNRECOVERABLE_IF(kernel == nullptr);
appendEventForProfiling(hEvent, true);
const auto functionImmutableData = kernel->getImmutableData();
commandListPerThreadScratchSize = std::max<std::uint32_t>(commandListPerThreadScratchSize,
kernel->getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0]);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2020 Intel Corporation
* Copyright (C) 2019-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -141,6 +141,10 @@ NEO::GraphicsAllocation &Event::getAllocation() {
return *eventImp->eventPool->getAllocation().getGraphicsAllocation(eventImp->device->getNEODevice()->getRootDeviceIndex());
}
uint64_t Event::getTimestampPacketAddress() {
return gpuAddress + packetsInUse * sizeof(TimestampPacketStorage::Packet);
}
ze_result_t EventImp::calculateProfilingData() {
globalStartTS = timestampsData->packets[0].globalStart;
globalEndTS = timestampsData->packets[0].globalEnd;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2020 Intel Corporation
* Copyright (C) 2019-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -46,11 +46,15 @@ struct Event : _ze_event_handle_t {
virtual NEO::GraphicsAllocation &getAllocation();
void increasePacketsInUse() { packetsInUse++; }
void resetPackets() { packetsInUse = 0; }
uint64_t getGpuAddress() { return gpuAddress; }
uint32_t getPacketsInUse() { return packetsInUse; }
uint64_t getTimestampPacketAddress();
void *hostAddress = nullptr;
uint64_t gpuAddress;
uint32_t getPacketsInUse() { return packetsInUse; }
uint32_t packetsInUse = 0;
ze_event_scope_flags_t signalScope = 0u;
ze_event_scope_flags_t waitScope = 0u;
@@ -62,8 +66,6 @@ struct Event : _ze_event_handle_t {
uint64_t contextStartTS;
uint64_t contextEndTS;
uint32_t packetsInUse = 1;
// Metric streamer instance associated with the event.
MetricStreamer *metricStreamer = nullptr;
NEO::CommandStreamReceiver *csr = nullptr;