Fix profiling in BCS split

-program profiling start after event waitlist

Resolves: NEO-7723

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2023-02-13 11:42:15 +00:00
committed by Compute-Runtime-Automation
parent 5ec9de90ee
commit 7ad78a28ff
3 changed files with 86 additions and 9 deletions

View File

@@ -396,8 +396,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size);
if (isSplitNeeded) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, true, relaxedOrderingDispatch, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, relaxedOrderingDispatch);
});
} else {
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptr, srcptr, size, hSignalEvent,
@@ -432,7 +432,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch));
if (isSplitNeeded) {
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, true, relaxedOrderingDispatch, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
ze_copy_region_t dstRegionLocal = {};
ze_copy_region_t srcRegionLocal = {};
memcpy(&dstRegionLocal, dstRegion, sizeof(ze_copy_region_t));
@@ -443,7 +443,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
srcRegionLocal.width = static_cast<uint32_t>(sizeParam);
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, &dstRegionLocal, dstPitch, dstSlicePitch,
srcPtr, &srcRegionLocal, srcPitch, srcSlicePitch,
hSignalEventParam, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
hSignalEventParam, 0u, nullptr, relaxedOrderingDispatch);
});
} else {
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, dstRegion, dstPitch, dstSlicePitch,
@@ -516,7 +516,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendPageFaultCopy(N
relaxedOrdering = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
uintptr_t dstAddress = static_cast<uintptr_t>(dstAllocation->getGpuAddress());
uintptr_t srcAddress = static_cast<uintptr_t>(srcAllocation->getGpuAddress());
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uintptr_t, uintptr_t>(this, dstAddress, srcAddress, size, nullptr, false, relaxedOrdering, [&](uintptr_t dstAddressParam, uintptr_t srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uintptr_t, uintptr_t>(this, dstAddress, srcAddress, size, nullptr, 0u, nullptr, false, relaxedOrdering, [&](uintptr_t dstAddressParam, uintptr_t srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
this->appendMemoryCopyBlit(dstAddressParam, dstAllocation, 0u,
srcAddressParam, srcAllocation, 0u,
sizeParam);

View File

@@ -59,15 +59,13 @@ struct BcsSplit {
K srcptr,
size_t size,
ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents,
bool performMigration,
bool hasRelaxedOrderingDependencies,
std::function<ze_result_t(T, K, size_t, ze_event_handle_t)> appendCall) {
ze_result_t result = ZE_RESULT_SUCCESS;
if (hSignalEvent) {
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true, true);
}
auto markerEventIndex = this->events.obtainForSplit(Context::fromHandle(cmdList->hContext), MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate<gfxCoreFamily>::GfxFamily::TimestampPacketType));
auto barrierRequired = cmdList->isBarrierRequired();
@@ -86,6 +84,11 @@ struct BcsSplit {
cmdList->addEventsToCmdList(1u, &barrierEventHandle, hasRelaxedOrderingDependencies, false);
}
cmdList->addEventsToCmdList(numWaitEvents, phWaitEvents, hasRelaxedOrderingDependencies, false);
if (hSignalEvent && i == 0u) {
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true, true);
}
auto localSize = totalSize / engineCount;
auto localDstPtr = ptrOffset(dstptr, size - totalSize);
auto localSrcPtr = ptrOffset(srcptr, size - totalSize);

View File

@@ -908,6 +908,80 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
context->freeMem(dstPtr);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyWithProfilingEventThenSuccessIsReturnedAndMiSemaphoresProgrammedBeforeProfiling, IsXeHpcCore) {
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
DebugManagerStateRestore restorer;
DebugManager.flags.SplitBcsCopy.set(1);
DebugManager.flags.EnableFlushTaskSubmission.set(0);
ze_result_t returnValue;
auto hwInfo = *NEO::defaultHwInfo;
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
hwInfo.capabilityTable.blitterOperationsSupported = true;
auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
auto testL0Device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue));
ze_command_queue_desc_t desc = {};
desc.ordinal = static_cast<uint32_t>(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy));
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
testL0Device.get(),
&desc,
false,
NEO::EngineGroupType::Copy,
returnValue));
ASSERT_NE(nullptr, commandList0);
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 4u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
constexpr size_t alignment = 4096u;
constexpr size_t size = 8 * MemoryConstants::megaByte;
void *srcPtr;
void *dstPtr;
ze_device_mem_alloc_desc_t deviceDesc = {};
context->allocDeviceMem(device->toHandle(),
&deviceDesc,
size, alignment, &srcPtr);
ze_host_mem_alloc_desc_t hostDesc = {};
context->allocHostMem(&hostDesc, size, alignment, &dstPtr);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
eventPoolDesc.count = 1;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.wait = 0;
eventDesc.signal = 0;
std::unique_ptr<EventPool> eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
std::unique_ptr<Event> event = std::unique_ptr<Event>(Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, event->toHandle(), 0, nullptr, false);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u);
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, commandList0->commandContainer.getCommandStream()->getCpuBase(), commandList0->commandContainer.getCommandStream()->getUsed()));
auto itor = find<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
context->freeMem(srcPtr);
context->freeMem(dstPtr);
}
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAllocateNewEventsForSplitThenEventsAreManagedProperly, IsXeHpcCore) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;