mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-29 09:03:14 +08:00
Fix profiling in BCS split
-program profiling start after event waitlist Resolves: NEO-7723 Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
5ec9de90ee
commit
7ad78a28ff
@@ -396,8 +396,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopy(
|
||||
auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size);
|
||||
if (isSplitNeeded) {
|
||||
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, true, relaxedOrderingDispatch, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, void *, const void *>(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptrParam, srcptrParam, sizeParam, hSignalEventParam, 0u, nullptr, relaxedOrderingDispatch);
|
||||
});
|
||||
} else {
|
||||
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(dstptr, srcptr, size, hSignalEvent,
|
||||
@@ -432,7 +432,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
|
||||
auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch));
|
||||
if (isSplitNeeded) {
|
||||
relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, true, relaxedOrderingDispatch, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uint32_t, uint32_t>(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
ze_copy_region_t dstRegionLocal = {};
|
||||
ze_copy_region_t srcRegionLocal = {};
|
||||
memcpy(&dstRegionLocal, dstRegion, sizeof(ze_copy_region_t));
|
||||
@@ -443,7 +443,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendMemoryCopyRegio
|
||||
srcRegionLocal.width = static_cast<uint32_t>(sizeParam);
|
||||
return CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, &dstRegionLocal, dstPitch, dstSlicePitch,
|
||||
srcPtr, &srcRegionLocal, srcPitch, srcSlicePitch,
|
||||
hSignalEventParam, numWaitEvents, phWaitEvents, relaxedOrderingDispatch);
|
||||
hSignalEventParam, 0u, nullptr, relaxedOrderingDispatch);
|
||||
});
|
||||
} else {
|
||||
ret = CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(dstPtr, dstRegion, dstPitch, dstSlicePitch,
|
||||
@@ -516,7 +516,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendPageFaultCopy(N
|
||||
relaxedOrdering = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event
|
||||
uintptr_t dstAddress = static_cast<uintptr_t>(dstAllocation->getGpuAddress());
|
||||
uintptr_t srcAddress = static_cast<uintptr_t>(srcAllocation->getGpuAddress());
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uintptr_t, uintptr_t>(this, dstAddress, srcAddress, size, nullptr, false, relaxedOrdering, [&](uintptr_t dstAddressParam, uintptr_t srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
ret = static_cast<DeviceImp *>(this->device)->bcsSplit.appendSplitCall<gfxCoreFamily, uintptr_t, uintptr_t>(this, dstAddress, srcAddress, size, nullptr, 0u, nullptr, false, relaxedOrdering, [&](uintptr_t dstAddressParam, uintptr_t srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) {
|
||||
this->appendMemoryCopyBlit(dstAddressParam, dstAllocation, 0u,
|
||||
srcAddressParam, srcAllocation, 0u,
|
||||
sizeParam);
|
||||
|
||||
@@ -59,15 +59,13 @@ struct BcsSplit {
|
||||
K srcptr,
|
||||
size_t size,
|
||||
ze_event_handle_t hSignalEvent,
|
||||
uint32_t numWaitEvents,
|
||||
ze_event_handle_t *phWaitEvents,
|
||||
bool performMigration,
|
||||
bool hasRelaxedOrderingDependencies,
|
||||
std::function<ze_result_t(T, K, size_t, ze_event_handle_t)> appendCall) {
|
||||
ze_result_t result = ZE_RESULT_SUCCESS;
|
||||
|
||||
if (hSignalEvent) {
|
||||
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true, true);
|
||||
}
|
||||
|
||||
auto markerEventIndex = this->events.obtainForSplit(Context::fromHandle(cmdList->hContext), MemoryConstants::pageSize64k / sizeof(typename CommandListCoreFamilyImmediate<gfxCoreFamily>::GfxFamily::TimestampPacketType));
|
||||
|
||||
auto barrierRequired = cmdList->isBarrierRequired();
|
||||
@@ -86,6 +84,11 @@ struct BcsSplit {
|
||||
cmdList->addEventsToCmdList(1u, &barrierEventHandle, hasRelaxedOrderingDependencies, false);
|
||||
}
|
||||
|
||||
cmdList->addEventsToCmdList(numWaitEvents, phWaitEvents, hasRelaxedOrderingDependencies, false);
|
||||
if (hSignalEvent && i == 0u) {
|
||||
cmdList->appendEventForProfilingAllWalkers(Event::fromHandle(hSignalEvent), true, true);
|
||||
}
|
||||
|
||||
auto localSize = totalSize / engineCount;
|
||||
auto localDstPtr = ptrOffset(dstptr, size - totalSize);
|
||||
auto localSrcPtr = ptrOffset(srcptr, size - totalSize);
|
||||
|
||||
@@ -908,6 +908,80 @@ HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhe
|
||||
context->freeMem(dstPtr);
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAppendingMemoryCopyWithProfilingEventThenSuccessIsReturnedAndMiSemaphoresProgrammedBeforeProfiling, IsXeHpcCore) {
|
||||
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
|
||||
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.SplitBcsCopy.set(1);
|
||||
DebugManager.flags.EnableFlushTaskSubmission.set(0);
|
||||
|
||||
ze_result_t returnValue;
|
||||
auto hwInfo = *NEO::defaultHwInfo;
|
||||
hwInfo.featureTable.ftrBcsInfo = 0b111111111;
|
||||
hwInfo.capabilityTable.blitterOperationsSupported = true;
|
||||
auto testNeoDevice = NEO::MockDevice::createWithNewExecutionEnvironment<NEO::MockDevice>(&hwInfo);
|
||||
auto testL0Device = std::unique_ptr<L0::Device>(L0::Device::create(driverHandle.get(), testNeoDevice, false, &returnValue));
|
||||
|
||||
ze_command_queue_desc_t desc = {};
|
||||
desc.ordinal = static_cast<uint32_t>(testNeoDevice->getEngineGroupIndexFromEngineGroupType(NEO::EngineGroupType::Copy));
|
||||
|
||||
std::unique_ptr<L0::CommandList> commandList0(CommandList::createImmediate(productFamily,
|
||||
testL0Device.get(),
|
||||
&desc,
|
||||
false,
|
||||
NEO::EngineGroupType::Copy,
|
||||
returnValue));
|
||||
ASSERT_NE(nullptr, commandList0);
|
||||
EXPECT_EQ(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs.size(), 4u);
|
||||
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 0u);
|
||||
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 0u);
|
||||
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 0u);
|
||||
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 0u);
|
||||
|
||||
constexpr size_t alignment = 4096u;
|
||||
constexpr size_t size = 8 * MemoryConstants::megaByte;
|
||||
void *srcPtr;
|
||||
void *dstPtr;
|
||||
ze_device_mem_alloc_desc_t deviceDesc = {};
|
||||
context->allocDeviceMem(device->toHandle(),
|
||||
&deviceDesc,
|
||||
size, alignment, &srcPtr);
|
||||
ze_host_mem_alloc_desc_t hostDesc = {};
|
||||
context->allocHostMem(&hostDesc, size, alignment, &dstPtr);
|
||||
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
|
||||
eventPoolDesc.count = 1;
|
||||
|
||||
ze_event_desc_t eventDesc = {};
|
||||
eventDesc.index = 0;
|
||||
eventDesc.wait = 0;
|
||||
eventDesc.signal = 0;
|
||||
|
||||
std::unique_ptr<EventPool> eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
|
||||
std::unique_ptr<Event> event = std::unique_ptr<Event>(Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
|
||||
|
||||
auto result = commandList0->appendMemoryCopy(dstPtr, srcPtr, size, event->toHandle(), 0, nullptr, false);
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[0])->getTaskCount(), 1u);
|
||||
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[1])->getTaskCount(), 1u);
|
||||
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[2])->getTaskCount(), 1u);
|
||||
EXPECT_EQ(static_cast<CommandQueueImp *>(static_cast<DeviceImp *>(testL0Device.get())->bcsSplit.cmdQs[3])->getTaskCount(), 1u);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, commandList0->commandContainer.getCommandStream()->getCpuBase(), commandList0->commandContainer.getCommandStream()->getUsed()));
|
||||
|
||||
auto itor = find<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
|
||||
context->freeMem(srcPtr);
|
||||
context->freeMem(dstPtr);
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandQueueCommandsXeHpc, givenSplitBcsCopyAndImmediateCommandListWhenAllocateNewEventsForSplitThenEventsAreManagedProperly, IsXeHpcCore) {
|
||||
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user