feature: dont program pipe_control for in-order barrier profiling

Related-To: NEO-7966

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-09-25 13:46:59 +00:00
committed by Compute-Runtime-Automation
parent 202a33ffac
commit c115eeb108
11 changed files with 75 additions and 45 deletions

View File

@@ -280,9 +280,9 @@ struct CommandListCoreFamily : CommandListImp {
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t *threadGroupDimensions);
void appendWriteKernelTimestamp(Event *event, bool beforeWalker, bool maskLsb, bool workloadPartition);
void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask, bool workloadPartition);
void appendEventForProfiling(Event *event, bool beforeWalker);
void appendEventForProfiling(Event *event, bool beforeWalker, bool skipBarrierForEndProfiling);
void appendEventForProfilingCopyCommand(Event *event, bool beforeWalker);
void appendSignalEventPostWalker(Event *event);
void appendSignalEventPostWalker(Event *event, bool skipBarrierForEndProfiling);
virtual void programStateBaseAddress(NEO::CommandContainer &container, bool useSbaProperties);
void appendComputeBarrierCommand();
NEO::PipeControlArgs createBarrierFlags();

View File

@@ -423,12 +423,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelIndirect(ze_
launchParams.isHostSignalScopeEvent = event->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST);
}
appendEventForProfiling(event, true);
appendEventForProfiling(event, true, false);
launchParams.isIndirect = true;
ret = appendLaunchKernelWithParams(Kernel::fromHandle(kernelHandle), pDispatchArgumentsBuffer,
nullptr, launchParams);
addToMappedEventList(event);
appendSignalEventPostWalker(event);
appendSignalEventPostWalker(event, false);
if (isInOrderExecutionEnabled()) {
handleInOrderDependencyCounter();
@@ -461,7 +461,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchMultipleKernelsInd
launchParams.isHostSignalScopeEvent = event->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST);
}
appendEventForProfiling(event, true);
appendEventForProfiling(event, true, false);
const bool haveLaunchArguments = pLaunchArgumentsBuffer != nullptr;
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(pNumLaunchArguments);
auto alloc = allocData->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex());
@@ -478,7 +478,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchMultipleKernelsInd
}
}
addToMappedEventList(event);
appendSignalEventPostWalker(event);
appendSignalEventPostWalker(event, false);
return ret;
}
@@ -552,9 +552,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryRangesBarrier(uint
signalEvent = Event::fromHandle(hSignalEvent);
}
appendEventForProfiling(signalEvent, true);
appendEventForProfiling(signalEvent, true, false);
applyMemoryRangesBarrier(numRanges, pRangeSizes, pRanges);
appendSignalEventPostWalker(signalEvent);
appendSignalEventPostWalker(signalEvent, false);
addToMappedEventList(signalEvent);
if (this->inOrderExecutionEnabled) {
@@ -1210,7 +1210,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(Ali
return ret;
}
appendEventForProfiling(signalEvent, true);
appendEventForProfiling(signalEvent, true, false);
auto &rootDeviceEnvironment = device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()];
bool copyRegionPreferred = NEO::BlitCommandsHelper<GfxFamily>::isCopyRegionPreferred(copySizeModified, *rootDeviceEnvironment, blitProperties.isSystemMemoryPoolUsed);
if (copyRegionPreferred) {
@@ -1220,7 +1220,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(Ali
}
makeResidentDummyAllocation();
appendSignalEventPostWalker(signalEvent);
appendSignalEventPostWalker(signalEvent, false);
return ZE_RESULT_SUCCESS;
}
@@ -1245,11 +1245,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendCopyImageBlit(NEO::Graph
commandContainer.addToResidencyContainer(src);
commandContainer.addToResidencyContainer(clearColorAllocation);
appendEventForProfiling(signalEvent, true);
appendEventForProfiling(signalEvent, true, false);
NEO::BlitCommandsHelper<GfxFamily>::dispatchBlitCommandsForImageRegion(blitProperties, *commandContainer.getCommandStream(), dummyBlitWa);
makeResidentDummyAllocation();
appendSignalEventPostWalker(signalEvent);
appendSignalEventPostWalker(signalEvent, false);
return ZE_RESULT_SUCCESS;
}
@@ -2027,7 +2027,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBlitFill(void *ptr,
}
auto neoDevice = device->getNEODevice();
appendEventForProfiling(signalEvent, true);
appendEventForProfiling(signalEvent, true, false);
NEO::GraphicsAllocation *gpuAllocation = device->getDriverHandle()->getDriverSystemMemoryAllocation(ptr,
size,
neoDevice->getRootDeviceIndex(),
@@ -2055,7 +2055,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBlitFill(void *ptr,
this->dummyBlitWa);
makeResidentDummyAllocation();
appendSignalEventPostWalker(signalEvent);
appendSignalEventPostWalker(signalEvent, false);
if (isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter();
@@ -2066,12 +2066,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBlitFill(void *ptr,
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(Event *event) {
void CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(Event *event, bool skipBarrierForEndProfiling) {
if (event == nullptr) {
return;
}
if (event->isEventTimestampFlagSet()) {
appendEventForProfiling(event, false);
appendEventForProfiling(event, false, skipBarrierForEndProfiling);
} else {
event->resetKernelCountAndPacketUsedCount();
commandContainer.addToResidencyContainer(&event->getAllocation(this->device));
@@ -2480,7 +2480,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(Event *eve
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(Event *event, bool beforeWalker) {
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(Event *event, bool beforeWalker, bool skipBarrierForEndProfiling) {
if (!event) {
return;
}
@@ -2505,11 +2505,14 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(Event *event,
dispatchEventPostSyncOperation(event, Event::STATE_SIGNALED, true, false, false, true);
const auto &rootDeviceEnvironment = this->device->getNEODevice()->getRootDeviceEnvironment();
NEO::PipeControlArgs args;
args.dcFlushEnable = getDcFlushRequired(event->isSignalScope());
NEO::MemorySynchronizationCommands<GfxFamily>::setPostSyncExtraProperties(args);
NEO::MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(*commandContainer.getCommandStream(), args);
if (!skipBarrierForEndProfiling) {
NEO::PipeControlArgs args;
args.dcFlushEnable = getDcFlushRequired(event->isSignalScope());
NEO::MemorySynchronizationCommands<GfxFamily>::setPostSyncExtraProperties(args);
NEO::MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(*commandContainer.getCommandStream(), args);
}
uint64_t baseAddr = event->getGpuAddress(this->device);
NEO::MemorySynchronizationCommands<GfxFamily>::addAdditionalSynchronization(*commandContainer.getCommandStream(), baseAddr, false, rootDeviceEnvironment);
@@ -2535,7 +2538,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWriteGlobalTimestamp(
signalEvent = Event::fromHandle(hSignalEvent);
}
appendEventForProfiling(signalEvent, true);
appendEventForProfiling(signalEvent, true, false);
if (isCopyOnly()) {
NEO::MiFlushArgs args{this->dummyBlitWa};
@@ -2559,7 +2562,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWriteGlobalTimestamp(
args);
}
appendSignalEventPostWalker(signalEvent);
appendSignalEventPostWalker(signalEvent, false);
if (this->inOrderExecutionEnabled) {
appendSignalInOrderDependencyCounter();
@@ -3054,7 +3057,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
signalEvent = Event::fromHandle(hSignalEvent);
}
appendEventForProfiling(signalEvent, true);
appendEventForProfiling(signalEvent, true, false);
if (this->inOrderExecutionEnabled) {
appendSignalInOrderDependencyCounter();
@@ -3076,7 +3079,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
}
addToMappedEventList(signalEvent);
appendSignalEventPostWalker(signalEvent);
appendSignalEventPostWalker(signalEvent, this->inOrderExecutionEnabled);
if (isInOrderExecutionEnabled()) {
handleInOrderDependencyCounter();
@@ -3203,7 +3206,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnMemory(void *desc,
}
UNRECOVERABLE_IF(srcAllocationStruct.alloc == nullptr);
appendEventForProfiling(signalEvent, true);
appendEventForProfiling(signalEvent, true, false);
if (this->inOrderExecutionEnabled) {
handleInOrderImplicitDependencies(false);
@@ -3222,7 +3225,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnMemory(void *desc,
NEO::MemorySynchronizationCommands<GfxFamily>::addAdditionalSynchronization(*commandContainer.getCommandStream(), gpuAddress, true, rootDeviceEnvironment);
}
appendSignalEventPostWalker(signalEvent);
appendSignalEventPostWalker(signalEvent, false);
if (this->inOrderExecutionEnabled) {
appendSignalInOrderDependencyCounter();

View File

@@ -89,7 +89,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
dsh = dshReserveArgs.indirectHeapReservation;
}
appendEventForProfiling(event, true);
appendEventForProfiling(event, true, false);
auto perThreadScratchSize = std::max<std::uint32_t>(this->getCommandListPerThreadScratchSize(),
kernel->getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0]);
this->setCommandListPerThreadScratchSize(perThreadScratchSize);
@@ -221,7 +221,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
*reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateSpace) = surfaceState;
}
appendSignalEventPostWalker(event);
appendSignalEventPostWalker(event, false);
commandContainer.addToResidencyContainer(kernelImmutableData->getIsaGraphicsAllocation());
auto &residencyContainer = kernel->getResidencyContainer();
@@ -303,9 +303,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(Kernel
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent) {
if (beforeWalker) {
appendEventForProfiling(event, true);
appendEventForProfiling(event, true, false);
} else {
appendSignalEventPostWalker(event);
appendSignalEventPostWalker(event, false);
}
}

View File

@@ -475,9 +475,9 @@ template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Event *event, bool beforeWalker, bool singlePacketEvent) {
if (isCopyOnly() || singlePacketEvent) {
if (beforeWalker) {
appendEventForProfiling(event, true);
appendEventForProfiling(event, true, false);
} else {
appendSignalEventPostWalker(event);
appendSignalEventPostWalker(event, false);
}
} else {
if (event) {

View File

@@ -2826,7 +2826,7 @@ HWTEST2_F(CommandListCreate, givenNullEventWhenAppendEventAfterWalkerThenNothing
auto usedBefore = commandList->getCmdContainer().getCommandStream()->getUsed();
commandList->appendSignalEventPostWalker(nullptr);
commandList->appendSignalEventPostWalker(nullptr, false);
EXPECT_EQ(commandList->getCmdContainer().getCommandStream()->getUsed(), usedBefore);
}

View File

@@ -1044,6 +1044,33 @@ HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenDispatchingSemaphoreThenProg
EXPECT_EQ(1u, allCmds.size());
}
HWTEST2_F(InOrderCmdListTests, givenTimestmapEventWhenProgrammingBarrierThenDontAddPipeControl, IsAtLeastSkl) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto eventPool = createEvents<FamilyType>(1, true);
auto eventHandle = events[0]->toHandle();
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
auto offset = cmdStream->getUsed();
immCmdList->appendBarrier(eventHandle, 0, nullptr, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
cmdStream->getUsed() - offset));
auto itor = find<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(cmdList.end(), itor);
}
HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenDispatchingStoreDataImmThenProgramUserInterrupt, IsAtLeastSkl) {
using MI_USER_INTERRUPT = typename FamilyType::MI_USER_INTERRUPT;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;

View File

@@ -381,7 +381,7 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
event->signalScope = ZE_EVENT_SCOPE_FLAG_HOST;
commandList->partitionCount = packets;
commandList->appendSignalEventPostWalker(event.get());
commandList->appendSignalEventPostWalker(event.get(), false);
EXPECT_EQ(packets, event->getPacketsInUse());
auto gpuAddress = event->getCompletionFieldGpuAddress(device);
@@ -441,7 +441,7 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
event->signalScope = ZE_EVENT_SCOPE_FLAG_HOST;
commandList->partitionCount = packets;
commandList->appendSignalEventPostWalker(event.get());
commandList->appendSignalEventPostWalker(event.get(), false);
EXPECT_EQ(packets, event->getPacketsInUse());
auto gpuAddress = event->getCompletionFieldGpuAddress(device);
@@ -605,7 +605,7 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
event->setEventTimestampFlag(false);
commandList->appendSignalEventPostWalker(event.get());
commandList->appendSignalEventPostWalker(event.get(), false);
size_t usedAfterSize = cmdStream->getUsed();
GenCmdList cmdList;

View File

@@ -878,7 +878,7 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
commandList->setupTimestampEventForMultiTile(event.get());
size_t sizeBefore = cmdStream->getUsed();
commandList->appendEventForProfiling(event.get(), false);
commandList->appendEventForProfiling(event.get(), false, false);
size_t sizeAfter = cmdStream->getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
@@ -963,7 +963,7 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
commandList->setupTimestampEventForMultiTile(event.get());
size_t sizeBefore = cmdStream->getUsed();
commandList->appendSignalEventPostWalker(event.get());
commandList->appendSignalEventPostWalker(event.get(), false);
size_t sizeAfter = cmdStream->getUsed();
EXPECT_EQ(ZE_RESULT_SUCCESS, result);

View File

@@ -60,7 +60,7 @@ PVCTEST_F(CommandListEventFenceTestsPvc, givenCommandListWithProfilingEventAfter
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
commandList->appendEventForProfiling(event.get(), false);
commandList->appendEventForProfiling(event.get(), false, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(

View File

@@ -654,7 +654,7 @@ HWTEST2_F(CommandListEventFenceTestsXeHpcCore, givenCommandListWithProfilingEven
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
commandList->appendEventForProfiling(event.get(), false);
commandList->appendEventForProfiling(event.get(), false, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
@@ -684,7 +684,7 @@ HWTEST2_F(CommandListEventFenceTestsXeHpcCore, givenCommandListWithRegularEventA
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
commandList->appendSignalEventPostWalker(event.get());
commandList->appendSignalEventPostWalker(event.get(), false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(

View File

@@ -93,7 +93,7 @@ HWTEST2_F(CommandListCreate, givenNotCopyCommandListWhenProfilingEventBeforeComm
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
commandList->appendEventForProfiling(event.get(), true);
commandList->appendEventForProfiling(event.get(), true, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
@@ -129,7 +129,7 @@ HWTEST2_F(CommandListCreate, givenNotCopyCommandListWhenProfilingEventAfterComma
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
commandList->appendEventForProfiling(event.get(), false);
commandList->appendEventForProfiling(event.get(), false, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
@@ -164,7 +164,7 @@ HWTEST2_F(CommandListCreate, givenCopyCommandListWhenProfilingEventThenStoreRegC
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
commandList->appendEventForProfiling(event.get(), false);
commandList->appendEventForProfiling(event.get(), false, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->getCmdContainer().getCommandStream()->getCpuBase(), 0), commandList->getCmdContainer().getCommandStream()->getUsed()));