feature: store timestamp cb event clear and sync commands

Related-To: NEO-10385

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-03-07 18:18:15 +00:00
committed by Compute-Runtime-Automation
parent c72b9ec448
commit f1f874c47d
6 changed files with 122 additions and 24 deletions

View File

@@ -59,6 +59,7 @@ struct CmdListFillKernelArguments {
struct CmdListEventOperation {
size_t operationOffset = 0;
size_t completionFieldOffset = 0;
uint32_t operationCount = 0;
bool workPartitionOperation = false;
bool isTimestmapEvent = false;
@@ -268,7 +269,7 @@ struct CommandListCoreFamily : public CommandListImp {
Event *signalEvent,
CmdListKernelLaunchParams &launchParams);
void appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed);
void appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, CommandToPatch::CommandType storedSemaphore);
void appendSdiInOrderCounterSignalling(uint64_t baseGpuVa, uint64_t signalValue);
@@ -320,11 +321,11 @@ struct CommandListCoreFamily : public CommandListImp {
MOCKABLE_VIRTUAL void allocateOrReuseKernelPrivateMemory(Kernel *kernel, uint32_t sizePerHwThread, NEO::PrivateAllocsToReuseContainer &privateAllocsToReuse);
virtual void allocateOrReuseKernelPrivateMemoryIfNeeded(Kernel *kernel, uint32_t sizePerHwThread);
CmdListEventOperation estimateEventPostSync(Event *event, uint32_t operations);
void dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition);
void dispatchPostSyncCompute(uint64_t gpuAddress, uint32_t value, bool workloadPartition);
void dispatchPostSyncCommands(const CmdListEventOperation &eventOperations, uint64_t gpuAddress, void **syncCmdBuffer, uint32_t value, bool useLastPipeControl, bool signalScope, bool skipPartitionOffsetProgramming);
void dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition, void **outCmdBuffer);
void dispatchPostSyncCompute(uint64_t gpuAddress, uint32_t value, bool workloadPartition, void **outCmdBuffer);
void dispatchPostSyncCommands(const CmdListEventOperation &eventOperations, uint64_t gpuAddress, void **syncCmdBuffer, CommandToPatchContainer *outListCommands, uint32_t value, bool useLastPipeControl, bool signalScope, bool skipPartitionOffsetProgramming);
void dispatchEventRemainingPacketsPostSyncOperation(Event *event);
void dispatchEventPostSyncOperation(Event *event, void **syncCmdBuffer, uint32_t value, bool omitFirstOperation, bool useMax, bool useLastPipeControl, bool skipPartitionOffsetProgramming);
void dispatchEventPostSyncOperation(Event *event, void **syncCmdBuffer, CommandToPatchContainer *outListCommands, uint32_t value, bool omitFirstOperation, bool useMax, bool useLastPipeControl, bool skipPartitionOffsetProgramming);
bool isKernelUncachedMocsRequired(bool kernelState) {
this->containsStatelessUncachedResource |= kernelState;
if (this->stateBaseAddressTracking) {

View File

@@ -559,7 +559,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
bool useMaxPackets = event->isEventTimestampFlagSet() || (event->getPacketsInUse() < this->partitionCount);
bool appendPipeControlWithPostSync = (!isCopyOnly()) && (event->isSignalScope() || event->isEventTimestampFlagSet());
dispatchEventPostSyncOperation(event, nullptr, Event::STATE_CLEARED, false, useMaxPackets, appendPipeControlWithPostSync, false);
dispatchEventPostSyncOperation(event, nullptr, nullptr, Event::STATE_CLEARED, false, useMaxPackets, appendPipeControlWithPostSync, false);
if (!isCopyOnly()) {
if (this->partitionCount > 1) {
@@ -2169,7 +2169,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(Event *ev
}
event->setPacketsInUse(this->partitionCount);
dispatchEventPostSyncOperation(event, syncCmdBuffer, Event::STATE_SIGNALED, false, false, !isCopyOnly(), false);
dispatchEventPostSyncOperation(event, syncCmdBuffer, nullptr, Event::STATE_SIGNALED, false, false, !isCopyOnly(), false);
}
}
@@ -2185,7 +2185,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingCopyCommand(Ev
} else {
NEO::MiFlushArgs args{this->dummyBlitWa};
encodeMiFlush(0, 0, args);
dispatchEventPostSyncOperation(event, nullptr, Event::STATE_SIGNALED, true, false, false, false);
dispatchEventPostSyncOperation(event, nullptr, nullptr, Event::STATE_SIGNALED, true, false, false, false);
}
appendWriteKernelTimestamp(event, nullptr, beforeWalker, false, false);
}
@@ -2379,7 +2379,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
event->setPacketsInUse(this->partitionCount);
bool appendPipeControlWithPostSync = (!isCopyOnly()) && (event->isSignalScope() || event->isEventTimestampFlagSet());
dispatchEventPostSyncOperation(event, nullptr, Event::STATE_SIGNALED, false, false, appendPipeControlWithPostSync, false);
dispatchEventPostSyncOperation(event, nullptr, nullptr, Event::STATE_SIGNALED, false, false, appendPipeControlWithPostSync, false);
if (this->isInOrderExecutionEnabled()) {
appendSignalInOrderDependencyCounter(event);
@@ -2514,7 +2514,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
commandContainer.addToResidencyContainer(event->getPoolAllocation(this->device));
}
appendWaitOnSingleEvent(event, outWaitCmds, relaxedOrderingAllowed);
appendWaitOnSingleEvent(event, outWaitCmds, relaxedOrderingAllowed, CommandToPatch::WaitEventSemaphoreWait);
}
if (isImmediateType() && isCopyOnly() && trackDependencies) {
@@ -2674,7 +2674,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(Event *event,
bool workloadPartition = setupTimestampEventForMultiTile(event);
appendWriteKernelTimestamp(event, outTimeStampSyncCmds, beforeWalker, true, workloadPartition);
} else {
dispatchEventPostSyncOperation(event, nullptr, Event::STATE_SIGNALED, true, false, false, true);
dispatchEventPostSyncOperation(event, nullptr, nullptr, Event::STATE_SIGNALED, true, false, false, true);
const auto &rootDeviceEnvironment = this->device->getNEODevice()->getRootDeviceEnvironment();
@@ -3544,12 +3544,13 @@ CmdListEventOperation CommandListCoreFamily<gfxCoreFamily>::estimateEventPostSyn
ret.operationOffset = event->getSinglePacketSize() * this->partitionCount;
ret.workPartitionOperation = this->partitionCount > 1;
ret.isTimestmapEvent = event->isEventTimestampFlagSet();
ret.completionFieldOffset = event->getCompletionFieldOffset();
return ret;
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition) {
void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCopy(uint64_t gpuAddress, uint32_t value, bool workloadPartition, void **outCmdBuffer) {
NEO::MiFlushArgs miFlushArgs{this->dummyBlitWa};
miFlushArgs.commandWithPostSync = true;
@@ -3558,7 +3559,7 @@ void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCopy(uint64_t gpuAddr
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCompute(uint64_t gpuAddress, uint32_t value, bool workloadPartition) {
void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCompute(uint64_t gpuAddress, uint32_t value, bool workloadPartition, void **outCmdBuffer) {
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(
*commandContainer.getCommandStream(),
gpuAddress,
@@ -3566,11 +3567,11 @@ void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCompute(uint64_t gpuA
0u,
false,
workloadPartition,
nullptr);
outCmdBuffer);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCommands(const CmdListEventOperation &eventOperations, uint64_t gpuAddress, void **syncCmdBuffer, uint32_t value, bool useLastPipeControl, bool signalScope, bool skipPartitionOffsetProgramming) {
void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCommands(const CmdListEventOperation &eventOperations, uint64_t gpuAddress, void **syncCmdBuffer, CommandToPatchContainer *outListCommands, uint32_t value, bool useLastPipeControl, bool signalScope, bool skipPartitionOffsetProgramming) {
decltype(&CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCompute) dispatchFunction = &CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCompute;
if (isCopyOnly()) {
dispatchFunction = &CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCopy;
@@ -3585,8 +3586,20 @@ void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCommands(const CmdLis
appendDispatchOffsetRegister(eventOperations.workPartitionOperation, true);
}
void **outCmdBuffer = nullptr;
void *outCmd = nullptr;
if (outListCommands != nullptr) {
outCmdBuffer = &outCmd;
}
for (uint32_t i = 0; i < operationCount; i++) {
(this->*dispatchFunction)(gpuAddress, value, eventOperations.workPartitionOperation);
(this->*dispatchFunction)(gpuAddress, value, eventOperations.workPartitionOperation, outCmdBuffer);
if (outListCommands != nullptr) {
auto &cmdToPatch = outListCommands->emplace_back();
cmdToPatch.type = CommandToPatch::CbEventTimestampClearStoreDataImm;
cmdToPatch.offset = i * eventOperations.operationOffset + eventOperations.completionFieldOffset;
cmdToPatch.pDestination = outCmd;
}
gpuAddress += eventOperations.operationOffset;
}
@@ -3624,7 +3637,7 @@ void CommandListCoreFamily<gfxCoreFamily>::dispatchPostSyncCommands(const CmdLis
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::dispatchEventPostSyncOperation(Event *event, void **syncCmdBuffer, uint32_t value, bool omitFirstOperation, bool useMax, bool useLastPipeControl, bool skipPartitionOffsetProgramming) {
void CommandListCoreFamily<gfxCoreFamily>::dispatchEventPostSyncOperation(Event *event, void **syncCmdBuffer, CommandToPatchContainer *outListCommands, uint32_t value, bool omitFirstOperation, bool useMax, bool useLastPipeControl, bool skipPartitionOffsetProgramming) {
uint32_t packets = event->getPacketsInUse();
if (this->signalAllEventPackets || useMax) {
packets = event->getMaxPacketsCount();
@@ -3637,7 +3650,7 @@ void CommandListCoreFamily<gfxCoreFamily>::dispatchEventPostSyncOperation(Event
eventPostSync.operationCount--;
}
dispatchPostSyncCommands(eventPostSync, gpuAddress, syncCmdBuffer, value, useLastPipeControl, event->isSignalScope(), skipPartitionOffsetProgramming);
dispatchPostSyncCommands(eventPostSync, gpuAddress, syncCmdBuffer, outListCommands, value, useLastPipeControl, event->isSignalScope(), skipPartitionOffsetProgramming);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -3650,12 +3663,12 @@ void CommandListCoreFamily<gfxCoreFamily>::dispatchEventRemainingPacketsPostSync
eventAddress += event->getSinglePacketSize() * event->getPacketsInUse();
constexpr bool appendLastPipeControl = false;
dispatchPostSyncCommands(remainingPacketsOperation, eventAddress, nullptr, Event::STATE_SIGNALED, appendLastPipeControl, event->isSignalScope(), false);
dispatchPostSyncCommands(remainingPacketsOperation, eventAddress, nullptr, nullptr, Event::STATE_SIGNALED, appendLastPipeControl, event->isSignalScope(), false);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed) {
void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, CommandToPatch::CommandType storedSemaphore) {
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
uint64_t gpuAddr = event->getCompletionFieldGpuAddress(this->device);
@@ -3679,7 +3692,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(Event *event,
if (outWaitCmds != nullptr) {
auto &semWaitCmd = outWaitCmds->emplace_back();
semWaitCmd.type = CommandToPatch::WaitEventSemaphoreWait;
semWaitCmd.type = storedSemaphore;
semWaitCmd.offset = i * event->getSinglePacketSize() + event->getCompletionFieldOffset();
semWaitCmd.pDestination = outSemWaitCmd;
}

View File

@@ -480,7 +480,7 @@ void CommandListCoreFamilyImmediate<gfxCoreFamily>::handleInOrderNonWalkerSignal
hasStallingCmds = hasStallingCmdsForRelaxedOrdering(1, relaxedOrderingDispatch);
}
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(event, nullptr, nonWalkerSignalingHasRelaxedOrdering);
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnSingleEvent(event, nullptr, nonWalkerSignalingHasRelaxedOrdering, CommandToPatch::Invalid);
CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(event);
}

View File

@@ -297,7 +297,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
if (inOrderExecSignalRequired) {
if (inOrderNonWalkerSignalling) {
dispatchEventPostSyncOperation(eventForInOrderExec, nullptr, Event::STATE_CLEARED, false, false, false, false);
dispatchEventPostSyncOperation(eventForInOrderExec, nullptr, launchParams.outListCommands, Event::STATE_CLEARED, false, false, false, false);
} else {
inOrderCounterValue = this->inOrderExecInfo->getCounterValue() + getInOrderIncrementValue();
inOrderExecInfo = this->inOrderExecInfo.get();
@@ -367,7 +367,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
if (inOrderExecSignalRequired) {
if (inOrderNonWalkerSignalling) {
if (!launchParams.skipInOrderNonWalkerSignaling) {
appendWaitOnSingleEvent(eventForInOrderExec, nullptr, false);
appendWaitOnSingleEvent(eventForInOrderExec, launchParams.outListCommands, false, CommandToPatch::CbEventTimestampPostSyncSemaphoreWait);
appendSignalInOrderDependencyCounter(eventForInOrderExec);
}
} else {

View File

@@ -26,6 +26,8 @@ struct CommandToPatch {
SignalEventPostSyncPipeControl,
WaitEventSemaphoreWait,
TimestampEventPostSyncStoreRegMem,
CbEventTimestampPostSyncSemaphoreWait,
CbEventTimestampClearStoreDataImm,
Invalid
};
void *pDestination = nullptr;

View File

@@ -2637,5 +2637,87 @@ HWTEST2_F(CommandListAppendLaunchKernel,
}
}
HWTEST2_F(CommandListAppendLaunchKernel,
givenInOrderCmdListAndTimeStampEventWhenAppendingKernelAndEventWithOutCmdListSetThenStoreStoreDataImmClearAndSemapohreWaitPostSyncCommands,
IsAtLeastXeHpCore) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
Mock<::L0::KernelImp> kernel;
auto mockModule = std::unique_ptr<Module>(new Mock<Module>(device, nullptr));
kernel.module = mockModule.get();
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
auto result = commandList->initialize(device, NEO::EngineGroupType::compute, ZE_COMMAND_LIST_FLAG_IN_ORDER);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto &commandContainer = commandList->getCmdContainer();
auto cmdStream = commandContainer.getCommandStream();
ze_event_pool_counter_based_exp_desc_t counterBasedExtension = {ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC};
counterBasedExtension.flags = ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_NON_IMMEDIATE;
ze_event_pool_desc_t eventPoolDesc = {ZE_STRUCTURE_TYPE_EVENT_POOL_DESC};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
eventPoolDesc.pNext = &counterBasedExtension;
ze_event_desc_t eventDesc = {ZE_STRUCTURE_TYPE_EVENT_DESC};
eventDesc.index = 0;
auto eventPool = std::unique_ptr<L0::EventPool>(L0::EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
auto event = std::unique_ptr<L0::Event>(L0::Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
ASSERT_NE(nullptr, event.get());
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
CommandToPatchContainer outCbEventCmds;
launchParams.outListCommands = &outCbEventCmds;
auto commandStreamOffset = cmdStream->getUsed();
result = commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event->toHandle(), 0, nullptr, launchParams, false);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(cmdStream->getCpuBase(), commandStreamOffset),
cmdStream->getUsed() - commandStreamOffset));
auto eventCompletionAddress = event->getCompletionFieldGpuAddress(device);
ASSERT_EQ(2u, outCbEventCmds.size());
size_t expectedSdi = commandList->inOrderAtomicSignalingEnabled ? 1 : 2;
auto storeDataImmList = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(expectedSdi, storeDataImmList.size());
auto computeWalkerList = findAll<DefaultWalkerType *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(1u, computeWalkerList.size());
auto semaphoreWaitList = findAll<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(1u, semaphoreWaitList.size());
EXPECT_EQ(CommandToPatch::CbEventTimestampClearStoreDataImm, outCbEventCmds[0].type);
EXPECT_EQ(*storeDataImmList[0], outCbEventCmds[0].pDestination);
auto storeDataImmCmd = genCmdCast<MI_STORE_DATA_IMM *>(outCbEventCmds[0].pDestination);
ASSERT_NE(nullptr, storeDataImmCmd);
EXPECT_EQ(eventCompletionAddress, storeDataImmCmd->getAddress());
EXPECT_EQ(launchParams.outWalker, *computeWalkerList[0]);
auto walkerCmd = genCmdCast<DefaultWalkerType *>(launchParams.outWalker);
ASSERT_NE(nullptr, walkerCmd);
if constexpr (!FamilyType::template isHeaplessMode<DefaultWalkerType>()) {
auto eventBaseAddress = event->getGpuAddress(device);
EXPECT_EQ(eventBaseAddress, walkerCmd->getPostSync().getDestinationAddress());
}
EXPECT_EQ(CommandToPatch::CbEventTimestampPostSyncSemaphoreWait, outCbEventCmds[1].type);
EXPECT_EQ(*semaphoreWaitList[0], outCbEventCmds[1].pDestination);
auto semaphoreWaitCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(outCbEventCmds[1].pDestination);
ASSERT_NE(nullptr, semaphoreWaitCmd);
EXPECT_EQ(eventCompletionAddress, semaphoreWaitCmd->getSemaphoreGraphicsAddress());
}
} // namespace ult
} // namespace L0