feature: initial support of in-order regular cmd lists

Related-To: NEO-7966

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz 2023-08-07 14:08:17 +00:00 committed by Compute-Runtime-Automation
parent 8e989fa333
commit d5d43ead7c
6 changed files with 154 additions and 3 deletions

View File

@ -327,6 +327,7 @@ struct CommandListCoreFamily : CommandListImp {
virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents) const { return false; } virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents) const { return false; }
virtual void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {} virtual void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {}
bool isInOrderEventWaitRequired(const Event &event) const; bool isInOrderEventWaitRequired(const Event &event) const;
virtual bool useCounterAllocationForInOrderMode() const { return false; }
}; };
template <PRODUCT_FAMILY gfxProductFamily> template <PRODUCT_FAMILY gfxProductFamily>

View File

@ -135,6 +135,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
cmdListCurrentStartOffset = 0; cmdListCurrentStartOffset = 0;
mappedTsEventList.clear(); mappedTsEventList.clear();
inOrderDependencyCounter = 0;
inOrderAllocationOffset = 0;
return ZE_RESULT_SUCCESS; return ZE_RESULT_SUCCESS;
} }
@ -1387,7 +1391,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
addFlushRequiredCommand(dstAllocationStruct.needsFlush, signalEvent); addFlushRequiredCommand(dstAllocationStruct.needsFlush, signalEvent);
addToMappedEventList(signalEvent); addToMappedEventList(signalEvent);
if (this->inOrderExecutionEnabled && (launchParams.isKernelSplitOperation || inOrderCopyOnlySignalingAllowed)) { if (this->inOrderExecutionEnabled && useCounterAllocationForInOrderMode() && (launchParams.isKernelSplitOperation || inOrderCopyOnlySignalingAllowed)) {
if (!signalEvent && !isCopyOnly()) { if (!signalEvent && !isCopyOnly()) {
NEO::PipeControlArgs args; NEO::PipeControlArgs args;
NEO::MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(*commandContainer.getCommandStream(), args); NEO::MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(*commandContainer.getCommandStream(), args);
@ -2130,7 +2134,11 @@ inline ze_result_t CommandListCoreFamily<gfxCoreFamily>::addEventsToCmdList(uint
} }
if (hasInOrderDependencies) { if (hasInOrderDependencies) {
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(this->inOrderDependencyCounterAllocation, this->inOrderDependencyCounter, this->inOrderAllocationOffset, relaxedOrderingAllowed); if (useCounterAllocationForInOrderMode()) {
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(this->inOrderDependencyCounterAllocation, this->inOrderDependencyCounter, this->inOrderAllocationOffset, relaxedOrderingAllowed);
} else if (!isCopyOnly()) {
appendComputeBarrierCommand();
}
} }
if (numWaitEvents > 0) { if (numWaitEvents > 0) {
@ -2141,6 +2149,10 @@ inline ze_result_t CommandListCoreFamily<gfxCoreFamily>::addEventsToCmdList(uint
} }
} }
if (cmdListType == TYPE_REGULAR && this->inOrderExecutionEnabled && !hasInOrderDependencies) {
inOrderDependencyCounter++; // First append is without dependencies. Increment counter to program barrier on next calls.
}
return ZE_RESULT_SUCCESS; return ZE_RESULT_SUCCESS;
} }
@ -2263,6 +2275,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
} }
if (event->isInOrderExecEvent()) { if (event->isInOrderExecEvent()) {
UNRECOVERABLE_IF(this->cmdListType != TYPE_IMMEDIATE);
if (isInOrderEventWaitRequired(*event)) { if (isInOrderEventWaitRequired(*event)) {
CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(event->getInOrderExecDataAllocation(), event->getInOrderExecSignalValue(), event->getInOrderAllocationOffset(), relaxedOrderingAllowed); CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(event->getInOrderExecDataAllocation(), event->getInOrderExecSignalValue(), event->getInOrderAllocationOffset(), relaxedOrderingAllowed);
} }

View File

@ -188,6 +188,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) override; void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) override;
void handleInOrderDependencyCounter(); void handleInOrderDependencyCounter();
bool isSkippingInOrderBarrierAllowed(ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) const; bool isSkippingInOrderBarrierAllowed(ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) const;
bool useCounterAllocationForInOrderMode() const override { return true; }
MOCKABLE_VIRTUAL void checkAssert(); MOCKABLE_VIRTUAL void checkAssert();
ComputeFlushMethodType computeFlushMethod = nullptr; ComputeFlushMethodType computeFlushMethod = nullptr;

View File

@ -284,7 +284,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
this->dcFlushSupport // dcFlushEnable this->dcFlushSupport // dcFlushEnable
}; };
bool inOrderExecSignalRequired = (this->inOrderExecutionEnabled && !launchParams.isKernelSplitOperation); bool inOrderExecSignalRequired = (this->inOrderExecutionEnabled && !launchParams.isKernelSplitOperation && useCounterAllocationForInOrderMode());
if (inOrderExecSignalRequired) { if (inOrderExecSignalRequired) {
if (isTimestampEvent) { if (isTimestampEvent) {

View File

@ -76,6 +76,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using BaseClass::indirectAllocationsAllowed; using BaseClass::indirectAllocationsAllowed;
using BaseClass::initialize; using BaseClass::initialize;
using BaseClass::inOrderAllocationOffset; using BaseClass::inOrderAllocationOffset;
using BaseClass::inOrderDependencyCounter;
using BaseClass::isFlushTaskSubmissionEnabled; using BaseClass::isFlushTaskSubmissionEnabled;
using BaseClass::isRelaxedOrderingDispatchAllowed; using BaseClass::isRelaxedOrderingDispatchAllowed;
using BaseClass::isSyncModeQueue; using BaseClass::isSyncModeQueue;

View File

@ -726,6 +726,26 @@ struct InOrderCmdListTests : public CommandListAppendLaunchKernel {
return cmdList; return cmdList;
} }
template <GFXCORE_FAMILY gfxCoreFamily>
DestroyableZeUniquePtr<WhiteBox<L0::CommandListCoreFamily<gfxCoreFamily>>> createRegularCmdList(bool copyOnly) {
auto cmdList = makeZeUniquePtr<WhiteBox<L0::CommandListCoreFamily<gfxCoreFamily>>>();
auto csr = device->getNEODevice()->getDefaultEngine().commandStreamReceiver;
ze_command_queue_desc_t desc = {};
mockCmdQs.emplace_back(std::make_unique<Mock<CommandQueue>>(device, csr, &desc));
auto engineType = copyOnly ? EngineGroupType::Copy : EngineGroupType::RenderCompute;
cmdList->initialize(device, engineType, 0u);
cmdList->enableInOrderExecution();
createdCmdLists++;
return cmdList;
}
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
DestroyableZeUniquePtr<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>> createCopyOnlyImmCmdList() { DestroyableZeUniquePtr<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>> createCopyOnlyImmCmdList() {
auto cmdList = createImmCmdList<gfxCoreFamily>(); auto cmdList = createImmCmdList<gfxCoreFamily>();
@ -2670,6 +2690,121 @@ HWTEST2_F(BcsSplitInOrderCmdListTests, givenBcsSplitEnabledWhenDispatchingCopyRe
EXPECT_EQ(0u, sdiCmd->getDataDword1()); EXPECT_EQ(0u, sdiCmd->getDataDword1());
} }
using InOrderRegularCmdListTests = InOrderCmdListTests;
HWTEST2_F(InOrderRegularCmdListTests, givenInOrderModeWhenDispatchingRegularCmdListThenProgramPipeControlsToHandleDependencies, IsAtLeastXeHpCore) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
auto regularCmdList = createRegularCmdList<gfxCoreFamily>(false);
auto cmdStream = regularCmdList->getCmdContainer().getCommandStream();
size_t offset = cmdStream->getUsed();
EXPECT_EQ(0u, regularCmdList->inOrderDependencyCounter);
regularCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(1u, regularCmdList->inOrderDependencyCounter);
{
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
(cmdStream->getUsed() - offset)));
EXPECT_EQ(nullptr, genCmdCast<PIPE_CONTROL *>(*cmdList.begin()));
auto walkerItor = find<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), walkerItor);
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*walkerItor);
auto &postSync = walkerCmd->getPostSync();
EXPECT_EQ(POSTSYNC_DATA::OPERATION_NO_WRITE, postSync.getOperation());
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(cmdList.end(), sdiItor);
}
offset = cmdStream->getUsed();
regularCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(1u, regularCmdList->inOrderDependencyCounter);
{
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
(cmdStream->getUsed() - offset)));
EXPECT_NE(nullptr, genCmdCast<PIPE_CONTROL *>(*cmdList.begin()));
auto walkerItor = find<COMPUTE_WALKER *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), walkerItor);
auto walkerCmd = genCmdCast<COMPUTE_WALKER *>(*walkerItor);
auto &postSync = walkerCmd->getPostSync();
EXPECT_EQ(POSTSYNC_DATA::OPERATION_NO_WRITE, postSync.getOperation());
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(cmdList.end(), sdiItor);
}
regularCmdList->inOrderAllocationOffset = 123;
regularCmdList->reset();
EXPECT_EQ(0u, regularCmdList->inOrderDependencyCounter);
EXPECT_EQ(0u, regularCmdList->inOrderAllocationOffset);
}
using InOrderRegularCopyOnlyCmdListTests = InOrderCmdListTests;
HWTEST2_F(InOrderRegularCopyOnlyCmdListTests, givenInOrderModeWhenDispatchingRegularCmdListThenDontProgramBarriers, IsAtLeastXeHpCore) {
using XY_COPY_BLT = typename std::remove_const<decltype(FamilyType::cmdInitXyCopyBlt)>::type;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
auto regularCmdList = createRegularCmdList<gfxCoreFamily>(true);
auto cmdStream = regularCmdList->getCmdContainer().getCommandStream();
size_t offset = cmdStream->getUsed();
auto alignedPtr = alignedMalloc(MemoryConstants::cacheLineSize, MemoryConstants::cacheLineSize);
regularCmdList->appendMemoryCopy(alignedPtr, alignedPtr, MemoryConstants::cacheLineSize, nullptr, 0, nullptr, false, false);
{
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
(cmdStream->getUsed() - offset)));
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(cmdList.end(), sdiItor);
}
offset = cmdStream->getUsed();
regularCmdList->appendMemoryCopy(alignedPtr, alignedPtr, MemoryConstants::cacheLineSize, nullptr, 0, nullptr, false, false);
{
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
(cmdStream->getUsed() - offset)));
auto sdiItor = find<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(cmdList.end(), sdiItor);
auto copyCmd = genCmdCast<XY_COPY_BLT *>(*cmdList.begin());
EXPECT_NE(nullptr, copyCmd);
}
alignedFree(alignedPtr);
}
struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKernel { struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKernel {
template <typename FamilyType> template <typename FamilyType>
uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::KernelImp> &kernel) { uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::KernelImp> &kernel) {