From b0c924d40e55354a9bdf9e67f104636468d1652e Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Fri, 14 Jun 2024 09:43:05 +0000 Subject: [PATCH] feature: relaxed ordering in copy offload path Related-To: NEO-11376 Signed-off-by: Bartosz Dunajski --- level_zero/core/source/cmdlist/cmdlist_hw.h | 2 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 4 +- .../source/cmdlist/cmdlist_hw_immediate.h | 2 +- .../source/cmdlist/cmdlist_hw_immediate.inl | 36 +++++----- .../sources/cmdlist/test_cmdlist_6.cpp | 2 +- .../sources/cmdlist/test_in_order_cmdlist.cpp | 66 ++++++++++++++++++- 6 files changed, 86 insertions(+), 26 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index d750836490..a0cdb53d56 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -356,7 +356,7 @@ struct CommandListCoreFamily : public CommandListImp { } void postInitComputeSetup(); NEO::PreemptionMode obtainKernelPreemptionMode(Kernel *kernel); - virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents) const { return false; } + virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) const { return false; } virtual void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {} bool canSkipInOrderEventWait(Event &event, bool ignorCbEventBoundToCmdList) const; bool handleInOrderImplicitDependencies(bool relaxedOrderingAllowed); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 95376cf523..de1c736c21 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -588,7 +588,7 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand } if (this->isInOrderExecutionEnabled()) { - handleInOrderImplicitDependencies(isRelaxedOrderingDispatchAllowed(0)); + handleInOrderImplicitDependencies(isRelaxedOrderingDispatchAllowed(0, false)); } appendSynchronizedDispatchInitializationSection(); @@ -2476,7 +2476,7 @@ inline ze_result_t CommandListCoreFamily::addEventsToCmdList(uint template ze_result_t CommandListCoreFamily::appendSignalEvent(ze_event_handle_t hEvent) { if (this->isInOrderExecutionEnabled()) { - handleInOrderImplicitDependencies(isRelaxedOrderingDispatchAllowed(0)); + handleInOrderImplicitDependencies(isRelaxedOrderingDispatchAllowed(0, false)); } auto event = Event::fromHandle(hEvent); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index 3efa57573b..335862d0db 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -213,7 +213,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily::appendLaunchKernel( ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, CmdListKernelLaunchParams &launchParams, bool relaxedOrderingDispatch) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); bool stallingCmdsForRelaxedOrdering = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch); checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); @@ -525,7 +525,7 @@ void CommandListCoreFamilyImmediate::handleInOrderNonWalkerSignal bool nonWalkerSignalingHasRelaxedOrdering = false; if (NEO::debugManager.flags.EnableInOrderRelaxedOrderingForEventsChaining.get() != 0) { - nonWalkerSignalingHasRelaxedOrdering = isRelaxedOrderingDispatchAllowed(1); + nonWalkerSignalingHasRelaxedOrdering = isRelaxedOrderingDispatchAllowed(1, false); } if (nonWalkerSignalingHasRelaxedOrdering) { @@ -543,7 +543,7 @@ template ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernelIndirect( ze_kernel_handle_t kernelHandle, const ze_group_count_t &pDispatchArgumentsBuffer, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); @@ -568,7 +568,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendBarrier(ze_even return ZE_RESULT_SUCCESS; } - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); isStallingOperation = hasStallingCmdsForRelaxedOrdering(numWaitEvents, relaxedOrderingDispatch); } @@ -588,7 +588,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool forceDisableCopyOnlyInOrderSignaling) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, isCopyOffloadEnabled()); auto estimatedSize = commonImmediateCommandSize; if (isCopyOnly() || isCopyOffloadEnabled()) { @@ -615,7 +615,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( NEO::TransferDirection direction; auto isSplitNeeded = this->isAppendSplitNeeded(dstptr, srcptr, size, direction); if (isSplitNeeded) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event hasStallindCmds = !relaxedOrderingDispatch; ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](void *dstptrParam, const void *srcptrParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { @@ -642,7 +642,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch, bool forceDisableCopyOnlyInOrderSignaling) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, isCopyOffloadEnabled()); auto estimatedSize = commonImmediateCommandSize; if (isCopyOnly() || isCopyOffloadEnabled()) { @@ -661,7 +661,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio NEO::TransferDirection direction; auto isSplitNeeded = this->isAppendSplitNeeded(dstPtr, srcPtr, this->getTotalSizeForCopyRegion(dstRegion, dstPitch, dstSlicePitch), direction); if (isSplitNeeded) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event hasStallindCmds = !relaxedOrderingDispatch; ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstRegion->originX, srcRegion->originX, dstRegion->width, hSignalEvent, numWaitEvents, phWaitEvents, true, relaxedOrderingDispatch, direction, [&](uint32_t dstOriginXParam, uint32_t srcOriginXParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { @@ -692,7 +692,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryFill(void ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); @@ -736,7 +736,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(N bool relaxedOrdering = false; if (isSplitNeeded) { - relaxedOrdering = isRelaxedOrderingDispatchAllowed(1); // split generates more than 1 event + relaxedOrdering = isRelaxedOrderingDispatchAllowed(1, false); // split generates more than 1 event uintptr_t dstAddress = static_cast(dstAllocation->getGpuAddress()); uintptr_t srcAddress = static_cast(srcAllocation->getGpuAddress()); ret = static_cast(this->device)->bcsSplit.appendSplitCall(this, dstAddress, srcAddress, size, nullptr, 0u, nullptr, false, relaxedOrdering, direction, [&](uintptr_t dstAddressParam, uintptr_t srcAddressParam, size_t sizeParam, ze_event_handle_t hSignalEventParam) { @@ -815,7 +815,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyRegion ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); auto estimatedSize = commonImmediateCommandSize; if (isCopyOnly()) { @@ -840,7 +840,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyFromMe ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); @@ -858,7 +858,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyToMemo ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); @@ -878,7 +878,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyFromMe ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); @@ -898,7 +898,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyToMemo ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); @@ -927,7 +927,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchCooperati ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *waitEventHandles, bool relaxedOrderingDispatch) { - relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents); + relaxedOrderingDispatch = isRelaxedOrderingDispatchAllowed(numWaitEvents, false); checkAvailableSpace(numWaitEvents, relaxedOrderingDispatch, commonImmediateCommandSize); @@ -1396,10 +1396,10 @@ void CommandListCoreFamilyImmediate::checkAssert() { } template -bool CommandListCoreFamilyImmediate::isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents) const { +bool CommandListCoreFamilyImmediate::isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents, bool copyOffload) const { auto numEvents = numWaitEvents + (this->hasInOrderDependencies() ? 1 : 0); - return NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*getCsr(false), numEvents); + return NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*getCsr(copyOffload), numEvents); } template diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp index 55e68d7265..93ad1c0301 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp @@ -546,7 +546,7 @@ HWTEST2_F(CommandListTest, givenRegularCmdListWhenAskingForRelaxedOrderingThenRe auto commandList = std::make_unique>>(); commandList->initialize(device, NEO::EngineGroupType::renderCompute, 0u); - EXPECT_FALSE(commandList->isRelaxedOrderingDispatchAllowed(5)); + EXPECT_FALSE(commandList->isRelaxedOrderingDispatchAllowed(5, false)); } HWTEST2_F(CommandListTest, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp index 5432845f2f..082f1c824f 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp @@ -7,6 +7,7 @@ #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/implicit_scaling.h" +#include "shared/source/direct_submission/dispatchers/blitter_dispatcher.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/constants.h" #include "shared/source/helpers/register_offsets.h" @@ -1640,7 +1641,7 @@ HWTEST2_F(InOrderCmdListTests, givenCmdsChainingWhenDispatchingKernelWithRelaxed immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); findConditionalBbStarts(1); // chaining - EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0)); + EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0, false)); offset = cmdStream->getUsed(); immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); @@ -2039,7 +2040,7 @@ HWTEST2_F(InOrderCmdListTests, givenRelaxedOrderingWhenProgrammingTimestampEvent immCmdList->inOrderExecInfo->addCounterValue(1); - EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0)); + EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0, false)); EXPECT_EQ(0u, immCmdList->flushData.size()); @@ -2146,7 +2147,7 @@ HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenChainingWithRelaxedOrderingT immCmdList->inOrderExecInfo->addCounterValue(1); - EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0)); + EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0, false)); EXPECT_EQ(0u, immCmdList->flushCount); @@ -7231,6 +7232,65 @@ HWTEST2_F(CopyOffloadInOrderTests, givenCopyOperationWithHostVisibleEventThenMar EXPECT_EQ(!immCmdList->dcFlushSupport, immCmdList->latestFlushIsHostVisible); } +HWTEST2_F(CopyOffloadInOrderTests, givenRelaxedOrderingEnabledWhenDispatchingThenUseCorrectCsr, IsAtLeastXeHpcCore) { + class MyMockCmdList : public WhiteBox> { + public: + ze_result_t flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, bool kernelOperation, bool copyOffloadSubmission, ze_event_handle_t hSignalEvent) override { + latestRelaxedOrderingMode = hasRelaxedOrderingDependencies; + + return ZE_RESULT_SUCCESS; + } + + bool latestRelaxedOrderingMode = false; + }; + + debugManager.flags.DirectSubmissionRelaxedOrdering.set(1); + + auto immCmdList = createImmCmdListImpl(true); + + auto mainQueueCsr = static_cast *>(immCmdList->getCsr(false)); + auto copyQueueCsr = static_cast *>(immCmdList->getCsr(true)); + + auto mainQueueDirectSubmission = new MockDirectSubmissionHw>(*mainQueueCsr); + auto offloadDirectSubmission = new MockDirectSubmissionHw>(*copyQueueCsr); + + mainQueueCsr->directSubmission.reset(mainQueueDirectSubmission); + copyQueueCsr->blitterDirectSubmission.reset(offloadDirectSubmission); + + int client1, client2; + + // first dependency + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + + // compute CSR + mainQueueCsr->registerClient(&client1); + mainQueueCsr->registerClient(&client2); + + EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0, false)); + EXPECT_FALSE(immCmdList->isRelaxedOrderingDispatchAllowed(0, true)); + + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_TRUE(immCmdList->latestRelaxedOrderingMode); + + immCmdList->appendMemoryCopy(©Data1, ©Data2, 1, nullptr, 0, nullptr, false, false); + EXPECT_FALSE(immCmdList->latestRelaxedOrderingMode); + + // offload CSR + mainQueueCsr->unregisterClient(&client1); + mainQueueCsr->unregisterClient(&client2); + copyQueueCsr->registerClient(&client1); + copyQueueCsr->registerClient(&client2); + + EXPECT_FALSE(immCmdList->isRelaxedOrderingDispatchAllowed(0, false)); + EXPECT_TRUE(immCmdList->isRelaxedOrderingDispatchAllowed(0, true)); + + immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_FALSE(immCmdList->latestRelaxedOrderingMode); + + immCmdList->appendMemoryCopy(©Data1, ©Data2, 1, nullptr, 0, nullptr, false, false); + EXPECT_TRUE(immCmdList->latestRelaxedOrderingMode); +} + HWTEST2_F(CopyOffloadInOrderTests, givenInOrderModeWhenCallingSyncThenHandleCompletionOnCorrectCsr, IsAtLeastXeHpCore) { auto immCmdList = createImmCmdListWithOffload();