From 9e04907c9bc9826c25616872b1b95e9cd3ebc361 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Wed, 13 Mar 2024 09:43:18 +0000 Subject: [PATCH] feature: add flag allowing noop space for cb events from the same in order pool Related-To: NEO-10385 Signed-off-by: Zbigniew Zdanowicz --- level_zero/core/source/cmdlist/cmdlist_hw.h | 9 ++- level_zero/core/source/cmdlist/cmdlist_hw.inl | 64 ++++++++++++++----- .../core/test/unit_tests/mocks/mock_cmdlist.h | 2 + .../sources/cmdlist/test_in_order_cmdlist.cpp | 60 ++++++++++++++++- 4 files changed, 114 insertions(+), 21 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 72fa072adb..5dcbaa4150 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -174,7 +174,8 @@ struct CommandListCoreFamily : public CommandListImp { ze_result_t appendWaitOnEvents(uint32_t numEvents, ze_event_handle_t *phEvent, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, bool trackDependencies, bool apiRequest, bool skipAddingWaitEventsToResidency) override; void appendWaitOnInOrderDependency(std::shared_ptr &inOrderExecInfo, CommandToPatchContainer *outListCommands, - uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency, bool skipAddingWaitEventsToResidency); + uint64_t waitValue, uint32_t offset, + bool relaxedOrderingAllowed, bool implicitDependency, bool skipAddingWaitEventsToResidency, bool noopDispatch); void appendSignalInOrderDependencyCounter(Event *signalEvent); void handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining); @@ -196,6 +197,7 @@ struct CommandListCoreFamily : public CommandListImp { size_t getReserveSshSize(); void patchInOrderCmds() override; bool handleCounterBasedEventOperations(Event *signalEvent); + bool isCbEventBoundToCmdList(Event *event) const; protected: MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc, @@ -338,7 +340,7 @@ struct CommandListCoreFamily : public CommandListImp { NEO::PreemptionMode obtainKernelPreemptionMode(Kernel *kernel); virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents) const { return false; } virtual void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {} - bool canSkipInOrderEventWait(Event &event) const; + bool canSkipInOrderEventWait(Event &event, bool ignorCbEventBoundToCmdList) const; bool handleInOrderImplicitDependencies(bool relaxedOrderingAllowed); bool isQwordInOrderCounter() const { return GfxFamily::isQwordInOrderCounter; } bool isInOrderNonWalkerSignalingRequired(const Event *event) const; @@ -349,7 +351,7 @@ struct CommandListCoreFamily : public CommandListImp { bool isSkippingInOrderBarrierAllowed(ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) const; void encodeMiFlush(uint64_t immediateDataGpuAddress, uint64_t immediateData, NEO::MiFlushArgs &args); - void updateInOrderExecInfo(size_t inOrderPatchIndex, std::shared_ptr *inOrderExecInfo); + void updateInOrderExecInfo(size_t inOrderPatchIndex, std::shared_ptr *inOrderExecInfo, bool disablePatchingFlag); void disablePatching(size_t inOrderPatchIndex); void enablePatching(size_t inOrderPatchIndex); @@ -359,6 +361,7 @@ struct CommandListCoreFamily : public CommandListImp { bool latestOperationRequiredNonWalkerInOrderCmdsChaining = false; bool duplicatedInOrderCounterStorageEnabled = false; bool inOrderAtomicSignalingEnabled = false; + bool allowCbWaitEventsNoopDispatch = false; }; template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index d9dbb60098..3204a0b800 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -8,6 +8,7 @@ #include "shared/source/built_ins/built_ins.h" #include "shared/source/command_container/encode_surface_state.h" #include "shared/source/command_stream/command_stream_receiver.h" +#include "shared/source/command_stream/linear_stream.h" #include "shared/source/device/device.h" #include "shared/source/direct_submission/relaxed_ordering_helper.h" #include "shared/source/execution_environment/execution_environment.h" @@ -162,7 +163,7 @@ void CommandListCoreFamily::handleInOrderDependencyCounter(Event } if (!isQwordInOrderCounter() && ((inOrderExecInfo->getCounterValue() + 1) == std::numeric_limits::max())) { - CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue() + 1, inOrderExecInfo->getAllocationOffset(), false, true, false); + CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue() + 1, inOrderExecInfo->getAllocationOffset(), false, true, false, false); inOrderExecInfo->resetCounterValue(); @@ -2316,7 +2317,7 @@ bool CommandListCoreFamily::handleInOrderImplicitDependencies(boo NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandContainer.getCommandStream()); } - CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), relaxedOrderingAllowed, true, false); + CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), relaxedOrderingAllowed, true, false, false); return true; } @@ -2399,7 +2400,8 @@ ze_result_t CommandListCoreFamily::appendSignalEvent(ze_event_han template void CommandListCoreFamily::appendWaitOnInOrderDependency(std::shared_ptr &inOrderExecInfo, CommandToPatchContainer *outListCommands, - uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency, bool skipAddingWaitEventsToResidency) { + uint64_t waitValue, uint32_t offset, + bool relaxedOrderingAllowed, bool implicitDependency, bool skipAddingWaitEventsToResidency, bool noopDispatch) { using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; UNRECOVERABLE_IF(waitValue > static_cast(std::numeric_limits::max()) && !isQwordInOrderCounter()); @@ -2416,6 +2418,7 @@ void CommandListCoreFamily::appendWaitOnInOrderDependency(std::sh } else { using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; + using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; bool indirectMode = false; @@ -2425,11 +2428,23 @@ void CommandListCoreFamily::appendWaitOnInOrderDependency(std::sh constexpr uint32_t firstRegister = RegisterOffsets::csGprR0; constexpr uint32_t secondRegister = RegisterOffsets::csGprR0 + 4; - auto lri1 = NEO::LriHelper::program(commandContainer.getCommandStream(), firstRegister, getLowPart(waitValue), true); - auto lri2 = NEO::LriHelper::program(commandContainer.getCommandStream(), secondRegister, getHighPart(waitValue), true); + + auto lri1 = commandContainer.getCommandStream()->template getSpaceForCmd(); + auto lri2 = commandContainer.getCommandStream()->template getSpaceForCmd(); + + if (!noopDispatch) { + NEO::LriHelper::program(lri1, firstRegister, getLowPart(waitValue), true); + NEO::LriHelper::program(lri2, secondRegister, getHighPart(waitValue), true); + } else { + memset(lri1, 0, sizeof(MI_LOAD_REGISTER_IMM)); + memset(lri2, 0, sizeof(MI_LOAD_REGISTER_IMM)); + } if (inOrderExecInfo->isRegularCmdList()) { inOrderPatchListIndex = addCmdForPatching((implicitDependency ? nullptr : &inOrderExecInfo), lri1, lri2, waitValue, NEO::InOrderPatchCommandHelpers::PatchCmdType::lri64b); + if (noopDispatch) { + disablePatching(inOrderPatchListIndex); + } } if (outListCommands != nullptr) { auto &lri1ToPatch = outListCommands->emplace_back(); @@ -2448,11 +2463,18 @@ void CommandListCoreFamily::appendWaitOnInOrderDependency(std::sh auto semaphoreCommand = reinterpret_cast(commandContainer.getCommandStream()->getSpace(sizeof(MI_SEMAPHORE_WAIT))); - NEO::EncodeSemaphore::programMiSemaphoreWait(semaphoreCommand, gpuAddress, waitValue, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, - false, true, isQwordInOrderCounter(), indirectMode); + if (!noopDispatch) { + NEO::EncodeSemaphore::programMiSemaphoreWait(semaphoreCommand, gpuAddress, waitValue, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, + false, true, isQwordInOrderCounter(), indirectMode); + } else { + memset(semaphoreCommand, 0, sizeof(MI_SEMAPHORE_WAIT)); + } if (inOrderExecInfo->isRegularCmdList() && !isQwordInOrderCounter()) { inOrderPatchListIndex = addCmdForPatching((implicitDependency ? nullptr : &inOrderExecInfo), semaphoreCommand, nullptr, waitValue, NEO::InOrderPatchCommandHelpers::PatchCmdType::semaphore); + if (noopDispatch) { + disablePatching(inOrderPatchListIndex); + } } else { inOrderPatchListIndex = std::numeric_limits::max(); } @@ -2471,10 +2493,10 @@ void CommandListCoreFamily::appendWaitOnInOrderDependency(std::sh } template -bool CommandListCoreFamily::canSkipInOrderEventWait(Event &event) const { +bool CommandListCoreFamily::canSkipInOrderEventWait(Event &event, bool ignorCbEventBoundToCmdList) const { if (isInOrderExecutionEnabled()) { - return ((isImmediateType() && event.getLatestUsedCmdQueue() == this->cmdQImmediate) || // 1. Immediate CmdList can skip "regular Events" from the same CmdList - (event.isCounterBased() && event.getInOrderExecInfo().get() == inOrderExecInfo.get())); // 2. Both Immediate and Regular CmdLists can skip "CounterBased Events" from the same CmdList + return ((isImmediateType() && event.getLatestUsedCmdQueue() == this->cmdQImmediate) || // 1. Immediate CmdList can skip "regular Events" from the same CmdList + (isCbEventBoundToCmdList(&event) && !ignorCbEventBoundToCmdList)); // 2. Both Immediate and Regular CmdLists can skip "CounterBased Events" from the same CmdList } return false; @@ -2521,7 +2543,7 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu auto event = Event::fromHandle(phEvent[i]); if ((isImmediateType() && event->isAlreadyCompleted()) || - canSkipInOrderEventWait(*event)) { + canSkipInOrderEventWait(*event, this->allowCbWaitEventsNoopDispatch)) { continue; } @@ -2534,7 +2556,10 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu // 2. Immediate CmdList takes current value (with submission counter) auto waitValue = !isImmediateType() ? event->getInOrderExecBaseSignalValue() : event->getInOrderExecSignalValueWithSubmissionCounter(); - CommandListCoreFamily::appendWaitOnInOrderDependency(event->getInOrderExecInfo(), outWaitCmds, waitValue, event->getInOrderAllocationOffset(), relaxedOrderingAllowed, false, skipAddingWaitEventsToResidency); + CommandListCoreFamily::appendWaitOnInOrderDependency(event->getInOrderExecInfo(), outWaitCmds, + waitValue, event->getInOrderAllocationOffset(), + relaxedOrderingAllowed, false, skipAddingWaitEventsToResidency, + isCbEventBoundToCmdList(event)); continue; } @@ -3251,7 +3276,7 @@ bool CommandListCoreFamily::isSkippingInOrderBarrierAllowed(ze_ev uint32_t eventsToWait = numWaitEvents; for (uint32_t i = 0; i < numWaitEvents; i++) { - if (CommandListCoreFamily::canSkipInOrderEventWait(*Event::fromHandle(phWaitEvents[i]))) { + if (CommandListCoreFamily::canSkipInOrderEventWait(*Event::fromHandle(phWaitEvents[i]), false)) { eventsToWait--; } } @@ -3820,22 +3845,27 @@ void CommandListCoreFamily::encodeMiFlush(uint64_t immediateDataG } template -void CommandListCoreFamily::updateInOrderExecInfo(size_t inOrderPatchIndex, std::shared_ptr *inOrderExecInfo) { +void CommandListCoreFamily::updateInOrderExecInfo(size_t inOrderPatchIndex, std::shared_ptr *inOrderExecInfo, bool disablePatchingFlag) { auto &patchCmd = inOrderPatchCmds[inOrderPatchIndex]; patchCmd.updateInOrderExecInfo(inOrderExecInfo); - patchCmd.setSkipPatching(false); + patchCmd.setSkipPatching(disablePatchingFlag); } template -void CommandListCoreFamily::disablePatching(size_t inOrderPatchIndex) { +inline void CommandListCoreFamily::disablePatching(size_t inOrderPatchIndex) { auto &patchCmd = inOrderPatchCmds[inOrderPatchIndex]; patchCmd.setSkipPatching(true); } template -void CommandListCoreFamily::enablePatching(size_t inOrderPatchIndex) { +inline void CommandListCoreFamily::enablePatching(size_t inOrderPatchIndex) { auto &patchCmd = inOrderPatchCmds[inOrderPatchIndex]; patchCmd.setSkipPatching(false); } +template +inline bool CommandListCoreFamily::isCbEventBoundToCmdList(Event *event) const { + return event->isCounterBased() && event->getInOrderExecInfo().get() == inOrderExecInfo.get(); +} + } // namespace L0 diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 83333b39c7..f4c8fa244b 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -33,6 +33,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::addCmdForPatching; using BaseClass::addFlushRequiredCommand; using BaseClass::allocateOrReuseKernelPrivateMemoryIfNeeded; + using BaseClass::allowCbWaitEventsNoopDispatch; using BaseClass::appendBlitFill; using BaseClass::appendCopyImageBlit; using BaseClass::appendDispatchOffsetRegister; @@ -168,6 +169,7 @@ struct WhiteBox> using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using BaseClass = L0::CommandListCoreFamilyImmediate; using BaseClass::addCmdForPatching; + using BaseClass::allowCbWaitEventsNoopDispatch; using BaseClass::appendBlitFill; using BaseClass::appendLaunchKernelWithParams; using BaseClass::appendMemoryCopyBlitRegion; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp index 79116a5cfe..0e45282f85 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp @@ -5757,9 +5757,67 @@ HWTEST2_F(InOrderRegularCmdListTests, givenAddedCmdForPatchWhenUpdateNewInOrderI inOrderRegularCmdList->inOrderPatchCmds[0].patch(3); EXPECT_EQ(4u, semaphoreCmd.getSemaphoreDataDword()); - inOrderRegularCmdList->updateInOrderExecInfo(0, &inOrderExecInfo2); + inOrderRegularCmdList->updateInOrderExecInfo(0, &inOrderExecInfo2, false); inOrderRegularCmdList->inOrderPatchCmds[0].patch(3); EXPECT_EQ(6u, semaphoreCmd.getSemaphoreDataDword()); + + inOrderExecInfo->addRegularCmdListSubmissionCounter(1); + inOrderRegularCmdList->updateInOrderExecInfo(0, &inOrderExecInfo, true); + inOrderRegularCmdList->inOrderPatchCmds[0].patch(3); + EXPECT_EQ(6u, semaphoreCmd.getSemaphoreDataDword()); + + inOrderRegularCmdList->enablePatching(0); + inOrderRegularCmdList->inOrderPatchCmds[0].patch(3); + EXPECT_EQ(5u, semaphoreCmd.getSemaphoreDataDword()); +} + +HWTEST2_F(InOrderCmdListTests, givenInOrderModeAndNoopWaitEventsAllowedWhenEventBoundToCmdListThenNoopSpaceForWaitCommands, IsAtLeastXeHpCore) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + + char noopedLriBuffer[sizeof(MI_LOAD_REGISTER_IMM)] = {}; + memset(noopedLriBuffer, 0, sizeof(MI_LOAD_REGISTER_IMM)); + char noopedSemWaitBuffer[sizeof(MI_SEMAPHORE_WAIT)] = {}; + memset(noopedSemWaitBuffer, 0, sizeof(MI_SEMAPHORE_WAIT)); + + auto regularCmdList = createRegularCmdList(false); + regularCmdList->allowCbWaitEventsNoopDispatch = true; + + auto eventPool = createEvents(1, false); + auto eventHandle = events[0]->toHandle(); + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + + auto result = regularCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, eventHandle, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + CommandToPatchContainer outCbWaitEventCmds; + launchParams.outListCommands = &outCbWaitEventCmds; + result = regularCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 1, &eventHandle, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + size_t expectedLoadRegImmCount = FamilyType::isQwordInOrderCounter ? 2 : 0; + + size_t expectedWaitCmds = 1 + expectedLoadRegImmCount; + ASSERT_EQ(expectedWaitCmds, outCbWaitEventCmds.size()); + + size_t outCbWaitEventCmdsIndex = 0; + for (; outCbWaitEventCmdsIndex < expectedLoadRegImmCount; outCbWaitEventCmdsIndex++) { + EXPECT_EQ(CommandToPatch::CbWaitEventLoadRegisterImm, outCbWaitEventCmds[outCbWaitEventCmdsIndex].type); + auto registerNumber = 0x2600 + (4 * outCbWaitEventCmdsIndex); + EXPECT_EQ(registerNumber, outCbWaitEventCmds[outCbWaitEventCmdsIndex].offset); + + ASSERT_NE(nullptr, outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination); + auto memCmpRet = memcmp(outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination, noopedLriBuffer, sizeof(MI_LOAD_REGISTER_IMM)); + EXPECT_EQ(0, memCmpRet); + } + + EXPECT_EQ(CommandToPatch::CbWaitEventSemaphoreWait, outCbWaitEventCmds[outCbWaitEventCmdsIndex].type); + + ASSERT_NE(nullptr, outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination); + auto memCmpRet = memcmp(outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination, noopedSemWaitBuffer, sizeof(MI_SEMAPHORE_WAIT)); + EXPECT_EQ(0, memCmpRet); } } // namespace ult