From 791558ba7438717094ea81b152e62c8fc804efe3 Mon Sep 17 00:00:00 2001 From: Bellekallu Rajkiran Date: Fri, 14 Nov 2025 02:26:46 +0000 Subject: [PATCH] performance: Modify wait flow when signal event is used for sub copy Related-To: NEO-13003 Signed-off-by: Bellekallu Rajkiran --- level_zero/core/source/cmdlist/cmdlist_hw.inl | 18 ++++++++++--- level_zero/core/source/device/bcs_split.inl | 14 +++++----- .../source/helpers/in_order_cmd_helpers.cpp | 1 + shared/source/helpers/in_order_cmd_helpers.h | 5 ++++ .../command_encoder_tests.cpp | 26 +++++++++++++++++++ 5 files changed, 52 insertions(+), 12 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index e48cb0b2e8..dcfd196ab4 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -185,11 +185,21 @@ void CommandListCoreFamily::handleInOrderDependencyCounter(Event this->addResidency(inOrderExecInfo->getDeviceCounterAllocation(), inOrderExecInfo->getHostCounterAllocation()); - if (signalEvent && signalEvent->getInOrderIncrementValue(this->partitionCount) == 0) { - if (signalEvent->isCounterBased() || nonWalkerInOrderCmdsChaining || (isImmediateType() && this->duplicatedInOrderCounterStorageEnabled)) { - assignInOrderExecInfoToEvent(signalEvent); + if (signalEvent) { + if (signalEvent->getInOrderIncrementValue(this->partitionCount) == 0) { + if (signalEvent->isCounterBased() || nonWalkerInOrderCmdsChaining || (isImmediateType() && this->duplicatedInOrderCounterStorageEnabled)) { + assignInOrderExecInfoToEvent(signalEvent); + } else { + signalEvent->unsetInOrderExecInfo(); + } } else { - signalEvent->unsetInOrderExecInfo(); + auto incrementValue = signalEvent->getInOrderIncrementValue(1); + auto currentUsage = signalEvent->getInOrderExecInfo()->getAggregatedEventUsageCounter(); + + if ((currentUsage + incrementValue) > signalEvent->getInOrderExecBaseSignalValue()) { + signalEvent->getInOrderExecInfo()->resetAggregatedEventUsageCounter(); + } + signalEvent->getInOrderExecInfo()->addAggregatedEventUsageCounter(incrementValue); } } diff --git a/level_zero/core/source/device/bcs_split.inl b/level_zero/core/source/device/bcs_split.inl index 49cfdf4b5e..0b21851a5f 100644 --- a/level_zero/core/source/device/bcs_split.inl +++ b/level_zero/core/source/device/bcs_split.inl @@ -120,14 +120,12 @@ ze_result_t BcsSplit::appendSplitCall(CommandListCoreFamilyImmediateisDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled()); if (useSignalEventForSubcopy && cmdList->isInOrderExecutionEnabled()) { - for (size_t i = 0; i < cmdListsForSplit.size(); i++) { - auto subCmdList = static_cast *>(cmdListsForSplit[i]); - auto &subInOrderExecInfo = subCmdList->getInOrderExecInfo(); - cmdList->appendWaitOnInOrderDependency(subInOrderExecInfo, nullptr, - subInOrderExecInfo->getCounterValue(), - subInOrderExecInfo->getAllocationOffset(), - hasRelaxedOrderingDependencies, false, false, false, dualStreamCopyOffload); - } + auto currentCounter = signalEvent->getInOrderExecInfo()->getAggregatedEventUsageCounter(); + auto expectedCounter = currentCounter + signalEvent->getInOrderIncrementValue(1); + cmdList->appendWaitOnInOrderDependency(signalEvent->getInOrderExecInfo(), nullptr, + expectedCounter, + signalEvent->getInOrderAllocationOffset(), + hasRelaxedOrderingDependencies, false, false, false, dualStreamCopyOffload); } if (!useSignalEventForSubcopy) { diff --git a/shared/source/helpers/in_order_cmd_helpers.cpp b/shared/source/helpers/in_order_cmd_helpers.cpp index b48200b501..5db2986f6e 100644 --- a/shared/source/helpers/in_order_cmd_helpers.cpp +++ b/shared/source/helpers/in_order_cmd_helpers.cpp @@ -120,6 +120,7 @@ void InOrderExecInfo::initializeAllocationsFromHost() { void InOrderExecInfo::reset() { resetCounterValue(); regularCmdListSubmissionCounter = 0; + aggregatedEventUsageCounter = 0; allocationOffset = 0; initializeAllocationsFromHost(); diff --git a/shared/source/helpers/in_order_cmd_helpers.h b/shared/source/helpers/in_order_cmd_helpers.h index abab7404d8..42187a0afe 100644 --- a/shared/source/helpers/in_order_cmd_helpers.h +++ b/shared/source/helpers/in_order_cmd_helpers.h @@ -88,6 +88,10 @@ class InOrderExecInfo : public NEO::NonCopyableClass { uint64_t getRegularCmdListSubmissionCounter() const { return regularCmdListSubmissionCounter; } void addRegularCmdListSubmissionCounter(uint64_t addValue) { regularCmdListSubmissionCounter += addValue; } + uint64_t getAggregatedEventUsageCounter() const { return aggregatedEventUsageCounter; } + void addAggregatedEventUsageCounter(uint64_t addValue) { aggregatedEventUsageCounter += addValue; } + void resetAggregatedEventUsageCounter() { aggregatedEventUsageCounter = 0; } + bool isRegularCmdList() const { return regularCmdList; } bool isHostStorageDuplicated() const { return duplicatedHostStorage; } bool isAtomicDeviceSignalling() const { return atomicDeviceSignalling; } @@ -132,6 +136,7 @@ class InOrderExecInfo : public NEO::NonCopyableClass { uint64_t counterValue = 0; uint64_t regularCmdListSubmissionCounter = 0; + uint64_t aggregatedEventUsageCounter = 0; uint64_t deviceAddress = 0; uint64_t *hostAddress = nullptr; uint32_t numDevicePartitionsToWait = 0; diff --git a/shared/test/unit_test/command_container/command_encoder_tests.cpp b/shared/test/unit_test/command_container/command_encoder_tests.cpp index f115961f41..a6150436b6 100644 --- a/shared/test/unit_test/command_container/command_encoder_tests.cpp +++ b/shared/test/unit_test/command_container/command_encoder_tests.cpp @@ -1066,3 +1066,29 @@ HWTEST_F(CommandEncoderTests, whenGetScratchPtrOffsetOfImplicitArgsIsCalledThenZ auto scratchOffset = EncodeDispatchKernel::getScratchPtrOffsetOfImplicitArgs(); EXPECT_EQ(0u, scratchOffset); } + +HWTEST_F(CommandEncoderTests, givenInOrderExecInfoWhenAggregatedEventUsageCounterIsUsedThenVerifyCorrectBehavior) { + MockDevice mockDevice; + + uint64_t counterValue = 20; + uint64_t *hostAddress = &counterValue; + uint64_t gpuAddress = castToUint64(ptrOffset(&counterValue, 64)); + + MockGraphicsAllocation deviceAlloc(nullptr, gpuAddress, 1); + + auto inOrderExecInfo = InOrderExecInfo::createFromExternalAllocation(mockDevice, &deviceAlloc, gpuAddress, nullptr, hostAddress, counterValue, 1, 1); + + EXPECT_EQ(0u, inOrderExecInfo->getAggregatedEventUsageCounter()); + + inOrderExecInfo->addAggregatedEventUsageCounter(5); + EXPECT_EQ(5u, inOrderExecInfo->getAggregatedEventUsageCounter()); + + inOrderExecInfo->addAggregatedEventUsageCounter(10); + EXPECT_EQ(15u, inOrderExecInfo->getAggregatedEventUsageCounter()); + + inOrderExecInfo->resetAggregatedEventUsageCounter(); + EXPECT_EQ(0u, inOrderExecInfo->getAggregatedEventUsageCounter()); + + inOrderExecInfo->addAggregatedEventUsageCounter(7); + EXPECT_EQ(7u, inOrderExecInfo->getAggregatedEventUsageCounter()); +}