performance: Modify wait flow when signal event is used for sub copy

Related-To: NEO-13003

Signed-off-by: Bellekallu Rajkiran <bellekallu.rajkiran@intel.com>
This commit is contained in:
Bellekallu Rajkiran
2025-11-14 02:26:46 +00:00
committed by Compute-Runtime-Automation
parent 7861610e52
commit 791558ba74
5 changed files with 52 additions and 12 deletions

View File

@@ -185,11 +185,21 @@ void CommandListCoreFamily<gfxCoreFamily>::handleInOrderDependencyCounter(Event
this->addResidency(inOrderExecInfo->getDeviceCounterAllocation(), inOrderExecInfo->getHostCounterAllocation());
if (signalEvent && signalEvent->getInOrderIncrementValue(this->partitionCount) == 0) {
if (signalEvent->isCounterBased() || nonWalkerInOrderCmdsChaining || (isImmediateType() && this->duplicatedInOrderCounterStorageEnabled)) {
assignInOrderExecInfoToEvent(signalEvent);
if (signalEvent) {
if (signalEvent->getInOrderIncrementValue(this->partitionCount) == 0) {
if (signalEvent->isCounterBased() || nonWalkerInOrderCmdsChaining || (isImmediateType() && this->duplicatedInOrderCounterStorageEnabled)) {
assignInOrderExecInfoToEvent(signalEvent);
} else {
signalEvent->unsetInOrderExecInfo();
}
} else {
signalEvent->unsetInOrderExecInfo();
auto incrementValue = signalEvent->getInOrderIncrementValue(1);
auto currentUsage = signalEvent->getInOrderExecInfo()->getAggregatedEventUsageCounter();
if ((currentUsage + incrementValue) > signalEvent->getInOrderExecBaseSignalValue()) {
signalEvent->getInOrderExecInfo()->resetAggregatedEventUsageCounter();
}
signalEvent->getInOrderExecInfo()->addAggregatedEventUsageCounter(incrementValue);
}
}

View File

@@ -120,14 +120,12 @@ ze_result_t BcsSplit::appendSplitCall(CommandListCoreFamilyImmediate<gfxCoreFami
const bool dualStreamCopyOffload = cmdList->isDualStreamCopyOffloadOperation(cmdList->isCopyOffloadEnabled());
if (useSignalEventForSubcopy && cmdList->isInOrderExecutionEnabled()) {
for (size_t i = 0; i < cmdListsForSplit.size(); i++) {
auto subCmdList = static_cast<CommandListCoreFamilyImmediate<gfxCoreFamily> *>(cmdListsForSplit[i]);
auto &subInOrderExecInfo = subCmdList->getInOrderExecInfo();
cmdList->appendWaitOnInOrderDependency(subInOrderExecInfo, nullptr,
subInOrderExecInfo->getCounterValue(),
subInOrderExecInfo->getAllocationOffset(),
hasRelaxedOrderingDependencies, false, false, false, dualStreamCopyOffload);
}
auto currentCounter = signalEvent->getInOrderExecInfo()->getAggregatedEventUsageCounter();
auto expectedCounter = currentCounter + signalEvent->getInOrderIncrementValue(1);
cmdList->appendWaitOnInOrderDependency(signalEvent->getInOrderExecInfo(), nullptr,
expectedCounter,
signalEvent->getInOrderAllocationOffset(),
hasRelaxedOrderingDependencies, false, false, false, dualStreamCopyOffload);
}
if (!useSignalEventForSubcopy) {

View File

@@ -120,6 +120,7 @@ void InOrderExecInfo::initializeAllocationsFromHost() {
void InOrderExecInfo::reset() {
resetCounterValue();
regularCmdListSubmissionCounter = 0;
aggregatedEventUsageCounter = 0;
allocationOffset = 0;
initializeAllocationsFromHost();

View File

@@ -88,6 +88,10 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
uint64_t getRegularCmdListSubmissionCounter() const { return regularCmdListSubmissionCounter; }
void addRegularCmdListSubmissionCounter(uint64_t addValue) { regularCmdListSubmissionCounter += addValue; }
uint64_t getAggregatedEventUsageCounter() const { return aggregatedEventUsageCounter; }
void addAggregatedEventUsageCounter(uint64_t addValue) { aggregatedEventUsageCounter += addValue; }
void resetAggregatedEventUsageCounter() { aggregatedEventUsageCounter = 0; }
bool isRegularCmdList() const { return regularCmdList; }
bool isHostStorageDuplicated() const { return duplicatedHostStorage; }
bool isAtomicDeviceSignalling() const { return atomicDeviceSignalling; }
@@ -132,6 +136,7 @@ class InOrderExecInfo : public NEO::NonCopyableClass {
uint64_t counterValue = 0;
uint64_t regularCmdListSubmissionCounter = 0;
uint64_t aggregatedEventUsageCounter = 0;
uint64_t deviceAddress = 0;
uint64_t *hostAddress = nullptr;
uint32_t numDevicePartitionsToWait = 0;

View File

@@ -1066,3 +1066,29 @@ HWTEST_F(CommandEncoderTests, whenGetScratchPtrOffsetOfImplicitArgsIsCalledThenZ
auto scratchOffset = EncodeDispatchKernel<FamilyType>::getScratchPtrOffsetOfImplicitArgs();
EXPECT_EQ(0u, scratchOffset);
}
HWTEST_F(CommandEncoderTests, givenInOrderExecInfoWhenAggregatedEventUsageCounterIsUsedThenVerifyCorrectBehavior) {
MockDevice mockDevice;
uint64_t counterValue = 20;
uint64_t *hostAddress = &counterValue;
uint64_t gpuAddress = castToUint64(ptrOffset(&counterValue, 64));
MockGraphicsAllocation deviceAlloc(nullptr, gpuAddress, 1);
auto inOrderExecInfo = InOrderExecInfo::createFromExternalAllocation(mockDevice, &deviceAlloc, gpuAddress, nullptr, hostAddress, counterValue, 1, 1);
EXPECT_EQ(0u, inOrderExecInfo->getAggregatedEventUsageCounter());
inOrderExecInfo->addAggregatedEventUsageCounter(5);
EXPECT_EQ(5u, inOrderExecInfo->getAggregatedEventUsageCounter());
inOrderExecInfo->addAggregatedEventUsageCounter(10);
EXPECT_EQ(15u, inOrderExecInfo->getAggregatedEventUsageCounter());
inOrderExecInfo->resetAggregatedEventUsageCounter();
EXPECT_EQ(0u, inOrderExecInfo->getAggregatedEventUsageCounter());
inOrderExecInfo->addAggregatedEventUsageCounter(7);
EXPECT_EQ(7u, inOrderExecInfo->getAggregatedEventUsageCounter());
}