performance: Signal inOrder counter with pipe control, part 5

When on DC flush platform resolve in order implicit dependency with pipe
control.

Related-To: NEO-13441

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2025-01-10 10:01:02 +00:00
committed by Compute-Runtime-Automation
parent 112abeeeef
commit 3735ccaed7
3 changed files with 96 additions and 55 deletions

View File

@@ -2623,75 +2623,88 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWaitOnInOrderDependency(std::sh
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::less, true, isQwordInOrderCounter(), isCopyOnly(copyOffloadOperation));
} else {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
auto resolveDependenciesViaPipeControls = !this->isCopyOnly(copyOffloadOperation) && !this->asMutable() && implicitDependency && this->dcFlushSupport;
bool indirectMode = false;
if (NEO::debugManager.flags.ResolveDependenciesViaPipeControls.get() != -1) {
resolveDependenciesViaPipeControls = NEO::debugManager.flags.ResolveDependenciesViaPipeControls.get();
}
size_t inOrderPatchListIndex = std::numeric_limits<size_t>::max();
if (isQwordInOrderCounter()) {
indirectMode = true;
if (resolveDependenciesViaPipeControls) {
NEO::PipeControlArgs args;
args.csStallOnly = true;
NEO::MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(*commandContainer.getCommandStream(), args);
break;
} else {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
constexpr uint32_t firstRegister = RegisterOffsets::csGprR0;
constexpr uint32_t secondRegister = RegisterOffsets::csGprR0 + 4;
bool indirectMode = false;
auto lri1 = commandContainer.getCommandStream()->template getSpaceForCmd<MI_LOAD_REGISTER_IMM>();
auto lri2 = commandContainer.getCommandStream()->template getSpaceForCmd<MI_LOAD_REGISTER_IMM>();
size_t inOrderPatchListIndex = std::numeric_limits<size_t>::max();
if (isQwordInOrderCounter()) {
indirectMode = true;
if (!noopDispatch) {
NEO::LriHelper<GfxFamily>::program(lri1, firstRegister, getLowPart(waitValue), true, isCopyOnly(copyOffloadOperation));
NEO::LriHelper<GfxFamily>::program(lri2, secondRegister, getHighPart(waitValue), true, isCopyOnly(copyOffloadOperation));
} else {
memset(lri1, 0, sizeof(MI_LOAD_REGISTER_IMM));
memset(lri2, 0, sizeof(MI_LOAD_REGISTER_IMM));
constexpr uint32_t firstRegister = RegisterOffsets::csGprR0;
constexpr uint32_t secondRegister = RegisterOffsets::csGprR0 + 4;
auto lri1 = commandContainer.getCommandStream()->template getSpaceForCmd<MI_LOAD_REGISTER_IMM>();
auto lri2 = commandContainer.getCommandStream()->template getSpaceForCmd<MI_LOAD_REGISTER_IMM>();
if (!noopDispatch) {
NEO::LriHelper<GfxFamily>::program(lri1, firstRegister, getLowPart(waitValue), true, isCopyOnly(copyOffloadOperation));
NEO::LriHelper<GfxFamily>::program(lri2, secondRegister, getHighPart(waitValue), true, isCopyOnly(copyOffloadOperation));
} else {
memset(lri1, 0, sizeof(MI_LOAD_REGISTER_IMM));
memset(lri2, 0, sizeof(MI_LOAD_REGISTER_IMM));
}
if (inOrderExecInfo->isRegularCmdList()) {
inOrderPatchListIndex = addCmdForPatching((implicitDependency ? nullptr : &inOrderExecInfo), lri1, lri2, waitValue, NEO::InOrderPatchCommandHelpers::PatchCmdType::lri64b);
if (noopDispatch) {
disablePatching(inOrderPatchListIndex);
}
}
if (outListCommands != nullptr) {
auto &lri1ToPatch = outListCommands->emplace_back();
lri1ToPatch.type = CommandToPatch::CbWaitEventLoadRegisterImm;
lri1ToPatch.pDestination = lri1;
lri1ToPatch.inOrderPatchListIndex = inOrderPatchListIndex;
lri1ToPatch.offset = firstRegister;
auto &lri2ToPatch = outListCommands->emplace_back();
lri2ToPatch.type = CommandToPatch::CbWaitEventLoadRegisterImm;
lri2ToPatch.pDestination = lri2;
lri2ToPatch.inOrderPatchListIndex = inOrderPatchListIndex;
lri2ToPatch.offset = secondRegister;
}
}
if (inOrderExecInfo->isRegularCmdList()) {
inOrderPatchListIndex = addCmdForPatching((implicitDependency ? nullptr : &inOrderExecInfo), lri1, lri2, waitValue, NEO::InOrderPatchCommandHelpers::PatchCmdType::lri64b);
auto semaphoreCommand = reinterpret_cast<MI_SEMAPHORE_WAIT *>(commandContainer.getCommandStream()->getSpace(sizeof(MI_SEMAPHORE_WAIT)));
if (!noopDispatch) {
NEO::EncodeSemaphore<GfxFamily>::programMiSemaphoreWait(semaphoreCommand, gpuAddress, waitValue, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD,
false, true, isQwordInOrderCounter(), indirectMode, false);
} else {
memset(semaphoreCommand, 0, sizeof(MI_SEMAPHORE_WAIT));
}
if (inOrderExecInfo->isRegularCmdList() && !isQwordInOrderCounter()) {
inOrderPatchListIndex = addCmdForPatching((implicitDependency ? nullptr : &inOrderExecInfo), semaphoreCommand, nullptr, waitValue, NEO::InOrderPatchCommandHelpers::PatchCmdType::semaphore);
if (noopDispatch) {
disablePatching(inOrderPatchListIndex);
}
} else {
inOrderPatchListIndex = std::numeric_limits<size_t>::max();
}
if (outListCommands != nullptr) {
auto &lri1ToPatch = outListCommands->emplace_back();
lri1ToPatch.type = CommandToPatch::CbWaitEventLoadRegisterImm;
lri1ToPatch.pDestination = lri1;
lri1ToPatch.inOrderPatchListIndex = inOrderPatchListIndex;
lri1ToPatch.offset = firstRegister;
auto &lri2ToPatch = outListCommands->emplace_back();
lri2ToPatch.type = CommandToPatch::CbWaitEventLoadRegisterImm;
lri2ToPatch.pDestination = lri2;
lri2ToPatch.inOrderPatchListIndex = inOrderPatchListIndex;
lri2ToPatch.offset = secondRegister;
auto &semaphoreWaitPatch = outListCommands->emplace_back();
semaphoreWaitPatch.type = CommandToPatch::CbWaitEventSemaphoreWait;
semaphoreWaitPatch.pDestination = semaphoreCommand;
semaphoreWaitPatch.offset = i * immWriteOffset;
semaphoreWaitPatch.inOrderPatchListIndex = inOrderPatchListIndex;
}
}
auto semaphoreCommand = reinterpret_cast<MI_SEMAPHORE_WAIT *>(commandContainer.getCommandStream()->getSpace(sizeof(MI_SEMAPHORE_WAIT)));
if (!noopDispatch) {
NEO::EncodeSemaphore<GfxFamily>::programMiSemaphoreWait(semaphoreCommand, gpuAddress, waitValue, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD,
false, true, isQwordInOrderCounter(), indirectMode, false);
} else {
memset(semaphoreCommand, 0, sizeof(MI_SEMAPHORE_WAIT));
}
if (inOrderExecInfo->isRegularCmdList() && !isQwordInOrderCounter()) {
inOrderPatchListIndex = addCmdForPatching((implicitDependency ? nullptr : &inOrderExecInfo), semaphoreCommand, nullptr, waitValue, NEO::InOrderPatchCommandHelpers::PatchCmdType::semaphore);
if (noopDispatch) {
disablePatching(inOrderPatchListIndex);
}
} else {
inOrderPatchListIndex = std::numeric_limits<size_t>::max();
}
if (outListCommands != nullptr) {
auto &semaphoreWaitPatch = outListCommands->emplace_back();
semaphoreWaitPatch.type = CommandToPatch::CbWaitEventSemaphoreWait;
semaphoreWaitPatch.pDestination = semaphoreCommand;
semaphoreWaitPatch.offset = i * immWriteOffset;
semaphoreWaitPatch.inOrderPatchListIndex = inOrderPatchListIndex;
}
}
gpuAddress += immWriteOffset;

View File

@@ -70,6 +70,7 @@ struct InOrderCmdListFixture : public ::Test<ModuleFixture> {
void SetUp() override {
NEO::debugManager.flags.ForcePreemptionMode.set(static_cast<int32_t>(NEO::PreemptionMode::Disabled));
NEO::debugManager.flags.ResolveDependenciesViaPipeControls.set(0u);
::Test<ModuleFixture>::SetUp();
createKernel();

View File

@@ -977,6 +977,33 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenSubmittingThenProgramSemaphor
ASSERT_TRUE(verifyInOrderDependency<FamilyType>(itor, 1, immCmdList->inOrderExecInfo->getBaseDeviceAddress() + counterOffset, immCmdList->isQwordInOrderCounter(), false));
}
HWTEST2_F(InOrderCmdListTests, givenResolveDependenciesViaPipeControlsForInOrderModeWhenSubmittingThenProgramPipeControlInBetweenDispatches, IsAtLeastXeHpCore) {
DebugManagerStateRestore restorer;
NEO::debugManager.flags.ResolveDependenciesViaPipeControls.set(1);
uint32_t counterOffset = 64;
auto immCmdList = createImmCmdList<gfxCoreFamily>();
immCmdList->inOrderExecInfo->setAllocationOffset(counterOffset);
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
auto offset = cmdStream->getUsed();
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
cmdList,
ptrOffset(cmdStream->getCpuBase(), offset),
cmdStream->getUsed() - offset));
auto itor = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itor);
}
HWTEST2_F(InOrderCmdListTests, givenDependencyFromDifferentRootDeviceWhenAppendCalledThenCreatePeerAllocation, MatchAny) {
NEO::UltDeviceFactory deviceFactory{2, 0};