feature: support to use mi_atomic for signalling in-order counter

Related-To: NEO-7966

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-11-30 09:47:58 +00:00
committed by Compute-Runtime-Automation
parent 35ead06765
commit 2c921ec940
6 changed files with 117 additions and 14 deletions

View File

@@ -340,6 +340,9 @@ struct CommandListCoreFamily : public CommandListImp {
void addCmdForPatching(std::shared_ptr<InOrderExecInfo> *externalInOrderExecInfo, void *cmd1, void *cmd2, uint64_t counterValue, InOrderPatchCommandHelpers::PatchCmdType patchCmdType);
bool inOrderAtomicSignallingEnabled() const;
uint64_t getInOrderIncrementValue() const;
InOrderPatchCommandsContainer<GfxFamily> inOrderPatchCmds;
uint64_t latestHostWaitedInOrderSyncValue = 0;

View File

@@ -174,10 +174,10 @@ void CommandListCoreFamily<gfxCoreFamily>::handleInOrderDependencyCounter(Event
UNRECOVERABLE_IF(inOrderAllocationOffset + offset >= inOrderExecInfo->getDeviceCounterAllocation().getUnderlyingBufferSize());
CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(nullptr); // write 1 on new offset
CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(nullptr); // signal counter on new offset
}
inOrderExecInfo->addCounterValue(1);
inOrderExecInfo->addCounterValue(getInOrderIncrementValue());
this->commandContainer.addToResidencyContainer(&inOrderExecInfo->getDeviceCounterAllocation());
@@ -2554,21 +2554,30 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendSignalInOrderDependencyCounter(Event *signalEvent) {
using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM;
uint64_t signalValue = inOrderExecInfo->getCounterValue() + 1;
uint64_t gpuVa = inOrderExecInfo->getDeviceCounterAllocation().getGpuAddress() + this->inOrderAllocationOffset;
auto cmdStream = commandContainer.getCommandStream();
auto miStoreCmd = reinterpret_cast<MI_STORE_DATA_IMM *>(commandContainer.getCommandStream()->getSpace(sizeof(MI_STORE_DATA_IMM)));
if (inOrderAtomicSignallingEnabled()) {
using ATOMIC_OPCODES = typename GfxFamily::MI_ATOMIC::ATOMIC_OPCODES;
using DATA_SIZE = typename GfxFamily::MI_ATOMIC::DATA_SIZE;
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(miStoreCmd, gpuVa, getLowPart(signalValue), getHighPart(signalValue),
isQwordInOrderCounter(), (this->partitionCount > 1));
NEO::EncodeAtomic<GfxFamily>::programMiAtomic(*cmdStream, gpuVa, ATOMIC_OPCODES::ATOMIC_8B_INCREMENT,
DATA_SIZE::DATA_SIZE_QWORD, 0, 0, 0, 0);
} else {
using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM;
addCmdForPatching(nullptr, miStoreCmd, nullptr, signalValue, InOrderPatchCommandHelpers::PatchCmdType::Sdi);
uint64_t signalValue = inOrderExecInfo->getCounterValue() + 1;
auto miStoreCmd = reinterpret_cast<MI_STORE_DATA_IMM *>(cmdStream->getSpace(sizeof(MI_STORE_DATA_IMM)));
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(miStoreCmd, gpuVa, getLowPart(signalValue), getHighPart(signalValue),
isQwordInOrderCounter(), (this->partitionCount > 1));
addCmdForPatching(nullptr, miStoreCmd, nullptr, signalValue, InOrderPatchCommandHelpers::PatchCmdType::Sdi);
}
if ((NEO::debugManager.flags.ProgramUserInterruptOnResolvedDependency.get() == 1) && signalEvent && signalEvent->isKmdWaitModeEnabled()) {
NEO::EnodeUserInterrupt<GfxFamily>::encode(*commandContainer.getCommandStream());
NEO::EnodeUserInterrupt<GfxFamily>::encode(*cmdStream);
}
}
@@ -3661,4 +3670,18 @@ bool CommandListCoreFamily<gfxCoreFamily>::handleCounterBasedEventOperations(Eve
return true;
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamily<gfxCoreFamily>::inOrderAtomicSignallingEnabled() const {
if (NEO::debugManager.flags.InOrderAtomicSignallingEnabled.get() == 1) {
return (this->getPartitionCount() > 1);
}
return false;
}
template <GFXCORE_FAMILY gfxCoreFamily>
uint64_t CommandListCoreFamily<gfxCoreFamily>::getInOrderIncrementValue() const {
return (inOrderAtomicSignallingEnabled() ? this->getPartitionCount() : 1);
}
} // namespace L0

View File

@@ -75,11 +75,13 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using BaseClass::getAllocationFromHostPtrMap;
using BaseClass::getDcFlushRequired;
using BaseClass::getHostPtrAlloc;
using BaseClass::getInOrderIncrementValue;
using BaseClass::hostPtrMap;
using BaseClass::immediateCmdListHeapSharing;
using BaseClass::indirectAllocationsAllowed;
using BaseClass::initialize;
using BaseClass::inOrderAllocationOffset;
using BaseClass::inOrderAtomicSignallingEnabled;
using BaseClass::inOrderExecInfo;
using BaseClass::inOrderPatchCmds;
using BaseClass::isFlushTaskSubmissionEnabled;
@@ -181,8 +183,10 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
using BaseClass::frontEndStateTracking;
using BaseClass::getDcFlushRequired;
using BaseClass::getHostPtrAlloc;
using BaseClass::getInOrderIncrementValue;
using BaseClass::hostSynchronize;
using BaseClass::immediateCmdListHeapSharing;
using BaseClass::inOrderAtomicSignallingEnabled;
using BaseClass::inOrderExecInfo;
using BaseClass::inOrderPatchCmds;
using BaseClass::isBcsSplitNeeded;

View File

@@ -1204,6 +1204,18 @@ HWTEST2_F(InOrderCmdListTests, givenRegularEventWithInOrderExecInfoWhenReusedOnR
EXPECT_EQ(nullptr, events[0]->inOrderExecInfo.get());
}
HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetAndSingleTileCmdListWhenAskingForAtomicSignallingThenReturnFalse, IsAtLeastSkl) {
auto immCmdList = createImmCmdList<gfxCoreFamily>();
EXPECT_FALSE(immCmdList->inOrderAtomicSignallingEnabled());
EXPECT_EQ(1u, immCmdList->getInOrderIncrementValue());
debugManager.flags.InOrderAtomicSignallingEnabled.set(1);
EXPECT_FALSE(immCmdList->inOrderAtomicSignallingEnabled());
EXPECT_EQ(1u, immCmdList->getInOrderIncrementValue());
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenSubmittingThenProgramSemaphoreForPreviousDispatch, IsAtLeastXeHpCore) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
@@ -4075,8 +4087,8 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingKernelSplitWithEve
struct MultiTileInOrderCmdListTests : public InOrderCmdListTests {
void SetUp() override {
NEO::debugManager.flags.CreateMultipleSubDevices.set(2);
NEO::debugManager.flags.EnableImplicitScaling.set(1);
NEO::debugManager.flags.CreateMultipleSubDevices.set(partitionCount);
NEO::debugManager.flags.EnableImplicitScaling.set(4);
InOrderCmdListTests::SetUp();
}
@@ -4085,12 +4097,71 @@ struct MultiTileInOrderCmdListTests : public InOrderCmdListTests {
DestroyableZeUniquePtr<WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>> createMultiTileImmCmdList() {
auto cmdList = createImmCmdList<gfxCoreFamily>();
cmdList->partitionCount = 2;
cmdList->partitionCount = partitionCount;
return cmdList;
}
const uint32_t partitionCount = 2;
};
HWTEST2_F(MultiTileInOrderCmdListTests, givenDebugFlagSetWhenAskingForAtomicSignallingThenReturnTrue, IsAtLeastXeHpCore) {
auto immCmdList = createMultiTileImmCmdList<gfxCoreFamily>();
EXPECT_FALSE(immCmdList->inOrderAtomicSignallingEnabled());
EXPECT_EQ(1u, immCmdList->getInOrderIncrementValue());
debugManager.flags.InOrderAtomicSignallingEnabled.set(1);
EXPECT_TRUE(immCmdList->inOrderAtomicSignallingEnabled());
EXPECT_EQ(partitionCount, immCmdList->getInOrderIncrementValue());
}
HWTEST2_F(MultiTileInOrderCmdListTests, givenAtomicSignallingEnabledWhenSignallingCounterThenUseMiAtomicCmd, IsAtLeastXeHpCore) {
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
using ATOMIC_OPCODES = typename FamilyType::MI_ATOMIC::ATOMIC_OPCODES;
using DATA_SIZE = typename FamilyType::MI_ATOMIC::DATA_SIZE;
debugManager.flags.InOrderAtomicSignallingEnabled.set(1);
auto immCmdList = createMultiTileImmCmdList<gfxCoreFamily>();
auto eventPool = createEvents<FamilyType>(1, false);
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
EXPECT_EQ(0u, immCmdList->inOrderExecInfo->getCounterValue());
auto handle = events[0]->toHandle();
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, handle, 0, nullptr, launchParams, false);
EXPECT_EQ(partitionCount, immCmdList->inOrderExecInfo->getCounterValue());
size_t offset = cmdStream->getUsed();
immCmdList->appendWaitOnEvents(1, &handle, false, false, true);
EXPECT_EQ(partitionCount * 2, immCmdList->inOrderExecInfo->getCounterValue());
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
auto miAtomics = findAll<MI_ATOMIC *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(1u, miAtomics.size());
auto atomicCmd = genCmdCast<MI_ATOMIC *>(*miAtomics[0]);
ASSERT_NE(nullptr, atomicCmd);
auto gpuAddress = immCmdList->inOrderExecInfo->getDeviceCounterAllocation().getGpuAddress();
EXPECT_EQ(gpuAddress, NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*atomicCmd));
EXPECT_EQ(ATOMIC_OPCODES::ATOMIC_8B_INCREMENT, atomicCmd->getAtomicOpcode());
EXPECT_EQ(DATA_SIZE::DATA_SIZE_QWORD, atomicCmd->getDataSize());
EXPECT_EQ(0u, atomicCmd->getReturnDataControl());
EXPECT_EQ(0u, atomicCmd->getCsStall());
}
HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgrammingWaitOnEventsThenHandleAllEventPackets, IsAtLeastXeHpCore) {
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;

View File

@@ -262,6 +262,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DisableSystemPointerKernelArgument, -1, "-1: def
DECLARE_DEBUG_VARIABLE(int32_t, ProgramUserInterruptOnResolvedDependency, -1, "-1: default, 0: Disabled, 1: On signaling append completion (if possible) - for example in-order counter update")
DECLARE_DEBUG_VARIABLE(int32_t, EnableInOrderRegularCmdListPatching, -1, "-1: default, 0: Disabled, 1: If set, patch counter value on execute call")
DECLARE_DEBUG_VARIABLE(int32_t, EnableInOrderRelaxedOrderingForEventsChaining, -1, "-1: default, 0: Disabled, 1: If set, send 2 immediate flushes to avoid stalling RelaxedOrdering Scheduler.")
DECLARE_DEBUG_VARIABLE(int32_t, InOrderAtomicSignallingEnabled, -1, "-1: default, 0: disabled, 1: Use atomic GPU operations in increment the counter. Otherwise use non-atomic commands like SDI.")
/*LOGGING FLAGS*/
DECLARE_DEBUG_VARIABLE(int32_t, PrintDriverDiagnostics, -1, "prints driver diagnostics messages to standard output, value corresponds to hint level")

View File

@@ -565,4 +565,5 @@ EnableImplicitConvertionToCounterBasedEvents = -1
SetAmountOfInternalHeapsToPreallocate = -1
DoNotUseProductConfigForValidationWa = 0
EnableDeviceStateVerificationAfterFailedSubmission = -1
InOrderAtomicSignallingEnabled = -1
# Please don't edit below this line