feature: full sync dispatch mode initialization path

Related-To: NEO-8171

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski 2024-03-27 15:15:25 +00:00 committed by Compute-Runtime-Automation
parent b109094e4b
commit 50908a0809
3 changed files with 164 additions and 14 deletions

View File

@ -358,6 +358,7 @@ struct CommandListCoreFamily : public CommandListImp {
bool isQwordInOrderCounter() const { return GfxFamily::isQwordInOrderCounter; }
bool isInOrderNonWalkerSignalingRequired(const Event *event) const;
bool hasInOrderDependencies() const;
void appendFullSynchronizedDispatchInit();
size_t addCmdForPatching(std::shared_ptr<NEO::InOrderExecInfo> *externalInOrderExecInfo, void *cmd1, void *cmd2, uint64_t counterValue, NEO::InOrderPatchCommandHelpers::PatchCmdType patchCmdType);
uint64_t getInOrderIncrementValue() const;

View File

@ -3955,7 +3955,74 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSynchronizedDispatchInitializat
NEO::EncodeSemaphore<GfxFamily>::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(), syncAlloc->getGpuAddress() + sizeof(uint32_t), 0u,
GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD,
false, false, false, true, nullptr);
} else if (this->synchronizedDispatchMode == NEO::SynchronizedDispatchMode::full) {
appendFullSynchronizedDispatchInit();
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendFullSynchronizedDispatchInit() {
using MI_ATOMIC = typename GfxFamily::MI_ATOMIC;
using ATOMIC_OPCODES = typename MI_ATOMIC::ATOMIC_OPCODES;
using DATA_SIZE = typename MI_ATOMIC::DATA_SIZE;
constexpr size_t conditionalDataMemBbStartSize = NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataMemBatchBufferStart(false);
const uint32_t queueId = this->syncDispatchQueueId + 1;
const uint64_t queueIdToken = static_cast<uint64_t>(queueId) << 32;
const uint64_t tokenInitialValue = queueIdToken + this->partitionCount;
auto syncAllocationGpuVa = device->getSyncDispatchTokenAllocation()->getGpuAddress();
auto workPartitionAllocationGpuVa = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocation()->getGpuAddress();
auto cmdStream = commandContainer.getCommandStream();
// If Secondary Tile, then jump to Secondary Tile section
// Reserve space for now. Will be patched later
NEO::LinearStream skipPrimaryTileSectionCmdStream(cmdStream->getSpace(conditionalDataMemBbStartSize), conditionalDataMemBbStartSize);
// If token acquired, jump to the end
NEO::LinearStream jumpToEndSectionFromPrimaryTile;
// Primary Tile section
{
// Try acquire token
uint64_t acquireTokenCmdBufferVa = cmdStream->getCurrentGpuAddressPosition();
NEO::EncodeMiPredicate<GfxFamily>::encode(*cmdStream, NEO::MiPredicateType::disable);
NEO::EncodeAtomic<GfxFamily>::programMiAtomic(*cmdStream, syncAllocationGpuVa, ATOMIC_OPCODES::ATOMIC_8B_CMP_WR,
DATA_SIZE::DATA_SIZE_QWORD, 1, 1, 0, tokenInitialValue);
// If token acquired, jump to the end
// Reserve space for now. Will be patched later
jumpToEndSectionFromPrimaryTile.replaceBuffer(cmdStream->getSpace(conditionalDataMemBbStartSize), conditionalDataMemBbStartSize);
// Semaphore for potential switch
NEO::EncodeSemaphore<GfxFamily>::addMiSemaphoreWaitCommand(*cmdStream, syncAllocationGpuVa + sizeof(uint32_t), 0u,
GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD,
false, false, false, true, nullptr);
// Loop back to acquire again
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(cmdStream, acquireTokenCmdBufferVa, false, false, false);
}
// Patch Primary Tile section skip (to Secondary Tile section)
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(skipPrimaryTileSectionCmdStream, cmdStream->getCurrentGpuAddressPosition(), workPartitionAllocationGpuVa, 0,
NEO::CompareOperation::notEqual, false, false);
// Secondary Tile section
{
NEO::EncodeMiPredicate<GfxFamily>::encode(*cmdStream, NEO::MiPredicateType::disable);
// Wait for token acquisition by Primary Tile
NEO::EncodeSemaphore<GfxFamily>::addMiSemaphoreWaitCommand(*cmdStream, syncAllocationGpuVa + sizeof(uint32_t), queueId,
GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD,
false, false, false, true, nullptr);
}
// Patch Primary Tile section jump to end
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataMemBatchBufferStart(jumpToEndSectionFromPrimaryTile, cmdStream->getCurrentGpuAddressPosition(), syncAllocationGpuVa + sizeof(uint32_t), queueId,
NEO::CompareOperation::equal, false, false);
// End section
NEO::EncodeMiPredicate<GfxFamily>::encode(*cmdStream, NEO::MiPredicateType::disable);
}
} // namespace L0

View File

@ -6131,34 +6131,116 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppend
context->freeMem(alloc);
}
HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppendingThenDontProgramTokenCheck, IsAtLeastSkl) {
HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppendingThenProgramTokenAcquire, IsAtLeastXeHpcCore) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using MI_SET_PREDICATE = typename FamilyType::MI_SET_PREDICATE;
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
auto immCmdList = createMultiTileImmCmdList<gfxCoreFamily>();
immCmdList->synchronizedDispatchMode = NEO::SynchronizedDispatchMode::full;
immCmdList->syncDispatchQueueId = 0x1234;
const uint32_t queueId = immCmdList->syncDispatchQueueId + 1;
const uint64_t queueIdToken = static_cast<uint64_t>(queueId) << 32;
const uint64_t tokenInitialValue = queueIdToken + partitionCount;
auto cmdStream = immCmdList->getCmdContainer().getCommandStream();
size_t offset = cmdStream->getUsed();
auto verifyTokenCheck = [&](bool hasDependencySemaphore) {
auto verifyTokenAcquisition = [&](bool hasDependencySemaphore) {
GenCmdList cmdList;
EXPECT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset)));
if (::testing::Test::HasFailure()) {
return false;
}
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
for (auto &semaphore : semaphores) {
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*semaphore);
EXPECT_NE(nullptr, semaphoreCmd);
if (::testing::Test::HasFailure()) {
return false;
auto itor = cmdList.begin();
if (hasDependencySemaphore) {
for (uint32_t i = 0; i < partitionCount; i++) {
itor = find<MI_SEMAPHORE_WAIT *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor++;
}
}
EXPECT_NE(device->getSyncDispatchTokenAllocation()->getGpuAddress() + sizeof(uint32_t), semaphoreCmd->getSemaphoreGraphicsAddress());
if (::testing::Test::HasFailure()) {
return false;
}
// Primary-secondaty path selection
void *primaryTileSectionSkipVa = *itor;
// Primary Tile section
auto miPredicate = reinterpret_cast<MI_SET_PREDICATE *>(
ptrOffset(primaryTileSectionSkipVa, NEO::EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataMemBatchBufferStart(false)));
void *loopBackToAcquireVa = miPredicate;
if (!RelaxedOrderingCommandsHelper::verifyMiPredicate<FamilyType>(miPredicate, MiPredicateType::disable)) {
return false;
}
auto miAtomic = reinterpret_cast<MI_ATOMIC *>(++miPredicate);
EXPECT_EQ(MI_ATOMIC::DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_1, miAtomic->getDwordLength());
EXPECT_EQ(1u, miAtomic->getInlineData());
EXPECT_EQ(0u, miAtomic->getOperand1DataDword0());
EXPECT_EQ(0u, miAtomic->getOperand1DataDword1());
EXPECT_EQ(getLowPart(tokenInitialValue), miAtomic->getOperand2DataDword0());
EXPECT_EQ(getHighPart(tokenInitialValue), miAtomic->getOperand2DataDword1());
EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_8B_CMP_WR, miAtomic->getAtomicOpcode());
EXPECT_EQ(MI_ATOMIC::DATA_SIZE::DATA_SIZE_QWORD, miAtomic->getDataSize());
if (::testing::Test::HasFailure()) {
return false;
}
void *jumpToEndSectionFromPrimaryTile = ++miAtomic;
auto semaphore = reinterpret_cast<MI_SEMAPHORE_WAIT *>(
ptrOffset(jumpToEndSectionFromPrimaryTile, NEO::EncodeBatchBufferStartOrEnd<FamilyType>::getCmdSizeConditionalDataMemBatchBufferStart(false)));
EXPECT_EQ(0u, semaphore->getSemaphoreDataDword());
uint64_t syncAllocGpuVa = device->getSyncDispatchTokenAllocation()->getGpuAddress();
EXPECT_EQ(syncAllocGpuVa + sizeof(uint32_t), semaphore->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, semaphore->getCompareOperation());
if (::testing::Test::HasFailure()) {
return false;
}
auto bbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(++semaphore);
EXPECT_EQ(castToUint64(loopBackToAcquireVa), bbStart->getBatchBufferStartAddress());
if (::testing::Test::HasFailure()) {
return false;
}
uint64_t workPartitionGpuVa = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocation()->getGpuAddress();
// Secondary Tile section
miPredicate = reinterpret_cast<MI_SET_PREDICATE *>(++bbStart);
if (!RelaxedOrderingCommandsHelper::verifyMiPredicate<FamilyType>(miPredicate, MiPredicateType::disable)) {
return false;
}
// Primary Tile section skip - patching
if (!RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart<FamilyType>(primaryTileSectionSkipVa, castToUint64(miPredicate), workPartitionGpuVa, 0, NEO::CompareOperation::notEqual, false, false)) {
return false;
}
semaphore = reinterpret_cast<MI_SEMAPHORE_WAIT *>(++miPredicate);
EXPECT_EQ(queueId, semaphore->getSemaphoreDataDword());
EXPECT_EQ(syncAllocGpuVa + sizeof(uint32_t), semaphore->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, semaphore->getCompareOperation());
// End section
miPredicate = reinterpret_cast<MI_SET_PREDICATE *>(++semaphore);
if (!RelaxedOrderingCommandsHelper::verifyMiPredicate<FamilyType>(miPredicate, MiPredicateType::disable)) {
return false;
}
// Jump to end from Primary Tile section - patching
if (!RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart<FamilyType>(jumpToEndSectionFromPrimaryTile, castToUint64(miPredicate), syncAllocGpuVa + sizeof(uint32_t), queueId, NEO::CompareOperation::equal, false, false)) {
return false;
}
return true;
@ -6166,11 +6248,11 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppending
// first run without dependency
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_TRUE(verifyTokenCheck(false));
EXPECT_TRUE(verifyTokenAcquisition(false));
offset = cmdStream->getUsed();
immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_TRUE(verifyTokenCheck(true));
EXPECT_TRUE(verifyTokenAcquisition(true));
}
} // namespace ult