From 50908a0809099acc712ab033581c0bfbc4e77c1b Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Wed, 27 Mar 2024 15:15:25 +0000 Subject: [PATCH] feature: full sync dispatch mode initialization path Related-To: NEO-8171 Signed-off-by: Bartosz Dunajski --- level_zero/core/source/cmdlist/cmdlist_hw.h | 1 + level_zero/core/source/cmdlist/cmdlist_hw.inl | 67 +++++++++++ .../sources/cmdlist/test_in_order_cmdlist.cpp | 110 +++++++++++++++--- 3 files changed, 164 insertions(+), 14 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 8ded0e4687..d12c9e9750 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -358,6 +358,7 @@ struct CommandListCoreFamily : public CommandListImp { bool isQwordInOrderCounter() const { return GfxFamily::isQwordInOrderCounter; } bool isInOrderNonWalkerSignalingRequired(const Event *event) const; bool hasInOrderDependencies() const; + void appendFullSynchronizedDispatchInit(); size_t addCmdForPatching(std::shared_ptr *externalInOrderExecInfo, void *cmd1, void *cmd2, uint64_t counterValue, NEO::InOrderPatchCommandHelpers::PatchCmdType patchCmdType); uint64_t getInOrderIncrementValue() const; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index ecaffd6189..6cb8c9aa9e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -3955,7 +3955,74 @@ void CommandListCoreFamily::appendSynchronizedDispatchInitializat NEO::EncodeSemaphore::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(), syncAlloc->getGpuAddress() + sizeof(uint32_t), 0u, GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, false, false, false, true, nullptr); + } else if (this->synchronizedDispatchMode == NEO::SynchronizedDispatchMode::full) { + appendFullSynchronizedDispatchInit(); } } +template +void CommandListCoreFamily::appendFullSynchronizedDispatchInit() { + using MI_ATOMIC = typename GfxFamily::MI_ATOMIC; + using ATOMIC_OPCODES = typename MI_ATOMIC::ATOMIC_OPCODES; + using DATA_SIZE = typename MI_ATOMIC::DATA_SIZE; + + constexpr size_t conditionalDataMemBbStartSize = NEO::EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(false); + + const uint32_t queueId = this->syncDispatchQueueId + 1; + const uint64_t queueIdToken = static_cast(queueId) << 32; + const uint64_t tokenInitialValue = queueIdToken + this->partitionCount; + + auto syncAllocationGpuVa = device->getSyncDispatchTokenAllocation()->getGpuAddress(); + auto workPartitionAllocationGpuVa = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocation()->getGpuAddress(); + auto cmdStream = commandContainer.getCommandStream(); + + // If Secondary Tile, then jump to Secondary Tile section + // Reserve space for now. Will be patched later + NEO::LinearStream skipPrimaryTileSectionCmdStream(cmdStream->getSpace(conditionalDataMemBbStartSize), conditionalDataMemBbStartSize); + + // If token acquired, jump to the end + NEO::LinearStream jumpToEndSectionFromPrimaryTile; + + // Primary Tile section + { + // Try acquire token + uint64_t acquireTokenCmdBufferVa = cmdStream->getCurrentGpuAddressPosition(); + NEO::EncodeMiPredicate::encode(*cmdStream, NEO::MiPredicateType::disable); + NEO::EncodeAtomic::programMiAtomic(*cmdStream, syncAllocationGpuVa, ATOMIC_OPCODES::ATOMIC_8B_CMP_WR, + DATA_SIZE::DATA_SIZE_QWORD, 1, 1, 0, tokenInitialValue); + + // If token acquired, jump to the end + // Reserve space for now. Will be patched later + jumpToEndSectionFromPrimaryTile.replaceBuffer(cmdStream->getSpace(conditionalDataMemBbStartSize), conditionalDataMemBbStartSize); + + // Semaphore for potential switch + NEO::EncodeSemaphore::addMiSemaphoreWaitCommand(*cmdStream, syncAllocationGpuVa + sizeof(uint32_t), 0u, + GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, + false, false, false, true, nullptr); + + // Loop back to acquire again + NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(cmdStream, acquireTokenCmdBufferVa, false, false, false); + } + + // Patch Primary Tile section skip (to Secondary Tile section) + NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(skipPrimaryTileSectionCmdStream, cmdStream->getCurrentGpuAddressPosition(), workPartitionAllocationGpuVa, 0, + NEO::CompareOperation::notEqual, false, false); + + // Secondary Tile section + { + NEO::EncodeMiPredicate::encode(*cmdStream, NEO::MiPredicateType::disable); + + // Wait for token acquisition by Primary Tile + NEO::EncodeSemaphore::addMiSemaphoreWaitCommand(*cmdStream, syncAllocationGpuVa + sizeof(uint32_t), queueId, + GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, + false, false, false, true, nullptr); + } + + // Patch Primary Tile section jump to end + NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(jumpToEndSectionFromPrimaryTile, cmdStream->getCurrentGpuAddressPosition(), syncAllocationGpuVa + sizeof(uint32_t), queueId, + NEO::CompareOperation::equal, false, false); + + // End section + NEO::EncodeMiPredicate::encode(*cmdStream, NEO::MiPredicateType::disable); +} } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp index 81177a71c1..5b27a24631 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp @@ -6131,34 +6131,116 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenLimitedSyncDispatchWhenAppend context->freeMem(alloc); } -HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppendingThenDontProgramTokenCheck, IsAtLeastSkl) { +HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppendingThenProgramTokenAcquire, IsAtLeastXeHpcCore) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using MI_SET_PREDICATE = typename FamilyType::MI_SET_PREDICATE; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; auto immCmdList = createMultiTileImmCmdList(); immCmdList->synchronizedDispatchMode = NEO::SynchronizedDispatchMode::full; + immCmdList->syncDispatchQueueId = 0x1234; + + const uint32_t queueId = immCmdList->syncDispatchQueueId + 1; + const uint64_t queueIdToken = static_cast(queueId) << 32; + const uint64_t tokenInitialValue = queueIdToken + partitionCount; auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); size_t offset = cmdStream->getUsed(); - auto verifyTokenCheck = [&](bool hasDependencySemaphore) { + auto verifyTokenAcquisition = [&](bool hasDependencySemaphore) { GenCmdList cmdList; EXPECT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset))); if (::testing::Test::HasFailure()) { return false; } - auto semaphores = findAll(cmdList.begin(), cmdList.end()); - for (auto &semaphore : semaphores) { - auto semaphoreCmd = genCmdCast(*semaphore); - EXPECT_NE(nullptr, semaphoreCmd); - if (::testing::Test::HasFailure()) { - return false; + auto itor = cmdList.begin(); + if (hasDependencySemaphore) { + for (uint32_t i = 0; i < partitionCount; i++) { + itor = find(itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + itor++; } + } - EXPECT_NE(device->getSyncDispatchTokenAllocation()->getGpuAddress() + sizeof(uint32_t), semaphoreCmd->getSemaphoreGraphicsAddress()); - if (::testing::Test::HasFailure()) { - return false; - } + // Primary-secondaty path selection + void *primaryTileSectionSkipVa = *itor; + + // Primary Tile section + auto miPredicate = reinterpret_cast( + ptrOffset(primaryTileSectionSkipVa, NEO::EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(false))); + void *loopBackToAcquireVa = miPredicate; + + if (!RelaxedOrderingCommandsHelper::verifyMiPredicate(miPredicate, MiPredicateType::disable)) { + return false; + } + + auto miAtomic = reinterpret_cast(++miPredicate); + EXPECT_EQ(MI_ATOMIC::DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_1, miAtomic->getDwordLength()); + EXPECT_EQ(1u, miAtomic->getInlineData()); + + EXPECT_EQ(0u, miAtomic->getOperand1DataDword0()); + EXPECT_EQ(0u, miAtomic->getOperand1DataDword1()); + + EXPECT_EQ(getLowPart(tokenInitialValue), miAtomic->getOperand2DataDword0()); + EXPECT_EQ(getHighPart(tokenInitialValue), miAtomic->getOperand2DataDword1()); + + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_8B_CMP_WR, miAtomic->getAtomicOpcode()); + EXPECT_EQ(MI_ATOMIC::DATA_SIZE::DATA_SIZE_QWORD, miAtomic->getDataSize()); + + if (::testing::Test::HasFailure()) { + return false; + } + + void *jumpToEndSectionFromPrimaryTile = ++miAtomic; + + auto semaphore = reinterpret_cast( + ptrOffset(jumpToEndSectionFromPrimaryTile, NEO::EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(false))); + + EXPECT_EQ(0u, semaphore->getSemaphoreDataDword()); + uint64_t syncAllocGpuVa = device->getSyncDispatchTokenAllocation()->getGpuAddress(); + EXPECT_EQ(syncAllocGpuVa + sizeof(uint32_t), semaphore->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, semaphore->getCompareOperation()); + + if (::testing::Test::HasFailure()) { + return false; + } + + auto bbStart = reinterpret_cast(++semaphore); + EXPECT_EQ(castToUint64(loopBackToAcquireVa), bbStart->getBatchBufferStartAddress()); + + if (::testing::Test::HasFailure()) { + return false; + } + + uint64_t workPartitionGpuVa = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocation()->getGpuAddress(); + + // Secondary Tile section + miPredicate = reinterpret_cast(++bbStart); + if (!RelaxedOrderingCommandsHelper::verifyMiPredicate(miPredicate, MiPredicateType::disable)) { + return false; + } + + // Primary Tile section skip - patching + if (!RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(primaryTileSectionSkipVa, castToUint64(miPredicate), workPartitionGpuVa, 0, NEO::CompareOperation::notEqual, false, false)) { + return false; + } + + semaphore = reinterpret_cast(++miPredicate); + EXPECT_EQ(queueId, semaphore->getSemaphoreDataDword()); + EXPECT_EQ(syncAllocGpuVa + sizeof(uint32_t), semaphore->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, semaphore->getCompareOperation()); + + // End section + miPredicate = reinterpret_cast(++semaphore); + if (!RelaxedOrderingCommandsHelper::verifyMiPredicate(miPredicate, MiPredicateType::disable)) { + return false; + } + + // Jump to end from Primary Tile section - patching + if (!RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(jumpToEndSectionFromPrimaryTile, castToUint64(miPredicate), syncAllocGpuVa + sizeof(uint32_t), queueId, NEO::CompareOperation::equal, false, false)) { + return false; } return true; @@ -6166,11 +6248,11 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppending // first run without dependency immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); - EXPECT_TRUE(verifyTokenCheck(false)); + EXPECT_TRUE(verifyTokenAcquisition(false)); offset = cmdStream->getUsed(); immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); - EXPECT_TRUE(verifyTokenCheck(true)); + EXPECT_TRUE(verifyTokenAcquisition(true)); } } // namespace ult