From 76cb97de9d18fda7eab9a2400aa2be8dba7742c2 Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Tue, 21 May 2024 14:14:07 +0000 Subject: [PATCH] feature: handle in-order counter in copy offload path Related-To: NEO-11376 Signed-off-by: Bartosz Dunajski --- level_zero/core/source/cmdlist/cmdlist_hw.h | 6 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 102 +++++++++------- .../source/cmdlist/cmdlist_hw_immediate.inl | 4 +- .../cmdlist/cmdlist_hw_skl_to_tgllp.inl | 2 +- .../cmdlist/cmdlist_hw_xehp_and_later.inl | 2 +- level_zero/core/source/device/bcs_split.h | 4 +- .../sources/cmdlist/test_in_order_cmdlist.cpp | 109 +++++++++++++++++- .../command_container/command_encoder.inl | 2 +- .../gen11/hw_cmds_generated_gen11.inl | 1 + .../gen12lp/hw_cmds_generated_gen12lp.inl | 1 + .../generated/gen8/hw_cmds_generated_gen8.inl | 1 + .../generated/gen9/hw_cmds_generated_gen9.inl | 1 + .../hw_cmds_generated_xe_hpc_core.inl | 1 + .../hw_cmds_generated_xe_hpg_core.inl | 1 + .../unit_test/encoders/test_encode_atomic.cpp | 23 +++- 15 files changed, 205 insertions(+), 55 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index e16cb2beca..9d1588651d 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -187,8 +187,8 @@ struct CommandListCoreFamily : public CommandListImp { void appendWaitOnInOrderDependency(std::shared_ptr &inOrderExecInfo, CommandToPatchContainer *outListCommands, uint64_t waitValue, uint32_t offset, bool relaxedOrderingAllowed, bool implicitDependency, bool skipAddingWaitEventsToResidency, bool noopDispatch); - void appendSignalInOrderDependencyCounter(Event *signalEvent); - void handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining); + void appendSignalInOrderDependencyCounter(Event *signalEvent, bool copyOffloadOperation); + void handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining, bool copyOffloadOperation); ze_result_t appendWriteGlobalTimestamp(uint64_t *dstptr, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) override; @@ -290,7 +290,7 @@ struct CommandListCoreFamily : public CommandListImp { void appendWaitOnSingleEvent(Event *event, CommandToPatchContainer *outWaitCmds, bool relaxedOrderingAllowed, CommandToPatch::CommandType storedSemaphore); - void appendSdiInOrderCounterSignalling(uint64_t baseGpuVa, uint64_t signalValue); + void appendSdiInOrderCounterSignalling(uint64_t baseGpuVa, uint64_t signalValue, bool copyOffloadOperation); ze_result_t prepareIndirectParams(const ze_group_count_t *threadGroupDimensions); void updateStreamPropertiesForRegularCommandLists(Kernel &kernel, bool isCooperative, const ze_group_count_t &threadGroupDimensions, bool isIndirect); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 069b55a1c3..79fff9752e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -158,7 +158,7 @@ ze_result_t CommandListCoreFamily::reset() { } template -void CommandListCoreFamily::handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining) { +void CommandListCoreFamily::handleInOrderDependencyCounter(Event *signalEvent, bool nonWalkerInOrderCmdsChaining, bool copyOffloadOperation) { if (!isInOrderExecutionEnabled()) { if (signalEvent && signalEvent->getInOrderExecInfo().get()) { UNRECOVERABLE_IF(signalEvent->isCounterBased()); @@ -182,7 +182,7 @@ void CommandListCoreFamily::handleInOrderDependencyCounter(Event inOrderExecInfo->setAllocationOffset(newOffset); inOrderExecInfo->initializeAllocationsFromHost(); - CommandListCoreFamily::appendSignalInOrderDependencyCounter(nullptr); // signal counter on new offset + CommandListCoreFamily::appendSignalInOrderDependencyCounter(nullptr, copyOffloadOperation); // signal counter on new offset } inOrderExecInfo->addCounterValue(getInOrderIncrementValue()); @@ -407,7 +407,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernel(ze_kernel_h event, launchParams); if (!launchParams.skipInOrderNonWalkerSignaling) { - handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event)); + handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event), false); } appendSynchronizedDispatchCleanupSection(); @@ -459,7 +459,7 @@ ze_result_t CommandListCoreFamily::appendLaunchCooperativeKernel( event, launchParams); addToMappedEventList(event); - handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event)); + handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event), false); appendSynchronizedDispatchCleanupSection(); @@ -504,7 +504,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelIndirect(ze_ addToMappedEventList(event); appendSignalEventPostWalker(event, nullptr, nullptr, false, false); - handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event)); + handleInOrderDependencyCounter(event, isInOrderNonWalkerSignalingRequired(event), false); appendSynchronizedDispatchCleanupSection(); @@ -608,9 +608,9 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand } if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(event); + appendSignalInOrderDependencyCounter(event, false); } - handleInOrderDependencyCounter(event, false); + handleInOrderDependencyCounter(event, false, false); appendSynchronizedDispatchCleanupSection(); @@ -655,9 +655,9 @@ ze_result_t CommandListCoreFamily::appendMemoryRangesBarrier(uint addToMappedEventList(signalEvent); if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(signalEvent); + appendSignalInOrderDependencyCounter(signalEvent, false); } - handleInOrderDependencyCounter(signalEvent, false); + handleInOrderDependencyCounter(signalEvent, false, false); appendSynchronizedDispatchCleanupSection(); @@ -1643,15 +1643,15 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, NEO::PipeControlArgs args; NEO::MemorySynchronizationCommands::addSingleBarrier(*commandContainer.getCommandStream(), args); } - appendSignalInOrderDependencyCounter(signalEvent); + appendSignalInOrderDependencyCounter(signalEvent, isCopyOffloadEnabled()); } if (!isCopyOnlyEnabled || inOrderCopyOnlySignalingAllowed) { bool nonWalkerInOrderCmdChaining = !isCopyOnlyEnabled && isInOrderNonWalkerSignalingRequired(signalEvent) && !emitPipeControl; - handleInOrderDependencyCounter(signalEvent, nonWalkerInOrderCmdChaining); + handleInOrderDependencyCounter(signalEvent, nonWalkerInOrderCmdChaining, isCopyOffloadEnabled()); } } else { - handleInOrderDependencyCounter(signalEvent, false); + handleInOrderDependencyCounter(signalEvent, false, isCopyOffloadEnabled()); } appendSynchronizedDispatchCleanupSection(); @@ -1748,11 +1748,11 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d if (this->isInOrderExecutionEnabled()) { if (inOrderCopyOnlySignalingAllowed) { - appendSignalInOrderDependencyCounter(signalEvent); - handleInOrderDependencyCounter(signalEvent, false); + appendSignalInOrderDependencyCounter(signalEvent, isCopyOffloadEnabled()); + handleInOrderDependencyCounter(signalEvent, false, isCopyOffloadEnabled()); } } else { - handleInOrderDependencyCounter(signalEvent, false); + handleInOrderDependencyCounter(signalEvent, false, isCopyOffloadEnabled()); } if (NEO::debugManager.flags.EnableSWTags.get()) { @@ -2187,12 +2187,12 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, NEO::PipeControlArgs args; NEO::MemorySynchronizationCommands::addSingleBarrier(*commandContainer.getCommandStream(), args); } - appendSignalInOrderDependencyCounter(signalEvent); + appendSignalInOrderDependencyCounter(signalEvent, false); } else { nonWalkerInOrderCmdChaining = isInOrderNonWalkerSignalingRequired(signalEvent); } } - handleInOrderDependencyCounter(signalEvent, nonWalkerInOrderCmdChaining); + handleInOrderDependencyCounter(signalEvent, nonWalkerInOrderCmdChaining, false); appendSynchronizedDispatchCleanupSection(); @@ -2259,9 +2259,9 @@ ze_result_t CommandListCoreFamily::appendBlitFill(void *ptr, appendSignalEventPostWalker(signalEvent, nullptr, nullptr, false, false); if (isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(signalEvent); + appendSignalInOrderDependencyCounter(signalEvent, false); } - handleInOrderDependencyCounter(signalEvent, false); + handleInOrderDependencyCounter(signalEvent, false, false); } return ZE_RESULT_SUCCESS; } @@ -2494,9 +2494,9 @@ ze_result_t CommandListCoreFamily::appendSignalEvent(ze_event_han dispatchEventPostSyncOperation(event, nullptr, nullptr, Event::STATE_SIGNALED, false, false, appendPipeControlWithPostSync, false); if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(event); + appendSignalInOrderDependencyCounter(event, false); } - handleInOrderDependencyCounter(event, false); + handleInOrderDependencyCounter(event, false, false); if (NEO::debugManager.flags.EnableSWTags.get()) { neoDevice->getRootDeviceEnvironment().tagsManager->insertTag( @@ -2691,9 +2691,9 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu if (apiRequest) { if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(nullptr); + appendSignalInOrderDependencyCounter(nullptr, false); } - handleInOrderDependencyCounter(nullptr, false); + handleInOrderDependencyCounter(nullptr, false, false); } if (NEO::debugManager.flags.EnableSWTags.get()) { @@ -2708,21 +2708,33 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu } template -void CommandListCoreFamily::appendSdiInOrderCounterSignalling(uint64_t baseGpuVa, uint64_t signalValue) { +void CommandListCoreFamily::appendSdiInOrderCounterSignalling(uint64_t baseGpuVa, uint64_t signalValue, bool copyOffloadOperation) { using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM; uint64_t gpuVa = baseGpuVa + inOrderExecInfo->getAllocationOffset(); - auto miStoreCmd = reinterpret_cast(commandContainer.getCommandStream()->getSpace(sizeof(MI_STORE_DATA_IMM))); + uint32_t numWrites = 1; + bool partitionOffsetEnabled = this->partitionCount > 1; - NEO::EncodeStoreMemory::programStoreDataImm(miStoreCmd, gpuVa, getLowPart(signalValue), getHighPart(signalValue), - isQwordInOrderCounter(), (this->partitionCount > 1)); + if (copyOffloadOperation && partitionOffsetEnabled) { + numWrites = this->partitionCount; + partitionOffsetEnabled = false; + } - addCmdForPatching(nullptr, miStoreCmd, nullptr, signalValue, NEO::InOrderPatchCommandHelpers::PatchCmdType::sdi); + for (uint32_t i = 0; i < numWrites; i++) { + auto miStoreCmd = reinterpret_cast(commandContainer.getCommandStream()->getSpace(sizeof(MI_STORE_DATA_IMM))); + + NEO::EncodeStoreMemory::programStoreDataImm(miStoreCmd, gpuVa, getLowPart(signalValue), getHighPart(signalValue), + isQwordInOrderCounter(), partitionOffsetEnabled); + + addCmdForPatching(nullptr, miStoreCmd, nullptr, signalValue, NEO::InOrderPatchCommandHelpers::PatchCmdType::sdi); + + gpuVa += device->getL0GfxCoreHelper().getImmediateWritePostSyncOffset(); + } } template -void CommandListCoreFamily::appendSignalInOrderDependencyCounter(Event *signalEvent) { +void CommandListCoreFamily::appendSignalInOrderDependencyCounter(Event *signalEvent, bool copyOffloadOperation) { uint64_t deviceAllocGpuVa = inOrderExecInfo->getBaseDeviceAddress(); uint64_t signalValue = inOrderExecInfo->getCounterValue() + getInOrderIncrementValue(); @@ -2732,15 +2744,23 @@ void CommandListCoreFamily::appendSignalInOrderDependencyCounter( using ATOMIC_OPCODES = typename GfxFamily::MI_ATOMIC::ATOMIC_OPCODES; using DATA_SIZE = typename GfxFamily::MI_ATOMIC::DATA_SIZE; - NEO::EncodeAtomic::programMiAtomic(*cmdStream, deviceAllocGpuVa, ATOMIC_OPCODES::ATOMIC_8B_INCREMENT, - DATA_SIZE::DATA_SIZE_QWORD, 0, 0, 0, 0); + ATOMIC_OPCODES opcode = ATOMIC_OPCODES::ATOMIC_8B_INCREMENT; + uint64_t operand1Data = 0; + + if (copyOffloadOperation && this->partitionCount > 1) { + opcode = ATOMIC_OPCODES::ATOMIC_8B_ADD; + operand1Data = this->partitionCount; + } + + NEO::EncodeAtomic::programMiAtomic(*cmdStream, deviceAllocGpuVa, opcode, + DATA_SIZE::DATA_SIZE_QWORD, 0, 0, operand1Data, 0); } else { - appendSdiInOrderCounterSignalling(deviceAllocGpuVa, signalValue); + appendSdiInOrderCounterSignalling(deviceAllocGpuVa, signalValue, copyOffloadOperation); } if (inOrderExecInfo->isHostStorageDuplicated()) { - appendSdiInOrderCounterSignalling(inOrderExecInfo->getBaseHostGpuAddress(), signalValue); + appendSdiInOrderCounterSignalling(inOrderExecInfo->getBaseHostGpuAddress(), signalValue, copyOffloadOperation); } if ((NEO::debugManager.flags.ProgramUserInterruptOnResolvedDependency.get() == 1) && signalEvent && signalEvent->isInterruptModeEnabled()) { @@ -2910,9 +2930,9 @@ ze_result_t CommandListCoreFamily::appendWriteGlobalTimestamp( appendSignalEventPostWalker(signalEvent, nullptr, nullptr, false, false); if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(signalEvent); + appendSignalInOrderDependencyCounter(signalEvent, false); } - handleInOrderDependencyCounter(signalEvent, false); + handleInOrderDependencyCounter(signalEvent, false, false); appendSynchronizedDispatchCleanupSection(); @@ -3457,9 +3477,9 @@ ze_result_t CommandListCoreFamily::appendBarrier(ze_event_handle_ appendSignalEventPostWalker(signalEvent, nullptr, nullptr, this->isInOrderExecutionEnabled(), false); if (isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(signalEvent); + appendSignalInOrderDependencyCounter(signalEvent, false); } - handleInOrderDependencyCounter(signalEvent, false); + handleInOrderDependencyCounter(signalEvent, false, false); appendSynchronizedDispatchCleanupSection(); @@ -3624,9 +3644,9 @@ ze_result_t CommandListCoreFamily::appendWaitOnMemory(void *desc, appendSignalEventPostWalker(signalEvent, nullptr, nullptr, false, false); if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(signalEvent); + appendSignalInOrderDependencyCounter(signalEvent, false); } - handleInOrderDependencyCounter(signalEvent, false); + handleInOrderDependencyCounter(signalEvent, false, false); return ZE_RESULT_SUCCESS; } @@ -3671,9 +3691,9 @@ ze_result_t CommandListCoreFamily::appendWriteToMemory(void *desc } if (this->isInOrderExecutionEnabled()) { - appendSignalInOrderDependencyCounter(nullptr); + appendSignalInOrderDependencyCounter(nullptr, false); } - handleInOrderDependencyCounter(nullptr, false); + handleInOrderDependencyCounter(nullptr, false, false); return ZE_RESULT_SUCCESS; } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index bf838e1767..40266f3cea 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -510,7 +510,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernel( // Skip only in base appendLaunchKernel(). Handle remaining operations here. handleInOrderNonWalkerSignaling(event, stallingCmdsForRelaxedOrdering, relaxedOrderingDispatch, ret); } - CommandListCoreFamily::handleInOrderDependencyCounter(event, true); + CommandListCoreFamily::handleInOrderDependencyCounter(event, true, false); } return flushImmediate(ret, true, stallingCmdsForRelaxedOrdering, relaxedOrderingDispatch, true, hSignalEvent); @@ -532,7 +532,7 @@ void CommandListCoreFamilyImmediate::handleInOrderNonWalkerSignal } CommandListCoreFamily::appendWaitOnSingleEvent(event, nullptr, nonWalkerSignalingHasRelaxedOrdering, CommandToPatch::Invalid); - CommandListCoreFamily::appendSignalInOrderDependencyCounter(event); + CommandListCoreFamily::appendSignalInOrderDependencyCounter(event, false); } template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index 40f42984ea..5475e131c6 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -290,7 +290,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K NEO::PipeControlArgs args; NEO::MemorySynchronizationCommands::addSingleBarrier(*commandContainer.getCommandStream(), args); - appendSignalInOrderDependencyCounter(event); + appendSignalInOrderDependencyCounter(event, false); } return ZE_RESULT_SUCCESS; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 81ebd64f69..bd06745158 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -399,7 +399,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K if (inOrderNonWalkerSignalling) { if (!launchParams.skipInOrderNonWalkerSignaling) { appendWaitOnSingleEvent(eventForInOrderExec, launchParams.outListCommands, false, CommandToPatch::CbEventTimestampPostSyncSemaphoreWait); - appendSignalInOrderDependencyCounter(eventForInOrderExec); + appendSignalInOrderDependencyCounter(eventForInOrderExec, false); } } else { UNRECOVERABLE_IF(!dispatchKernelArgs.outWalkerPtr); diff --git a/level_zero/core/source/device/bcs_split.h b/level_zero/core/source/device/bcs_split.h index 3684628016..52f74944d4 100644 --- a/level_zero/core/source/device/bcs_split.h +++ b/level_zero/core/source/device/bcs_split.h @@ -143,9 +143,9 @@ struct BcsSplit { cmdList->appendEventForProfilingAllWalkers(this->events.marker[markerEventIndex], nullptr, nullptr, false, true, false); if (cmdList->isInOrderExecutionEnabled()) { - cmdList->appendSignalInOrderDependencyCounter(signalEvent); + cmdList->appendSignalInOrderDependencyCounter(signalEvent, false); } - cmdList->handleInOrderDependencyCounter(signalEvent, false); + cmdList->handleInOrderDependencyCounter(signalEvent, false, false); return result; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp index bd530aaf2f..ef02224777 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp @@ -4782,7 +4782,7 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenSignalingSy auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); - immCmdList->appendSignalInOrderDependencyCounter(nullptr); + immCmdList->appendSignalInOrderDependencyCounter(nullptr, false); GenCmdList cmdList; ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, cmdStream->getCpuBase(), cmdStream->getUsed())); @@ -6825,6 +6825,14 @@ struct CopyOffloadInOrderTests : public InOrderCmdListTests { return createImmCmdListImpl>>(true); } + template + DestroyableZeUniquePtr>> createMultiTileImmCmdListWithOffload(uint32_t partitionCount) { + auto cmdList = createImmCmdListWithOffload(); + cmdList->partitionCount = partitionCount; + return cmdList; + } + + uint32_t copyData = 0; std::unique_ptr> backupHwInfo; }; @@ -6920,8 +6928,6 @@ HWTEST2_F(CopyOffloadInOrderTests, givenCopyOffloadEnabledWhenProgrammingHwCmdsT auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); - uint32_t copyData = 0; - { auto offset = cmdStream->getUsed(); @@ -6952,5 +6958,102 @@ HWTEST2_F(CopyOffloadInOrderTests, givenCopyOffloadEnabledWhenProgrammingHwCmdsT } } +HWTEST2_F(CopyOffloadInOrderTests, givenAtomicSignalingModeWhenUpdatingCounterThenUseCorrectHwCommands, IsAtLeastXeHpCore) { + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using ATOMIC_OPCODES = typename FamilyType::MI_ATOMIC::ATOMIC_OPCODES; + using DATA_SIZE = typename FamilyType::MI_ATOMIC::DATA_SIZE; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + + constexpr uint32_t partitionCount = 4; + + debugManager.flags.InOrderDuplicatedCounterStorageEnabled.set(0); + + { + debugManager.flags.InOrderAtomicSignallingEnabled.set(1); + + auto immCmdList = createMultiTileImmCmdListWithOffload(partitionCount); + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + size_t offset = cmdStream->getUsed(); + + immCmdList->appendMemoryCopy(©Data, ©Data, 1, nullptr, 0, nullptr, false, false); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset))); + + auto miAtomics = findAll(cmdList.begin(), cmdList.end()); + EXPECT_EQ(1u, miAtomics.size()); + + auto atomicCmd = genCmdCast(*miAtomics[0]); + ASSERT_NE(nullptr, atomicCmd); + + auto gpuAddress = immCmdList->inOrderExecInfo->getBaseDeviceAddress(); + + EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getAtomicMemoryAddress(*atomicCmd)); + EXPECT_EQ(ATOMIC_OPCODES::ATOMIC_8B_ADD, atomicCmd->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_QWORD, atomicCmd->getDataSize()); + EXPECT_EQ(getLowPart(partitionCount), atomicCmd->getOperand1DataDword0()); + EXPECT_EQ(getHighPart(partitionCount), atomicCmd->getOperand1DataDword1()); + } + + { + debugManager.flags.InOrderAtomicSignallingEnabled.set(0); + + auto immCmdList = createMultiTileImmCmdListWithOffload(partitionCount); + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + size_t offset = cmdStream->getUsed(); + + immCmdList->appendMemoryCopy(©Data, ©Data, 1, nullptr, 0, nullptr, false, false); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset))); + + auto miStoreDws = findAll(cmdList.begin(), cmdList.end()); + EXPECT_EQ(partitionCount, miStoreDws.size()); + + for (uint32_t i = 0; i < partitionCount; i++) { + + auto storeDw = genCmdCast(*miStoreDws[i]); + ASSERT_NE(nullptr, storeDw); + + auto gpuAddress = immCmdList->inOrderExecInfo->getBaseDeviceAddress() + (i * device->getL0GfxCoreHelper().getImmediateWritePostSyncOffset()); + EXPECT_EQ(gpuAddress, storeDw->getAddress()); + EXPECT_EQ(1u, storeDw->getDataDword0()); + } + } + + { + debugManager.flags.InOrderAtomicSignallingEnabled.set(0); + debugManager.flags.InOrderDuplicatedCounterStorageEnabled.set(1); + + auto immCmdList = createMultiTileImmCmdListWithOffload(partitionCount); + + auto cmdStream = immCmdList->getCmdContainer().getCommandStream(); + + size_t offset = cmdStream->getUsed(); + + immCmdList->appendMemoryCopy(©Data, ©Data, 1, nullptr, 0, nullptr, false, false); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset))); + + auto miStoreDws = findAll(cmdList.begin(), cmdList.end()); + EXPECT_EQ(partitionCount * 2, miStoreDws.size()); + + for (uint32_t i = 0; i < partitionCount; i++) { + + auto storeDw = genCmdCast(*miStoreDws[i + partitionCount]); + ASSERT_NE(nullptr, storeDw); + + auto gpuAddress = immCmdList->inOrderExecInfo->getBaseHostGpuAddress() + (i * device->getL0GfxCoreHelper().getImmediateWritePostSyncOffset()); + EXPECT_EQ(gpuAddress, storeDw->getAddress()); + EXPECT_EQ(1u, storeDw->getDataDword0()); + } + } +} + } // namespace ult } // namespace L0 diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 1cc171e009..ad10f00d85 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -985,7 +985,7 @@ void EncodeAtomic::programMiAtomic(MI_ATOMIC *atomic, cmd.setReturnDataControl(returnDataControl); cmd.setCsStall(csStall); - if (opcode == ATOMIC_OPCODES::ATOMIC_4B_MOVE || opcode == ATOMIC_OPCODES::ATOMIC_8B_MOVE || opcode == ATOMIC_OPCODES::ATOMIC_8B_CMP_WR) { + if (opcode == ATOMIC_OPCODES::ATOMIC_4B_MOVE || opcode == ATOMIC_OPCODES::ATOMIC_8B_MOVE || opcode == ATOMIC_OPCODES::ATOMIC_8B_CMP_WR || opcode == ATOMIC_OPCODES::ATOMIC_8B_ADD) { cmd.setDwordLength(MI_ATOMIC::DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_1); cmd.setInlineData(0x1); diff --git a/shared/source/generated/gen11/hw_cmds_generated_gen11.inl b/shared/source/generated/gen11/hw_cmds_generated_gen11.inl index ebb71dbf7c..84ae72f82b 100644 --- a/shared/source/generated/gen11/hw_cmds_generated_gen11.inl +++ b/shared/source/generated/gen11/hw_cmds_generated_gen11.inl @@ -879,6 +879,7 @@ typedef struct tagMI_ATOMIC { ATOMIC_8B_MOVE = 0x24, ATOMIC_8B_INCREMENT = 0x25, ATOMIC_8B_DECREMENT = 0x26, + ATOMIC_8B_ADD = 0x27, ATOMIC_8B_CMP_WR = 0x2E, } ATOMIC_OPCODES; inline void init() { diff --git a/shared/source/generated/gen12lp/hw_cmds_generated_gen12lp.inl b/shared/source/generated/gen12lp/hw_cmds_generated_gen12lp.inl index f420ad2934..d51bf357ab 100644 --- a/shared/source/generated/gen12lp/hw_cmds_generated_gen12lp.inl +++ b/shared/source/generated/gen12lp/hw_cmds_generated_gen12lp.inl @@ -915,6 +915,7 @@ typedef struct tagMI_ATOMIC { ATOMIC_8B_MOVE = 0x24, ATOMIC_8B_INCREMENT = 0x25, ATOMIC_8B_DECREMENT = 0x26, + ATOMIC_8B_ADD = 0x27, ATOMIC_8B_CMP_WR = 0x2E, } ATOMIC_OPCODES; inline void init() { diff --git a/shared/source/generated/gen8/hw_cmds_generated_gen8.inl b/shared/source/generated/gen8/hw_cmds_generated_gen8.inl index 149f301c8b..649a64f817 100644 --- a/shared/source/generated/gen8/hw_cmds_generated_gen8.inl +++ b/shared/source/generated/gen8/hw_cmds_generated_gen8.inl @@ -1015,6 +1015,7 @@ typedef struct tagMI_ATOMIC { ATOMIC_8B_MOVE = 0x24, ATOMIC_8B_INCREMENT = 0x25, ATOMIC_8B_DECREMENT = 0x26, + ATOMIC_8B_ADD = 0x27, ATOMIC_8B_CMP_WR = 0x2E, } ATOMIC_OPCODES; inline void init() { diff --git a/shared/source/generated/gen9/hw_cmds_generated_gen9.inl b/shared/source/generated/gen9/hw_cmds_generated_gen9.inl index 5ad2cb0fa3..2efc8eb635 100644 --- a/shared/source/generated/gen9/hw_cmds_generated_gen9.inl +++ b/shared/source/generated/gen9/hw_cmds_generated_gen9.inl @@ -1012,6 +1012,7 @@ typedef struct tagMI_ATOMIC { ATOMIC_8B_MOVE = 0x24, ATOMIC_8B_INCREMENT = 0x25, ATOMIC_8B_DECREMENT = 0x26, + ATOMIC_8B_ADD = 0x27, ATOMIC_8B_CMP_WR = 0x2E, } ATOMIC_OPCODES; inline void init() { diff --git a/shared/source/generated/xe_hpc_core/hw_cmds_generated_xe_hpc_core.inl b/shared/source/generated/xe_hpc_core/hw_cmds_generated_xe_hpc_core.inl index cc005ec61e..b34394835d 100644 --- a/shared/source/generated/xe_hpc_core/hw_cmds_generated_xe_hpc_core.inl +++ b/shared/source/generated/xe_hpc_core/hw_cmds_generated_xe_hpc_core.inl @@ -1094,6 +1094,7 @@ typedef struct tagMI_ATOMIC { ATOMIC_8B_MOVE = 0x24, ATOMIC_8B_INCREMENT = 0x25, ATOMIC_8B_DECREMENT = 0x26, + ATOMIC_8B_ADD = 0x27, ATOMIC_8B_CMP_WR = 0x2E, } ATOMIC_OPCODES; inline void init() { diff --git a/shared/source/generated/xe_hpg_core/hw_cmds_generated_xe_hpg_core.inl b/shared/source/generated/xe_hpg_core/hw_cmds_generated_xe_hpg_core.inl index 43bd1f9fdf..5d58aa5cdd 100644 --- a/shared/source/generated/xe_hpg_core/hw_cmds_generated_xe_hpg_core.inl +++ b/shared/source/generated/xe_hpg_core/hw_cmds_generated_xe_hpg_core.inl @@ -836,6 +836,7 @@ typedef struct tagMI_ATOMIC { ATOMIC_8B_MOVE = 0x24, ATOMIC_8B_INCREMENT = 0x25, ATOMIC_8B_DECREMENT = 0x26, + ATOMIC_8B_ADD = 0x27, ATOMIC_8B_CMP_WR = 0x2E, } ATOMIC_OPCODES; inline void init() { diff --git a/shared/test/unit_test/encoders/test_encode_atomic.cpp b/shared/test/unit_test/encoders/test_encode_atomic.cpp index 57f3229e61..c9980da27b 100644 --- a/shared/test/unit_test/encoders/test_encode_atomic.cpp +++ b/shared/test/unit_test/encoders/test_encode_atomic.cpp @@ -50,7 +50,7 @@ HWTEST_F(CommandEncodeAtomic, WhenProgrammingMiAtomicMoveOperationThenExpectInli using DATA_SIZE = typename FamilyType::MI_ATOMIC::DATA_SIZE; using DWORD_LENGTH = typename FamilyType::MI_ATOMIC::DWORD_LENGTH; - constexpr size_t bufferSize = sizeof(MI_ATOMIC) * 3; + constexpr size_t bufferSize = sizeof(MI_ATOMIC) * 4; uint8_t buffer[bufferSize]; uint64_t address = (static_cast(3) << 32) + 0x123400; LinearStream cmdbuffer(buffer, bufferSize); @@ -90,6 +90,15 @@ HWTEST_F(CommandEncodeAtomic, WhenProgrammingMiAtomicMoveOperationThenExpectInli operand1Data, operand2Data); + EncodeAtomic::programMiAtomic(cmdbuffer, + address, + ATOMIC_OPCODES::ATOMIC_8B_ADD, + DATA_SIZE::DATA_SIZE_QWORD, + 0x0u, + 0x0u, + operand1Data, + operand2Data); + MI_ATOMIC *miAtomicCmd = reinterpret_cast(cmdbuffer.getCpuBase()); EXPECT_EQ(ATOMIC_OPCODES::ATOMIC_4B_MOVE, miAtomicCmd->getAtomicOpcode()); @@ -126,4 +135,16 @@ HWTEST_F(CommandEncodeAtomic, WhenProgrammingMiAtomicMoveOperationThenExpectInli EXPECT_EQ(operand1DataHigh, miAtomicCmd->getOperand1DataDword1()); EXPECT_EQ(operand2DataLow, miAtomicCmd->getOperand2DataDword0()); EXPECT_EQ(operand2DataHigh, miAtomicCmd->getOperand2DataDword1()); + + miAtomicCmd++; + EXPECT_EQ(ATOMIC_OPCODES::ATOMIC_8B_ADD, miAtomicCmd->getAtomicOpcode()); + EXPECT_EQ(DATA_SIZE::DATA_SIZE_QWORD, miAtomicCmd->getDataSize()); + EXPECT_EQ(address, UnitTestHelper::getAtomicMemoryAddress(*miAtomicCmd)); + EXPECT_EQ(0x0u, miAtomicCmd->getReturnDataControl()); + EXPECT_EQ(DWORD_LENGTH::DWORD_LENGTH_INLINE_DATA_1, miAtomicCmd->getDwordLength()); + EXPECT_EQ(0x1u, miAtomicCmd->getInlineData()); + EXPECT_EQ(operand1DataLow, miAtomicCmd->getOperand1DataDword0()); + EXPECT_EQ(operand1DataHigh, miAtomicCmd->getOperand1DataDword1()); + EXPECT_EQ(operand2DataLow, miAtomicCmd->getOperand2DataDword0()); + EXPECT_EQ(operand2DataHigh, miAtomicCmd->getOperand2DataDword1()); }