diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 52553e9419..ac2601a4bd 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -13,6 +13,7 @@ #include "level_zero/core/source/cmdlist/cmdlist_imp.h" #include "igfxfmid.h" +#include "pipe_control_args.h" namespace NEO { enum class ImageType; @@ -240,6 +241,8 @@ struct CommandListCoreFamily : CommandListImp { void appendSignalEventPostWalker(ze_event_handle_t hEvent); void programStateBaseAddress(NEO::CommandContainer &container, bool genericMediaStateClearRequired); void programThreadArbitrationPolicy(Device *device); + void appendComputeBarrierCommand(); + NEO::PipeControlArgs createBarrierFlags(); uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region); MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index e49b5c3cdb..c9b13d5aa2 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2288,4 +2288,32 @@ void CommandListCoreFamily::programStateBaseAddress(NEO::CommandC template void CommandListCoreFamily::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask) {} +template +ze_result_t CommandListCoreFamily::appendBarrier(ze_event_handle_t hSignalEvent, + uint32_t numWaitEvents, + ze_event_handle_t *phWaitEvents) { + + ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents); + if (ret) { + return ret; + } + appendEventForProfiling(hSignalEvent, true); + + if (!hSignalEvent) { + if (isCopyOnly()) { + size_t estimatedSizeRequired = NEO::EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite(); + increaseCommandStreamSpace(estimatedSizeRequired); + + NEO::MiFlushArgs args; + NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args); + } else { + appendComputeBarrierCommand(); + } + } else { + appendSignalEventPostWalker(hSignalEvent); + } + + return ZE_RESULT_SUCCESS; +} + } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index b8b9ab7955..c5f0ec9e3e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -33,32 +33,6 @@ size_t CommandListCoreFamily::getReserveSshSize() { return helper.getRenderSurfaceStateSize(); } -template -ze_result_t CommandListCoreFamily::appendBarrier(ze_event_handle_t hSignalEvent, - uint32_t numWaitEvents, - ze_event_handle_t *phWaitEvents) { - - ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents); - if (ret) { - return ret; - } - appendEventForProfiling(hSignalEvent, true); - - if (!hSignalEvent) { - if (isCopyOnly()) { - NEO::MiFlushArgs args; - NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args); - } else { - NEO::PipeControlArgs args; - NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); - } - } else { - appendSignalEventPostWalker(hSignalEvent); - } - - return ZE_RESULT_SUCCESS; -} - template ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, @@ -203,4 +177,19 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z template void CommandListCoreFamily::appendMultiPartitionPrologue(uint32_t partitionDataSize) {} +template +void CommandListCoreFamily::appendComputeBarrierCommand() { + size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands::getSizeForSinglePipeControl(); + increaseCommandStreamSpace(estimatedSizeRequired); + + NEO::PipeControlArgs args = createBarrierFlags(); + NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); +} + +template +NEO::PipeControlArgs CommandListCoreFamily::createBarrierFlags() { + NEO::PipeControlArgs args; + return args; +} + } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 724fb504a6..db6a957d5e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -36,33 +36,6 @@ size_t CommandListCoreFamily::getReserveSshSize() { return 4 * MemoryConstants::pageSize; } -template -ze_result_t CommandListCoreFamily::appendBarrier(ze_event_handle_t hSignalEvent, - uint32_t numWaitEvents, - ze_event_handle_t *phWaitEvents) { - - ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents); - if (ret) { - return ret; - } - appendEventForProfiling(hSignalEvent, true); - - if (!hSignalEvent) { - if (isCopyOnly()) { - NEO::MiFlushArgs args; - NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args); - } else { - NEO::PipeControlArgs args; - args.hdcPipelineFlush = true; - NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); - } - } else { - appendSignalEventPostWalker(hSignalEvent); - } - - return ZE_RESULT_SUCCESS; -} - template void CommandListCoreFamily::applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes, @@ -345,4 +318,30 @@ void CommandListCoreFamily::appendMultiPartitionPrologue(uint32_t true); } +template +void CommandListCoreFamily::appendComputeBarrierCommand() { + NEO::PipeControlArgs args = createBarrierFlags(); + if (this->partitionCount > 1) { + size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch::getBarrierSize(true); + increaseCommandStreamSpace(estimatedSizeRequired); + + NEO::ImplicitScalingDispatch::dispatchBarrierCommands(*commandContainer.getCommandStream(), + device->getNEODevice()->getDeviceBitfield(), + args, + true, + true); + } else { + size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands::getSizeForSinglePipeControl(); + increaseCommandStreamSpace(estimatedSizeRequired); + NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); + } +} + +template +NEO::PipeControlArgs CommandListCoreFamily::createBarrierFlags() { + NEO::PipeControlArgs args; + args.hdcPipelineFlush = true; + return args; +} + } // namespace L0 diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h index 797e0f8886..64f1af13b2 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h @@ -44,5 +44,35 @@ class CommandListFixture : public DeviceFixture { std::unique_ptr event; }; +struct MultiTileCommandListFixture : public SingleRootMultiSubDeviceFixture { + void SetUp() { + SingleRootMultiSubDeviceFixture::SetUp(); + ze_result_t returnValue; + commandList.reset(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue))); + + commandList->partitionCount = 2; + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + eventPoolDesc.count = 2; + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.wait = 0; + eventDesc.signal = 0; + + eventPool = std::unique_ptr(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc)); + event = std::unique_ptr(Event::create(eventPool.get(), &eventDesc, device)); + } + + void TearDown() { + SingleRootMultiSubDeviceFixture::TearDown(); + } + + std::unique_ptr commandList; + std::unique_ptr eventPool; + std::unique_ptr event; +}; + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index b1e87a67a0..c86a956ff6 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -51,6 +51,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::hostPtrMap; using BaseClass::indirectAllocationsAllowed; using BaseClass::initialize; + using BaseClass::partitionCount; using BaseClass::patternAllocations; using BaseClass::requiredStreamState; using BaseClass::unifiedMemoryControls; @@ -70,6 +71,7 @@ struct WhiteBox> using BaseClass::clearCommandsToPatch; using BaseClass::commandsToPatch; using BaseClass::finalStreamState; + using BaseClass::partitionCount; using BaseClass::requiredStreamState; WhiteBox() : BaseClass(BaseClass::defaultNumIddsPerBlock) {} @@ -82,6 +84,7 @@ struct WhiteBox<::L0::CommandList> : public ::L0::CommandListImp { using BaseClass::commandContainer; using BaseClass::commandListPreemptionMode; using BaseClass::initialize; + using BaseClass::partitionCount; WhiteBox(Device *device); ~WhiteBox() override; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp index 87228b2e25..d866c22a1f 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp @@ -7,6 +7,7 @@ #include "shared/source/command_container/command_encoder.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" +#include "shared/test/common/helpers/unit_test_helper.h" #include "test.h" @@ -79,5 +80,142 @@ HWTEST_F(CommandListAppendBarrier, GivenEventVsNoEventWhenAppendingBarrierThenCo ASSERT_LE(sizeWithoutEvent, sizeWithEvent); } + +using MultiTileCommandListAppendBarrier = Test; + +HWTEST2_F(MultiTileCommandListAppendBarrier, WhenAppendingBarrierThenPipeControlIsGenerated, IsWithinXeGfxFamily) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + size_t beforeControlSectionOffset = sizeof(MI_STORE_DATA_IMM) + + sizeof(PIPE_CONTROL) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_BATCH_BUFFER_START); + + size_t startOffset = beforeControlSectionOffset + + (2 * sizeof(uint32_t)); + + size_t expectedUseBuffer = startOffset + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_STORE_DATA_IMM) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT); + + auto usedSpaceBefore = commandList->commandContainer.getCommandStream()->getUsed(); + auto gpuBaseAddress = commandList->commandContainer.getCommandStream()->getGraphicsAllocation()->getGpuAddress() + + usedSpaceBefore; + + auto gpuCrossTileSyncAddress = gpuBaseAddress + + beforeControlSectionOffset; + + auto gpuFinalSyncAddress = gpuCrossTileSyncAddress + + sizeof(uint32_t); + + auto gpuStartAddress = gpuBaseAddress + + startOffset; + + auto result = commandList->appendBarrier(nullptr, 0, nullptr); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + auto usedSpaceAfter = commandList->commandContainer.getCommandStream()->getUsed(); + ASSERT_GT(usedSpaceAfter, usedSpaceBefore); + size_t usedBuffer = usedSpaceAfter - usedSpaceBefore; + EXPECT_EQ(expectedUseBuffer, usedBuffer); + + void *cmdBuffer = ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), usedSpaceBefore); + size_t parsedOffset = 0; + + { + auto storeDataImm = genCmdCast(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, storeDataImm); + EXPECT_EQ(gpuFinalSyncAddress, storeDataImm->getAddress()); + EXPECT_EQ(0u, storeDataImm->getDataDword0()); + parsedOffset += sizeof(MI_STORE_DATA_IMM); + } + { + auto pipeControl = genCmdCast(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, pipeControl); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_FALSE(pipeControl->getDcFlushEnable()); + parsedOffset += sizeof(PIPE_CONTROL); + } + { + auto miAtomic = genCmdCast(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicProgrammedAddress = NEO::UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(gpuCrossTileSyncAddress, miAtomicProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(MI_ATOMIC); + } + { + auto miSemaphore = genCmdCast(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphore); + EXPECT_EQ(gpuCrossTileSyncAddress, miSemaphore->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphore->getCompareOperation()); + EXPECT_EQ(2u, miSemaphore->getSemaphoreDataDword()); + parsedOffset += sizeof(MI_SEMAPHORE_WAIT); + } + { + auto bbStart = genCmdCast(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, bbStart); + EXPECT_EQ(gpuStartAddress, bbStart->getBatchBufferStartAddress()); + EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer()); + parsedOffset += sizeof(MI_BATCH_BUFFER_START); + } + { + auto crossField = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_EQ(0u, *crossField); + parsedOffset += sizeof(uint32_t); + auto finalField = reinterpret_cast(ptrOffset(cmdBuffer, parsedOffset)); + EXPECT_EQ(0u, *finalField); + parsedOffset += sizeof(uint32_t); + } + { + auto miAtomic = genCmdCast(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicProgrammedAddress = NEO::UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(gpuFinalSyncAddress, miAtomicProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(MI_ATOMIC); + } + { + auto miSemaphore = genCmdCast(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphore); + EXPECT_EQ(gpuFinalSyncAddress, miSemaphore->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphore->getCompareOperation()); + EXPECT_EQ(2u, miSemaphore->getSemaphoreDataDword()); + parsedOffset += sizeof(MI_SEMAPHORE_WAIT); + } + { + auto storeDataImm = genCmdCast(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, storeDataImm); + EXPECT_EQ(gpuCrossTileSyncAddress, storeDataImm->getAddress()); + EXPECT_EQ(0u, storeDataImm->getDataDword0()); + parsedOffset += sizeof(MI_STORE_DATA_IMM); + } + { + auto miAtomic = genCmdCast(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miAtomic); + auto miAtomicProgrammedAddress = NEO::UnitTestHelper::getAtomicMemoryAddress(*miAtomic); + EXPECT_EQ(gpuFinalSyncAddress, miAtomicProgrammedAddress); + EXPECT_FALSE(miAtomic->getReturnDataControl()); + EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode()); + parsedOffset += sizeof(MI_ATOMIC); + } + { + auto miSemaphore = genCmdCast(ptrOffset(cmdBuffer, parsedOffset)); + ASSERT_NE(nullptr, miSemaphore); + EXPECT_EQ(gpuFinalSyncAddress, miSemaphore->getSemaphoreGraphicsAddress()); + EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphore->getCompareOperation()); + EXPECT_EQ(4u, miSemaphore->getSemaphoreDataDword()); + parsedOffset += sizeof(MI_SEMAPHORE_WAIT); + } + EXPECT_EQ(expectedUseBuffer, parsedOffset); +} + } // namespace ult -} // namespace L0 \ No newline at end of file +} // namespace L0