diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 62bad2d7ee..a91ffb8d58 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -244,6 +244,8 @@ struct CommandListCoreFamily : CommandListImp { void programThreadArbitrationPolicy(Device *device); void appendComputeBarrierCommand(); NEO::PipeControlArgs createBarrierFlags(); + void appendMultiTileBarrier(NEO::Device &neoDevice); + size_t estimateBufferSizeMultiTileBarrier(const NEO::HardwareInfo &hwInfo); uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region); MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize, bool hostCopyAllowed); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 1b1934b6b3..a7d686f2e3 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -295,8 +295,7 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand auto event = Event::fromHandle(hEvent); uint64_t baseAddr = event->getGpuAddress(this->device); - - uint32_t packetsToReset = 1; + uint32_t packetsToReset = event->getPacketsInUse(); NEO::Device *neoDevice = device->getNEODevice(); uint32_t callId = 0; @@ -312,8 +311,8 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand if (event->isEventTimestampFlagSet()) { baseAddr += event->getContextEndOffset(); packetsToReset = EventPacketsCount::eventPackets; - event->resetPackets(); } + event->resetPackets(); commandContainer.addToResidencyContainer(&event->getAllocation(this->device)); if (isCopyOnly()) { NEO::MiFlushArgs args; @@ -324,11 +323,15 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand } else { NEO::PipeControlArgs args; if (NEO::MemorySynchronizationCommands::isDcFlushAllowed()) { - args.dcFlushEnable = (!event->signalScope) ? false : true; + args.dcFlushEnable = !!event->signalScope; } - auto &hwInfo = device->getNEODevice()->getHardwareInfo(); - increaseCommandStreamSpace(NEO::MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo) * packetsToReset); + auto &hwInfo = neoDevice->getHardwareInfo(); + size_t estimateSize = NEO::MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo) * packetsToReset; + if (this->partitionCount > 1) { + estimateSize += estimateBufferSizeMultiTileBarrier(hwInfo); + } + increaseCommandStreamSpace(estimateSize); for (uint32_t i = 0u; i < packetsToReset; i++) { NEO::MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( @@ -340,6 +343,9 @@ ze_result_t CommandListCoreFamily::appendEventReset(ze_event_hand args); baseAddr += event->getSinglePacketSize(); } + if (this->partitionCount > 1) { + appendMultiTileBarrier(*neoDevice); + } } if (NEO::DebugManager.flags.EnableSWTags.get()) { diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index 947174868b..905cfd980a 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -191,9 +191,18 @@ void CommandListCoreFamily::appendComputeBarrierCommand() { } template -NEO::PipeControlArgs CommandListCoreFamily::createBarrierFlags() { +inline NEO::PipeControlArgs CommandListCoreFamily::createBarrierFlags() { NEO::PipeControlArgs args; return args; } +template +inline void CommandListCoreFamily::appendMultiTileBarrier(NEO::Device &neoDevice) { +} + +template +inline size_t CommandListCoreFamily::estimateBufferSizeMultiTileBarrier(const NEO::HardwareInfo &hwInfo) { + return 0; +} + } // namespace L0 diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 3a280479c5..4e14d39b12 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -313,25 +313,14 @@ void CommandListCoreFamily::appendMultiPartitionEpilogue() { template void CommandListCoreFamily::appendComputeBarrierCommand() { - NEO::PipeControlArgs args = createBarrierFlags(); if (this->partitionCount > 1) { auto neoDevice = device->getNEODevice(); auto &hwInfo = neoDevice->getHardwareInfo(); - size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch::getBarrierSize(hwInfo, - true, - false); - increaseCommandStreamSpace(estimatedSizeRequired); - - NEO::ImplicitScalingDispatch::dispatchBarrierCommands(*commandContainer.getCommandStream(), - neoDevice->getDeviceBitfield(), - args, - hwInfo, - 0, - 0, - true, - true); + increaseCommandStreamSpace(estimateBufferSizeMultiTileBarrier(hwInfo)); + appendMultiTileBarrier(*neoDevice); } else { + NEO::PipeControlArgs args = createBarrierFlags(); size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands::getSizeForSinglePipeControl(); increaseCommandStreamSpace(estimatedSizeRequired); NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); @@ -345,4 +334,25 @@ NEO::PipeControlArgs CommandListCoreFamily::createBarrierFlags() return args; } +template +void CommandListCoreFamily::appendMultiTileBarrier(NEO::Device &neoDevice) { + NEO::PipeControlArgs args = createBarrierFlags(); + auto &hwInfo = neoDevice.getHardwareInfo(); + NEO::ImplicitScalingDispatch::dispatchBarrierCommands(*commandContainer.getCommandStream(), + neoDevice.getDeviceBitfield(), + args, + hwInfo, + 0, + 0, + true, + true); +} + +template +inline size_t CommandListCoreFamily::estimateBufferSizeMultiTileBarrier(const NEO::HardwareInfo &hwInfo) { + return NEO::ImplicitScalingDispatch::getBarrierSize(hwInfo, + true, + false); +} + } // namespace L0 diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 5b5335f2fd..83afecfbfe 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -35,11 +35,13 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::appendLaunchKernelWithParams; using BaseClass::appendMemoryCopyBlit; using BaseClass::appendMemoryCopyBlitRegion; + using BaseClass::appendMultiTileBarrier; using BaseClass::appendSignalEventPostWalker; using BaseClass::appendWriteKernelTimestamp; using BaseClass::applyMemoryRangesBarrier; using BaseClass::clearCommandsToPatch; using BaseClass::cmdQImmediate; + using BaseClass::commandContainer; using BaseClass::commandListPerThreadScratchSize; using BaseClass::commandListPreemptionMode; using BaseClass::commandsToPatch; @@ -47,6 +49,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::containsCooperativeKernelsFlag; using BaseClass::csr; using BaseClass::engineGroupType; + using BaseClass::estimateBufferSizeMultiTileBarrier; using BaseClass::finalStreamState; using BaseClass::flags; using BaseClass::getAlignedAllocation; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp index ee930f4eb6..4ac556e0a2 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp @@ -668,5 +668,26 @@ HWTEST2_F(HostPointerManagerCommandListTest, givenDebugModeToRegisterAllHostPoin EXPECT_EQ(ZE_RESULT_SUCCESS, result); } +using SingleTileOnlyPlatforms = IsWithinGfxCore; +HWTEST2_F(CommandListCreate, givenSingleTileOnlyPlatformsWhenProgrammingMultiTileBarrierThenNoProgrammingIsExpected, SingleTileOnlyPlatforms) { + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + + auto neoDevice = device->getNEODevice(); + auto &hwInfo = neoDevice->getHardwareInfo(); + + auto commandList = std::make_unique<::L0::ult::CommandListCoreFamily>(); + ASSERT_NE(nullptr, commandList); + ze_result_t returnValue = commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + EXPECT_EQ(0u, commandList->estimateBufferSizeMultiTileBarrier(hwInfo)); + + auto cmdListStream = commandList->commandContainer.getCommandStream(); + size_t usedBefore = cmdListStream->getUsed(); + commandList->appendMultiTileBarrier(*neoDevice); + size_t usedAfter = cmdListStream->getUsed(); + EXPECT_EQ(usedBefore, usedAfter); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp index c3ef359440..a2e4673da9 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp @@ -224,5 +224,81 @@ HWTEST2_F(CommandListAppendEventReset, givenEventWithHostScopeUsedInResetThenPip } ASSERT_TRUE(postSyncFound); } + +HWTEST2_F(CommandListAppendEventReset, + givenMultiTileCommandListWhenAppendingMultiPacketEventThenExpectSameNumberOfResetPostSyncAndMultiBarrierCommands, IsAtLeastXeHpCore) { + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + auto commandList = std::make_unique<::L0::ult::CommandListCoreFamily>(); + ASSERT_NE(nullptr, commandList); + ze_result_t returnValue = commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + auto cmdStream = commandList->commandContainer.getCommandStream(); + + size_t useSize = cmdStream->getAvailableSpace(); + useSize -= sizeof(MI_BATCH_BUFFER_END); + cmdStream->getSpace(useSize); + + constexpr uint32_t packets = 2u; + event->setPacketsInUse(packets); + event->setEventTimestampFlag(false); + event->signalScope = ZE_EVENT_SCOPE_FLAG_HOST; + + commandList->partitionCount = packets; + returnValue = commandList->appendEventReset(event->toHandle()); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + EXPECT_EQ(1u, event->getPacketsInUse()); + + auto gpuAddress = event->getGpuAddress(device); + auto &hwInfo = device->getNEODevice()->getHardwareInfo(); + + size_t expectedSize = NEO::MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo) * packets + + commandList->estimateBufferSizeMultiTileBarrier(hwInfo); + size_t usedSize = cmdStream->getUsed(); + EXPECT_EQ(expectedSize, usedSize); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + cmdStream->getCpuBase(), + usedSize)); + + auto pipeControlList = findAll(cmdList.begin(), cmdList.end()); + ASSERT_NE(0u, pipeControlList.size()); + uint32_t postSyncFound = 0; + auto postSyncPipeControlItor = cmdList.end(); + for (auto &it : pipeControlList) { + auto cmd = genCmdCast(*it); + if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + EXPECT_EQ(cmd->getImmediateData(), Event::STATE_CLEARED); + EXPECT_TRUE(cmd->getCommandStreamerStallEnable()); + EXPECT_EQ(gpuAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*cmd)); + EXPECT_EQ(MemorySynchronizationCommands::isDcFlushAllowed(), cmd->getDcFlushEnable()); + postSyncFound++; + gpuAddress += event->getSinglePacketSize(); + postSyncPipeControlItor = it; + } + } + EXPECT_EQ(packets, postSyncFound); + postSyncPipeControlItor++; + ASSERT_NE(cmdList.end(), postSyncPipeControlItor); + + //find multi tile barrier section: pipe control + atomic/semaphore + auto itorPipeControl = find(postSyncPipeControlItor, cmdList.end()); + ASSERT_NE(cmdList.end(), itorPipeControl); + + auto itorAtomic = find(itorPipeControl, cmdList.end()); + ASSERT_NE(cmdList.end(), itorAtomic); + + auto itorSemaphore = find(itorAtomic, cmdList.end()); + ASSERT_NE(cmdList.end(), itorSemaphore); +} + } // namespace ult } // namespace L0