From 56bef797338a0463c01ebfd0a134d739fc2c6748 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Thu, 9 Dec 2021 19:31:27 +0000 Subject: [PATCH] Add multi tile support for OCL post sync barrier Related-To: NEO-6262 Signed-off-by: Zbigniew Zdanowicz --- .../command_stream_receiver_hw_1_tests.cpp | 44 +++++ ...tream_receiver_hw_tests_xehp_and_later.cpp | 168 ++++++++++++++++++ .../command_stream_receiver_hw.h | 2 + .../command_stream_receiver_hw_base.inl | 15 +- ...mmand_stream_receiver_hw_bdw_and_later.inl | 18 ++ ...mand_stream_receiver_hw_xehp_and_later.inl | 37 ++++ .../libult/ult_command_stream_receiver.h | 2 + 7 files changed, 274 insertions(+), 12 deletions(-) diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp index 0c8bdc2b24..51529ce228 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp @@ -1424,3 +1424,47 @@ HWCMDTEST_F(IGFX_GEN8_CORE, UltCommandStreamReceiverTest, WhenProgrammingActiveP size_t usedAfter = commandStreamReceiver.commandStream.getUsed(); EXPECT_EQ(usedBefore, usedAfter); } + +HWCMDTEST_F(IGFX_GEN8_CORE, UltCommandStreamReceiverTest, givenBarrierNodeSetWhenProgrammingBarrierCommandThenExpectPostSyncPipeControl) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto &hwInfo = pDevice->getHardwareInfo(); + auto commandStreamReceiver = &pDevice->getUltCommandStreamReceiver(); + + auto &commandStreamCSR = commandStreamReceiver->getCS(); + + TagNodeBase *tagNode = commandStreamReceiver->getTimestampPacketAllocator()->getTag(); + uint64_t gpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tagNode); + + TimestampPacketDependencies timestampPacketDependencies; + timestampPacketDependencies.barrierNodes.add(tagNode); + + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.barrierTimestampPacketNodes = ×tampPacketDependencies.barrierNodes; + + size_t expectedCmdSize = MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo); + size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); + EXPECT_EQ(expectedCmdSize, estimatedCmdSize); + + commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags); + EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed()); + + parseCommands(commandStreamCSR, 0); + findHardwareCommands(); + auto cmdItor = cmdList.begin(); + + if (MemorySynchronizationCommands::isPipeControlWArequired(hwInfo)) { + PIPE_CONTROL *pipeControl = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, pipeControl); + cmdItor++; + if (MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hwInfo) > 0) { + cmdItor++; + } + } + PIPE_CONTROL *pipeControl = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation()); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(0u, pipeControl->getImmediateData()); + EXPECT_EQ(gpuAddress, UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl)); +} diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp index af8312f3c6..d2f481e05f 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp @@ -950,3 +950,171 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWh EXPECT_EQ(estimatedCmdSize, offset); } + +HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWhenSinglePartitionUsedForPostSyncBarrierThenExpectOnlyPostSyncCommands, IsAtLeastXeHpCore) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto &hwInfo = pDevice->getHardwareInfo(); + + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + auto &commandStreamCSR = commandStreamReceiver->getCS(); + + TagNodeBase *tagNode = commandStreamReceiver->getTimestampPacketAllocator()->getTag(); + uint64_t gpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tagNode); + + TimestampPacketDependencies timestampPacketDependencies; + timestampPacketDependencies.barrierNodes.add(tagNode); + + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.barrierTimestampPacketNodes = ×tampPacketDependencies.barrierNodes; + + commandStreamReceiver->staticWorkPartitioningEnabled = true; + commandStreamReceiver->activePartitions = 1; + + size_t expectedCmdSize = MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo); + size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); + EXPECT_EQ(expectedCmdSize, estimatedCmdSize); + + commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags); + EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed()); + + parseCommands(commandStreamCSR, 0); + findHardwareCommands(); + auto cmdItor = cmdList.begin(); + + if (MemorySynchronizationCommands::isPipeControlWArequired(hwInfo)) { + PIPE_CONTROL *pipeControl = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, pipeControl); + cmdItor++; + if (MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hwInfo) > 0) { + cmdItor++; + } + } + PIPE_CONTROL *pipeControl = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation()); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(0u, pipeControl->getImmediateData()); + EXPECT_EQ(gpuAddress, UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl)); +} + +HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionDisabledWhenMultiplePartitionsUsedForPostSyncBarrierThenExpectOnlyPostSyncCommands, IsAtLeastXeHpCore) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto &hwInfo = pDevice->getHardwareInfo(); + + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + auto &commandStreamCSR = commandStreamReceiver->getCS(); + + TagNodeBase *tagNode = commandStreamReceiver->getTimestampPacketAllocator()->getTag(); + uint64_t gpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tagNode); + + TimestampPacketDependencies timestampPacketDependencies; + timestampPacketDependencies.barrierNodes.add(tagNode); + + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.barrierTimestampPacketNodes = ×tampPacketDependencies.barrierNodes; + + commandStreamReceiver->staticWorkPartitioningEnabled = false; + commandStreamReceiver->activePartitions = 2; + + size_t expectedCmdSize = MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo); + size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); + EXPECT_EQ(expectedCmdSize, estimatedCmdSize); + + commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags); + EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed()); + + parseCommands(commandStreamCSR, 0); + findHardwareCommands(); + auto cmdItor = cmdList.begin(); + + if (MemorySynchronizationCommands::isPipeControlWArequired(hwInfo)) { + PIPE_CONTROL *pipeControl = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, pipeControl); + cmdItor++; + if (MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hwInfo) > 0) { + cmdItor++; + } + } + PIPE_CONTROL *pipeControl = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation()); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(0u, pipeControl->getImmediateData()); + EXPECT_EQ(gpuAddress, UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl)); +} + +HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWhenMultiplePartitionsUsedThenExpectImplicitScalingPostSyncBarrierWithoutSelfCleanup, IsAtLeastXeHpCore) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using MI_ATOMIC = typename FamilyType::MI_ATOMIC; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + auto &hwInfo = pDevice->getHardwareInfo(); + + auto commandStreamReceiver = new MockCsrHw(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + pDevice->resetCommandStreamReceiver(commandStreamReceiver); + auto &commandStreamCSR = commandStreamReceiver->getCS(); + + TagNodeBase *tagNode = commandStreamReceiver->getTimestampPacketAllocator()->getTag(); + uint64_t gpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tagNode); + + TimestampPacketDependencies timestampPacketDependencies; + timestampPacketDependencies.barrierNodes.add(tagNode); + + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.barrierTimestampPacketNodes = ×tampPacketDependencies.barrierNodes; + + commandStreamReceiver->staticWorkPartitioningEnabled = true; + commandStreamReceiver->activePartitions = 2; + + size_t expectedSize = MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo) + + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + + sizeof(MI_BATCH_BUFFER_START) + + 2 * sizeof(uint32_t); + size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); + EXPECT_EQ(expectedSize, estimatedCmdSize); + + commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags); + EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed()); + EXPECT_EQ(2u, tagNode->getPacketsUsed()); + + parseCommands(commandStreamCSR, 0); + findHardwareCommands(); + auto cmdItor = cmdList.begin(); + + if (MemorySynchronizationCommands::isPipeControlWArequired(hwInfo)) { + PIPE_CONTROL *pipeControl = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, pipeControl); + cmdItor++; + if (MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hwInfo) > 0) { + cmdItor++; + } + } + PIPE_CONTROL *pipeControl = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation()); + EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); + EXPECT_EQ(0u, pipeControl->getImmediateData()); + EXPECT_EQ(gpuAddress, UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl)); + EXPECT_TRUE(pipeControl->getWorkloadPartitionIdOffsetEnable()); + cmdItor++; + + if (MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hwInfo) > 0) { + cmdItor++; + } + + MI_ATOMIC *miAtomic = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, miAtomic); + cmdItor++; + + MI_SEMAPHORE_WAIT *miSemaphore = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, miSemaphore); + cmdItor++; + + MI_BATCH_BUFFER_START *bbStart = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, bbStart); +} diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index eeef4ac23d..2055ea6750 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -70,6 +70,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { size_t getCmdSizeForActivePartitionConfig() const; size_t getCmdSizeForStallingCommands(const DispatchFlags &dispatchFlags) const; size_t getCmdSizeForStallingNoPostSyncCommands() const; + size_t getCmdSizeForStallingPostSyncCommands() const; bool isComputeModeNeeded() const; bool isPipelineSelectAlreadyProgrammed() const; @@ -149,6 +150,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads); void programStallingCommandsForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags); void programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream); + void programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode); void programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags); void programEngineModeEpliogue(LinearStream &csr, const DispatchFlags &dispatchFlags); void programActivePartitionConfigFlushTask(LinearStream &csr); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 60655e0ce4..50c2a5773a 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -660,17 +660,8 @@ inline void CommandStreamReceiverHw::programStallingCommandsForBarrie auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes; if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() != 0) { - auto barrierTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*dispatchFlags.barrierTimestampPacketNodes->peekNodes()[0]); - - PipeControlArgs args(true); - MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( - cmdStream, - PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, - barrierTimestampPacketGpuAddress, - 0, - peekHwInfo(), - args); - dispatchFlags.barrierTimestampPacketNodes->makeResident(*this); + programStallingPostSyncCommandsForBarrier(cmdStream, *barrierTimestampPacketNodes->peekNodes()[0]); + barrierTimestampPacketNodes->makeResident(*this); } else { programStallingNoPostSyncCommandsForBarrier(cmdStream); } @@ -1474,7 +1465,7 @@ template size_t CommandStreamReceiverHw::getCmdSizeForStallingCommands(const DispatchFlags &dispatchFlags) const { auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes; if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() > 0) { - return MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(peekHwInfo()); + return getCmdSizeForStallingPostSyncCommands(); } else { return getCmdSizeForStallingNoPostSyncCommands(); } diff --git a/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl index 65e7abc29a..f706592007 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl @@ -159,12 +159,30 @@ inline size_t CommandStreamReceiverHw::getCmdSizeForStallingNoPostSyn return sizeof(typename GfxFamily::PIPE_CONTROL); } +template +inline size_t CommandStreamReceiverHw::getCmdSizeForStallingPostSyncCommands() const { + return MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(peekHwInfo()); +} + template inline void CommandStreamReceiverHw::programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream) { PipeControlArgs args; MemorySynchronizationCommands::addPipeControl(cmdStream, args); } +template +inline void CommandStreamReceiverHw::programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode) { + auto barrierTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(tagNode); + PipeControlArgs args(true); + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + cmdStream, + PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + barrierTimestampPacketGpuAddress, + 0, + peekHwInfo(), + args); +} + template inline void CommandStreamReceiverHw::configurePostSyncWriteOffset() { } diff --git a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl index 554db51e43..0106a19614 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl @@ -206,6 +206,17 @@ inline size_t CommandStreamReceiverHw::getCmdSizeForStallingNoPostSyn } } +template +inline size_t CommandStreamReceiverHw::getCmdSizeForStallingPostSyncCommands() const { + if (this->activePartitions > 1 && this->staticWorkPartitioningEnabled) { + return ImplicitScalingDispatch::getBarrierSize(peekHwInfo(), + false, + true); + } else { + return MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(peekHwInfo()); + } +} + template inline void CommandStreamReceiverHw::programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream) { PipeControlArgs args; @@ -223,6 +234,32 @@ inline void CommandStreamReceiverHw::programStallingNoPostSyncCommand } } +template +inline void CommandStreamReceiverHw::programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode) { + auto barrierTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(tagNode); + PipeControlArgs args(true); + if (this->activePartitions > 1 && this->staticWorkPartitioningEnabled) { + args.workloadPartitionOffset = true; + ImplicitScalingDispatch::dispatchBarrierCommands(cmdStream, + this->deviceBitfield, + args, + peekHwInfo(), + barrierTimestampPacketGpuAddress, + 0, + false, + false); + tagNode.setPacketsUsed(this->activePartitions); + } else { + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + cmdStream, + PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + barrierTimestampPacketGpuAddress, + 0, + peekHwInfo(), + args); + } +} + template inline void CommandStreamReceiverHw::configurePostSyncWriteOffset() { this->postSyncWriteOffset = ImplicitScalingDispatch::getPostSyncOffset(); diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index 0f4a876e70..309b5649cc 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -48,7 +48,9 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass::programEnginePrologue; using BaseClass::programPerDssBackedBuffer; using BaseClass::programPreamble; + using BaseClass::programStallingCommandsForBarrier; using BaseClass::programStallingNoPostSyncCommandsForBarrier; + using BaseClass::programStallingPostSyncCommandsForBarrier; using BaseClass::programStateSip; using BaseClass::programVFEState; using BaseClass::requiresInstructionCacheFlush;