From b3ebcfe811e95001866133065be33906291fdf8f Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Tue, 27 Jun 2023 13:42:31 +0000 Subject: [PATCH] performance: immediate flush add ending commands to command list buffer Related-To: NEO-7808 Signed-off-by: Zbigniew Zdanowicz --- .../command_stream_receiver_hw.h | 4 + .../command_stream_receiver_hw_base.inl | 34 ++++++++ .../source/command_stream/csr_definitions.h | 2 + shared/test/common/cmd_parse/hw_parse.h | 13 +++ .../command_stream_receiver_tests.cpp | 81 +++++++++++++++++++ 5 files changed, 134 insertions(+) diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index 4fb15571af..b94a39eff4 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -26,6 +26,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { struct ImmediateFlushData { PipelineSelectArgs pipelineSelectArgs{}; size_t estimatedSize = 0; + void *endPtr = nullptr; bool pipelineSelectFullConfigurationNeeded = false; bool pipelineSelectDirty = false; @@ -257,6 +258,9 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { size_t immediateCommandStreamStart, ImmediateFlushData &flushData, LinearStream &csrStream); + inline void dispatchImmediateFlushClientBufferCommands(ImmediateDispatchFlags &dispatchFlags, + LinearStream &immediateCommandStream, + ImmediateFlushData &flushData); inline void handleImmediateFlushAllocationsResidency(Device &device); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index fa49699d1b..a46e28bbbb 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -317,8 +317,12 @@ CompletionStamp CommandStreamReceiverHw::flushImmediateTask( dispatchImmediateFlushJumpToImmediateCommand(immediateCommandStream, immediateCommandStreamStart, flushData, csrCommandStream); + dispatchImmediateFlushClientBufferCommands(dispatchFlags, immediateCommandStream, flushData); + this->latestSentTaskCount = taskCount + 1; + handleImmediateFlushAllocationsResidency(device); + ++taskCount; CompletionStamp completionStamp = { this->taskCount, this->taskLevel, @@ -2041,6 +2045,8 @@ void CommandStreamReceiverHw::dispatchImmediateFlushOneTimeContextIni template void CommandStreamReceiverHw::handleImmediateFlushAllocationsResidency(Device &device) { + this->makeResident(*tagAllocation); + if (globalFenceAllocation) { makeResident(*globalFenceAllocation); } @@ -2075,4 +2081,32 @@ void CommandStreamReceiverHw::dispatchImmediateFlushJumpToImmediateCo } } +template +void CommandStreamReceiverHw::dispatchImmediateFlushClientBufferCommands(ImmediateDispatchFlags &dispatchFlags, + LinearStream &immediateCommandStream, + ImmediateFlushData &flushData) { + if (dispatchFlags.blockingAppend) { + auto address = getTagAllocation()->getGpuAddress(); + + PipeControlArgs args = {}; + args.dcFlushEnable = this->dcFlushSupport; + args.notifyEnable = isUsedNotifyEnableForPostSync(); + args.workloadPartitionOffset = isMultiTileOperationEnabled(); + MemorySynchronizationCommands::addBarrierWithPostSyncOperation( + immediateCommandStream, + PostSyncMode::ImmediateData, + address, + this->taskCount + 1, + peekRootDeviceEnvironment(), + args); + + this->latestFlushedTaskCount = this->taskCount + 1; + } + + makeResident(*immediateCommandStream.getGraphicsAllocation()); + + programEndingCmd(immediateCommandStream, &flushData.endPtr, isDirectSubmissionEnabled(), dispatchFlags.hasRelaxedOrderingDependencies, true); + EncodeNoop::alignToCacheLine(immediateCommandStream); +} + } // namespace NEO diff --git a/shared/source/command_stream/csr_definitions.h b/shared/source/command_stream/csr_definitions.h index 62dab8562d..9dbb234e8e 100644 --- a/shared/source/command_stream/csr_definitions.h +++ b/shared/source/command_stream/csr_definitions.h @@ -139,6 +139,8 @@ struct CsrSizeRequestFlags { struct ImmediateDispatchFlags { StreamProperties *requiredState = nullptr; void *sshCpuBase = nullptr; + bool blockingAppend = false; + bool hasRelaxedOrderingDependencies = false; }; } // namespace NEO diff --git a/shared/test/common/cmd_parse/hw_parse.h b/shared/test/common/cmd_parse/hw_parse.h index 70c3da1fc2..523935e304 100644 --- a/shared/test/common/cmd_parse/hw_parse.h +++ b/shared/test/common/cmd_parse/hw_parse.h @@ -130,6 +130,19 @@ struct HardwareParse { return getCommand(cmdList.begin(), cmdList.end()); } + template + GenCmdList::iterator getCommandItor(GenCmdList::iterator itorStart, GenCmdList::iterator itorEnd) { + auto itorCmd = find(itorStart, itorEnd); + return itorCmd != cmdList.end() + ? itorCmd + : cmdList.end(); + } + + template + GenCmdList::iterator getCommandItor() { + return getCommandItor(cmdList.begin(), cmdList.end()); + } + template int getNumberOfPipelineSelectsThatEnablePipelineSelect() { using PIPELINE_SELECT = typename FamilyType::PIPELINE_SELECT; diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 0585de09c0..a9dd2ceec8 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -3813,3 +3813,84 @@ HWTEST2_F(CommandStreamReceiverHwTest, bbStartCmd = hwParserCsr.getCommand(); ASSERT_EQ(nullptr, bbStartCmd); } + +HWTEST2_F(CommandStreamReceiverHwTest, + givenImmediateFlushTaskWhenBlockingCallSelectedThenDispatchPipeControlPostSyncToImmediateBatchBuffer, + IsAtLeastXeHpCore) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END; + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + bool additionalSyncCmd = NEO::MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(commandStreamReceiver.peekRootDeviceEnvironment()) > 0; + commandStreamReceiver.storeMakeResidentAllocations = true; + + auto startOffset = commandStream.getUsed(); + auto immediateListCmdBufferAllocation = commandStream.getGraphicsAllocation(); + *commandStream.getSpaceForCmd() = FamilyType::cmdInitGpgpuWalker; + auto csrTagAllocation = commandStreamReceiver.getTagAllocation(); + uint64_t postsyncAddress = csrTagAllocation->getGpuAddress(); + + immediateFlushTaskFlags.blockingAppend = true; + auto completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice); + EXPECT_EQ(1u, completionStamp.taskCount); + EXPECT_EQ(1u, commandStreamReceiver.taskCount); + EXPECT_EQ(1u, commandStreamReceiver.latestSentTaskCount); + EXPECT_EQ(1u, commandStreamReceiver.latestFlushedTaskCount); + + HardwareParse hwParserCsr; + hwParserCsr.parseCommands(commandStream, 0); + auto cmdItor = hwParserCsr.getCommandItor(); + ASSERT_NE(hwParserCsr.cmdList.end(), cmdItor); + auto pipeControlCmd = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, pipeControlCmd); + EXPECT_EQ(postsyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*pipeControlCmd)); + EXPECT_EQ(1u, pipeControlCmd->getImmediateData()); + + cmdItor++; + ASSERT_NE(hwParserCsr.cmdList.end(), cmdItor); + if (additionalSyncCmd) { + cmdItor++; + ASSERT_NE(hwParserCsr.cmdList.end(), cmdItor); + } + auto bbEndCmd = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, bbEndCmd); + + EXPECT_TRUE(commandStreamReceiver.isMadeResident(csrTagAllocation)); + EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation)); + + startOffset = commandStream.getUsed(); + EXPECT_EQ(0u, (startOffset % MemoryConstants::cacheLineSize)); + + *commandStream.getSpaceForCmd() = FamilyType::cmdInitGpgpuWalker; + + completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice); + EXPECT_EQ(2u, completionStamp.taskCount); + EXPECT_EQ(2u, commandStreamReceiver.taskCount); + EXPECT_EQ(2u, commandStreamReceiver.latestSentTaskCount); + EXPECT_EQ(2u, commandStreamReceiver.latestFlushedTaskCount); + + hwParserCsr.tearDown(); + hwParserCsr.parseCommands(commandStream, startOffset); + cmdItor = hwParserCsr.getCommandItor(); + ASSERT_NE(hwParserCsr.cmdList.end(), cmdItor); + pipeControlCmd = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, pipeControlCmd); + EXPECT_EQ(postsyncAddress, NEO::UnitTestHelper::getPipeControlPostSyncAddress(*pipeControlCmd)); + EXPECT_EQ(2u, pipeControlCmd->getImmediateData()); + + cmdItor++; + ASSERT_NE(hwParserCsr.cmdList.end(), cmdItor); + if (additionalSyncCmd) { + cmdItor++; + ASSERT_NE(hwParserCsr.cmdList.end(), cmdItor); + } + bbEndCmd = genCmdCast(*cmdItor); + ASSERT_NE(nullptr, bbEndCmd); + + EXPECT_TRUE(commandStreamReceiver.isMadeResident(csrTagAllocation)); + EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation)); + + startOffset = commandStream.getUsed(); + EXPECT_EQ(0u, (startOffset % MemoryConstants::cacheLineSize)); +}