From eb4e7fb2a69ff3313300708f50037b778f6ea1c6 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Tue, 27 Jun 2023 19:54:20 +0000 Subject: [PATCH] performance: immediate flush add flushing mechanism to gpu Related-To: NEO-7808 Signed-off-by: Zbigniew Zdanowicz --- .../command_stream_receiver_hw.h | 12 +- .../command_stream_receiver_hw_base.inl | 78 +++++++++++-- .../source/command_stream/csr_definitions.h | 1 + .../command_stream_receiver_tests.cpp | 105 ++++++++++++++++++ 4 files changed, 184 insertions(+), 12 deletions(-) diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index b94a39eff4..276d39e9ea 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -27,6 +27,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { PipelineSelectArgs pipelineSelectArgs{}; size_t estimatedSize = 0; void *endPtr = nullptr; + size_t csrStartOffset = 0; bool pipelineSelectFullConfigurationNeeded = false; bool pipelineSelectDirty = false; @@ -262,7 +263,16 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { LinearStream &immediateCommandStream, ImmediateFlushData &flushData); - inline void handleImmediateFlushAllocationsResidency(Device &device); + inline void handleImmediateFlushAllocationsResidency(Device &device, + LinearStream &immediateCommandStream, + ImmediateFlushData &flushData, + LinearStream &csrStream); + + inline CompletionStamp handleImmediateFlushSendBatchBuffer(LinearStream &immediateCommandStream, + size_t immediateCommandStreamStart, + ImmediateDispatchFlags &dispatchFlags, + ImmediateFlushData &flushData, + LinearStream &csrStream); HeapDirtyState dshState; HeapDirtyState iohState; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index a46e28bbbb..fb03929396 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -308,6 +308,7 @@ CompletionStamp CommandStreamReceiverHw::flushImmediateTask( handleImmediateFlushJumpToImmediate(flushData); auto &csrCommandStream = getCS(flushData.estimatedSize); + flushData.csrStartOffset = csrCommandStream.getUsed(); dispatchImmediateFlushPipelineSelectCommand(flushData, csrCommandStream); dispatchImmediateFlushFrontEndCommand(scratchAddress, flushData, device, csrCommandStream); @@ -318,16 +319,17 @@ CompletionStamp CommandStreamReceiverHw::flushImmediateTask( dispatchImmediateFlushJumpToImmediateCommand(immediateCommandStream, immediateCommandStreamStart, flushData, csrCommandStream); dispatchImmediateFlushClientBufferCommands(dispatchFlags, immediateCommandStream, flushData); - this->latestSentTaskCount = taskCount + 1; - handleImmediateFlushAllocationsResidency(device); + handleImmediateFlushAllocationsResidency(device, + immediateCommandStream, + flushData, + csrCommandStream); - ++taskCount; - CompletionStamp completionStamp = { - this->taskCount, - this->taskLevel, - flushStamp->peekStamp()}; - return completionStamp; + return handleImmediateFlushSendBatchBuffer(immediateCommandStream, + immediateCommandStreamStart, + dispatchFlags, + flushData, + csrCommandStream); } template @@ -2044,7 +2046,10 @@ void CommandStreamReceiverHw::dispatchImmediateFlushOneTimeContextIni } template -void CommandStreamReceiverHw::handleImmediateFlushAllocationsResidency(Device &device) { +void CommandStreamReceiverHw::handleImmediateFlushAllocationsResidency(Device &device, + LinearStream &immediateCommandStream, + ImmediateFlushData &flushData, + LinearStream &csrStream) { this->makeResident(*tagAllocation); if (globalFenceAllocation) { @@ -2058,6 +2063,10 @@ void CommandStreamReceiverHw::handleImmediateFlushAllocationsResidenc if (device.getRTMemoryBackedBuffer()) { makeResident(*device.getRTMemoryBackedBuffer()); } + + if (flushData.estimatedSize > 0) { + makeResident(*csrStream.getGraphicsAllocation()); + } } template @@ -2099,8 +2108,6 @@ void CommandStreamReceiverHw::dispatchImmediateFlushClientBufferComma this->taskCount + 1, peekRootDeviceEnvironment(), args); - - this->latestFlushedTaskCount = this->taskCount + 1; } makeResident(*immediateCommandStream.getGraphicsAllocation()); @@ -2109,4 +2116,53 @@ void CommandStreamReceiverHw::dispatchImmediateFlushClientBufferComma EncodeNoop::alignToCacheLine(immediateCommandStream); } +template +CompletionStamp CommandStreamReceiverHw::handleImmediateFlushSendBatchBuffer(LinearStream &immediateCommandStream, + size_t immediateCommandStreamStart, + ImmediateDispatchFlags &dispatchFlags, + ImmediateFlushData &flushData, + LinearStream &csrStream) { + this->latestSentTaskCount = taskCount + 1; + + bool startFromCsr = flushData.estimatedSize > 0; + size_t startOffset = startFromCsr ? flushData.csrStartOffset : immediateCommandStreamStart; + auto &streamToSubmit = startFromCsr ? csrStream : immediateCommandStream; + GraphicsAllocation *chainedBatchBuffer = startFromCsr ? immediateCommandStream.getGraphicsAllocation() : nullptr; + size_t chainedBatchBufferStartOffset = startFromCsr ? csrStream.getUsed() : 0; + uint64_t taskStartAddress = immediateCommandStream.getGpuBase() + immediateCommandStreamStart; + bool hasStallingCmds = (startFromCsr || dispatchFlags.blockingAppend || dispatchFlags.hasStallingCmds); + + constexpr bool immediateRequiresCoherency = false; + constexpr bool immediateLowPriority = false; + constexpr QueueThrottle immediateThrottle = QueueThrottle::MEDIUM; + constexpr uint64_t immediateSliceCount = QueueSliceCount::defaultSliceCount; + + BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, taskStartAddress, chainedBatchBuffer, + immediateRequiresCoherency, immediateLowPriority, immediateThrottle, immediateSliceCount, + streamToSubmit.getUsed(), &streamToSubmit, flushData.endPtr, this->getNumClients(), hasStallingCmds, + dispatchFlags.hasRelaxedOrderingDependencies}; + updateStreamTaskCount(streamToSubmit, taskCount + 1); + + auto submissionStatus = flushHandler(batchBuffer, this->getResidencyAllocations()); + if (submissionStatus != SubmissionStatus::SUCCESS) { + --this->latestSentTaskCount; + updateStreamTaskCount(streamToSubmit, taskCount); + + CompletionStamp completionStamp = {CompletionStamp::getTaskCountFromSubmissionStatusError(submissionStatus)}; + return completionStamp; + } else { + if (dispatchFlags.blockingAppend) { + this->latestFlushedTaskCount = this->taskCount + 1; + } + + ++taskCount; + CompletionStamp completionStamp = { + this->taskCount, + this->taskLevel, + flushStamp->peekStamp()}; + + return completionStamp; + } +} + } // namespace NEO diff --git a/shared/source/command_stream/csr_definitions.h b/shared/source/command_stream/csr_definitions.h index 9dbb234e8e..2b6c87f575 100644 --- a/shared/source/command_stream/csr_definitions.h +++ b/shared/source/command_stream/csr_definitions.h @@ -141,6 +141,7 @@ struct ImmediateDispatchFlags { void *sshCpuBase = nullptr; bool blockingAppend = false; bool hasRelaxedOrderingDependencies = false; + bool hasStallingCmds = false; }; } // namespace NEO diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index a9dd2ceec8..1e0a1f606c 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -3894,3 +3894,108 @@ HWTEST2_F(CommandStreamReceiverHwTest, startOffset = commandStream.getUsed(); EXPECT_EQ(0u, (startOffset % MemoryConstants::cacheLineSize)); } + +HWTEST2_F(CommandStreamReceiverHwTest, + givenImmediateFlushTaskWhenPreambleIsUsedOrNotThenCsrBufferIsUsedOrImmediateBufferIsUsed, + IsAtLeastXeHpCore) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + commandStreamReceiver.storeMakeResidentAllocations = true; + commandStreamReceiver.recordFlusheBatchBuffer = true; + + auto startOffset = commandStream.getUsed(); + auto immediateListCmdBufferAllocation = commandStream.getGraphicsAllocation(); + + *commandStream.getSpaceForCmd() = FamilyType::cmdInitGpgpuWalker; + + immediateFlushTaskFlags.hasStallingCmds = true; + auto completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice); + + auto csrCmdBufferAllocation = commandStreamReceiver.commandStream.getGraphicsAllocation(); + + TaskCountType currentTaskCountType = 1u; + + EXPECT_EQ(currentTaskCountType, completionStamp.taskCount); + EXPECT_EQ(currentTaskCountType, commandStreamReceiver.taskCount); + EXPECT_EQ(currentTaskCountType, commandStreamReceiver.latestSentTaskCount); + EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount); + + EXPECT_TRUE(commandStreamReceiver.isMadeResident(csrCmdBufferAllocation, currentTaskCountType)); + EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType)); + + BatchBuffer &recordedBatchBuffer = commandStreamReceiver.latestFlushedBatchBuffer; + EXPECT_EQ(csrCmdBufferAllocation, recordedBatchBuffer.commandBufferAllocation); + EXPECT_EQ(0u, recordedBatchBuffer.startOffset); + EXPECT_EQ(true, recordedBatchBuffer.hasStallingCmds); + EXPECT_EQ(false, recordedBatchBuffer.hasRelaxedOrderingDependencies); + + startOffset = commandStream.getUsed(); + + *commandStream.getSpaceForCmd() = FamilyType::cmdInitGpgpuWalker; + + immediateFlushTaskFlags.hasRelaxedOrderingDependencies = true; + completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice); + + currentTaskCountType = 2u; + + EXPECT_EQ(currentTaskCountType, completionStamp.taskCount); + EXPECT_EQ(currentTaskCountType, commandStreamReceiver.taskCount); + EXPECT_EQ(currentTaskCountType, commandStreamReceiver.latestSentTaskCount); + EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount); + + EXPECT_FALSE(commandStreamReceiver.isMadeResident(csrCmdBufferAllocation, currentTaskCountType)); + EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType)); + + recordedBatchBuffer = commandStreamReceiver.latestFlushedBatchBuffer; + EXPECT_EQ(immediateListCmdBufferAllocation, recordedBatchBuffer.commandBufferAllocation); + EXPECT_EQ(startOffset, recordedBatchBuffer.startOffset); + EXPECT_EQ(true, recordedBatchBuffer.hasStallingCmds); + EXPECT_EQ(true, recordedBatchBuffer.hasRelaxedOrderingDependencies); +} + +HWTEST2_F(CommandStreamReceiverHwTest, + givenImmediateFlushTaskWhenFlushOperationFailsThenExpectNoBatchBufferSentAndCorrectFailCompletionReturned, + IsAtLeastXeHpCore) { + using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER; + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + commandStreamReceiver.storeMakeResidentAllocations = true; + commandStreamReceiver.recordFlusheBatchBuffer = true; + + auto startOffset = commandStream.getUsed(); + auto immediateListCmdBufferAllocation = commandStream.getGraphicsAllocation(); + + *commandStream.getSpaceForCmd() = FamilyType::cmdInitGpgpuWalker; + + immediateFlushTaskFlags.blockingAppend = true; + commandStreamReceiver.flushReturnValue = NEO::SubmissionStatus::FAILED; + auto completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice); + + auto csrCmdBufferAllocation = commandStreamReceiver.commandStream.getGraphicsAllocation(); + + TaskCountType currentTaskCountType = 1u; + + EXPECT_EQ(NEO::CompletionStamp::failed, completionStamp.taskCount); + EXPECT_EQ(0u, commandStreamReceiver.taskCount); + EXPECT_EQ(0u, commandStreamReceiver.latestSentTaskCount); + EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount); + + EXPECT_FALSE(commandStreamReceiver.isMadeResident(csrCmdBufferAllocation, currentTaskCountType)); + EXPECT_TRUE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType)); + + BatchBuffer &recordedBatchBuffer = commandStreamReceiver.latestFlushedBatchBuffer; + EXPECT_EQ(nullptr, recordedBatchBuffer.commandBufferAllocation); + EXPECT_EQ(0u, recordedBatchBuffer.startOffset); + EXPECT_EQ(false, recordedBatchBuffer.hasStallingCmds); + EXPECT_EQ(false, recordedBatchBuffer.hasRelaxedOrderingDependencies); + + completionStamp = commandStreamReceiver.flushImmediateTask(commandStream, startOffset, immediateFlushTaskFlags, *pDevice); + + EXPECT_EQ(NEO::CompletionStamp::failed, completionStamp.taskCount); + EXPECT_EQ(0u, commandStreamReceiver.taskCount); + EXPECT_EQ(0u, commandStreamReceiver.latestSentTaskCount); + EXPECT_EQ(0u, commandStreamReceiver.latestFlushedTaskCount); + + EXPECT_FALSE(commandStreamReceiver.isMadeResident(immediateListCmdBufferAllocation, currentTaskCountType)); +}