diff --git a/runtime/command_stream/command_stream_receiver_hw.h b/runtime/command_stream/command_stream_receiver_hw.h index 83de05d50c..953012ca02 100644 --- a/runtime/command_stream/command_stream_receiver_hw.h +++ b/runtime/command_stream/command_stream_receiver_hw.h @@ -47,6 +47,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { size_t getRequiredCmdStreamSizeAligned(const DispatchFlags &dispatchFlags, Device &device); size_t getRequiredCmdSizeForPreamble(Device &device) const; size_t getCmdSizeForPreemption(const DispatchFlags &dispatchFlags) const; + size_t getCmdSizeForEpilogue(const DispatchFlags &dispatchFlags) const; size_t getCmdSizeForL3Config() const; size_t getCmdSizeForPipelineSelect() const; size_t getCmdSizeForComputeMode(); @@ -77,6 +78,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { void programL3(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t &newL3Config); void programPreamble(LinearStream &csr, Device &device, DispatchFlags &dispatchFlags, uint32_t &newL3Config); void programPipelineSelect(LinearStream &csr, DispatchFlags &dispatchFlags); + void programEpilogue(LinearStream &csr, void **batchBufferEndLocation, DispatchFlags &dispatchFlags); void programMediaSampler(LinearStream &csr, DispatchFlags &dispatchFlags); void programStateSip(LinearStream &cmdStream, Device &device); void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads); diff --git a/runtime/command_stream/command_stream_receiver_hw_base.inl b/runtime/command_stream/command_stream_receiver_hw_base.inl index ab132e58ca..fc3b5f1357 100644 --- a/runtime/command_stream/command_stream_receiver_hw_base.inl +++ b/runtime/command_stream/command_stream_receiver_hw_base.inl @@ -415,7 +415,11 @@ CompletionStamp CommandStreamReceiverHw::flushTask( this->makeResident(*commandStreamAllocation); this->alignToCacheLine(commandStreamCSR); submitCommandStreamFromCsr = true; + } else if (dispatchFlags.epilogueRequired) { + this->makeResident(*commandStreamCSR.getGraphicsAllocation()); } + this->programEpilogue(commandStreamCSR, &bbEndLocation, dispatchFlags); + } else if (submitCSR) { this->addBatchBufferEnd(commandStreamCSR, &bbEndLocation); this->emitNoop(commandStreamCSR, bbEndPaddingSize); @@ -589,6 +593,7 @@ size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const Dispat size += getCmdSizeForMediaSampler(dispatchFlags.mediaSamplerRequired); size += getCmdSizeForPipelineSelect(); size += getCmdSizeForPreemption(dispatchFlags); + size += getCmdSizeForEpilogue(dispatchFlags); if (device.getWaTable()->waSamplerCacheFlushBetweenRedescribedSurfaceReads) { if (this->samplerCacheFlushRequired != SamplerCacheFlushState::samplerCacheFlushNotRequired) { @@ -779,4 +784,24 @@ void CommandStreamReceiverHw::blitBuffer(const BlitProperties &blitPr } } +template +inline void CommandStreamReceiverHw::programEpilogue(LinearStream &csr, void **batchBufferEndLocation, DispatchFlags &dispatchFlags) { + if (dispatchFlags.epilogueRequired) { + auto currentOffset = ptrDiff(csr.getSpace(0u), csr.getCpuBase()); + auto gpuAddress = ptrOffset(csr.getGraphicsAllocation()->getGpuAddress(), currentOffset); + + addBatchBufferStart(reinterpret_cast(*batchBufferEndLocation), gpuAddress, false); + this->addBatchBufferEnd(csr, batchBufferEndLocation); + this->alignToCacheLine(csr); + } +} + +template +inline size_t CommandStreamReceiverHw::getCmdSizeForEpilogue(const DispatchFlags &dispatchFlags) const { + if (dispatchFlags.epilogueRequired) { + return alignUp(sizeof(typename GfxFamily::MI_BATCH_BUFFER_END), MemoryConstants::cacheLineSize); + } + return 0u; +} + } // namespace NEO diff --git a/runtime/command_stream/csr_definitions.h b/runtime/command_stream/csr_definitions.h index c4a8f90a4b..daf1d41b50 100644 --- a/runtime/command_stream/csr_definitions.h +++ b/runtime/command_stream/csr_definitions.h @@ -49,6 +49,7 @@ struct DispatchFlags { bool outOfOrderExecutionAllowed = false; bool specialPipelineSelectMode = false; bool multiEngineQueue = false; + bool epilogueRequired = false; }; struct CsrSizeRequestFlags { diff --git a/unit_tests/command_stream/command_stream_receiver_flush_task_3_tests.cpp b/unit_tests/command_stream/command_stream_receiver_flush_task_3_tests.cpp index b9b0f21f7e..4817e844c9 100644 --- a/unit_tests/command_stream/command_stream_receiver_flush_task_3_tests.cpp +++ b/unit_tests/command_stream/command_stream_receiver_flush_task_3_tests.cpp @@ -16,6 +16,7 @@ #include "unit_tests/mocks/mock_command_queue.h" #include "unit_tests/mocks/mock_context.h" #include "unit_tests/mocks/mock_csr.h" +#include "unit_tests/mocks/mock_device.h" #include "unit_tests/mocks/mock_event.h" #include "unit_tests/mocks/mock_kernel.h" #include "unit_tests/mocks/mock_program.h" @@ -1413,6 +1414,55 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, givenDispatchFlagsWithThrottleSetT EXPECT_EQ(cmdBuffer->batchBuffer.throttle, QueueThrottle::HIGH); } +HWTEST_F(CommandStreamReceiverFlushTaskTests, givenEpilogueRequiredFlagWhenTaskIsSubmittedDirectlyThenItPointsBackToCsr) { + configureCSRtoNonDirtyState(); + auto &commandStreamReceiver = this->pDevice->getUltCommandStreamReceiver(); + + DispatchFlags dispatchFlags; + + EXPECT_EQ(0u, commandStreamReceiver.getCmdSizeForEpilogue(dispatchFlags)); + + dispatchFlags.epilogueRequired = true; + dispatchFlags.preemptionMode = PreemptionHelper::getDefaultPreemptionMode(pDevice->getHardwareInfo()); + + EXPECT_EQ(MemoryConstants::cacheLineSize, commandStreamReceiver.getCmdSizeForEpilogue(dispatchFlags)); + + auto data = commandStream.getSpace(MemoryConstants::cacheLineSize); + memset(data, 0, MemoryConstants::cacheLineSize); + commandStreamReceiver.storeMakeResidentAllocations = true; + commandStreamReceiver.flushTask(commandStream, + 0, + dsh, + ioh, + ssh, + taskLevel, + dispatchFlags, + *pDevice); + auto &commandStreamReceiverStream = commandStreamReceiver.getCS(0u); + + EXPECT_EQ(MemoryConstants::cacheLineSize * 2, commandStream.getUsed()); + EXPECT_EQ(MemoryConstants::cacheLineSize, commandStreamReceiverStream.getUsed()); + + parseCommands(commandStream, 0); + + auto itBBend = find(cmdList.begin(), cmdList.end()); + EXPECT_EQ(itBBend, cmdList.end()); + + auto itBatchBufferStart = find(cmdList.begin(), cmdList.end()); + EXPECT_NE(itBatchBufferStart, cmdList.end()); + + auto batchBufferStart = genCmdCast(*itBatchBufferStart); + EXPECT_EQ(batchBufferStart->getBatchBufferStartAddressGraphicsaddress472(), commandStreamReceiverStream.getGraphicsAllocation()->getGpuAddress()); + + parseCommands(commandStreamReceiverStream, 0); + + itBBend = find(cmdList.begin(), cmdList.end()); + void *bbEndAddress = *itBBend; + + EXPECT_EQ(commandStreamReceiverStream.getCpuBase(), bbEndAddress); + + EXPECT_TRUE(commandStreamReceiver.isMadeResident(commandStreamReceiverStream.getGraphicsAllocation())); +} template class UltCommandStreamReceiverForDispatchFlags : public UltCommandStreamReceiver {