From a969636b6a293877b797c1979ce11b4d664854fd Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Thu, 24 Nov 2022 15:00:51 +0000 Subject: [PATCH] RelaxedOrdering: Optimize GPU Queue stall by adding early return Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz --- shared/source/command_stream/linear_stream.h | 7 +++++ .../direct_submission/direct_submission_hw.h | 2 -- .../direct_submission_hw.inl | 30 +++++++++---------- ...efetch_mitigation_xe_hp_core_and_later.inl | 5 ++-- .../common/mocks/mock_direct_submission_hw.h | 1 - .../direct_submission_tests_2.cpp | 18 +++++++++-- 6 files changed, 39 insertions(+), 24 deletions(-) diff --git a/shared/source/command_stream/linear_stream.h b/shared/source/command_stream/linear_stream.h index 27a9466cbb..36efc0d973 100644 --- a/shared/source/command_stream/linear_stream.h +++ b/shared/source/command_stream/linear_stream.h @@ -37,6 +37,8 @@ class LinearStream { uint64_t getGpuBase() const; void setGpuBase(uint64_t gpuAddress); + uint64_t getCurrentGpuAddressPosition() const; + void overrideMaxSize(size_t newMaxSize); void replaceBuffer(void *buffer, size_t bufferSize); GraphicsAllocation *getGraphicsAllocation() const; @@ -112,4 +114,9 @@ inline GraphicsAllocation *LinearStream::getGraphicsAllocation() const { inline void LinearStream::replaceGraphicsAllocation(GraphicsAllocation *gfxAllocation) { graphicsAllocation = gfxAllocation; } + +inline uint64_t LinearStream::getCurrentGpuAddressPosition() const { + return (getGpuBase() + getUsed()); +} + } // namespace NEO diff --git a/shared/source/direct_submission/direct_submission_hw.h b/shared/source/direct_submission/direct_submission_hw.h index 11f649830a..b5280224dc 100644 --- a/shared/source/direct_submission/direct_submission_hw.h +++ b/shared/source/direct_submission/direct_submission_hw.h @@ -148,8 +148,6 @@ class DirectSubmissionHw { size_t getSizeEnd(); - uint64_t getCommandBufferPositionGpuAddress(void *position); - void dispatchPartitionRegisterConfiguration(); size_t getSizePartitionRegisterConfigurationSection(); diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index 603fbca6e7..4f209157f9 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -82,7 +82,7 @@ DirectSubmissionHw::DirectSubmissionHw(const DirectSubmis template void DirectSubmissionHw::dispatchRelaxedOrderingSchedulerSection(uint32_t value) { - uint64_t schedulerStartAddress = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0)); + uint64_t schedulerStartAddress = ringCommandStream.getCurrentGpuAddressPosition(); uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress(); // 1. Init section @@ -409,7 +409,7 @@ bool DirectSubmissionHw::startRingBuffer() { if (ringCommandStream.getAvailableSpace() < requiredSize) { switchRingBuffers(); } - uint64_t gpuStartVa = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0)); + uint64_t gpuStartVa = ringCommandStream.getCurrentGpuAddressPosition(); if (!this->partitionConfigSet) { dispatchPartitionRegisterConfiguration(); @@ -558,14 +558,6 @@ inline size_t DirectSubmissionHw::getSizeEnd() { return size; } -template -inline uint64_t DirectSubmissionHw::getCommandBufferPositionGpuAddress(void *position) { - void *currentBase = ringCommandStream.getCpuBase(); - - size_t offset = ptrDiff(position, currentBase); - return ringCommandStream.getGraphicsAllocation()->getGpuAddress() + static_cast(offset); -} - template inline size_t DirectSubmissionHw::getSizeDispatch() { size_t size = getSizeSemaphoreSection(false); @@ -624,8 +616,8 @@ void *DirectSubmissionHw::dispatchWorkloadSection(BatchBu } dispatchStartSection(commandStreamAddress); - void *returnPosition = ringCommandStream.getSpace(0); - uint64_t returnGpuPointer = getCommandBufferPositionGpuAddress(returnPosition); + + uint64_t returnGpuPointer = ringCommandStream.getCurrentGpuAddressPosition(); if (this->relaxedOrderingEnabled) { dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer); @@ -659,13 +651,21 @@ void *DirectSubmissionHw::dispatchWorkloadSection(BatchBu template void DirectSubmissionHw::dispatchRelaxedOrderingQueueStall() { + LinearStream bbStartStream(ringCommandStream.getSpace(EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart()), + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart()); + LriHelper::program(&ringCommandStream, CS_GPR_R5, 1, true); dispatchSemaphoreSection(currentQueueWorkCount, false); + + // patch conditional bb_start with current GPU address + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(), + CS_GPR_R1, 0, CompareOperation::Equal, false); } template size_t DirectSubmissionHw::getSizeDispatchRelaxedOrderingQueueStall() { - return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM); + return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) + + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(); } template @@ -762,8 +762,6 @@ bool DirectSubmissionHw::dispatchCommandBuffer(BatchBuffe requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection() + RelaxedOrderingHelper::getSizeReturnPtrRegs(); } - getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0)); - if (ringCommandStream.getAvailableSpace() < requiredMinimalSize) { switchRingBuffers(); } @@ -828,7 +826,7 @@ template inline uint64_t DirectSubmissionHw::switchRingBuffers() { GraphicsAllocation *nextRingBuffer = switchRingBuffersAllocations(); void *flushPtr = ringCommandStream.getSpace(0); - uint64_t currentBufferGpuVa = getCommandBufferPositionGpuAddress(flushPtr); + uint64_t currentBufferGpuVa = ringCommandStream.getCurrentGpuAddressPosition(); if (ringStart) { dispatchSwitchRingBufferSection(nextRingBuffer->getGpuAddress()); diff --git a/shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl b/shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl index 60bf11a8ec..9b2125d9ae 100644 --- a/shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl +++ b/shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021 Intel Corporation + * Copyright (C) 2021-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -11,8 +11,7 @@ namespace NEO { template inline void DirectSubmissionHw::dispatchPrefetchMitigation() { - auto addressToJump = ptrOffset(ringCommandStream.getSpace(0u), getSizeStartSection()); - dispatchStartSection(getCommandBufferPositionGpuAddress(addressToJump)); + dispatchStartSection(ringCommandStream.getCurrentGpuAddressPosition() + getSizeStartSection()); } template diff --git a/shared/test/common/mocks/mock_direct_submission_hw.h b/shared/test/common/mocks/mock_direct_submission_hw.h index 31712758aa..08e15b9a85 100644 --- a/shared/test/common/mocks/mock_direct_submission_hw.h +++ b/shared/test/common/mocks/mock_direct_submission_hw.h @@ -36,7 +36,6 @@ struct MockDirectSubmissionHw : public DirectSubmissionHw using BaseClass::dispatchStartSection; using BaseClass::dispatchSwitchRingBufferSection; using BaseClass::dispatchWorkloadSection; - using BaseClass::getCommandBufferPositionGpuAddress; using BaseClass::getDiagnosticModeSection; using BaseClass::getSizeDisablePrefetcher; using BaseClass::getSizeDispatch; diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index 6643dd95e7..e0dc8abfad 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -1761,8 +1761,15 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithStallingCmdsWhenDispa batchBuffer.hasStallingCmds = true; directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); + auto startAddress = ptrOffset(directSubmission.ringCommandStream.getCpuBase(), offset); + auto jumpOffset = directSubmission.getSizeSemaphoreSection(false) + sizeof(typename FamilyType::MI_LOAD_REGISTER_IMM) + + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(); + uint64_t expectedJumpAddress = directSubmission.ringCommandStream.getGpuBase() + offset + jumpOffset; + + EXPECT_TRUE(verifyConditionalDataRegBbStart(startAddress, expectedJumpAddress, CS_GPR_R1, 0, CompareOperation::Equal, false)); + HardwareParse hwParse; - hwParse.parseCommands(directSubmission.ringCommandStream, offset); + hwParse.parseCommands(directSubmission.ringCommandStream, offset + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart()); hwParse.findHardwareCommands(); bool success = false; @@ -1856,8 +1863,15 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenStoppingRingThenProgramSched directSubmission.stopRingBuffer(); + auto startAddress = ptrOffset(directSubmission.ringCommandStream.getCpuBase(), offset); + auto jumpOffset = directSubmission.getSizeSemaphoreSection(false) + sizeof(typename FamilyType::MI_LOAD_REGISTER_IMM) + + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(); + uint64_t expectedJumpAddress = directSubmission.ringCommandStream.getGpuBase() + offset + jumpOffset; + + EXPECT_TRUE(verifyConditionalDataRegBbStart(startAddress, expectedJumpAddress, CS_GPR_R1, 0, CompareOperation::Equal, false)); + HardwareParse hwParse; - hwParse.parseCommands(directSubmission.ringCommandStream, offset); + hwParse.parseCommands(directSubmission.ringCommandStream, offset + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart()); hwParse.findHardwareCommands(); bool success = false;