From 4c7bc2ca983f49462aad6ef3f9ce00bc5ea64c04 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Wed, 5 Apr 2023 13:21:45 +0000 Subject: [PATCH] [feature, perf] add alogrithm to chain command buffers in container This feature is part of performance improvement to dispatch and start command buffers as primary batch buffers. When exhausted command buffer is closed, then reserve exact space for chained batch buffer start and bind it to the next command buffer. When closing command buffer, then save ending pointer and reserve aligned space. Related-To: NEO-7807 Signed-off-by: Zbigniew Zdanowicz --- .../source/command_container/cmdcontainer.cpp | 38 ++++++-- .../source/command_container/cmdcontainer.h | 23 ++++- .../command_container/command_encoder.h | 1 + .../command_container/command_encoder.inl | 10 ++- shared/source/helpers/gfx_core_helper.h | 4 + .../source/helpers/gfx_core_helper_base.inl | 17 +++- .../command_container_tests.cpp | 87 +++++++++++++++++++ 7 files changed, 170 insertions(+), 10 deletions(-) diff --git a/shared/source/command_container/cmdcontainer.cpp b/shared/source/command_container/cmdcontainer.cpp index e52b48bd32..7287ccb583 100644 --- a/shared/source/command_container/cmdcontainer.cpp +++ b/shared/source/command_container/cmdcontainer.cpp @@ -83,8 +83,15 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat cmdBufferAllocations.push_back(cmdBufferAllocation); auto &gfxCoreHelper = device->getGfxCoreHelper(); + if (this->usingPrimaryBuffer) { + this->selectedBbCmdSize = gfxCoreHelper.getBatchBufferStartSize(); + } else { + this->selectedBbCmdSize = gfxCoreHelper.getBatchBufferEndSize(); + this->bbEndReference = gfxCoreHelper.getBatchBufferEndReference(); + } + commandStream = std::make_unique(cmdBufferAllocation->getUnderlyingBuffer(), - usableSize, this, gfxCoreHelper.getBatchBufferEndSize()); + usableSize, this, this->selectedBbCmdSize); commandStream->replaceGraphicsAllocation(cmdBufferAllocation); @@ -96,7 +103,7 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat return ErrorCode::OUT_OF_DEVICE_MEMORY; } secondaryCommandStreamForImmediateCmdList = std::make_unique(cmdBufferAllocationHost->getUnderlyingBuffer(), - usableSize, this, gfxCoreHelper.getBatchBufferEndSize()); + usableSize, this, this->selectedBbCmdSize); secondaryCommandStreamForImmediateCmdList->replaceGraphicsAllocation(cmdBufferAllocationHost); cmdBufferAllocations.push_back(cmdBufferAllocationHost); addToResidencyContainer(cmdBufferAllocationHost); @@ -192,6 +199,7 @@ void CommandContainer::reset() { nextIddInBlock = this->getNumIddPerBlock(); lastPipelineSelectModeRequired = false; lastSentUseGlobalAtomics = false; + endCmdPtr = nullptr; } size_t CommandContainer::getAlignedCmdBufferSize() const { @@ -316,14 +324,34 @@ void CommandContainer::allocateNextCommandBuffer() { } void CommandContainer::closeAndAllocateNextCommandBuffer() { - auto &gfxCoreHelper = device->getGfxCoreHelper(); - auto bbEndSize = gfxCoreHelper.getBatchBufferEndSize(); auto ptr = commandStream->getSpace(0u); - memcpy_s(ptr, bbEndSize, gfxCoreHelper.getBatchBufferEndReference(), bbEndSize); + size_t usedSize = commandStream->getUsed(); allocateNextCommandBuffer(); + if (this->usingPrimaryBuffer) { + auto nextChainedBuffer = commandStream->getGraphicsAllocation(); + auto &gfxCoreHelper = device->getGfxCoreHelper(); + gfxCoreHelper.encodeBatchBufferStart(ptr, nextChainedBuffer->getGpuAddress(), false, false, false); + alignPrimaryEnding(ptr, usedSize); + } else { + memcpy_s(ptr, this->selectedBbCmdSize, this->bbEndReference, this->selectedBbCmdSize); + } currentLinearStreamStartOffset = 0u; } +void CommandContainer::alignPrimaryEnding(void *endPtr, size_t exactUsedSize) { + exactUsedSize += this->selectedBbCmdSize; + this->alignedPrimarySize = alignUp(exactUsedSize, minCmdBufferPtrAlign); + if (this->alignedPrimarySize > exactUsedSize) { + endPtr = ptrOffset(endPtr, this->selectedBbCmdSize); + memset(endPtr, 0, this->alignedPrimarySize - exactUsedSize); + } +} + +void CommandContainer::endAlignedPrimaryBuffer() { + this->endCmdPtr = commandStream->getSpace(0u); + alignPrimaryEnding(this->endCmdPtr, commandStream->getUsed()); +} + void CommandContainer::prepareBindfulSsh() { if (ApiSpecificConfig::getBindlessConfiguration()) { if (allocationIndirectHeaps[IndirectHeap::Type::SURFACE_STATE] == nullptr) { diff --git a/shared/source/command_container/cmdcontainer.h b/shared/source/command_container/cmdcontainer.h index 2d5b1626b5..f62322219a 100644 --- a/shared/source/command_container/cmdcontainer.h +++ b/shared/source/command_container/cmdcontainer.h @@ -72,6 +72,7 @@ class CommandContainer : public NonCopyableOrMovableClass { static constexpr size_t startingResidencyContainerSize = 128; static constexpr size_t defaultCmdBufferAllocationAlignment = MemoryConstants::pageSize64k; static constexpr size_t defaultHeapAllocationAlignment = MemoryConstants::pageSize64k; + static constexpr size_t minCmdBufferPtrAlign = 8; CommandContainer(); @@ -188,6 +189,20 @@ class CommandContainer : public NonCopyableOrMovableClass { HeapContainer &getSshAllocations() { return sshAllocations; } uint64_t ¤tLinearStreamStartOffsetRef() { return currentLinearStreamStartOffset; } + void setUsingPrimaryBuffer(bool value) { + usingPrimaryBuffer = value; + } + bool isUsingPrimaryBuffer() const { + return usingPrimaryBuffer; + } + void *getEndCmdPtr() const { + return endCmdPtr; + } + size_t getEndAlignedSize() const { + return this->alignedPrimarySize; + } + void endAlignedPrimaryBuffer(); + protected: size_t getAlignedCmdBufferSize() const; size_t getMaxUsableSpace() const { @@ -198,9 +213,9 @@ class CommandContainer : public NonCopyableOrMovableClass { IndirectHeap *initIndirectHeapReservation(ReservedIndirectHeap *indirectHeapReservation, size_t size, size_t alignment, HeapType heapType); inline bool skipHeapAllocationCreation(HeapType heapType); size_t getHeapSize(HeapType heapType); + void alignPrimaryEnding(void *endPtr, size_t exactUsedSize); GraphicsAllocation *allocationIndirectHeaps[HeapType::NUM_TYPES] = {}; - std::unique_ptr indirectHeaps[HeapType::NUM_TYPES]; CmdBufferContainer cmdBufferAllocations; ResidencyContainer residencyContainer; @@ -210,6 +225,7 @@ class CommandContainer : public NonCopyableOrMovableClass { HeapReserveData dynamicStateHeapReserveData; HeapReserveData surfaceStateHeapReserveData; + std::unique_ptr indirectHeaps[HeapType::NUM_TYPES]; std::unique_ptr heapHelper; std::unique_ptr commandStream; std::unique_ptr secondaryCommandStreamForImmediateCmdList; @@ -228,6 +244,10 @@ class CommandContainer : public NonCopyableOrMovableClass { IndirectHeap *sharedDshCsrHeap = nullptr; size_t defaultSshSize = 0; L1CachePolicy *l1CachePolicyData = nullptr; + size_t selectedBbCmdSize = 0; + const void *bbEndReference = nullptr; + void *endCmdPtr = nullptr; + size_t alignedPrimarySize = 0; uint32_t dirtyHeaps = std::numeric_limits::max(); uint32_t numIddsPerBlock = 64; @@ -245,6 +265,7 @@ class CommandContainer : public NonCopyableOrMovableClass { bool lastSentUseGlobalAtomics = false; bool systolicModeSupport = false; bool doubleSbaWa = false; + bool usingPrimaryBuffer = false; }; } // namespace NEO diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index faedf4b02e..65c651722c 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -432,6 +432,7 @@ struct EncodeBatchBufferStartOrEnd { return sizeof(MI_BATCH_BUFFER_END); } + static void programBatchBufferStart(MI_BATCH_BUFFER_START *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate); static void programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate); static void programBatchBufferEnd(CommandContainer &container); static void programBatchBufferEnd(LinearStream &commandStream); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 22b99da407..87da59c820 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -933,7 +933,7 @@ void EncodeBatchBufferStartOrEnd::programConditionalBatchBufferStartBase } template -void EncodeBatchBufferStartOrEnd::programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate) { +void EncodeBatchBufferStartOrEnd::programBatchBufferStart(MI_BATCH_BUFFER_START *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) { MI_BATCH_BUFFER_START cmd = Family::cmdInitBatchBufferStart; if (secondLevel) { cmd.setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH); @@ -943,8 +943,12 @@ void EncodeBatchBufferStartOrEnd::programBatchBufferStart(LinearStream * appendBatchBufferStart(cmd, indirect, predicate); - auto buffer = commandStream->getSpaceForCmd(); - *buffer = cmd; + *cmdBuffer = cmd; +} + +template +void EncodeBatchBufferStartOrEnd::programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate) { + programBatchBufferStart(commandStream->getSpaceForCmd(), address, secondLevel, indirect, predicate); } template diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index 9281562ec3..c54f4b3bb3 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -162,6 +162,8 @@ class GfxCoreHelper { virtual bool platformSupportsImplicitScaling(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const = 0; virtual size_t getBatchBufferEndSize() const = 0; virtual const void *getBatchBufferEndReference() const = 0; + virtual size_t getBatchBufferStartSize() const = 0; + virtual void encodeBatchBufferStart(void *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) const = 0; virtual bool isPlatformFlushTaskEnabled(const NEO::ProductHelper &productHelper) const = 0; virtual uint32_t getMinimalScratchSpaceSize() const = 0; virtual bool copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const = 0; @@ -378,6 +380,8 @@ class GfxCoreHelperHw : public GfxCoreHelper { bool platformSupportsImplicitScaling(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const override; size_t getBatchBufferEndSize() const override; const void *getBatchBufferEndReference() const override; + size_t getBatchBufferStartSize() const override; + void encodeBatchBufferStart(void *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) const override; bool isPlatformFlushTaskEnabled(const NEO::ProductHelper &productHelper) const override; uint32_t getMinimalScratchSpaceSize() const override; bool copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const override; diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index 4229458868..55a2fa4e3c 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -6,6 +6,7 @@ */ #include "shared/source/aub_mem_dump/aub_mem_dump.h" +#include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/encode_surface_state.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/gmm_helper/gmm.h" @@ -627,12 +628,26 @@ bool GfxCoreHelperHw::forceNonGpuCoherencyWA(bool requiresCoherency) } template size_t GfxCoreHelperHw::getBatchBufferEndSize() const { - return sizeof(typename GfxFamily::MI_BATCH_BUFFER_END); + return EncodeBatchBufferStartOrEnd::getBatchBufferEndSize(); } template const void *GfxCoreHelperHw::getBatchBufferEndReference() const { return reinterpret_cast(&GfxFamily::cmdInitBatchBufferEnd); } + +template +size_t GfxCoreHelperHw::getBatchBufferStartSize() const { + return EncodeBatchBufferStartOrEnd::getBatchBufferStartSize(); +} + +template +void GfxCoreHelperHw::encodeBatchBufferStart(void *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) const { + using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START; + + MI_BATCH_BUFFER_START *bbBuffer = reinterpret_cast(cmdBuffer); + EncodeBatchBufferStartOrEnd::programBatchBufferStart(bbBuffer, address, secondLevel, indirect, predicate); +} + template bool GfxCoreHelperHw::isPlatformFlushTaskEnabled(const ProductHelper &productHelper) const { return productHelper.isFlushTaskAllowed(); diff --git a/shared/test/unit_test/command_container/command_container_tests.cpp b/shared/test/unit_test/command_container/command_container_tests.cpp index 8fbf7898a1..dc958a9f25 100644 --- a/shared/test/unit_test/command_container/command_container_tests.cpp +++ b/shared/test/unit_test/command_container/command_container_tests.cpp @@ -13,6 +13,7 @@ #include "shared/source/indirect_heap/indirect_heap.h" #include "shared/source/memory_manager/allocations_list.h" #include "shared/source/memory_manager/internal_allocation_storage.h" +#include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/fixtures/device_fixture.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/libult/ult_command_stream_receiver.h" @@ -1654,3 +1655,89 @@ TEST_F(CommandContainerTest, givenCmdContainerSetToSbaTrackingWhenContainerIsIni cmdContainer->initialize(pDevice, nullptr, sshDefaultSize, true, false); EXPECT_EQ(2 * HeapSize::defaultHeapSize, cmdContainer->defaultSshSize); } + +HWTEST_F(CommandContainerTest, + givenCmdContainerUsingPrimaryBatchBufferWhenCloseAndAllocateNextCommandBufferCalledThenNewCmdBufferAllocationCreatedAndChainedWithOldCmdBuffer) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + CommandContainer cmdContainer; + cmdContainer.setUsingPrimaryBuffer(true); + EXPECT_TRUE(cmdContainer.isUsingPrimaryBuffer()); + cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); + + ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u); + auto firstCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[0]; + + cmdContainer.closeAndAllocateNextCommandBuffer(); + + ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 2u); + auto chainedCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[1]; + auto bbStartGpuAddress = chainedCmdBufferAllocation->getGpuAddress(); + + auto bbStart = genCmdCast(firstCmdBufferAllocation->getUnderlyingBuffer()); + ASSERT_NE(nullptr, bbStart); + EXPECT_EQ(bbStartGpuAddress, bbStart->getBatchBufferStartAddress()); + EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer()); + + size_t expectedEndSize = alignUp(sizeof(MI_BATCH_BUFFER_START), CommandContainer::minCmdBufferPtrAlign); + cmdContainer.endAlignedPrimaryBuffer(); + + void *endPtr = cmdContainer.getEndCmdPtr(); + size_t alignedSize = cmdContainer.getEndAlignedSize(); + + EXPECT_EQ(chainedCmdBufferAllocation->getUnderlyingBuffer(), endPtr); + EXPECT_EQ(expectedEndSize, alignedSize); +} + +HWTEST_F(CommandContainerTest, + givenCmdContainerUsingPrimaryBatchBufferWhenCloseAndAllocateMultipleCommandBuffersThenNewCmdBufferAllocationsCreatedAndChainedWithOldCmdBuffers) { + using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + CommandContainer cmdContainer; + cmdContainer.setUsingPrimaryBuffer(true); + EXPECT_TRUE(cmdContainer.isUsingPrimaryBuffer()); + cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); + + ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u); + auto firstCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[0]; + + size_t consumedSize = sizeof(int); + cmdContainer.getCommandStream()->getSpace(consumedSize); + + cmdContainer.closeAndAllocateNextCommandBuffer(); + + ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 2u); + auto chainedCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[1]; + auto bbStartGpuAddress = chainedCmdBufferAllocation->getGpuAddress(); + + auto bbStart = genCmdCast(ptrOffset(firstCmdBufferAllocation->getUnderlyingBuffer(), consumedSize)); + ASSERT_NE(nullptr, bbStart); + EXPECT_EQ(bbStartGpuAddress, bbStart->getBatchBufferStartAddress()); + EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer()); + + consumedSize *= 2; + cmdContainer.getCommandStream()->getSpace(consumedSize); + cmdContainer.closeAndAllocateNextCommandBuffer(); + + ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 3u); + auto closingCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[2]; + + bbStartGpuAddress = closingCmdBufferAllocation->getGpuAddress(); + + bbStart = genCmdCast(ptrOffset(chainedCmdBufferAllocation->getUnderlyingBuffer(), consumedSize)); + ASSERT_NE(nullptr, bbStart); + EXPECT_EQ(bbStartGpuAddress, bbStart->getBatchBufferStartAddress()); + EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer()); + + consumedSize = alignUp(sizeof(MI_BATCH_BUFFER_START), CommandContainer::minCmdBufferPtrAlign) - sizeof(MI_BATCH_BUFFER_START); + cmdContainer.getCommandStream()->getSpace(consumedSize); + + size_t expectedEndSize = alignUp((sizeof(MI_BATCH_BUFFER_START) + consumedSize), CommandContainer::minCmdBufferPtrAlign); + cmdContainer.endAlignedPrimaryBuffer(); + + void *endPtr = cmdContainer.getEndCmdPtr(); + size_t alignedSize = cmdContainer.getEndAlignedSize(); + + EXPECT_EQ(ptrOffset(closingCmdBufferAllocation->getUnderlyingBuffer(), consumedSize), endPtr); + EXPECT_EQ(expectedEndSize, alignedSize); +}