[feature, perf] add alogrithm to chain command buffers in container

This feature is part of performance improvement to dispatch and start
command buffers as primary batch buffers.
When exhausted command buffer is closed, then reserve exact space for chained
batch buffer start and bind it to the next command buffer.
When closing command buffer, then save ending pointer and
reserve aligned space.

Related-To: NEO-7807

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2023-04-05 13:21:45 +00:00 committed by Compute-Runtime-Automation
parent 88fe17e50a
commit 4c7bc2ca98
7 changed files with 170 additions and 10 deletions

View File

@ -83,8 +83,15 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat
cmdBufferAllocations.push_back(cmdBufferAllocation);
auto &gfxCoreHelper = device->getGfxCoreHelper();
if (this->usingPrimaryBuffer) {
this->selectedBbCmdSize = gfxCoreHelper.getBatchBufferStartSize();
} else {
this->selectedBbCmdSize = gfxCoreHelper.getBatchBufferEndSize();
this->bbEndReference = gfxCoreHelper.getBatchBufferEndReference();
}
commandStream = std::make_unique<LinearStream>(cmdBufferAllocation->getUnderlyingBuffer(),
usableSize, this, gfxCoreHelper.getBatchBufferEndSize());
usableSize, this, this->selectedBbCmdSize);
commandStream->replaceGraphicsAllocation(cmdBufferAllocation);
@ -96,7 +103,7 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat
return ErrorCode::OUT_OF_DEVICE_MEMORY;
}
secondaryCommandStreamForImmediateCmdList = std::make_unique<LinearStream>(cmdBufferAllocationHost->getUnderlyingBuffer(),
usableSize, this, gfxCoreHelper.getBatchBufferEndSize());
usableSize, this, this->selectedBbCmdSize);
secondaryCommandStreamForImmediateCmdList->replaceGraphicsAllocation(cmdBufferAllocationHost);
cmdBufferAllocations.push_back(cmdBufferAllocationHost);
addToResidencyContainer(cmdBufferAllocationHost);
@ -192,6 +199,7 @@ void CommandContainer::reset() {
nextIddInBlock = this->getNumIddPerBlock();
lastPipelineSelectModeRequired = false;
lastSentUseGlobalAtomics = false;
endCmdPtr = nullptr;
}
size_t CommandContainer::getAlignedCmdBufferSize() const {
@ -316,14 +324,34 @@ void CommandContainer::allocateNextCommandBuffer() {
}
void CommandContainer::closeAndAllocateNextCommandBuffer() {
auto &gfxCoreHelper = device->getGfxCoreHelper();
auto bbEndSize = gfxCoreHelper.getBatchBufferEndSize();
auto ptr = commandStream->getSpace(0u);
memcpy_s(ptr, bbEndSize, gfxCoreHelper.getBatchBufferEndReference(), bbEndSize);
size_t usedSize = commandStream->getUsed();
allocateNextCommandBuffer();
if (this->usingPrimaryBuffer) {
auto nextChainedBuffer = commandStream->getGraphicsAllocation();
auto &gfxCoreHelper = device->getGfxCoreHelper();
gfxCoreHelper.encodeBatchBufferStart(ptr, nextChainedBuffer->getGpuAddress(), false, false, false);
alignPrimaryEnding(ptr, usedSize);
} else {
memcpy_s(ptr, this->selectedBbCmdSize, this->bbEndReference, this->selectedBbCmdSize);
}
currentLinearStreamStartOffset = 0u;
}
void CommandContainer::alignPrimaryEnding(void *endPtr, size_t exactUsedSize) {
exactUsedSize += this->selectedBbCmdSize;
this->alignedPrimarySize = alignUp(exactUsedSize, minCmdBufferPtrAlign);
if (this->alignedPrimarySize > exactUsedSize) {
endPtr = ptrOffset(endPtr, this->selectedBbCmdSize);
memset(endPtr, 0, this->alignedPrimarySize - exactUsedSize);
}
}
void CommandContainer::endAlignedPrimaryBuffer() {
this->endCmdPtr = commandStream->getSpace(0u);
alignPrimaryEnding(this->endCmdPtr, commandStream->getUsed());
}
void CommandContainer::prepareBindfulSsh() {
if (ApiSpecificConfig::getBindlessConfiguration()) {
if (allocationIndirectHeaps[IndirectHeap::Type::SURFACE_STATE] == nullptr) {

View File

@ -72,6 +72,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
static constexpr size_t startingResidencyContainerSize = 128;
static constexpr size_t defaultCmdBufferAllocationAlignment = MemoryConstants::pageSize64k;
static constexpr size_t defaultHeapAllocationAlignment = MemoryConstants::pageSize64k;
static constexpr size_t minCmdBufferPtrAlign = 8;
CommandContainer();
@ -188,6 +189,20 @@ class CommandContainer : public NonCopyableOrMovableClass {
HeapContainer &getSshAllocations() { return sshAllocations; }
uint64_t &currentLinearStreamStartOffsetRef() { return currentLinearStreamStartOffset; }
void setUsingPrimaryBuffer(bool value) {
usingPrimaryBuffer = value;
}
bool isUsingPrimaryBuffer() const {
return usingPrimaryBuffer;
}
void *getEndCmdPtr() const {
return endCmdPtr;
}
size_t getEndAlignedSize() const {
return this->alignedPrimarySize;
}
void endAlignedPrimaryBuffer();
protected:
size_t getAlignedCmdBufferSize() const;
size_t getMaxUsableSpace() const {
@ -198,9 +213,9 @@ class CommandContainer : public NonCopyableOrMovableClass {
IndirectHeap *initIndirectHeapReservation(ReservedIndirectHeap *indirectHeapReservation, size_t size, size_t alignment, HeapType heapType);
inline bool skipHeapAllocationCreation(HeapType heapType);
size_t getHeapSize(HeapType heapType);
void alignPrimaryEnding(void *endPtr, size_t exactUsedSize);
GraphicsAllocation *allocationIndirectHeaps[HeapType::NUM_TYPES] = {};
std::unique_ptr<IndirectHeap> indirectHeaps[HeapType::NUM_TYPES];
CmdBufferContainer cmdBufferAllocations;
ResidencyContainer residencyContainer;
@ -210,6 +225,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
HeapReserveData dynamicStateHeapReserveData;
HeapReserveData surfaceStateHeapReserveData;
std::unique_ptr<IndirectHeap> indirectHeaps[HeapType::NUM_TYPES];
std::unique_ptr<HeapHelper> heapHelper;
std::unique_ptr<LinearStream> commandStream;
std::unique_ptr<LinearStream> secondaryCommandStreamForImmediateCmdList;
@ -228,6 +244,10 @@ class CommandContainer : public NonCopyableOrMovableClass {
IndirectHeap *sharedDshCsrHeap = nullptr;
size_t defaultSshSize = 0;
L1CachePolicy *l1CachePolicyData = nullptr;
size_t selectedBbCmdSize = 0;
const void *bbEndReference = nullptr;
void *endCmdPtr = nullptr;
size_t alignedPrimarySize = 0;
uint32_t dirtyHeaps = std::numeric_limits<uint32_t>::max();
uint32_t numIddsPerBlock = 64;
@ -245,6 +265,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
bool lastSentUseGlobalAtomics = false;
bool systolicModeSupport = false;
bool doubleSbaWa = false;
bool usingPrimaryBuffer = false;
};
} // namespace NEO

View File

@ -432,6 +432,7 @@ struct EncodeBatchBufferStartOrEnd {
return sizeof(MI_BATCH_BUFFER_END);
}
static void programBatchBufferStart(MI_BATCH_BUFFER_START *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate);
static void programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate);
static void programBatchBufferEnd(CommandContainer &container);
static void programBatchBufferEnd(LinearStream &commandStream);

View File

@ -933,7 +933,7 @@ void EncodeBatchBufferStartOrEnd<Family>::programConditionalBatchBufferStartBase
}
template <typename Family>
void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate) {
void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(MI_BATCH_BUFFER_START *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) {
MI_BATCH_BUFFER_START cmd = Family::cmdInitBatchBufferStart;
if (secondLevel) {
cmd.setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH);
@ -943,8 +943,12 @@ void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(LinearStream *
appendBatchBufferStart(cmd, indirect, predicate);
auto buffer = commandStream->getSpaceForCmd<MI_BATCH_BUFFER_START>();
*buffer = cmd;
*cmdBuffer = cmd;
}
template <typename Family>
void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate) {
programBatchBufferStart(commandStream->getSpaceForCmd<MI_BATCH_BUFFER_START>(), address, secondLevel, indirect, predicate);
}
template <typename Family>

View File

@ -162,6 +162,8 @@ class GfxCoreHelper {
virtual bool platformSupportsImplicitScaling(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
virtual size_t getBatchBufferEndSize() const = 0;
virtual const void *getBatchBufferEndReference() const = 0;
virtual size_t getBatchBufferStartSize() const = 0;
virtual void encodeBatchBufferStart(void *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) const = 0;
virtual bool isPlatformFlushTaskEnabled(const NEO::ProductHelper &productHelper) const = 0;
virtual uint32_t getMinimalScratchSpaceSize() const = 0;
virtual bool copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const = 0;
@ -378,6 +380,8 @@ class GfxCoreHelperHw : public GfxCoreHelper {
bool platformSupportsImplicitScaling(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const override;
size_t getBatchBufferEndSize() const override;
const void *getBatchBufferEndReference() const override;
size_t getBatchBufferStartSize() const override;
void encodeBatchBufferStart(void *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) const override;
bool isPlatformFlushTaskEnabled(const NEO::ProductHelper &productHelper) const override;
uint32_t getMinimalScratchSpaceSize() const override;
bool copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const override;

View File

@ -6,6 +6,7 @@
*/
#include "shared/source/aub_mem_dump/aub_mem_dump.h"
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_container/encode_surface_state.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/gmm_helper/gmm.h"
@ -627,12 +628,26 @@ bool GfxCoreHelperHw<GfxFamily>::forceNonGpuCoherencyWA(bool requiresCoherency)
}
template <typename GfxFamily>
size_t GfxCoreHelperHw<GfxFamily>::getBatchBufferEndSize() const {
return sizeof(typename GfxFamily::MI_BATCH_BUFFER_END);
return EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferEndSize();
}
template <typename GfxFamily>
const void *GfxCoreHelperHw<GfxFamily>::getBatchBufferEndReference() const {
return reinterpret_cast<const void *>(&GfxFamily::cmdInitBatchBufferEnd);
}
template <typename GfxFamily>
size_t GfxCoreHelperHw<GfxFamily>::getBatchBufferStartSize() const {
return EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
}
template <typename GfxFamily>
void GfxCoreHelperHw<GfxFamily>::encodeBatchBufferStart(void *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) const {
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
MI_BATCH_BUFFER_START *bbBuffer = reinterpret_cast<MI_BATCH_BUFFER_START *>(cmdBuffer);
EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(bbBuffer, address, secondLevel, indirect, predicate);
}
template <typename GfxFamily>
bool GfxCoreHelperHw<GfxFamily>::isPlatformFlushTaskEnabled(const ProductHelper &productHelper) const {
return productHelper.isFlushTaskAllowed();

View File

@ -13,6 +13,7 @@
#include "shared/source/indirect_heap/indirect_heap.h"
#include "shared/source/memory_manager/allocations_list.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/fixtures/device_fixture.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
@ -1654,3 +1655,89 @@ TEST_F(CommandContainerTest, givenCmdContainerSetToSbaTrackingWhenContainerIsIni
cmdContainer->initialize(pDevice, nullptr, sshDefaultSize, true, false);
EXPECT_EQ(2 * HeapSize::defaultHeapSize, cmdContainer->defaultSshSize);
}
HWTEST_F(CommandContainerTest,
givenCmdContainerUsingPrimaryBatchBufferWhenCloseAndAllocateNextCommandBufferCalledThenNewCmdBufferAllocationCreatedAndChainedWithOldCmdBuffer) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
CommandContainer cmdContainer;
cmdContainer.setUsingPrimaryBuffer(true);
EXPECT_TRUE(cmdContainer.isUsingPrimaryBuffer());
cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false);
ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
auto firstCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[0];
cmdContainer.closeAndAllocateNextCommandBuffer();
ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 2u);
auto chainedCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[1];
auto bbStartGpuAddress = chainedCmdBufferAllocation->getGpuAddress();
auto bbStart = genCmdCast<MI_BATCH_BUFFER_START *>(firstCmdBufferAllocation->getUnderlyingBuffer());
ASSERT_NE(nullptr, bbStart);
EXPECT_EQ(bbStartGpuAddress, bbStart->getBatchBufferStartAddress());
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
size_t expectedEndSize = alignUp(sizeof(MI_BATCH_BUFFER_START), CommandContainer::minCmdBufferPtrAlign);
cmdContainer.endAlignedPrimaryBuffer();
void *endPtr = cmdContainer.getEndCmdPtr();
size_t alignedSize = cmdContainer.getEndAlignedSize();
EXPECT_EQ(chainedCmdBufferAllocation->getUnderlyingBuffer(), endPtr);
EXPECT_EQ(expectedEndSize, alignedSize);
}
HWTEST_F(CommandContainerTest,
givenCmdContainerUsingPrimaryBatchBufferWhenCloseAndAllocateMultipleCommandBuffersThenNewCmdBufferAllocationsCreatedAndChainedWithOldCmdBuffers) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
CommandContainer cmdContainer;
cmdContainer.setUsingPrimaryBuffer(true);
EXPECT_TRUE(cmdContainer.isUsingPrimaryBuffer());
cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false);
ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
auto firstCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[0];
size_t consumedSize = sizeof(int);
cmdContainer.getCommandStream()->getSpace(consumedSize);
cmdContainer.closeAndAllocateNextCommandBuffer();
ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 2u);
auto chainedCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[1];
auto bbStartGpuAddress = chainedCmdBufferAllocation->getGpuAddress();
auto bbStart = genCmdCast<MI_BATCH_BUFFER_START *>(ptrOffset(firstCmdBufferAllocation->getUnderlyingBuffer(), consumedSize));
ASSERT_NE(nullptr, bbStart);
EXPECT_EQ(bbStartGpuAddress, bbStart->getBatchBufferStartAddress());
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
consumedSize *= 2;
cmdContainer.getCommandStream()->getSpace(consumedSize);
cmdContainer.closeAndAllocateNextCommandBuffer();
ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 3u);
auto closingCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[2];
bbStartGpuAddress = closingCmdBufferAllocation->getGpuAddress();
bbStart = genCmdCast<MI_BATCH_BUFFER_START *>(ptrOffset(chainedCmdBufferAllocation->getUnderlyingBuffer(), consumedSize));
ASSERT_NE(nullptr, bbStart);
EXPECT_EQ(bbStartGpuAddress, bbStart->getBatchBufferStartAddress());
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
consumedSize = alignUp(sizeof(MI_BATCH_BUFFER_START), CommandContainer::minCmdBufferPtrAlign) - sizeof(MI_BATCH_BUFFER_START);
cmdContainer.getCommandStream()->getSpace(consumedSize);
size_t expectedEndSize = alignUp((sizeof(MI_BATCH_BUFFER_START) + consumedSize), CommandContainer::minCmdBufferPtrAlign);
cmdContainer.endAlignedPrimaryBuffer();
void *endPtr = cmdContainer.getEndCmdPtr();
size_t alignedSize = cmdContainer.getEndAlignedSize();
EXPECT_EQ(ptrOffset(closingCmdBufferAllocation->getUnderlyingBuffer(), consumedSize), endPtr);
EXPECT_EQ(expectedEndSize, alignedSize);
}