[feature, perf] add alogrithm to chain command buffers in container
This feature is part of performance improvement to dispatch and start command buffers as primary batch buffers. When exhausted command buffer is closed, then reserve exact space for chained batch buffer start and bind it to the next command buffer. When closing command buffer, then save ending pointer and reserve aligned space. Related-To: NEO-7807 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
parent
88fe17e50a
commit
4c7bc2ca98
|
@ -83,8 +83,15 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat
|
|||
cmdBufferAllocations.push_back(cmdBufferAllocation);
|
||||
|
||||
auto &gfxCoreHelper = device->getGfxCoreHelper();
|
||||
if (this->usingPrimaryBuffer) {
|
||||
this->selectedBbCmdSize = gfxCoreHelper.getBatchBufferStartSize();
|
||||
} else {
|
||||
this->selectedBbCmdSize = gfxCoreHelper.getBatchBufferEndSize();
|
||||
this->bbEndReference = gfxCoreHelper.getBatchBufferEndReference();
|
||||
}
|
||||
|
||||
commandStream = std::make_unique<LinearStream>(cmdBufferAllocation->getUnderlyingBuffer(),
|
||||
usableSize, this, gfxCoreHelper.getBatchBufferEndSize());
|
||||
usableSize, this, this->selectedBbCmdSize);
|
||||
|
||||
commandStream->replaceGraphicsAllocation(cmdBufferAllocation);
|
||||
|
||||
|
@ -96,7 +103,7 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat
|
|||
return ErrorCode::OUT_OF_DEVICE_MEMORY;
|
||||
}
|
||||
secondaryCommandStreamForImmediateCmdList = std::make_unique<LinearStream>(cmdBufferAllocationHost->getUnderlyingBuffer(),
|
||||
usableSize, this, gfxCoreHelper.getBatchBufferEndSize());
|
||||
usableSize, this, this->selectedBbCmdSize);
|
||||
secondaryCommandStreamForImmediateCmdList->replaceGraphicsAllocation(cmdBufferAllocationHost);
|
||||
cmdBufferAllocations.push_back(cmdBufferAllocationHost);
|
||||
addToResidencyContainer(cmdBufferAllocationHost);
|
||||
|
@ -192,6 +199,7 @@ void CommandContainer::reset() {
|
|||
nextIddInBlock = this->getNumIddPerBlock();
|
||||
lastPipelineSelectModeRequired = false;
|
||||
lastSentUseGlobalAtomics = false;
|
||||
endCmdPtr = nullptr;
|
||||
}
|
||||
|
||||
size_t CommandContainer::getAlignedCmdBufferSize() const {
|
||||
|
@ -316,14 +324,34 @@ void CommandContainer::allocateNextCommandBuffer() {
|
|||
}
|
||||
|
||||
void CommandContainer::closeAndAllocateNextCommandBuffer() {
|
||||
auto &gfxCoreHelper = device->getGfxCoreHelper();
|
||||
auto bbEndSize = gfxCoreHelper.getBatchBufferEndSize();
|
||||
auto ptr = commandStream->getSpace(0u);
|
||||
memcpy_s(ptr, bbEndSize, gfxCoreHelper.getBatchBufferEndReference(), bbEndSize);
|
||||
size_t usedSize = commandStream->getUsed();
|
||||
allocateNextCommandBuffer();
|
||||
if (this->usingPrimaryBuffer) {
|
||||
auto nextChainedBuffer = commandStream->getGraphicsAllocation();
|
||||
auto &gfxCoreHelper = device->getGfxCoreHelper();
|
||||
gfxCoreHelper.encodeBatchBufferStart(ptr, nextChainedBuffer->getGpuAddress(), false, false, false);
|
||||
alignPrimaryEnding(ptr, usedSize);
|
||||
} else {
|
||||
memcpy_s(ptr, this->selectedBbCmdSize, this->bbEndReference, this->selectedBbCmdSize);
|
||||
}
|
||||
currentLinearStreamStartOffset = 0u;
|
||||
}
|
||||
|
||||
void CommandContainer::alignPrimaryEnding(void *endPtr, size_t exactUsedSize) {
|
||||
exactUsedSize += this->selectedBbCmdSize;
|
||||
this->alignedPrimarySize = alignUp(exactUsedSize, minCmdBufferPtrAlign);
|
||||
if (this->alignedPrimarySize > exactUsedSize) {
|
||||
endPtr = ptrOffset(endPtr, this->selectedBbCmdSize);
|
||||
memset(endPtr, 0, this->alignedPrimarySize - exactUsedSize);
|
||||
}
|
||||
}
|
||||
|
||||
void CommandContainer::endAlignedPrimaryBuffer() {
|
||||
this->endCmdPtr = commandStream->getSpace(0u);
|
||||
alignPrimaryEnding(this->endCmdPtr, commandStream->getUsed());
|
||||
}
|
||||
|
||||
void CommandContainer::prepareBindfulSsh() {
|
||||
if (ApiSpecificConfig::getBindlessConfiguration()) {
|
||||
if (allocationIndirectHeaps[IndirectHeap::Type::SURFACE_STATE] == nullptr) {
|
||||
|
|
|
@ -72,6 +72,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
|||
static constexpr size_t startingResidencyContainerSize = 128;
|
||||
static constexpr size_t defaultCmdBufferAllocationAlignment = MemoryConstants::pageSize64k;
|
||||
static constexpr size_t defaultHeapAllocationAlignment = MemoryConstants::pageSize64k;
|
||||
static constexpr size_t minCmdBufferPtrAlign = 8;
|
||||
|
||||
CommandContainer();
|
||||
|
||||
|
@ -188,6 +189,20 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
|||
HeapContainer &getSshAllocations() { return sshAllocations; }
|
||||
uint64_t ¤tLinearStreamStartOffsetRef() { return currentLinearStreamStartOffset; }
|
||||
|
||||
void setUsingPrimaryBuffer(bool value) {
|
||||
usingPrimaryBuffer = value;
|
||||
}
|
||||
bool isUsingPrimaryBuffer() const {
|
||||
return usingPrimaryBuffer;
|
||||
}
|
||||
void *getEndCmdPtr() const {
|
||||
return endCmdPtr;
|
||||
}
|
||||
size_t getEndAlignedSize() const {
|
||||
return this->alignedPrimarySize;
|
||||
}
|
||||
void endAlignedPrimaryBuffer();
|
||||
|
||||
protected:
|
||||
size_t getAlignedCmdBufferSize() const;
|
||||
size_t getMaxUsableSpace() const {
|
||||
|
@ -198,9 +213,9 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
|||
IndirectHeap *initIndirectHeapReservation(ReservedIndirectHeap *indirectHeapReservation, size_t size, size_t alignment, HeapType heapType);
|
||||
inline bool skipHeapAllocationCreation(HeapType heapType);
|
||||
size_t getHeapSize(HeapType heapType);
|
||||
void alignPrimaryEnding(void *endPtr, size_t exactUsedSize);
|
||||
|
||||
GraphicsAllocation *allocationIndirectHeaps[HeapType::NUM_TYPES] = {};
|
||||
std::unique_ptr<IndirectHeap> indirectHeaps[HeapType::NUM_TYPES];
|
||||
|
||||
CmdBufferContainer cmdBufferAllocations;
|
||||
ResidencyContainer residencyContainer;
|
||||
|
@ -210,6 +225,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
|||
HeapReserveData dynamicStateHeapReserveData;
|
||||
HeapReserveData surfaceStateHeapReserveData;
|
||||
|
||||
std::unique_ptr<IndirectHeap> indirectHeaps[HeapType::NUM_TYPES];
|
||||
std::unique_ptr<HeapHelper> heapHelper;
|
||||
std::unique_ptr<LinearStream> commandStream;
|
||||
std::unique_ptr<LinearStream> secondaryCommandStreamForImmediateCmdList;
|
||||
|
@ -228,6 +244,10 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
|||
IndirectHeap *sharedDshCsrHeap = nullptr;
|
||||
size_t defaultSshSize = 0;
|
||||
L1CachePolicy *l1CachePolicyData = nullptr;
|
||||
size_t selectedBbCmdSize = 0;
|
||||
const void *bbEndReference = nullptr;
|
||||
void *endCmdPtr = nullptr;
|
||||
size_t alignedPrimarySize = 0;
|
||||
|
||||
uint32_t dirtyHeaps = std::numeric_limits<uint32_t>::max();
|
||||
uint32_t numIddsPerBlock = 64;
|
||||
|
@ -245,6 +265,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
|||
bool lastSentUseGlobalAtomics = false;
|
||||
bool systolicModeSupport = false;
|
||||
bool doubleSbaWa = false;
|
||||
bool usingPrimaryBuffer = false;
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -432,6 +432,7 @@ struct EncodeBatchBufferStartOrEnd {
|
|||
return sizeof(MI_BATCH_BUFFER_END);
|
||||
}
|
||||
|
||||
static void programBatchBufferStart(MI_BATCH_BUFFER_START *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate);
|
||||
static void programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate);
|
||||
static void programBatchBufferEnd(CommandContainer &container);
|
||||
static void programBatchBufferEnd(LinearStream &commandStream);
|
||||
|
|
|
@ -933,7 +933,7 @@ void EncodeBatchBufferStartOrEnd<Family>::programConditionalBatchBufferStartBase
|
|||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate) {
|
||||
void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(MI_BATCH_BUFFER_START *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) {
|
||||
MI_BATCH_BUFFER_START cmd = Family::cmdInitBatchBufferStart;
|
||||
if (secondLevel) {
|
||||
cmd.setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH);
|
||||
|
@ -943,8 +943,12 @@ void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(LinearStream *
|
|||
|
||||
appendBatchBufferStart(cmd, indirect, predicate);
|
||||
|
||||
auto buffer = commandStream->getSpaceForCmd<MI_BATCH_BUFFER_START>();
|
||||
*buffer = cmd;
|
||||
*cmdBuffer = cmd;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferStart(LinearStream *commandStream, uint64_t address, bool secondLevel, bool indirect, bool predicate) {
|
||||
programBatchBufferStart(commandStream->getSpaceForCmd<MI_BATCH_BUFFER_START>(), address, secondLevel, indirect, predicate);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
|
|
|
@ -162,6 +162,8 @@ class GfxCoreHelper {
|
|||
virtual bool platformSupportsImplicitScaling(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
|
||||
virtual size_t getBatchBufferEndSize() const = 0;
|
||||
virtual const void *getBatchBufferEndReference() const = 0;
|
||||
virtual size_t getBatchBufferStartSize() const = 0;
|
||||
virtual void encodeBatchBufferStart(void *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) const = 0;
|
||||
virtual bool isPlatformFlushTaskEnabled(const NEO::ProductHelper &productHelper) const = 0;
|
||||
virtual uint32_t getMinimalScratchSpaceSize() const = 0;
|
||||
virtual bool copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const = 0;
|
||||
|
@ -378,6 +380,8 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
|||
bool platformSupportsImplicitScaling(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const override;
|
||||
size_t getBatchBufferEndSize() const override;
|
||||
const void *getBatchBufferEndReference() const override;
|
||||
size_t getBatchBufferStartSize() const override;
|
||||
void encodeBatchBufferStart(void *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) const override;
|
||||
bool isPlatformFlushTaskEnabled(const NEO::ProductHelper &productHelper) const override;
|
||||
uint32_t getMinimalScratchSpaceSize() const override;
|
||||
bool copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const override;
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
*/
|
||||
|
||||
#include "shared/source/aub_mem_dump/aub_mem_dump.h"
|
||||
#include "shared/source/command_container/command_encoder.h"
|
||||
#include "shared/source/command_container/encode_surface_state.h"
|
||||
#include "shared/source/execution_environment/root_device_environment.h"
|
||||
#include "shared/source/gmm_helper/gmm.h"
|
||||
|
@ -627,12 +628,26 @@ bool GfxCoreHelperHw<GfxFamily>::forceNonGpuCoherencyWA(bool requiresCoherency)
|
|||
}
|
||||
template <typename GfxFamily>
|
||||
size_t GfxCoreHelperHw<GfxFamily>::getBatchBufferEndSize() const {
|
||||
return sizeof(typename GfxFamily::MI_BATCH_BUFFER_END);
|
||||
return EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferEndSize();
|
||||
}
|
||||
template <typename GfxFamily>
|
||||
const void *GfxCoreHelperHw<GfxFamily>::getBatchBufferEndReference() const {
|
||||
return reinterpret_cast<const void *>(&GfxFamily::cmdInitBatchBufferEnd);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t GfxCoreHelperHw<GfxFamily>::getBatchBufferStartSize() const {
|
||||
return EncodeBatchBufferStartOrEnd<GfxFamily>::getBatchBufferStartSize();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GfxCoreHelperHw<GfxFamily>::encodeBatchBufferStart(void *cmdBuffer, uint64_t address, bool secondLevel, bool indirect, bool predicate) const {
|
||||
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
|
||||
|
||||
MI_BATCH_BUFFER_START *bbBuffer = reinterpret_cast<MI_BATCH_BUFFER_START *>(cmdBuffer);
|
||||
EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(bbBuffer, address, secondLevel, indirect, predicate);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
bool GfxCoreHelperHw<GfxFamily>::isPlatformFlushTaskEnabled(const ProductHelper &productHelper) const {
|
||||
return productHelper.isFlushTaskAllowed();
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "shared/source/indirect_heap/indirect_heap.h"
|
||||
#include "shared/source/memory_manager/allocations_list.h"
|
||||
#include "shared/source/memory_manager/internal_allocation_storage.h"
|
||||
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
|
||||
#include "shared/test/common/fixtures/device_fixture.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||
|
@ -1654,3 +1655,89 @@ TEST_F(CommandContainerTest, givenCmdContainerSetToSbaTrackingWhenContainerIsIni
|
|||
cmdContainer->initialize(pDevice, nullptr, sshDefaultSize, true, false);
|
||||
EXPECT_EQ(2 * HeapSize::defaultHeapSize, cmdContainer->defaultSshSize);
|
||||
}
|
||||
|
||||
HWTEST_F(CommandContainerTest,
|
||||
givenCmdContainerUsingPrimaryBatchBufferWhenCloseAndAllocateNextCommandBufferCalledThenNewCmdBufferAllocationCreatedAndChainedWithOldCmdBuffer) {
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
|
||||
CommandContainer cmdContainer;
|
||||
cmdContainer.setUsingPrimaryBuffer(true);
|
||||
EXPECT_TRUE(cmdContainer.isUsingPrimaryBuffer());
|
||||
cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false);
|
||||
|
||||
ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
|
||||
auto firstCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[0];
|
||||
|
||||
cmdContainer.closeAndAllocateNextCommandBuffer();
|
||||
|
||||
ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 2u);
|
||||
auto chainedCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[1];
|
||||
auto bbStartGpuAddress = chainedCmdBufferAllocation->getGpuAddress();
|
||||
|
||||
auto bbStart = genCmdCast<MI_BATCH_BUFFER_START *>(firstCmdBufferAllocation->getUnderlyingBuffer());
|
||||
ASSERT_NE(nullptr, bbStart);
|
||||
EXPECT_EQ(bbStartGpuAddress, bbStart->getBatchBufferStartAddress());
|
||||
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
|
||||
|
||||
size_t expectedEndSize = alignUp(sizeof(MI_BATCH_BUFFER_START), CommandContainer::minCmdBufferPtrAlign);
|
||||
cmdContainer.endAlignedPrimaryBuffer();
|
||||
|
||||
void *endPtr = cmdContainer.getEndCmdPtr();
|
||||
size_t alignedSize = cmdContainer.getEndAlignedSize();
|
||||
|
||||
EXPECT_EQ(chainedCmdBufferAllocation->getUnderlyingBuffer(), endPtr);
|
||||
EXPECT_EQ(expectedEndSize, alignedSize);
|
||||
}
|
||||
|
||||
HWTEST_F(CommandContainerTest,
|
||||
givenCmdContainerUsingPrimaryBatchBufferWhenCloseAndAllocateMultipleCommandBuffersThenNewCmdBufferAllocationsCreatedAndChainedWithOldCmdBuffers) {
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
|
||||
CommandContainer cmdContainer;
|
||||
cmdContainer.setUsingPrimaryBuffer(true);
|
||||
EXPECT_TRUE(cmdContainer.isUsingPrimaryBuffer());
|
||||
cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false);
|
||||
|
||||
ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
|
||||
auto firstCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[0];
|
||||
|
||||
size_t consumedSize = sizeof(int);
|
||||
cmdContainer.getCommandStream()->getSpace(consumedSize);
|
||||
|
||||
cmdContainer.closeAndAllocateNextCommandBuffer();
|
||||
|
||||
ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 2u);
|
||||
auto chainedCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[1];
|
||||
auto bbStartGpuAddress = chainedCmdBufferAllocation->getGpuAddress();
|
||||
|
||||
auto bbStart = genCmdCast<MI_BATCH_BUFFER_START *>(ptrOffset(firstCmdBufferAllocation->getUnderlyingBuffer(), consumedSize));
|
||||
ASSERT_NE(nullptr, bbStart);
|
||||
EXPECT_EQ(bbStartGpuAddress, bbStart->getBatchBufferStartAddress());
|
||||
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
|
||||
|
||||
consumedSize *= 2;
|
||||
cmdContainer.getCommandStream()->getSpace(consumedSize);
|
||||
cmdContainer.closeAndAllocateNextCommandBuffer();
|
||||
|
||||
ASSERT_EQ(cmdContainer.getCmdBufferAllocations().size(), 3u);
|
||||
auto closingCmdBufferAllocation = cmdContainer.getCmdBufferAllocations()[2];
|
||||
|
||||
bbStartGpuAddress = closingCmdBufferAllocation->getGpuAddress();
|
||||
|
||||
bbStart = genCmdCast<MI_BATCH_BUFFER_START *>(ptrOffset(chainedCmdBufferAllocation->getUnderlyingBuffer(), consumedSize));
|
||||
ASSERT_NE(nullptr, bbStart);
|
||||
EXPECT_EQ(bbStartGpuAddress, bbStart->getBatchBufferStartAddress());
|
||||
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
|
||||
|
||||
consumedSize = alignUp(sizeof(MI_BATCH_BUFFER_START), CommandContainer::minCmdBufferPtrAlign) - sizeof(MI_BATCH_BUFFER_START);
|
||||
cmdContainer.getCommandStream()->getSpace(consumedSize);
|
||||
|
||||
size_t expectedEndSize = alignUp((sizeof(MI_BATCH_BUFFER_START) + consumedSize), CommandContainer::minCmdBufferPtrAlign);
|
||||
cmdContainer.endAlignedPrimaryBuffer();
|
||||
|
||||
void *endPtr = cmdContainer.getEndCmdPtr();
|
||||
size_t alignedSize = cmdContainer.getEndAlignedSize();
|
||||
|
||||
EXPECT_EQ(ptrOffset(closingCmdBufferAllocation->getUnderlyingBuffer(), consumedSize), endPtr);
|
||||
EXPECT_EQ(expectedEndSize, alignedSize);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue