Reuse heaps for immediate cmd lists

Signed-off-by: Szymon Morek <szymon.morek@intel.com>
This commit is contained in:
Szymon Morek
2022-10-05 16:27:50 +00:00
committed by Compute-Runtime-Automation
parent 54faccb53e
commit 3f5ac0b4d0
18 changed files with 314 additions and 13 deletions

View File

@@ -125,7 +125,6 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat
iddBlock = nullptr;
nextIddInBlock = this->getNumIddPerBlock();
}
return ErrorCode::SUCCESS;
}
@@ -237,7 +236,11 @@ void CommandContainer::createAndAssignNewHeap(HeapType heapType, size_t size) {
newAlloc->getUnderlyingBufferSize());
auto newBase = indirectHeap->getHeapGpuBase();
getResidencyContainer().push_back(newAlloc);
getDeallocationContainer().push_back(oldAlloc);
if (this->immediateCmdListCsr) {
this->storeAllocationAndFlushTagUpdate(oldAlloc);
} else {
getDeallocationContainer().push_back(oldAlloc);
}
setIndirectHeapAllocation(heapType, newAlloc);
if (oldBase != newBase) {
setHeapDirty(heapType);
@@ -334,11 +337,7 @@ GraphicsAllocation *CommandContainer::reuseExistingCmdBuffer() {
}
void CommandContainer::addCurrentCommandBufferToReusableAllocationList() {
auto taskCount = this->immediateCmdListCsr->peekTaskCount() + 1;
auto osContextId = this->immediateCmdListCsr->getOsContext().getContextId();
commandStream->getGraphicsAllocation()->updateTaskCount(taskCount, osContextId);
commandStream->getGraphicsAllocation()->updateResidencyTaskCount(taskCount, osContextId);
this->reusableAllocationList->pushTailOne(*this->commandStream->getGraphicsAllocation());
this->storeAllocationAndFlushTagUpdate(this->commandStream->getGraphicsAllocation());
}
void CommandContainer::setCmdBuffer(GraphicsAllocation *cmdBuffer) {
@@ -364,4 +363,58 @@ GraphicsAllocation *CommandContainer::allocateCommandBuffer() {
return device->getMemoryManager()->allocateGraphicsMemoryWithProperties(properties);
}
void CommandContainer::fillReusableAllocationLists() {
const auto &hardwareInfo = device->getHardwareInfo();
auto &hwHelper = NEO::HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
auto amountToFill = hwHelper.getAmountOfAllocationsToFill();
if (amountToFill == 0u) {
return;
}
for (auto i = 0u; i < amountToFill; i++) {
auto allocToReuse = this->allocateCommandBuffer();
this->reusableAllocationList->pushTailOne(*allocToReuse);
this->getResidencyContainer().push_back(allocToReuse);
}
if (!this->heapHelper) {
return;
}
constexpr size_t heapSize = 65536u;
size_t alignedSize = alignUp<size_t>(this->getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
for (auto i = 0u; i < amountToFill; i++) {
for (auto heapType = 0u; heapType < IndirectHeap::Type::NUM_TYPES; heapType++) {
if (NEO::ApiSpecificConfig::getBindlessConfiguration() && heapType != IndirectHeap::Type::INDIRECT_OBJECT) {
continue;
}
if (!hardwareInfo.capabilityTable.supportsImages && IndirectHeap::Type::DYNAMIC_STATE == heapType) {
continue;
}
if (immediateCmdListSharedHeap(static_cast<HeapType>(heapType))) {
continue;
}
auto heapToReuse = heapHelper->getHeapAllocation(heapType,
heapSize,
alignedSize,
device->getRootDeviceIndex());
this->heapHelper->storeHeapAllocation(heapToReuse);
}
}
}
void CommandContainer::storeAllocationAndFlushTagUpdate(GraphicsAllocation *allocation) {
auto lock = this->immediateCmdListCsr->obtainUniqueOwnership();
auto taskCount = this->immediateCmdListCsr->peekTaskCount() + 1;
auto osContextId = this->immediateCmdListCsr->getOsContext().getContextId();
allocation->updateTaskCount(taskCount, osContextId);
allocation->updateResidencyTaskCount(taskCount, osContextId);
if (allocation->getAllocationType() == AllocationType::COMMAND_BUFFER) {
this->reusableAllocationList->pushTailOne(*allocation);
} else {
getHeapHelper()->storeHeapAllocation(allocation);
}
this->immediateCmdListCsr->flushTagUpdate();
}
} // namespace NEO

View File

@@ -79,7 +79,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
Device *getDevice() const { return device; }
IndirectHeap *getHeapWithRequiredSizeAndAlignment(HeapType heapType, size_t sizeRequired, size_t alignment);
MOCKABLE_VIRTUAL IndirectHeap *getHeapWithRequiredSizeAndAlignment(HeapType heapType, size_t sizeRequired, size_t alignment);
void allocateNextCommandBuffer();
void closeAndAllocateNextCommandBuffer();
@@ -116,6 +116,9 @@ class CommandContainer : public NonCopyableOrMovableClass {
void setCmdBuffer(GraphicsAllocation *cmdBuffer);
void addCurrentCommandBufferToReusableAllocationList();
void fillReusableAllocationLists();
void storeAllocationAndFlushTagUpdate(GraphicsAllocation *allocation);
HeapContainer sshAllocations;
uint64_t currentLinearStreamStartOffset = 0u;
uint32_t slmSize = std::numeric_limits<uint32_t>::max();

View File

@@ -135,8 +135,12 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
auto heapIndirect = container.getIndirectHeap(HeapType::INDIRECT_OBJECT);
UNRECOVERABLE_IF(!(heapIndirect));
heapIndirect->align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
auto ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, iohRequiredSize);
void *ptr = nullptr;
if (args.isKernelDispatchedFromImmediateCmdList) {
ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::INDIRECT_OBJECT, iohRequiredSize, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)->getSpace(iohRequiredSize);
} else {
ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, iohRequiredSize);
}
UNRECOVERABLE_IF(!(ptr));
offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast<uint64_t>(heapIndirect->getUsed() - sizeThreadData);

View File

@@ -175,8 +175,12 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
auto heap = container.getIndirectHeap(HeapType::INDIRECT_OBJECT);
UNRECOVERABLE_IF(!heap);
heap->align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
auto ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, iohRequiredSize);
void *ptr = nullptr;
if (args.isKernelDispatchedFromImmediateCmdList) {
ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::INDIRECT_OBJECT, iohRequiredSize, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)->getSpace(iohRequiredSize);
} else {
ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, iohRequiredSize);
}
UNRECOVERABLE_IF(!ptr);
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData);

View File

@@ -298,6 +298,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, PreferInternalBcsEngine, -1, "-1: default, 0:dis
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsCopy, -1, "-1: default, 0:disabled, 1: enabled. When enqueues copy to main copy engine then split between even linked copy engines")
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMask, 0, "0: default, >0: bitmask: indicates bcs engines for split")
DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disabled, 1: enabled. If enabled, driver reuses kernel binaries.")
DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocations, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers and heaps at initialization of immediate command list.")
/*DIRECT SUBMISSION FLAGS*/
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD")

View File

@@ -160,6 +160,7 @@ class HwHelper {
virtual bool isPatIndexFallbackWaRequired() const = 0;
virtual uint32_t getMinimalScratchSpaceSize() const = 0;
virtual bool copyThroughLockedPtrEnabled() const = 0;
virtual uint32_t getAmountOfAllocationsToFill() const = 0;
protected:
HwHelper() = default;
@@ -402,6 +403,7 @@ class HwHelperHw : public HwHelper {
bool isPatIndexFallbackWaRequired() const override;
uint32_t getMinimalScratchSpaceSize() const override;
bool copyThroughLockedPtrEnabled() const override;
uint32_t getAmountOfAllocationsToFill() const override;
protected:
static const AuxTranslationMode defaultAuxTranslationMode;

View File

@@ -726,4 +726,12 @@ bool HwHelperHw<gfxProduct>::copyThroughLockedPtrEnabled() const {
return false;
}
template <typename gfxProduct>
uint32_t HwHelperHw<gfxProduct>::getAmountOfAllocationsToFill() const {
if (DebugManager.flags.SetAmountOfReusableAllocations.get() != -1) {
return DebugManager.flags.SetAmountOfReusableAllocations.get();
}
return 0u;
}
} // namespace NEO

View File

@@ -450,6 +450,14 @@ bool HwHelperHw<Family>::copyThroughLockedPtrEnabled() const {
return true;
}
template <>
uint32_t HwHelperHw<Family>::getAmountOfAllocationsToFill() const {
if (DebugManager.flags.SetAmountOfReusableAllocations.get() != -1) {
return DebugManager.flags.SetAmountOfReusableAllocations.get();
}
return 1u;
}
} // namespace NEO
#include "shared/source/helpers/hw_helper_pvc_and_later.inl"

View File

@@ -467,3 +467,4 @@ ExperimentalCopyThroughLock = -1
ExperimentalH2DCpuCopyThreshold = -1
ExperimentalD2HCpuCopyThreshold = -1
CopyHostPtrOnCpu = -1
SetAmountOfReusableAllocations = -1

View File

@@ -8,6 +8,7 @@
#include "shared/source/command_container/cmdcontainer.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/memory_manager/allocations_list.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/test/common/fixtures/device_fixture.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
@@ -985,3 +986,133 @@ HWTEST_F(CommandContainerTest, givenCmdContainerHasImmediateCsrWhenGettingHeapWi
EXPECT_THROW(cmdContainer.getHeapSpaceAllowGrow(HeapType::SURFACE_STATE, 64), std::exception);
EXPECT_THROW(cmdContainer.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, 64, 64), std::exception);
}
struct MockHeapHelper : public HeapHelper {
public:
using HeapHelper::storageForReuse;
};
TEST_F(CommandContainerTest, givenCmdContainerWhenFillReusableAllocationListsThenAllocListsNotEmpty) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.SetAmountOfReusableAllocations.set(1);
auto cmdContainer = std::make_unique<CommandContainer>();
AllocationsList allocList;
cmdContainer->initialize(pDevice, &allocList, true);
EXPECT_TRUE(allocList.peekIsEmpty());
EXPECT_TRUE(reinterpret_cast<MockHeapHelper *>(cmdContainer->getHeapHelper())->storageForReuse->getAllocationsForReuse().peekIsEmpty());
cmdContainer->fillReusableAllocationLists();
EXPECT_FALSE(allocList.peekIsEmpty());
EXPECT_FALSE(reinterpret_cast<MockHeapHelper *>(cmdContainer->getHeapHelper())->storageForReuse->getAllocationsForReuse().peekIsEmpty());
cmdContainer.reset();
allocList.freeAllGraphicsAllocations(pDevice);
}
TEST_F(CommandContainerTest, givenCmdContainerWhenFillReusableAllocationListsWithSharedHeapsEnabledThenOnlyOneHeapFilled) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.SetAmountOfReusableAllocations.set(1);
auto cmdContainer = std::make_unique<CommandContainer>();
AllocationsList allocList;
cmdContainer->enableHeapSharing();
cmdContainer->initialize(pDevice, &allocList, true);
auto &reusableHeapsList = reinterpret_cast<MockHeapHelper *>(cmdContainer->getHeapHelper())->storageForReuse->getAllocationsForReuse();
EXPECT_TRUE(reusableHeapsList.peekIsEmpty());
cmdContainer->fillReusableAllocationLists();
EXPECT_FALSE(reusableHeapsList.peekIsEmpty());
EXPECT_EQ(reusableHeapsList.peekHead()->countThisAndAllConnected(), 1u);
cmdContainer.reset();
allocList.freeAllGraphicsAllocations(pDevice);
}
TEST_F(CommandContainerTest, givenCmdContainerWhenFillReusableAllocationListsWithBindlessModeEnabledThenOnlyOneHeapFilled) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.SetAmountOfReusableAllocations.set(1);
auto cmdContainer = std::make_unique<CommandContainer>();
AllocationsList allocList;
cmdContainer->initialize(pDevice, &allocList, true);
auto &reusableHeapsList = reinterpret_cast<MockHeapHelper *>(cmdContainer->getHeapHelper())->storageForReuse->getAllocationsForReuse();
EXPECT_TRUE(reusableHeapsList.peekIsEmpty());
DebugManager.flags.UseBindlessMode.set(true);
cmdContainer->fillReusableAllocationLists();
EXPECT_FALSE(reusableHeapsList.peekIsEmpty());
EXPECT_EQ(reusableHeapsList.peekHead()->countThisAndAllConnected(), 1u);
cmdContainer.reset();
allocList.freeAllGraphicsAllocations(pDevice);
}
TEST_F(CommandContainerTest, givenCmdContainerWhenFillReusableAllocationListsWithoutHeapsThenAllocListNotEmpty) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.SetAmountOfReusableAllocations.set(1);
auto cmdContainer = std::make_unique<CommandContainer>();
AllocationsList allocList;
cmdContainer->initialize(pDevice, &allocList, false);
EXPECT_TRUE(allocList.peekIsEmpty());
cmdContainer->fillReusableAllocationLists();
EXPECT_FALSE(allocList.peekIsEmpty());
cmdContainer.reset();
allocList.freeAllGraphicsAllocations(pDevice);
}
TEST_F(CommandContainerTest, givenCmdContainerWhenFillReusableAllocationListsWithSpecifiedAmountThenAllocationsCreated) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.SetAmountOfReusableAllocations.set(10);
auto cmdContainer = std::make_unique<CommandContainer>();
AllocationsList allocList;
cmdContainer->initialize(pDevice, &allocList, false);
EXPECT_TRUE(allocList.peekIsEmpty());
cmdContainer->fillReusableAllocationLists();
EXPECT_EQ(allocList.peekHead()->countThisAndAllConnected(), 10u);
cmdContainer.reset();
allocList.freeAllGraphicsAllocations(pDevice);
}
TEST_F(CommandContainerTest, givenCmdContainerAndCsrWhenGetHeapWithRequiredSizeAndAlignmentThenReuseAllocationIfAvailable) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.SetAmountOfReusableAllocations.set(1);
auto cmdContainer = std::make_unique<CommandContainer>();
auto csr = pDevice->getDefaultEngine().commandStreamReceiver;
AllocationsList allocList;
cmdContainer->initialize(pDevice, &allocList, true);
cmdContainer->setImmediateCmdListCsr(csr);
cmdContainer->fillReusableAllocationLists();
auto &reusableHeapsList = reinterpret_cast<MockHeapHelper *>(cmdContainer->getHeapHelper())->storageForReuse->getAllocationsForReuse();
auto baseAlloc = cmdContainer->getIndirectHeapAllocation(HeapType::INDIRECT_OBJECT);
auto reusableAlloc = reusableHeapsList.peekHead();
cmdContainer->getIndirectHeap(HeapType::INDIRECT_OBJECT)->getSpace(cmdContainer->getIndirectHeap(HeapType::INDIRECT_OBJECT)->getMaxAvailableSpace());
auto heap = cmdContainer->getHeapWithRequiredSizeAndAlignment(HeapType::INDIRECT_OBJECT, 1024, 1024);
EXPECT_EQ(heap->getGraphicsAllocation(), reusableAlloc);
EXPECT_TRUE(reusableHeapsList.peekContains(*baseAlloc));
cmdContainer.reset();
allocList.freeAllGraphicsAllocations(pDevice);
}
TEST_F(CommandContainerTest, givenCmdContainerWhenFillReusableAllocationListsAndFlagDisabledThenAllocListEmpty) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.SetAmountOfReusableAllocations.set(0);
auto cmdContainer = std::make_unique<CommandContainer>();
AllocationsList allocList;
cmdContainer->initialize(pDevice, &allocList, false);
EXPECT_TRUE(allocList.peekIsEmpty());
cmdContainer->fillReusableAllocationLists();
EXPECT_TRUE(allocList.peekIsEmpty());
cmdContainer.reset();
allocList.freeAllGraphicsAllocations(pDevice);
}

View File

@@ -1422,3 +1422,14 @@ HWTEST_F(CommandEncodeStatesTest, givenKernelInfoWhenGettingRequiredSshSpaceThen
size = EncodeDispatchKernel<FamilyType>::getSizeRequiredSsh(kernelInfo);
EXPECT_EQ(expectedSize, size);
}
HWTEST_F(CommandEncodeStatesTest, givenCommandContainerWhenIsKernelDispatchedFromImmediateCmdListTrueThenGetHeapWithRequiredSizeAndAlignmentCalled) {
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
uint32_t dims[] = {1, 1, 1};
bool requiresUncachedMocs = false;
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
dispatchArgs.isKernelDispatchedFromImmediateCmdList = true;
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs, nullptr);
EXPECT_NE(0u, cmdContainer->getHeapWithRequiredSizeAndAlignmentCalled);
}

View File

@@ -19,6 +19,12 @@ class CommandEncodeStatesFixture : public DeviceFixture {
class MyMockCommandContainer : public CommandContainer {
public:
using CommandContainer::dirtyHeaps;
IndirectHeap *getHeapWithRequiredSizeAndAlignment(HeapType heapType, size_t sizeRequired, size_t alignment) override {
getHeapWithRequiredSizeAndAlignmentCalled++;
return CommandContainer::getHeapWithRequiredSizeAndAlignment(heapType, sizeRequired, alignment);
}
uint32_t getHeapWithRequiredSizeAndAlignmentCalled = 0u;
};
void setUp();

View File

@@ -211,4 +211,19 @@ HWTEST_F(HwInfoConfigTest, givenHwHelperWhenFlagSetAndCallCopyThroughLockedPtrEn
DebugManager.flags.ExperimentalCopyThroughLock.set(1);
EXPECT_TRUE(hwHelper.copyThroughLockedPtrEnabled());
}
HWTEST2_F(HwInfoConfigTest, givenHwHelperWhenCallGetAmountOfAllocationsToFillThenReturnFalse, IsNotXeHpcCore) {
HwHelper &hwHelper = HwHelper::get(defaultHwInfo->platform.eRenderCoreFamily);
EXPECT_EQ(hwHelper.getAmountOfAllocationsToFill(), 0u);
}
HWTEST_F(HwInfoConfigTest, givenHwHelperWhenFlagSetAndCallGetAmountOfAllocationsToFillThenReturnCorrectValue) {
DebugManagerStateRestore restorer;
HwHelper &hwHelper = HwHelper::get(defaultHwInfo->platform.eRenderCoreFamily);
DebugManager.flags.SetAmountOfReusableAllocations.set(0);
EXPECT_EQ(hwHelper.getAmountOfAllocationsToFill(), 0u);
DebugManager.flags.SetAmountOfReusableAllocations.set(1);
EXPECT_EQ(hwHelper.getAmountOfAllocationsToFill(), 1u);
}

View File

@@ -78,3 +78,8 @@ XE_HPC_CORETEST_F(HwHelperTest, givenHwHelperWhenCallCopyThroughLockedPtrEnabled
auto &hwHelper = HwHelperHw<FamilyType>::get();
EXPECT_TRUE(hwHelper.copyThroughLockedPtrEnabled());
}
XE_HPC_CORETEST_F(HwHelperTest, givenHwHelperWhenCallGetAmountOfAllocationsToFillThenReturnTrue) {
auto &hwHelper = HwHelperHw<FamilyType>::get();
EXPECT_EQ(hwHelper.getAmountOfAllocationsToFill(), 1u);
}