Add heap sharing to immediate command lists

This change is intended to be used in immediate command lists that are
using flush task functionality.
With this change all immediate command list using the same csr will consume
shared allocations for dsh and ssh heaps. This will decrease number of SBA
commands dispatched when multiple command lists coexists and dispatch kernels.
With this change new SBA command should be dispatched only when current heap
allocation is exhausted.
Functionality is currently disabled and available under debug key.
Functionality will be enabled by default for all immediate command lists
with flush task functionality enabled.

Related-To: NEO-7142

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2022-09-26 22:28:10 +00:00
committed by Compute-Runtime-Automation
parent 71bef6094d
commit 3d92186362
35 changed files with 671 additions and 93 deletions

View File

@@ -96,6 +96,9 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat
if (!hardwareInfo.capabilityTable.supportsImages && IndirectHeap::Type::DYNAMIC_STATE == i) {
continue;
}
if (immediateCmdListSharedHeap(static_cast<HeapType>(i))) {
continue;
}
allocationIndirectHeaps[i] = heapHelper->getHeapAllocation(i,
heapSize,
alignedSize,
@@ -185,32 +188,40 @@ void *CommandContainer::getHeapSpaceAllowGrow(HeapType heapType,
size_t size) {
auto indirectHeap = getIndirectHeap(heapType);
if (indirectHeap->getAvailableSpace() < size) {
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
newSize *= 2;
newSize = std::max(newSize, indirectHeap->getAvailableSpace() + size);
newSize = alignUp(newSize, MemoryConstants::pageSize);
auto oldAlloc = getIndirectHeapAllocation(heapType);
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
UNRECOVERABLE_IF(!oldAlloc);
UNRECOVERABLE_IF(!newAlloc);
auto oldBase = indirectHeap->getHeapGpuBase();
indirectHeap->replaceGraphicsAllocation(newAlloc);
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
newAlloc->getUnderlyingBufferSize());
auto newBase = indirectHeap->getHeapGpuBase();
getResidencyContainer().push_back(newAlloc);
getDeallocationContainer().push_back(oldAlloc);
setIndirectHeapAllocation(heapType, newAlloc);
if (oldBase != newBase) {
setHeapDirty(heapType);
if (immediateCmdListSharedHeap(heapType)) {
UNRECOVERABLE_IF(indirectHeap == nullptr);
UNRECOVERABLE_IF(indirectHeap->getAvailableSpace() < size);
getResidencyContainer().push_back(indirectHeap->getGraphicsAllocation());
} else {
if (indirectHeap->getAvailableSpace() < size) {
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
newSize *= 2;
newSize = std::max(newSize, indirectHeap->getAvailableSpace() + size);
newSize = alignUp(newSize, MemoryConstants::pageSize);
auto oldAlloc = getIndirectHeapAllocation(heapType);
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
UNRECOVERABLE_IF(!oldAlloc);
UNRECOVERABLE_IF(!newAlloc);
auto oldBase = indirectHeap->getHeapGpuBase();
indirectHeap->replaceGraphicsAllocation(newAlloc);
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
newAlloc->getUnderlyingBufferSize());
auto newBase = indirectHeap->getHeapGpuBase();
getResidencyContainer().push_back(newAlloc);
getDeallocationContainer().push_back(oldAlloc);
setIndirectHeapAllocation(heapType, newAlloc);
if (oldBase != newBase) {
setHeapDirty(heapType);
}
}
}
return indirectHeap->getSpace(size);
}
IndirectHeap *CommandContainer::getHeapWithRequiredSizeAndAlignment(HeapType heapType, size_t sizeRequired, size_t alignment) {
auto indirectHeap = getIndirectHeap(heapType);
UNRECOVERABLE_IF(indirectHeap == nullptr);
auto sizeRequested = sizeRequired;
auto heapBuffer = indirectHeap->getSpace(0);
@@ -218,27 +229,32 @@ IndirectHeap *CommandContainer::getHeapWithRequiredSizeAndAlignment(HeapType hea
sizeRequested += alignment;
}
if (indirectHeap->getAvailableSpace() < sizeRequested) {
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
newSize = alignUp(newSize, MemoryConstants::pageSize);
auto oldAlloc = getIndirectHeapAllocation(heapType);
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
UNRECOVERABLE_IF(!oldAlloc);
UNRECOVERABLE_IF(!newAlloc);
auto oldBase = indirectHeap->getHeapGpuBase();
indirectHeap->replaceGraphicsAllocation(newAlloc);
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
newAlloc->getUnderlyingBufferSize());
auto newBase = indirectHeap->getHeapGpuBase();
getResidencyContainer().push_back(newAlloc);
getDeallocationContainer().push_back(oldAlloc);
setIndirectHeapAllocation(heapType, newAlloc);
if (oldBase != newBase) {
setHeapDirty(heapType);
}
if (heapType == HeapType::SURFACE_STATE) {
indirectHeap->getSpace(reservedSshSize);
sshAllocations.push_back(oldAlloc);
if (immediateCmdListSharedHeap(heapType)) {
UNRECOVERABLE_IF(indirectHeap->getAvailableSpace() < sizeRequested);
getResidencyContainer().push_back(indirectHeap->getGraphicsAllocation());
} else {
if (indirectHeap->getAvailableSpace() < sizeRequested) {
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
newSize = alignUp(newSize, MemoryConstants::pageSize);
auto oldAlloc = getIndirectHeapAllocation(heapType);
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
UNRECOVERABLE_IF(!oldAlloc);
UNRECOVERABLE_IF(!newAlloc);
auto oldBase = indirectHeap->getHeapGpuBase();
indirectHeap->replaceGraphicsAllocation(newAlloc);
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
newAlloc->getUnderlyingBufferSize());
auto newBase = indirectHeap->getHeapGpuBase();
getResidencyContainer().push_back(newAlloc);
getDeallocationContainer().push_back(oldAlloc);
setIndirectHeapAllocation(heapType, newAlloc);
if (oldBase != newBase) {
setHeapDirty(heapType);
}
if (heapType == HeapType::SURFACE_STATE) {
indirectHeap->getSpace(reservedSshSize);
sshAllocations.push_back(oldAlloc);
}
}
}
@@ -329,7 +345,19 @@ void CommandContainer::prepareBindfulSsh() {
}
IndirectHeap *CommandContainer::getIndirectHeap(HeapType heapType) {
return indirectHeaps[heapType].get();
if (immediateCmdListSharedHeap(heapType)) {
return heapType == HeapType::SURFACE_STATE ? sharedSshCsrHeap : sharedDshCsrHeap;
} else {
return indirectHeaps[heapType].get();
}
}
void CommandContainer::ensureHeapSizePrepared(size_t sshRequiredSize, size_t dshRequiredSize) {
sharedSshCsrHeap = &immediateCmdListCsr->getIndirectHeap(HeapType::SURFACE_STATE, sshRequiredSize);
if (dshRequiredSize > 0) {
sharedDshCsrHeap = &immediateCmdListCsr->getIndirectHeap(HeapType::DYNAMIC_STATE, dshRequiredSize);
}
}
} // namespace NEO

View File

@@ -17,6 +17,7 @@
#include <vector>
namespace NEO {
class CommandStreamReceiver;
class Device;
class GraphicsAllocation;
class LinearStream;
@@ -94,13 +95,20 @@ class CommandContainer : public NonCopyableOrMovableClass {
void setIddBlock(void *iddBlock) { this->iddBlock = iddBlock; }
void *getIddBlock() { return iddBlock; }
uint32_t getNumIddPerBlock() const { return numIddsPerBlock; }
void setNumIddPerBlock(uint32_t value) { numIddsPerBlock = value; }
void setReservedSshSize(size_t reserveSize) {
reservedSshSize = reserveSize;
}
bool getFlushTaskUsedForImmediate() const { return isFlushTaskUsedForImmediate; }
void setFlushTaskUsedForImmediate(bool flushTaskUsedForImmediate) { isFlushTaskUsedForImmediate = flushTaskUsedForImmediate; }
void setImmediateCmdListCsr(CommandStreamReceiver *newValue) {
this->immediateCmdListCsr = newValue;
}
bool immediateCmdListSharedHeap(HeapType heapType) {
return (this->immediateCmdListCsr != nullptr && (heapType == HeapType::DYNAMIC_STATE || heapType == HeapType::SURFACE_STATE));
}
void ensureHeapSizePrepared(size_t sshRequiredSize, size_t dshRequiredSize);
HeapContainer sshAllocations;
uint64_t currentLinearStreamStartOffset = 0u;
uint32_t slmSize = std::numeric_limits<uint32_t>::max();
@@ -129,6 +137,9 @@ class CommandContainer : public NonCopyableOrMovableClass {
Device *device = nullptr;
AllocationsList *reusableAllocationList = nullptr;
size_t reservedSshSize = 0;
CommandStreamReceiver *immediateCmdListCsr = nullptr;
IndirectHeap *sharedSshCsrHeap = nullptr;
IndirectHeap *sharedDshCsrHeap = nullptr;
uint32_t dirtyHeaps = std::numeric_limits<uint32_t>::max();
uint32_t numIddsPerBlock = 64;

View File

@@ -30,6 +30,7 @@ class IndirectHeap;
class LogicalStateHelper;
class Gmm;
struct HardwareInfo;
struct KernelInfo;
struct StateComputeModeProperties;
struct EncodeDispatchKernelArgs {
@@ -112,6 +113,10 @@ struct EncodeDispatchKernel {
static void adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo);
static constexpr bool shouldUpdateGlobalAtomics(bool &currentVal, bool refVal, bool updateCurrent);
static size_t getSizeRequiredDsh(const KernelInfo &kernelInfo);
static size_t getSizeRequiredSsh(const KernelInfo &kernelInfo);
inline static uint32_t additionalSizeRequiredDsh();
};
template <typename GfxFamily>
@@ -121,8 +126,8 @@ struct EncodeStates {
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
using SAMPLER_BORDER_COLOR_STATE = typename GfxFamily::SAMPLER_BORDER_COLOR_STATE;
static const uint32_t alignIndirectStatePointer = MemoryConstants::cacheLineSize;
static const size_t alignInterfaceDescriptorData = MemoryConstants::cacheLineSize;
static constexpr uint32_t alignIndirectStatePointer = MemoryConstants::cacheLineSize;
static constexpr size_t alignInterfaceDescriptorData = MemoryConstants::cacheLineSize;
static uint32_t copySamplerState(IndirectHeap *dsh,
uint32_t samplerStateOffset,

View File

@@ -27,6 +27,7 @@
#include "shared/source/kernel/implicit_args.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/source/os_interface/hw_info_config.h"
#include "shared/source/program/kernel_info.h"
#include "encode_surface_state.inl"
@@ -698,6 +699,39 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
template <typename Family>
constexpr bool EncodeDispatchKernel<Family>::shouldUpdateGlobalAtomics(bool &currentVal, bool refVal, bool updateCurrent) { return false; }
template <typename Family>
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelInfo &kernelInfo) {
using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;
constexpr auto samplerStateSize = sizeof(typename Family::SAMPLER_STATE);
const auto numSamplers = kernelInfo.kernelDescriptor.payloadMappings.samplerTable.numSamplers;
const auto additionalDshSize = additionalSizeRequiredDsh();
if (numSamplers == 0U) {
return alignUp(additionalDshSize, EncodeStates<Family>::alignInterfaceDescriptorData);
}
size_t size = kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset -
kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor;
size = alignUp(size, EncodeStates<Family>::alignIndirectStatePointer);
size += numSamplers * samplerStateSize;
size = alignUp(size, INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
if (additionalDshSize > 0) {
size += additionalDshSize;
size = alignUp(size, EncodeStates<Family>::alignInterfaceDescriptorData);
}
return size;
}
template <typename Family>
size_t EncodeDispatchKernel<Family>::getSizeRequiredSsh(const KernelInfo &kernelInfo) {
using BINDING_TABLE_STATE = typename Family::BINDING_TABLE_STATE;
size_t requiredSshSize = kernelInfo.heapInfo.SurfaceStateHeapSize;
requiredSshSize = alignUp(requiredSshSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
return requiredSshSize;
}
template <typename Family>
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) {
for (int i = 0; i < 3; ++i) {

View File

@@ -104,13 +104,13 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, args.preemptionMode);
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
UNRECOVERABLE_IF(!heap);
uint32_t samplerStateOffset = 0;
uint32_t samplerCount = 0;
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
UNRECOVERABLE_IF(!heap);
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
samplerStateOffset = EncodeStates<Family>::copySamplerState(heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
kernelDescriptor.payloadMappings.samplerTable.numSamplers,
@@ -539,4 +539,9 @@ void EncodeDispatchKernel<Family>::setupPostSyncMocs(WALKER_TYPE &walkerCmd, con
template <typename Family>
void EncodeDispatchKernel<Family>::adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo) {}
template <typename Family>
uint32_t EncodeDispatchKernel<Family>::additionalSizeRequiredDsh() {
return sizeof(typename Family::INTERFACE_DESCRIPTOR_DATA);
}
} // namespace NEO

View File

@@ -127,13 +127,13 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, args.preemptionMode);
if constexpr (Family::supportsSampler) {
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
UNRECOVERABLE_IF(!heap);
uint32_t samplerStateOffset = 0;
uint32_t samplerCount = 0;
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
UNRECOVERABLE_IF(!heap);
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
samplerStateOffset = EncodeStates<Family>::copySamplerState(
heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
@@ -768,4 +768,9 @@ inline void EncodeStoreMMIO<Family>::appendFlags(MI_STORE_REGISTER_MEM *storeReg
template <typename Family>
void EncodeDispatchKernel<Family>::adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo) {}
template <typename Family>
uint32_t EncodeDispatchKernel<Family>::additionalSizeRequiredDsh() {
return 0u;
}
} // namespace NEO