mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-21 09:14:47 +08:00
Add heap sharing to immediate command lists
This change is intended to be used in immediate command lists that are using flush task functionality. With this change all immediate command list using the same csr will consume shared allocations for dsh and ssh heaps. This will decrease number of SBA commands dispatched when multiple command lists coexists and dispatch kernels. With this change new SBA command should be dispatched only when current heap allocation is exhausted. Functionality is currently disabled and available under debug key. Functionality will be enabled by default for all immediate command lists with flush task functionality enabled. Related-To: NEO-7142 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
71bef6094d
commit
3d92186362
@@ -96,6 +96,9 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat
|
||||
if (!hardwareInfo.capabilityTable.supportsImages && IndirectHeap::Type::DYNAMIC_STATE == i) {
|
||||
continue;
|
||||
}
|
||||
if (immediateCmdListSharedHeap(static_cast<HeapType>(i))) {
|
||||
continue;
|
||||
}
|
||||
allocationIndirectHeaps[i] = heapHelper->getHeapAllocation(i,
|
||||
heapSize,
|
||||
alignedSize,
|
||||
@@ -185,32 +188,40 @@ void *CommandContainer::getHeapSpaceAllowGrow(HeapType heapType,
|
||||
size_t size) {
|
||||
auto indirectHeap = getIndirectHeap(heapType);
|
||||
|
||||
if (indirectHeap->getAvailableSpace() < size) {
|
||||
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
|
||||
newSize *= 2;
|
||||
newSize = std::max(newSize, indirectHeap->getAvailableSpace() + size);
|
||||
newSize = alignUp(newSize, MemoryConstants::pageSize);
|
||||
auto oldAlloc = getIndirectHeapAllocation(heapType);
|
||||
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
|
||||
UNRECOVERABLE_IF(!oldAlloc);
|
||||
UNRECOVERABLE_IF(!newAlloc);
|
||||
auto oldBase = indirectHeap->getHeapGpuBase();
|
||||
indirectHeap->replaceGraphicsAllocation(newAlloc);
|
||||
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
|
||||
newAlloc->getUnderlyingBufferSize());
|
||||
auto newBase = indirectHeap->getHeapGpuBase();
|
||||
getResidencyContainer().push_back(newAlloc);
|
||||
getDeallocationContainer().push_back(oldAlloc);
|
||||
setIndirectHeapAllocation(heapType, newAlloc);
|
||||
if (oldBase != newBase) {
|
||||
setHeapDirty(heapType);
|
||||
if (immediateCmdListSharedHeap(heapType)) {
|
||||
UNRECOVERABLE_IF(indirectHeap == nullptr);
|
||||
UNRECOVERABLE_IF(indirectHeap->getAvailableSpace() < size);
|
||||
getResidencyContainer().push_back(indirectHeap->getGraphicsAllocation());
|
||||
} else {
|
||||
if (indirectHeap->getAvailableSpace() < size) {
|
||||
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
|
||||
newSize *= 2;
|
||||
newSize = std::max(newSize, indirectHeap->getAvailableSpace() + size);
|
||||
newSize = alignUp(newSize, MemoryConstants::pageSize);
|
||||
auto oldAlloc = getIndirectHeapAllocation(heapType);
|
||||
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
|
||||
UNRECOVERABLE_IF(!oldAlloc);
|
||||
UNRECOVERABLE_IF(!newAlloc);
|
||||
auto oldBase = indirectHeap->getHeapGpuBase();
|
||||
indirectHeap->replaceGraphicsAllocation(newAlloc);
|
||||
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
|
||||
newAlloc->getUnderlyingBufferSize());
|
||||
auto newBase = indirectHeap->getHeapGpuBase();
|
||||
getResidencyContainer().push_back(newAlloc);
|
||||
getDeallocationContainer().push_back(oldAlloc);
|
||||
setIndirectHeapAllocation(heapType, newAlloc);
|
||||
if (oldBase != newBase) {
|
||||
setHeapDirty(heapType);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return indirectHeap->getSpace(size);
|
||||
}
|
||||
|
||||
IndirectHeap *CommandContainer::getHeapWithRequiredSizeAndAlignment(HeapType heapType, size_t sizeRequired, size_t alignment) {
|
||||
auto indirectHeap = getIndirectHeap(heapType);
|
||||
UNRECOVERABLE_IF(indirectHeap == nullptr);
|
||||
auto sizeRequested = sizeRequired;
|
||||
|
||||
auto heapBuffer = indirectHeap->getSpace(0);
|
||||
@@ -218,27 +229,32 @@ IndirectHeap *CommandContainer::getHeapWithRequiredSizeAndAlignment(HeapType hea
|
||||
sizeRequested += alignment;
|
||||
}
|
||||
|
||||
if (indirectHeap->getAvailableSpace() < sizeRequested) {
|
||||
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
|
||||
newSize = alignUp(newSize, MemoryConstants::pageSize);
|
||||
auto oldAlloc = getIndirectHeapAllocation(heapType);
|
||||
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
|
||||
UNRECOVERABLE_IF(!oldAlloc);
|
||||
UNRECOVERABLE_IF(!newAlloc);
|
||||
auto oldBase = indirectHeap->getHeapGpuBase();
|
||||
indirectHeap->replaceGraphicsAllocation(newAlloc);
|
||||
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
|
||||
newAlloc->getUnderlyingBufferSize());
|
||||
auto newBase = indirectHeap->getHeapGpuBase();
|
||||
getResidencyContainer().push_back(newAlloc);
|
||||
getDeallocationContainer().push_back(oldAlloc);
|
||||
setIndirectHeapAllocation(heapType, newAlloc);
|
||||
if (oldBase != newBase) {
|
||||
setHeapDirty(heapType);
|
||||
}
|
||||
if (heapType == HeapType::SURFACE_STATE) {
|
||||
indirectHeap->getSpace(reservedSshSize);
|
||||
sshAllocations.push_back(oldAlloc);
|
||||
if (immediateCmdListSharedHeap(heapType)) {
|
||||
UNRECOVERABLE_IF(indirectHeap->getAvailableSpace() < sizeRequested);
|
||||
getResidencyContainer().push_back(indirectHeap->getGraphicsAllocation());
|
||||
} else {
|
||||
if (indirectHeap->getAvailableSpace() < sizeRequested) {
|
||||
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
|
||||
newSize = alignUp(newSize, MemoryConstants::pageSize);
|
||||
auto oldAlloc = getIndirectHeapAllocation(heapType);
|
||||
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
|
||||
UNRECOVERABLE_IF(!oldAlloc);
|
||||
UNRECOVERABLE_IF(!newAlloc);
|
||||
auto oldBase = indirectHeap->getHeapGpuBase();
|
||||
indirectHeap->replaceGraphicsAllocation(newAlloc);
|
||||
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
|
||||
newAlloc->getUnderlyingBufferSize());
|
||||
auto newBase = indirectHeap->getHeapGpuBase();
|
||||
getResidencyContainer().push_back(newAlloc);
|
||||
getDeallocationContainer().push_back(oldAlloc);
|
||||
setIndirectHeapAllocation(heapType, newAlloc);
|
||||
if (oldBase != newBase) {
|
||||
setHeapDirty(heapType);
|
||||
}
|
||||
if (heapType == HeapType::SURFACE_STATE) {
|
||||
indirectHeap->getSpace(reservedSshSize);
|
||||
sshAllocations.push_back(oldAlloc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -329,7 +345,19 @@ void CommandContainer::prepareBindfulSsh() {
|
||||
}
|
||||
|
||||
IndirectHeap *CommandContainer::getIndirectHeap(HeapType heapType) {
|
||||
return indirectHeaps[heapType].get();
|
||||
if (immediateCmdListSharedHeap(heapType)) {
|
||||
return heapType == HeapType::SURFACE_STATE ? sharedSshCsrHeap : sharedDshCsrHeap;
|
||||
} else {
|
||||
return indirectHeaps[heapType].get();
|
||||
}
|
||||
}
|
||||
|
||||
void CommandContainer::ensureHeapSizePrepared(size_t sshRequiredSize, size_t dshRequiredSize) {
|
||||
sharedSshCsrHeap = &immediateCmdListCsr->getIndirectHeap(HeapType::SURFACE_STATE, sshRequiredSize);
|
||||
|
||||
if (dshRequiredSize > 0) {
|
||||
sharedDshCsrHeap = &immediateCmdListCsr->getIndirectHeap(HeapType::DYNAMIC_STATE, dshRequiredSize);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <vector>
|
||||
|
||||
namespace NEO {
|
||||
class CommandStreamReceiver;
|
||||
class Device;
|
||||
class GraphicsAllocation;
|
||||
class LinearStream;
|
||||
@@ -94,13 +95,20 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
||||
void setIddBlock(void *iddBlock) { this->iddBlock = iddBlock; }
|
||||
void *getIddBlock() { return iddBlock; }
|
||||
uint32_t getNumIddPerBlock() const { return numIddsPerBlock; }
|
||||
void setNumIddPerBlock(uint32_t value) { numIddsPerBlock = value; }
|
||||
void setReservedSshSize(size_t reserveSize) {
|
||||
reservedSshSize = reserveSize;
|
||||
}
|
||||
|
||||
bool getFlushTaskUsedForImmediate() const { return isFlushTaskUsedForImmediate; }
|
||||
void setFlushTaskUsedForImmediate(bool flushTaskUsedForImmediate) { isFlushTaskUsedForImmediate = flushTaskUsedForImmediate; }
|
||||
|
||||
void setImmediateCmdListCsr(CommandStreamReceiver *newValue) {
|
||||
this->immediateCmdListCsr = newValue;
|
||||
}
|
||||
bool immediateCmdListSharedHeap(HeapType heapType) {
|
||||
return (this->immediateCmdListCsr != nullptr && (heapType == HeapType::DYNAMIC_STATE || heapType == HeapType::SURFACE_STATE));
|
||||
}
|
||||
void ensureHeapSizePrepared(size_t sshRequiredSize, size_t dshRequiredSize);
|
||||
HeapContainer sshAllocations;
|
||||
uint64_t currentLinearStreamStartOffset = 0u;
|
||||
uint32_t slmSize = std::numeric_limits<uint32_t>::max();
|
||||
@@ -129,6 +137,9 @@ class CommandContainer : public NonCopyableOrMovableClass {
|
||||
Device *device = nullptr;
|
||||
AllocationsList *reusableAllocationList = nullptr;
|
||||
size_t reservedSshSize = 0;
|
||||
CommandStreamReceiver *immediateCmdListCsr = nullptr;
|
||||
IndirectHeap *sharedSshCsrHeap = nullptr;
|
||||
IndirectHeap *sharedDshCsrHeap = nullptr;
|
||||
|
||||
uint32_t dirtyHeaps = std::numeric_limits<uint32_t>::max();
|
||||
uint32_t numIddsPerBlock = 64;
|
||||
|
||||
@@ -30,6 +30,7 @@ class IndirectHeap;
|
||||
class LogicalStateHelper;
|
||||
class Gmm;
|
||||
struct HardwareInfo;
|
||||
struct KernelInfo;
|
||||
struct StateComputeModeProperties;
|
||||
|
||||
struct EncodeDispatchKernelArgs {
|
||||
@@ -112,6 +113,10 @@ struct EncodeDispatchKernel {
|
||||
static void adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo);
|
||||
|
||||
static constexpr bool shouldUpdateGlobalAtomics(bool ¤tVal, bool refVal, bool updateCurrent);
|
||||
|
||||
static size_t getSizeRequiredDsh(const KernelInfo &kernelInfo);
|
||||
static size_t getSizeRequiredSsh(const KernelInfo &kernelInfo);
|
||||
inline static uint32_t additionalSizeRequiredDsh();
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
@@ -121,8 +126,8 @@ struct EncodeStates {
|
||||
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
|
||||
using SAMPLER_BORDER_COLOR_STATE = typename GfxFamily::SAMPLER_BORDER_COLOR_STATE;
|
||||
|
||||
static const uint32_t alignIndirectStatePointer = MemoryConstants::cacheLineSize;
|
||||
static const size_t alignInterfaceDescriptorData = MemoryConstants::cacheLineSize;
|
||||
static constexpr uint32_t alignIndirectStatePointer = MemoryConstants::cacheLineSize;
|
||||
static constexpr size_t alignInterfaceDescriptorData = MemoryConstants::cacheLineSize;
|
||||
|
||||
static uint32_t copySamplerState(IndirectHeap *dsh,
|
||||
uint32_t samplerStateOffset,
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "shared/source/kernel/implicit_args.h"
|
||||
#include "shared/source/kernel/kernel_descriptor.h"
|
||||
#include "shared/source/os_interface/hw_info_config.h"
|
||||
#include "shared/source/program/kernel_info.h"
|
||||
|
||||
#include "encode_surface_state.inl"
|
||||
|
||||
@@ -698,6 +699,39 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
|
||||
template <typename Family>
|
||||
constexpr bool EncodeDispatchKernel<Family>::shouldUpdateGlobalAtomics(bool ¤tVal, bool refVal, bool updateCurrent) { return false; }
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelInfo &kernelInfo) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;
|
||||
constexpr auto samplerStateSize = sizeof(typename Family::SAMPLER_STATE);
|
||||
const auto numSamplers = kernelInfo.kernelDescriptor.payloadMappings.samplerTable.numSamplers;
|
||||
const auto additionalDshSize = additionalSizeRequiredDsh();
|
||||
if (numSamplers == 0U) {
|
||||
return alignUp(additionalDshSize, EncodeStates<Family>::alignInterfaceDescriptorData);
|
||||
}
|
||||
|
||||
size_t size = kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset -
|
||||
kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor;
|
||||
size = alignUp(size, EncodeStates<Family>::alignIndirectStatePointer);
|
||||
|
||||
size += numSamplers * samplerStateSize;
|
||||
size = alignUp(size, INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
|
||||
|
||||
if (additionalDshSize > 0) {
|
||||
size += additionalDshSize;
|
||||
size = alignUp(size, EncodeStates<Family>::alignInterfaceDescriptorData);
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::getSizeRequiredSsh(const KernelInfo &kernelInfo) {
|
||||
using BINDING_TABLE_STATE = typename Family::BINDING_TABLE_STATE;
|
||||
size_t requiredSshSize = kernelInfo.heapInfo.SurfaceStateHeapSize;
|
||||
requiredSshSize = alignUp(requiredSshSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
|
||||
return requiredSshSize;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) {
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
|
||||
@@ -104,13 +104,13 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
|
||||
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, args.preemptionMode);
|
||||
|
||||
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
|
||||
UNRECOVERABLE_IF(!heap);
|
||||
|
||||
uint32_t samplerStateOffset = 0;
|
||||
uint32_t samplerCount = 0;
|
||||
|
||||
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
|
||||
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
|
||||
UNRECOVERABLE_IF(!heap);
|
||||
|
||||
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
|
||||
samplerStateOffset = EncodeStates<Family>::copySamplerState(heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
|
||||
kernelDescriptor.payloadMappings.samplerTable.numSamplers,
|
||||
@@ -539,4 +539,9 @@ void EncodeDispatchKernel<Family>::setupPostSyncMocs(WALKER_TYPE &walkerCmd, con
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo) {}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeDispatchKernel<Family>::additionalSizeRequiredDsh() {
|
||||
return sizeof(typename Family::INTERFACE_DESCRIPTOR_DATA);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -127,13 +127,13 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, args.preemptionMode);
|
||||
|
||||
if constexpr (Family::supportsSampler) {
|
||||
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
|
||||
UNRECOVERABLE_IF(!heap);
|
||||
|
||||
uint32_t samplerStateOffset = 0;
|
||||
uint32_t samplerCount = 0;
|
||||
|
||||
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
|
||||
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
|
||||
UNRECOVERABLE_IF(!heap);
|
||||
|
||||
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
|
||||
samplerStateOffset = EncodeStates<Family>::copySamplerState(
|
||||
heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
|
||||
@@ -768,4 +768,9 @@ inline void EncodeStoreMMIO<Family>::appendFlags(MI_STORE_REGISTER_MEM *storeReg
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo) {}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeDispatchKernel<Family>::additionalSizeRequiredDsh() {
|
||||
return 0u;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
Reference in New Issue
Block a user