Encode dispatch kernel with global bindless heaps

Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
Maciej Plewka
2020-11-26 09:04:26 +00:00
committed by Compute-Runtime-Automation
parent be90b9ff93
commit 7a5c9d39b5
14 changed files with 478 additions and 149 deletions

View File

@@ -21,6 +21,7 @@ namespace NEO {
class GmmHelper;
struct HardwareInfo;
class IndirectHeap;
class BindlessHeapsHelper;
template <typename GfxFamily>
struct EncodeDispatchKernel {
@@ -46,8 +47,6 @@ struct EncodeDispatchKernel {
static size_t estimateEncodeDispatchKernelCmdsSize(Device *device);
static void patchBindlessSurfaceStateOffsets(const size_t sshOffset, const KernelDescriptor &kernelDesc, uint8_t *crossThread);
static bool isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
size_t *lws,
std::array<uint8_t, 3> walkOrder,
@@ -84,6 +83,7 @@ struct EncodeStates {
using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
using SAMPLER_BORDER_COLOR_STATE = typename GfxFamily::SAMPLER_BORDER_COLOR_STATE;
static const uint32_t alignIndirectStatePointer = MemoryConstants::cacheLineSize;
static const size_t alignInterfaceDescriptorData = MemoryConstants::cacheLineSize;
@@ -92,7 +92,8 @@ struct EncodeStates {
uint32_t samplerStateOffset,
uint32_t samplerCount,
uint32_t borderColorOffset,
const void *fnDynamicStateHeap);
const void *fnDynamicStateHeap,
BindlessHeapsHelper *bindlessHeapHelper);
static void adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency);

View File

@@ -12,6 +12,8 @@
#include "shared/source/execution_environment/execution_environment.h"
#include "shared/source/gmm_helper/gmm.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/bindless_heaps_helper.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/local_id_gen.h"
#include "shared/source/helpers/preamble.h"
@@ -30,22 +32,44 @@ uint32_t EncodeStates<Family>::copySamplerState(IndirectHeap *dsh,
uint32_t samplerStateOffset,
uint32_t samplerCount,
uint32_t borderColorOffset,
const void *fnDynamicStateHeap) {
const void *fnDynamicStateHeap,
BindlessHeapsHelper *bindlessHeapHelper) {
auto sizeSamplerState = sizeof(SAMPLER_STATE) * samplerCount;
auto borderColorSize = samplerStateOffset - borderColorOffset;
SAMPLER_STATE *dstSamplerState = nullptr;
uint32_t samplerStateOffsetInDsh = 0;
dsh->align(EncodeStates<Family>::alignIndirectStatePointer);
auto borderColorOffsetInDsh = static_cast<uint32_t>(dsh->getUsed());
uint32_t borderColorOffsetInDsh = 0;
if (!ApiSpecificConfig::getBindlessConfiguration()) {
borderColorOffsetInDsh = static_cast<uint32_t>(dsh->getUsed());
auto borderColor = dsh->getSpace(borderColorSize);
auto borderColor = dsh->getSpace(borderColorSize);
memcpy_s(borderColor, borderColorSize, ptrOffset(fnDynamicStateHeap, borderColorOffset),
borderColorSize);
memcpy_s(borderColor, borderColorSize, ptrOffset(fnDynamicStateHeap, borderColorOffset),
borderColorSize);
dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
samplerStateOffsetInDsh = static_cast<uint32_t>(dsh->getUsed());
dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
auto samplerStateOffsetInDsh = static_cast<uint32_t>(dsh->getUsed());
auto dstSamplerState = reinterpret_cast<SAMPLER_STATE *>(dsh->getSpace(sizeSamplerState));
dstSamplerState = reinterpret_cast<SAMPLER_STATE *>(dsh->getSpace(sizeSamplerState));
} else {
auto borderColor = reinterpret_cast<const SAMPLER_BORDER_COLOR_STATE *>(ptrOffset(fnDynamicStateHeap, borderColorOffset));
if (borderColor->getBorderColorRed() != 0.0f ||
borderColor->getBorderColorGreen() != 0.0f ||
borderColor->getBorderColorBlue() != 0.0f ||
(borderColor->getBorderColorAlpha() != 0.0f && borderColor->getBorderColorAlpha() != 1.0f)) {
UNRECOVERABLE_IF(true);
} else if (borderColor->getBorderColorAlpha() == 0.0f) {
borderColorOffsetInDsh = bindlessHeapHelper->getDefaultBorderColorOffset();
} else {
borderColorOffsetInDsh = bindlessHeapHelper->getAlphaBorderColorOffset();
}
dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
auto samplerStateInDsh = bindlessHeapHelper->allocateSSInHeap(sizeSamplerState, nullptr, BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH);
dstSamplerState = reinterpret_cast<SAMPLER_STATE *>(samplerStateInDsh.ssPtr);
samplerStateOffsetInDsh = static_cast<uint32_t>(samplerStateInDsh.surfaceStateOffset);
}
auto srcSamplerState = reinterpret_cast<const SAMPLER_STATE *>(ptrOffset(fnDynamicStateHeap, samplerStateOffset));
SAMPLER_STATE state = {};
@@ -56,7 +80,7 @@ uint32_t EncodeStates<Family>::copySamplerState(IndirectHeap *dsh,
}
return samplerStateOffsetInDsh;
}
} // namespace NEO
template <typename Family>
size_t EncodeStates<Family>::getAdjustStateComputeModeSize() {
@@ -382,40 +406,6 @@ void *EncodeDispatchKernel<Family>::getInterfaceDescriptor(CommandContainer &con
return &interfaceDescriptorData[container.nextIddInBlock++];
}
template <typename Family>
void EncodeDispatchKernel<Family>::patchBindlessSurfaceStateOffsets(const size_t sshOffset, const KernelDescriptor &kernelDesc, uint8_t *crossThread) {
auto &hwHelper = HwHelperHw<Family>::get();
for (const auto &argT : kernelDesc.payloadMappings.explicitArgs) {
CrossThreadDataOffset bindless = undefined<CrossThreadDataOffset>;
SurfaceStateHeapOffset bindful = undefined<SurfaceStateHeapOffset>;
switch (argT.type) {
case ArgDescriptor::ArgTPointer: {
auto &arg = argT.as<NEO::ArgDescPointer>();
bindless = arg.bindless;
bindful = arg.bindful;
} break;
case ArgDescriptor::ArgTImage: {
auto &arg = argT.as<NEO::ArgDescImage>();
bindless = arg.bindless;
bindful = arg.bindful;
} break;
default:
break;
}
if (NEO::isValidOffset(bindless)) {
auto patchLocation = ptrOffset(crossThread, bindless);
auto bindlessOffset = static_cast<uint32_t>(sshOffset) + bindful;
auto patchValue = hwHelper.getBindlessSurfaceExtendedMessageDescriptorValue(bindlessOffset);
patchWithRequiredSize(patchLocation, sizeof(patchValue), patchValue);
}
}
}
template <typename Family>
bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDescriptor &kernelDesc) {
auto checkKernelForInlineData = true;

View File

@@ -11,6 +11,7 @@
#include "shared/source/command_stream/preemption.h"
#include "shared/source/execution_environment/execution_environment.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/simd_helper.h"
#include "shared/source/helpers/state_base_address.h"
@@ -79,22 +80,25 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
uint32_t bindingTablePointer = 0u;
bool isBindlessKernel = kernelDescriptor.kernelAttributes.bufferAddressingMode == KernelDescriptor::BindlessAndStateless;
if (!isBindlessKernel) {
if (bindingTableStateCount > 0u) {
auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
sshOffset = ssh->getUsed();
bindingTablePointer = static_cast<uint32_t>(EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(
*ssh, bindingTableStateCount,
dispatchInterface->getSurfaceStateHeapData(),
dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
if (bindingTableStateCount > 0u) {
auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
sshOffset = ssh->getUsed();
bindingTablePointer = static_cast<uint32_t>(EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(
*ssh, bindingTableStateCount,
dispatchInterface->getSurfaceStateHeapData(),
dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
}
idd.setBindingTablePointer(bindingTablePointer);
}
idd.setBindingTablePointer(bindingTablePointer);
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, preemptionMode);
auto heap = container.getIndirectHeap(HeapType::DYNAMIC_STATE);
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
UNRECOVERABLE_IF(!heap);
uint32_t samplerStateOffset = 0;
@@ -105,7 +109,11 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
samplerStateOffset = EncodeStates<Family>::copySamplerState(heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
kernelDescriptor.payloadMappings.samplerTable.numSamplers,
kernelDescriptor.payloadMappings.samplerTable.borderColor,
dispatchInterface->getDynamicStateHeapData());
dispatchInterface->getDynamicStateHeapData(),
device->getBindlessHeapsHelper());
if (ApiSpecificConfig::getBindlessConfiguration()) {
container.getResidencyContainer().push_back(device->getBindlessHeapsHelper()->getHeap(NEO::BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH)->getGraphicsAllocation());
}
}
idd.setSamplerStatePointer(samplerStateOffset);
@@ -139,10 +147,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize());
}
if (kernelDescriptor.payloadMappings.bindingTable.numEntries > 0) {
patchBindlessSurfaceStateOffsets(sshOffset, dispatchInterface->getKernelDescriptor(), reinterpret_cast<uint8_t *>(ptr));
}
ptr = ptrOffset(ptr, sizeCrossThreadData);
memcpy_s(ptr, sizePerThreadDataForWholeGroup,
dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup);