Improve EncodeDispatchKernel

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2020-11-27 09:22:59 +00:00
committed by Compute-Runtime-Automation
parent baea633bdd
commit 93ba4e646b
10 changed files with 155 additions and 35 deletions

View File

@ -66,6 +66,8 @@ struct EncodeDispatchKernel {
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo);
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
static void adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo);
};

View File

@ -441,6 +441,25 @@ void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &conta
}
}
template <typename Family>
void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount) {
auto enablePrefetch = EncodeSurfaceState<Family>::doBindingTablePrefetch();
if (DebugManager.flags.ForceBtpPrefetchMode.get() != -1) {
enablePrefetch = static_cast<bool>(DebugManager.flags.ForceBtpPrefetchMode.get());
}
if (enablePrefetch) {
interfaceDescriptor.setSamplerCount(static_cast<typename INTERFACE_DESCRIPTOR_DATA::SAMPLER_COUNT>((samplerCount + 3) / 4));
interfaceDescriptor.setBindingTableEntryCount(std::min(bindingTableEntryCount, 31u));
} else {
interfaceDescriptor.setSamplerCount(INTERFACE_DESCRIPTOR_DATA::SAMPLER_COUNT::SAMPLER_COUNT_NO_SAMPLERS_USED);
interfaceDescriptor.setBindingTableEntryCount(0u);
}
}
template <typename Family>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo) {}
template <typename Family>
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws) {
for (int i = 0; i < 3; ++i) {

View File

@ -77,28 +77,21 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
? slmSize
: INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K);
{
uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
uint32_t bindingTablePointer = 0u;
uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
uint32_t bindingTablePointer = 0u;
if (bindingTableStateCount > 0u) {
auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
sshOffset = ssh->getUsed();
bindingTablePointer = static_cast<uint32_t>(EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(
*ssh, bindingTableStateCount,
dispatchInterface->getSurfaceStateHeapData(),
dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
}
idd.setBindingTablePointer(bindingTablePointer);
uint32_t bindingTableStatePrefetchCount = 0;
if (EncodeSurfaceState<Family>::doBindingTablePrefetch()) {
bindingTableStatePrefetchCount = std::min(31u, bindingTableStateCount);
}
idd.setBindingTableEntryCount(bindingTableStatePrefetchCount);
if (bindingTableStateCount > 0u) {
auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
sshOffset = ssh->getUsed();
bindingTablePointer = static_cast<uint32_t>(EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(
*ssh, bindingTableStateCount,
dispatchInterface->getSurfaceStateHeapData(),
dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
}
idd.setBindingTablePointer(bindingTablePointer);
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, preemptionMode);
auto heap = container.getIndirectHeap(HeapType::DYNAMIC_STATE);
@ -116,9 +109,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
}
idd.setSamplerStatePointer(samplerStateOffset);
auto samplerCountState =
static_cast<typename INTERFACE_DESCRIPTOR_DATA::SAMPLER_COUNT>((samplerCount + 3) / 4);
idd.setSamplerCount(samplerCountState);
EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount);
auto numGrfCrossThreadData = static_cast<uint32_t>(sizeCrossThreadData / sizeof(float[8]));
idd.setCrossThreadConstantDataReadLength(numGrfCrossThreadData);
@ -310,9 +302,6 @@ void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const HardwareIn
template <typename Family>
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize) {}
template <typename Family>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo) {}
template <typename Family>
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device) {
using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;