refactor: Exclude thread dispatch algorithm for overdispatch to function

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk 2024-03-01 12:33:10 +00:00 committed by Compute-Runtime-Automation
parent 8840b6d02f
commit bd6925d51a
4 changed files with 121 additions and 109 deletions

View File

@ -158,6 +158,9 @@ struct EncodeDispatchKernel {
template <typename WalkerType, typename InterfaceDescriptorType>
static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
template <typename WalkerType, typename InterfaceDescriptorType>
static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
template <typename WalkerType>

View File

@ -7,6 +7,7 @@
#pragma once
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/debugger/debugger_l0.h"
#include "shared/source/device/device.h"
@ -18,6 +19,7 @@
#include "shared/source/helpers/blit_commands_helper.h"
#include "shared/source/helpers/definitions/command_encoder_args.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/local_id_gen.h"
#include "shared/source/helpers/preamble.h"
#include "shared/source/helpers/register_offsets.h"
@ -755,6 +757,119 @@ size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &
return size;
}
template <typename GfxFamily>
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
const auto &productHelper = device.getProductHelper();
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
bool adjustTGDispatchSize = true;
if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) {
adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get();
}
// apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1;
if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) {
algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get();
}
if (algorithmVersion == 2) {
auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
if (numGrf == 256) {
threadsPerXeCore /= 2;
}
auto tgDispatchSizeSelected = 8;
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
}
auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u;
// make sure we fit all xe core
while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) {
tgDispatchSizeSelected /= 2;
}
auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup;
// make sure we do not use more threads then present on each xe core
while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) {
tgDispatchSizeSelected /= 2;
threadCountPerGrouping /= 2;
}
if (tgDispatchSizeSelected == 8) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
} else if (tgDispatchSizeSelected == 1) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
} else if (tgDispatchSizeSelected == 2) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
} else {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
}
} else {
if (adjustTGDispatchSize) {
UNRECOVERABLE_IF(numGrf == 0u);
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
auto &gfxCoreHelper = device.getGfxCoreHelper();
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
const uint32_t tilesCount = device.getNumSubDevices();
availableThreadCount *= tilesCount;
}
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
auto tgDispatchSizeSelected = 1u;
if (dispatchedTotalThreadCount <= availableThreadCount) {
tgDispatchSizeSelected = 1;
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
tgDispatchSizeSelected = 8;
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
tgDispatchSizeSelected = 4;
} else {
tgDispatchSizeSelected = 2;
}
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
}
if (tgDispatchSizeSelected == 8) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
} else if (tgDispatchSizeSelected == 1) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
} else if (tgDispatchSizeSelected == 2) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
} else {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
}
}
}
}
if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
interfaceDescriptor.setThreadGroupDispatchSize(static_cast<typename INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE>(
debugManager.flags.ForceThreadGroupDispatchSize.get()));
}
}
template <typename Family>
size_t EncodeDispatchKernel<Family>::getSizeRequiredSsh(const KernelInfo &kernelInfo) {
size_t requiredSshSize = kernelInfo.heapInfo.surfaceStateHeapSize;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2023 Intel Corporation
* Copyright (C) 2021-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -29,113 +29,7 @@ namespace NEO {
template <>
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
const auto &productHelper = device.getProductHelper();
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
bool adjustTGDispatchSize = true;
if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) {
adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get();
}
// apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1;
if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) {
algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get();
}
if (algorithmVersion == 2) {
auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
if (numGrf == 256) {
threadsPerXeCore /= 2;
}
auto tgDispatchSizeSelected = 8;
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
}
auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u;
// make sure we fit all xe core
while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) {
tgDispatchSizeSelected /= 2;
}
auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup;
// make sure we do not use more threads then present on each xe core
while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) {
tgDispatchSizeSelected /= 2;
threadCountPerGrouping /= 2;
}
if (tgDispatchSizeSelected == 8) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
} else if (tgDispatchSizeSelected == 1) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
} else if (tgDispatchSizeSelected == 2) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
} else {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
}
} else {
if (adjustTGDispatchSize) {
UNRECOVERABLE_IF(numGrf == 0u);
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
auto &gfxCoreHelper = device.getGfxCoreHelper();
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
const uint32_t tilesCount = device.getNumSubDevices();
availableThreadCount *= tilesCount;
}
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
auto tgDispatchSizeSelected = 1u;
if (dispatchedTotalThreadCount <= availableThreadCount) {
tgDispatchSizeSelected = 1;
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
tgDispatchSizeSelected = 8;
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
tgDispatchSizeSelected = 4;
} else {
tgDispatchSizeSelected = 2;
}
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
}
if (tgDispatchSizeSelected == 8) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
} else if (tgDispatchSizeSelected == 1) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
} else if (tgDispatchSizeSelected == 2) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
} else {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
}
}
}
}
if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
interfaceDescriptor.setThreadGroupDispatchSize(static_cast<INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE>(
debugManager.flags.ForceThreadGroupDispatchSize.get()));
}
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, numGrf, walkerCmd);
}
template <>

View File

@ -685,7 +685,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInlineDataRequiredAnd
EXPECT_EQ(0u, cmd->getLocalZMaximum());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGroupDispatchSizeVariableIsDefaultThenThreadGroupDispatchSizeIsNotChanged) {
HWTEST2_F(CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGroupDispatchSizeVariableIsDefaultThenThreadGroupDispatchSizeIsNotChanged, IsXeLpg) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
INTERFACE_DESCRIPTOR_DATA iddArg;