refactor: Exclude thread dispatch algorithm for overdispatch to function
Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
parent
8840b6d02f
commit
bd6925d51a
|
@ -158,6 +158,9 @@ struct EncodeDispatchKernel {
|
|||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
|
||||
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd);
|
||||
|
||||
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
|
||||
|
||||
template <typename WalkerType>
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#pragma once
|
||||
#include "shared/source/command_container/command_encoder.h"
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
#include "shared/source/command_stream/linear_stream.h"
|
||||
#include "shared/source/debugger/debugger_l0.h"
|
||||
#include "shared/source/device/device.h"
|
||||
|
@ -18,6 +19,7 @@
|
|||
#include "shared/source/helpers/blit_commands_helper.h"
|
||||
#include "shared/source/helpers/definitions/command_encoder_args.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/helpers/local_id_gen.h"
|
||||
#include "shared/source/helpers/preamble.h"
|
||||
#include "shared/source/helpers/register_offsets.h"
|
||||
|
@ -755,6 +757,119 @@ size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &
|
|||
return size;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
|
||||
const auto &productHelper = device.getProductHelper();
|
||||
|
||||
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
|
||||
bool adjustTGDispatchSize = true;
|
||||
if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) {
|
||||
adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get();
|
||||
}
|
||||
// apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
|
||||
auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1;
|
||||
if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) {
|
||||
algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get();
|
||||
}
|
||||
|
||||
if (algorithmVersion == 2) {
|
||||
auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
|
||||
if (numGrf == 256) {
|
||||
threadsPerXeCore /= 2;
|
||||
}
|
||||
auto tgDispatchSizeSelected = 8;
|
||||
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
|
||||
|
||||
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
|
||||
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
|
||||
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
}
|
||||
|
||||
auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
|
||||
auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u;
|
||||
|
||||
// make sure we fit all xe core
|
||||
while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
|
||||
auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup;
|
||||
// make sure we do not use more threads then present on each xe core
|
||||
while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
threadCountPerGrouping /= 2;
|
||||
}
|
||||
|
||||
if (tgDispatchSizeSelected == 8) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
|
||||
} else if (tgDispatchSizeSelected == 1) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
} else if (tgDispatchSizeSelected == 2) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
|
||||
} else {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
|
||||
}
|
||||
} else {
|
||||
if (adjustTGDispatchSize) {
|
||||
UNRECOVERABLE_IF(numGrf == 0u);
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
|
||||
const uint32_t tilesCount = device.getNumSubDevices();
|
||||
availableThreadCount *= tilesCount;
|
||||
}
|
||||
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
|
||||
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
|
||||
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
|
||||
auto tgDispatchSizeSelected = 1u;
|
||||
|
||||
if (dispatchedTotalThreadCount <= availableThreadCount) {
|
||||
tgDispatchSizeSelected = 1;
|
||||
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
|
||||
tgDispatchSizeSelected = 8;
|
||||
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
|
||||
tgDispatchSizeSelected = 4;
|
||||
} else {
|
||||
tgDispatchSizeSelected = 2;
|
||||
}
|
||||
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
|
||||
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
|
||||
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
}
|
||||
if (tgDispatchSizeSelected == 8) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
|
||||
} else if (tgDispatchSizeSelected == 1) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
} else if (tgDispatchSizeSelected == 2) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
|
||||
} else {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(static_cast<typename INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE>(
|
||||
debugManager.flags.ForceThreadGroupDispatchSize.get()));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::getSizeRequiredSsh(const KernelInfo &kernelInfo) {
|
||||
size_t requiredSshSize = kernelInfo.heapInfo.surfaceStateHeapSize;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 2021-2023 Intel Corporation
|
||||
* Copyright (C) 2021-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
|
@ -29,113 +29,7 @@ namespace NEO {
|
|||
template <>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) {
|
||||
const auto &productHelper = device.getProductHelper();
|
||||
|
||||
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
bool adjustTGDispatchSize = true;
|
||||
if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) {
|
||||
adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get();
|
||||
}
|
||||
// apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
|
||||
auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1;
|
||||
if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) {
|
||||
algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get();
|
||||
}
|
||||
|
||||
if (algorithmVersion == 2) {
|
||||
auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
|
||||
if (numGrf == 256) {
|
||||
threadsPerXeCore /= 2;
|
||||
}
|
||||
auto tgDispatchSizeSelected = 8;
|
||||
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
|
||||
|
||||
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
|
||||
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
|
||||
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
}
|
||||
|
||||
auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
|
||||
auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u;
|
||||
|
||||
// make sure we fit all xe core
|
||||
while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
|
||||
auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup;
|
||||
// make sure we do not use more threads then present on each xe core
|
||||
while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
threadCountPerGrouping /= 2;
|
||||
}
|
||||
|
||||
if (tgDispatchSizeSelected == 8) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
|
||||
} else if (tgDispatchSizeSelected == 1) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
} else if (tgDispatchSizeSelected == 2) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
|
||||
} else {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
|
||||
}
|
||||
} else {
|
||||
if (adjustTGDispatchSize) {
|
||||
UNRECOVERABLE_IF(numGrf == 0u);
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
|
||||
const uint32_t tilesCount = device.getNumSubDevices();
|
||||
availableThreadCount *= tilesCount;
|
||||
}
|
||||
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
|
||||
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
|
||||
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
|
||||
auto tgDispatchSizeSelected = 1u;
|
||||
|
||||
if (dispatchedTotalThreadCount <= availableThreadCount) {
|
||||
tgDispatchSizeSelected = 1;
|
||||
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
|
||||
tgDispatchSizeSelected = 8;
|
||||
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
|
||||
tgDispatchSizeSelected = 4;
|
||||
} else {
|
||||
tgDispatchSizeSelected = 2;
|
||||
}
|
||||
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
|
||||
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
|
||||
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
}
|
||||
if (tgDispatchSizeSelected == 8) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
|
||||
} else if (tgDispatchSizeSelected == 1) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
} else if (tgDispatchSizeSelected == 2) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
|
||||
} else {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(static_cast<INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE>(
|
||||
debugManager.flags.ForceThreadGroupDispatchSize.get()));
|
||||
}
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
|
|
@ -685,7 +685,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInlineDataRequiredAnd
|
|||
EXPECT_EQ(0u, cmd->getLocalZMaximum());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGroupDispatchSizeVariableIsDefaultThenThreadGroupDispatchSizeIsNotChanged) {
|
||||
HWTEST2_F(CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGroupDispatchSizeVariableIsDefaultThenThreadGroupDispatchSizeIsNotChanged, IsXeLpg) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg;
|
||||
|
|
Loading…
Reference in New Issue