Revert "performance: Change thread group dispatch size algorithm"

This reverts commit ac7cd9c4c5.

Signed-off-by: Mrozek, Michal <michal.mrozek@intel.com>
This commit is contained in:
Mrozek, Michal 2023-10-04 18:12:09 +00:00 committed by Compute-Runtime-Automation
parent e676ac49bb
commit 90e24a433d
2 changed files with 86 additions and 43 deletions

View File

@ -42,30 +42,39 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
adjustTGDispatchSize = !!DebugManager.flags.AdjustThreadGroupDispatchSize.get();
}
if (adjustTGDispatchSize) {
UNRECOVERABLE_IF(numGrf == 0u);
auto tgDispatchSizeSelected = 8u;
auto dispatchDimension = 1u;
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
auto &gfxCoreHelper = device.getGfxCoreHelper();
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
const uint32_t tilesCount = device.getNumSubDevices();
availableThreadCount *= tilesCount;
}
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
auto tgDispatchSizeSelected = 1u;
if (walkerCmd.getThreadGroupIdXDimension() > 1) {
dispatchDimension = walkerCmd.getThreadGroupIdXDimension();
if (walkerCmd.getPartitionType() == WALKER_TYPE::PARTITION_TYPE_X) {
dispatchDimension = dispatchDimension / 2;
if (dispatchedTotalThreadCount <= availableThreadCount) {
tgDispatchSizeSelected = 1;
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
tgDispatchSizeSelected = 8;
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
tgDispatchSizeSelected = 4;
} else {
tgDispatchSizeSelected = 2;
}
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
} else if (walkerCmd.getThreadGroupIdYDimension() > 1) {
dispatchDimension = walkerCmd.getThreadGroupIdYDimension();
if (walkerCmd.getPartitionType() == WALKER_TYPE::PARTITION_TYPE_Y) {
dispatchDimension = dispatchDimension / 2;
}
} else if (walkerCmd.getThreadGroupIdZDimension() > 1) {
dispatchDimension = walkerCmd.getThreadGroupIdZDimension();
if (walkerCmd.getPartitionType() == WALKER_TYPE::PARTITION_TYPE_Z) {
dispatchDimension = dispatchDimension / 2;
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
}
while (dispatchDimension % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
if (tgDispatchSizeSelected == 8) {
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
} else if (tgDispatchSizeSelected == 1) {

View File

@ -508,7 +508,57 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDispatchSizeSmallerOrEqualToAv
}
}
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCountWhenInterfaceDescriptorIsBeingProgrammedThenCorrectValueOfThreadGroupDispatchSizeIsSelected) {
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenMultipleTilesAndImplicitScalingWhenAdjustInterfaceDescriptorDataIsCalledThenThreadGroupDispatchSizeIsCorrectlySet) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
WALKER_TYPE walkerCmd{};
DebugManagerStateRestore restorer;
DebugManager.flags.EnableWalkerPartition.set(0);
const auto &productHelper = pDevice->getProductHelper();
auto hwInfo = pDevice->getHardwareInfo();
hwInfo.platform.usRevId = productHelper.getHwRevIdFromStepping(REVISION_B, hwInfo);
hwInfo.gtSystemInfo.EUCount = 32;
hwInfo.gtSystemInfo.DualSubSliceCount = hwInfo.gtSystemInfo.MaxDualSubSlicesSupported;
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
const uint32_t numGrf = GrfConfig::DefaultGrfNumber;
auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
const uint32_t threadGroupCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf) / 32u;
iddArg.setNumberOfThreadsInGpgpuThreadGroup(64u);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
ASSERT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
DebugManager.flags.EnableWalkerPartition.set(1);
pDevice->numSubDevices = 2;
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenNumberOfThreadsInThreadGroupWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
WALKER_TYPE walkerCmd{};
const auto &productHelper = pDevice->getProductHelper();
auto hwInfo = pDevice->getHardwareInfo();
hwInfo.platform.usRevId = productHelper.getHwRevIdFromStepping(REVISION_B, hwInfo);
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
const uint32_t threadGroupCount = 512u;
const uint32_t numGrf = GrfConfig::DefaultGrfNumber;
std::array<std::pair<uint32_t, uint32_t>, 3> testParams = {{{16u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8},
{32u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4},
{64u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2}}};
for (const auto &[numberOfThreadsInThreadGroup, expectedThreadGroupDispatchSize] : testParams) {
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(expectedThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
}
}
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenNumberOfThreadsInThreadGroupAndDimensionsWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
WALKER_TYPE walkerCmd{};
@ -520,18 +570,12 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun
const uint32_t threadGroupCount = 512u;
const uint32_t numGrf = GrfConfig::DefaultGrfNumber;
walkerCmd.setThreadGroupIdXDimension(8);
iddArg.setNumberOfThreadsInGpgpuThreadGroup(16u);
{
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_X);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize());
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_DISABLED);
}
walkerCmd.setThreadGroupIdYDimension(2);
walkerCmd.setThreadGroupIdZDimension(1);
{
@ -547,12 +591,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun
{
walkerCmd.setThreadGroupIdXDimension(1);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_Y);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_DISABLED);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
walkerCmd.setThreadGroupIdYDimension(1);
walkerCmd.setThreadGroupIdZDimension(2);
@ -569,24 +608,24 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun
{
walkerCmd.setThreadGroupIdXDimension(1);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
walkerCmd.setThreadGroupIdYDimension(1);
walkerCmd.setThreadGroupIdZDimension(1);
{
walkerCmd.setThreadGroupIdXDimension(4);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize());
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
{
walkerCmd.setThreadGroupIdXDimension(2);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
{
walkerCmd.setThreadGroupIdXDimension(1);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
walkerCmd.setThreadGroupIdXDimension(1);
walkerCmd.setThreadGroupIdZDimension(2);
@ -603,12 +642,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun
{
walkerCmd.setThreadGroupIdYDimension(1);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_Z);
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_DISABLED);
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
}