From 90e24a433df631782a2b5c7912785224b78704ba Mon Sep 17 00:00:00 2001 From: "Mrozek, Michal" Date: Wed, 4 Oct 2023 18:12:09 +0000 Subject: [PATCH] Revert "performance: Change thread group dispatch size algorithm" This reverts commit ac7cd9c4c5dc48efc7fb55b740044660712d5bfb. Signed-off-by: Mrozek, Michal --- .../command_encoder_xe_hpc_core.cpp | 47 ++++++----- .../xe_hpc_core/test_encode_xe_hpc_core.cpp | 82 +++++++++++++------ 2 files changed, 86 insertions(+), 43 deletions(-) diff --git a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp index c903bb597d..ba43d7f1dc 100644 --- a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp @@ -42,30 +42,39 @@ void EncodeDispatchKernel::adjustInterfaceDescriptorData(INTERFACE_DESCR adjustTGDispatchSize = !!DebugManager.flags.AdjustThreadGroupDispatchSize.get(); } if (adjustTGDispatchSize) { + UNRECOVERABLE_IF(numGrf == 0u); - auto tgDispatchSizeSelected = 8u; - auto dispatchDimension = 1u; + constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u; + constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u; + auto &gfxCoreHelper = device.getGfxCoreHelper(); + uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf); + if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) { + const uint32_t tilesCount = device.getNumSubDevices(); + availableThreadCount *= tilesCount; + } + uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); + uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount; + UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u); + auto tgDispatchSizeSelected = 1u; - if (walkerCmd.getThreadGroupIdXDimension() > 1) { - dispatchDimension = walkerCmd.getThreadGroupIdXDimension(); - if (walkerCmd.getPartitionType() == WALKER_TYPE::PARTITION_TYPE_X) { - dispatchDimension = dispatchDimension / 2; + if (dispatchedTotalThreadCount <= availableThreadCount) { + tgDispatchSizeSelected = 1; + } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) { + tgDispatchSizeSelected = 8; + } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) { + tgDispatchSizeSelected = 4; + } else { + tgDispatchSizeSelected = 2; + } + if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { + while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { + tgDispatchSizeSelected /= 2; } - } else if (walkerCmd.getThreadGroupIdYDimension() > 1) { - dispatchDimension = walkerCmd.getThreadGroupIdYDimension(); - if (walkerCmd.getPartitionType() == WALKER_TYPE::PARTITION_TYPE_Y) { - dispatchDimension = dispatchDimension / 2; - } - } else if (walkerCmd.getThreadGroupIdZDimension() > 1) { - dispatchDimension = walkerCmd.getThreadGroupIdZDimension(); - if (walkerCmd.getPartitionType() == WALKER_TYPE::PARTITION_TYPE_Z) { - dispatchDimension = dispatchDimension / 2; + } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { + while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { + tgDispatchSizeSelected /= 2; } } - while (dispatchDimension % tgDispatchSizeSelected != 0) { - tgDispatchSizeSelected /= 2; - } - if (tgDispatchSizeSelected == 8) { interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); } else if (tgDispatchSizeSelected == 1) { diff --git a/shared/test/unit_test/xe_hpc_core/test_encode_xe_hpc_core.cpp b/shared/test/unit_test/xe_hpc_core/test_encode_xe_hpc_core.cpp index 6fd71be8fc..2100d3297e 100644 --- a/shared/test/unit_test/xe_hpc_core/test_encode_xe_hpc_core.cpp +++ b/shared/test/unit_test/xe_hpc_core/test_encode_xe_hpc_core.cpp @@ -508,7 +508,57 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDispatchSizeSmallerOrEqualToAv } } -XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCountWhenInterfaceDescriptorIsBeingProgrammedThenCorrectValueOfThreadGroupDispatchSizeIsSelected) { +XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenMultipleTilesAndImplicitScalingWhenAdjustInterfaceDescriptorDataIsCalledThenThreadGroupDispatchSizeIsCorrectlySet) { + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + WALKER_TYPE walkerCmd{}; + DebugManagerStateRestore restorer; + DebugManager.flags.EnableWalkerPartition.set(0); + const auto &productHelper = pDevice->getProductHelper(); + auto hwInfo = pDevice->getHardwareInfo(); + hwInfo.platform.usRevId = productHelper.getHwRevIdFromStepping(REVISION_B, hwInfo); + hwInfo.gtSystemInfo.EUCount = 32; + hwInfo.gtSystemInfo.DualSubSliceCount = hwInfo.gtSystemInfo.MaxDualSubSlicesSupported; + INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData; + const uint32_t numGrf = GrfConfig::DefaultGrfNumber; + auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); + const uint32_t threadGroupCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf) / 32u; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(64u); + + EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + ASSERT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + + DebugManager.flags.EnableWalkerPartition.set(1); + pDevice->numSubDevices = 2; + EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); +} + +XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenNumberOfThreadsInThreadGroupWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet) { + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + WALKER_TYPE walkerCmd{}; + const auto &productHelper = pDevice->getProductHelper(); + auto hwInfo = pDevice->getHardwareInfo(); + hwInfo.platform.usRevId = productHelper.getHwRevIdFromStepping(REVISION_B, hwInfo); + + INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData; + const uint32_t threadGroupCount = 512u; + const uint32_t numGrf = GrfConfig::DefaultGrfNumber; + std::array, 3> testParams = {{{16u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8}, + {32u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4}, + {64u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2}}}; + + for (const auto &[numberOfThreadsInThreadGroup, expectedThreadGroupDispatchSize] : testParams) { + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); + + EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + + EXPECT_EQ(expectedThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize()); + } +} + +XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenNumberOfThreadsInThreadGroupAndDimensionsWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using WALKER_TYPE = typename FamilyType::WALKER_TYPE; WALKER_TYPE walkerCmd{}; @@ -520,18 +570,12 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun const uint32_t threadGroupCount = 512u; const uint32_t numGrf = GrfConfig::DefaultGrfNumber; - walkerCmd.setThreadGroupIdXDimension(8); + iddArg.setNumberOfThreadsInGpgpuThreadGroup(16u); { EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); - - walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_X); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); - walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_DISABLED); } - walkerCmd.setThreadGroupIdYDimension(2); walkerCmd.setThreadGroupIdZDimension(1); { @@ -547,12 +591,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun { walkerCmd.setThreadGroupIdXDimension(1); EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); - - walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_Y); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); - walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_DISABLED); + EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(2); @@ -569,24 +608,24 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun { walkerCmd.setThreadGroupIdXDimension(1); EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(1); { walkerCmd.setThreadGroupIdXDimension(4); EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); + EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } { walkerCmd.setThreadGroupIdXDimension(2); EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } { walkerCmd.setThreadGroupIdXDimension(1); EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); + EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } walkerCmd.setThreadGroupIdXDimension(1); walkerCmd.setThreadGroupIdZDimension(2); @@ -603,12 +642,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun { walkerCmd.setThreadGroupIdYDimension(1); EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); - - walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_Z); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); - walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_DISABLED); + EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } }