Revert "performance: Change thread group dispatch size algorithm"
This reverts commit ac7cd9c4c5
.
Signed-off-by: Mrozek, Michal <michal.mrozek@intel.com>
This commit is contained in:
parent
e676ac49bb
commit
90e24a433d
|
@ -42,30 +42,39 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
|
|||
adjustTGDispatchSize = !!DebugManager.flags.AdjustThreadGroupDispatchSize.get();
|
||||
}
|
||||
if (adjustTGDispatchSize) {
|
||||
UNRECOVERABLE_IF(numGrf == 0u);
|
||||
|
||||
auto tgDispatchSizeSelected = 8u;
|
||||
auto dispatchDimension = 1u;
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf);
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
|
||||
const uint32_t tilesCount = device.getNumSubDevices();
|
||||
availableThreadCount *= tilesCount;
|
||||
}
|
||||
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
|
||||
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
|
||||
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
|
||||
auto tgDispatchSizeSelected = 1u;
|
||||
|
||||
if (walkerCmd.getThreadGroupIdXDimension() > 1) {
|
||||
dispatchDimension = walkerCmd.getThreadGroupIdXDimension();
|
||||
if (walkerCmd.getPartitionType() == WALKER_TYPE::PARTITION_TYPE_X) {
|
||||
dispatchDimension = dispatchDimension / 2;
|
||||
if (dispatchedTotalThreadCount <= availableThreadCount) {
|
||||
tgDispatchSizeSelected = 1;
|
||||
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
|
||||
tgDispatchSizeSelected = 8;
|
||||
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
|
||||
tgDispatchSizeSelected = 4;
|
||||
} else {
|
||||
tgDispatchSizeSelected = 2;
|
||||
}
|
||||
if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) {
|
||||
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
} else if (walkerCmd.getThreadGroupIdYDimension() > 1) {
|
||||
dispatchDimension = walkerCmd.getThreadGroupIdYDimension();
|
||||
if (walkerCmd.getPartitionType() == WALKER_TYPE::PARTITION_TYPE_Y) {
|
||||
dispatchDimension = dispatchDimension / 2;
|
||||
}
|
||||
} else if (walkerCmd.getThreadGroupIdZDimension() > 1) {
|
||||
dispatchDimension = walkerCmd.getThreadGroupIdZDimension();
|
||||
if (walkerCmd.getPartitionType() == WALKER_TYPE::PARTITION_TYPE_Z) {
|
||||
dispatchDimension = dispatchDimension / 2;
|
||||
} else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) {
|
||||
while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
}
|
||||
while (dispatchDimension % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
|
||||
if (tgDispatchSizeSelected == 8) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
|
||||
} else if (tgDispatchSizeSelected == 1) {
|
||||
|
|
|
@ -508,7 +508,57 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDispatchSizeSmallerOrEqualToAv
|
|||
}
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCountWhenInterfaceDescriptorIsBeingProgrammedThenCorrectValueOfThreadGroupDispatchSizeIsSelected) {
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenMultipleTilesAndImplicitScalingWhenAdjustInterfaceDescriptorDataIsCalledThenThreadGroupDispatchSizeIsCorrectlySet) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
WALKER_TYPE walkerCmd{};
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.EnableWalkerPartition.set(0);
|
||||
const auto &productHelper = pDevice->getProductHelper();
|
||||
auto hwInfo = pDevice->getHardwareInfo();
|
||||
hwInfo.platform.usRevId = productHelper.getHwRevIdFromStepping(REVISION_B, hwInfo);
|
||||
hwInfo.gtSystemInfo.EUCount = 32;
|
||||
hwInfo.gtSystemInfo.DualSubSliceCount = hwInfo.gtSystemInfo.MaxDualSubSlicesSupported;
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
const uint32_t numGrf = GrfConfig::DefaultGrfNumber;
|
||||
auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
|
||||
const uint32_t threadGroupCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf) / 32u;
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(64u);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
ASSERT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
|
||||
|
||||
DebugManager.flags.EnableWalkerPartition.set(1);
|
||||
pDevice->numSubDevices = 2;
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenNumberOfThreadsInThreadGroupWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
WALKER_TYPE walkerCmd{};
|
||||
const auto &productHelper = pDevice->getProductHelper();
|
||||
auto hwInfo = pDevice->getHardwareInfo();
|
||||
hwInfo.platform.usRevId = productHelper.getHwRevIdFromStepping(REVISION_B, hwInfo);
|
||||
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
const uint32_t threadGroupCount = 512u;
|
||||
const uint32_t numGrf = GrfConfig::DefaultGrfNumber;
|
||||
std::array<std::pair<uint32_t, uint32_t>, 3> testParams = {{{16u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8},
|
||||
{32u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4},
|
||||
{64u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2}}};
|
||||
|
||||
for (const auto &[numberOfThreadsInThreadGroup, expectedThreadGroupDispatchSize] : testParams) {
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
|
||||
EXPECT_EQ(expectedThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenNumberOfThreadsInThreadGroupAndDimensionsWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
WALKER_TYPE walkerCmd{};
|
||||
|
@ -520,18 +570,12 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun
|
|||
const uint32_t threadGroupCount = 512u;
|
||||
const uint32_t numGrf = GrfConfig::DefaultGrfNumber;
|
||||
|
||||
walkerCmd.setThreadGroupIdXDimension(8);
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(16u);
|
||||
|
||||
{
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
|
||||
|
||||
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_X);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize());
|
||||
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_DISABLED);
|
||||
}
|
||||
|
||||
walkerCmd.setThreadGroupIdYDimension(2);
|
||||
walkerCmd.setThreadGroupIdZDimension(1);
|
||||
{
|
||||
|
@ -547,12 +591,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun
|
|||
{
|
||||
walkerCmd.setThreadGroupIdXDimension(1);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
|
||||
|
||||
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_Y);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_DISABLED);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
walkerCmd.setThreadGroupIdYDimension(1);
|
||||
walkerCmd.setThreadGroupIdZDimension(2);
|
||||
|
@ -569,24 +608,24 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun
|
|||
{
|
||||
walkerCmd.setThreadGroupIdXDimension(1);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
walkerCmd.setThreadGroupIdYDimension(1);
|
||||
walkerCmd.setThreadGroupIdZDimension(1);
|
||||
{
|
||||
walkerCmd.setThreadGroupIdXDimension(4);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
{
|
||||
walkerCmd.setThreadGroupIdXDimension(2);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
{
|
||||
walkerCmd.setThreadGroupIdXDimension(1);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
walkerCmd.setThreadGroupIdXDimension(1);
|
||||
walkerCmd.setThreadGroupIdZDimension(2);
|
||||
|
@ -603,12 +642,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenVariousSettingsOfWorkgroupCoun
|
|||
{
|
||||
walkerCmd.setThreadGroupIdYDimension(1);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
|
||||
|
||||
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_Z);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE_DISABLED);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue