mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 14:55:24 +08:00
Correct tg dispatch size heuristic
Multiply available thread count by tile count if implicit scaling is used Related-To: NEO-6989 Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
e0c7e2e130
commit
6cf8b4daca
@@ -103,7 +103,7 @@ struct EncodeDispatchKernel {
|
||||
|
||||
static void programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
|
||||
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf);
|
||||
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf);
|
||||
|
||||
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
|
||||
|
||||
|
||||
@@ -714,7 +714,7 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {}
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {}
|
||||
|
||||
template <typename Family>
|
||||
constexpr bool EncodeDispatchKernel<Family>::shouldUpdateGlobalAtomics(bool ¤tVal, bool refVal, bool updateCurrent) { return false; }
|
||||
|
||||
@@ -234,7 +234,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
}
|
||||
|
||||
auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension();
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
|
||||
memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd));
|
||||
|
||||
|
||||
@@ -282,7 +282,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
walkerCmd.setPredicateEnable(args.isPredicate);
|
||||
|
||||
auto threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
|
||||
EncodeDispatchKernel<Family>::appendAdditionalIDDFields(&idd, hwInfo, threadsPerThreadGroup,
|
||||
args.dispatchInterface->getSlmTotalSize(),
|
||||
|
||||
@@ -61,7 +61,7 @@ void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily);
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
|
||||
@@ -31,7 +31,7 @@ void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd,
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily);
|
||||
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
@@ -47,6 +47,10 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
|
||||
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
|
||||
uint32_t availableThreadCount = hwHelper.calculateAvailableThreadCount(hwInfo, numGrf);
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) {
|
||||
const uint32_t tilesCount = device.getNumSubDevices();
|
||||
availableThreadCount *= tilesCount;
|
||||
}
|
||||
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
|
||||
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
|
||||
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
|
||||
@@ -63,6 +67,7 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
|
||||
|
||||
uint32_t exponent = INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1 - interfaceDescriptor.getThreadGroupDispatchSize();
|
||||
uint32_t threadGroupDispatchSize = 1u << exponent;
|
||||
|
||||
if ((dispatchedTotalThreadCount % (numberOfThreadsInThreadGroup * threadGroupDispatchSize)) != 0) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
}
|
||||
|
||||
@@ -86,7 +86,7 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTO
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily);
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
if (interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup() == 1) {
|
||||
|
||||
@@ -60,7 +60,7 @@ HWTEST2_F(DG2CommandEncoderTest, givenInterfaceDescriptorDataWhenForceThreadGrou
|
||||
|
||||
for (auto numberOfThreadsInGroup : {1u, 4u, 16u}) {
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInGroup);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, 0, 0);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, 0, 0);
|
||||
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
if (numberOfThreadsInGroup == 1) {
|
||||
|
||||
@@ -628,7 +628,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDa
|
||||
uint32_t revisions[] = {REVISION_A0, REVISION_B};
|
||||
for (auto revision : revisions) {
|
||||
hwInfo.platform.usRevId = hwInfoConfig.getHwRevIdFromStepping(revision, hwInfo);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, 0, 0);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, 0, 0);
|
||||
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
@@ -652,7 +652,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDa
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.ForceThreadGroupDispatchSize.set(forceThreadGroupDispatchSize);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, pDevice->getHardwareInfo(), threadGroupCount, 1);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, pDevice->getHardwareInfo(), threadGroupCount, 1);
|
||||
|
||||
EXPECT_NE(defaultThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(forceThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
|
||||
|
||||
@@ -522,7 +522,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenInterfaceDescriptorDataAndNonS
|
||||
for (const auto &revision : {REVISION_A0, REVISION_B}) {
|
||||
hwInfo.platform.usRevId = hwInfoConfig.getHwRevIdFromStepping(revision, hwInfo);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, 0, 0);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, 0, 0);
|
||||
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
@@ -545,12 +545,35 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDispatchSizeSmallerOrEqualToAv
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u);
|
||||
|
||||
for (const auto threadGroupCount : {1u, 2u}) {
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, threadGroupCount, numGrf);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf);
|
||||
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenMultipleTilesAndImplicitScalingWhenAdjustInterfaceDescriptorDataIsCalledThenThreadGroupDispatchSizeIsCorrectlySet) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.EnableWalkerPartition.set(0);
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(productFamily);
|
||||
auto hwInfo = pDevice->getHardwareInfo();
|
||||
hwInfo.platform.usRevId = hwInfoConfig.getHwRevIdFromStepping(REVISION_B, hwInfo);
|
||||
hwInfo.gtSystemInfo.EUCount = 32;
|
||||
hwInfo.gtSystemInfo.DualSubSliceCount = hwInfo.gtSystemInfo.MaxDualSubSlicesSupported;
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
const uint32_t threadGroupCount = 32u;
|
||||
const uint32_t numGrf = 32;
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(64u);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
|
||||
|
||||
DebugManager.flags.EnableWalkerPartition.set(1);
|
||||
pDevice->numSubDevices = 2;
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf);
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenNumberOfThreadsInThreadGroupWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
@@ -568,7 +591,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenNumberOfThreadsInThreadGroupWh
|
||||
for (const auto &[numberOfThreadsInThreadGroup, expectedThreadGroupDispatchSize] : testParams) {
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, threadGroupCount, numGrf);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf);
|
||||
|
||||
if (hwInfo.gtSystemInfo.MaxDualSubSlicesSupported == hwInfo.gtSystemInfo.DualSubSliceCount) {
|
||||
EXPECT_EQ(expectedThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
|
||||
@@ -596,7 +619,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenNumberOfThreadsInThreadGroupAn
|
||||
for (const auto &[numberOfThreadsInThreadGroup, expectedThreadGroupDispatchSize] : testParams) {
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, threadGroupCount, numGrf);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf);
|
||||
|
||||
EXPECT_EQ(expectedThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
@@ -614,7 +637,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenIndivisibleDispatchSizeWhenCal
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(18u);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, threadGroupCount, numGrf);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf);
|
||||
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
@@ -631,7 +654,7 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenThreadGroupCountZeroWhenCallin
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, threadGroupCount, numGrf);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf);
|
||||
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user