fix: correct limitation for num threads per thread group

taking into account the max work group limit

Resolves: NEO-14922
Related-To: NEO-11881
Signed-off-by: Katarzyna Cencelewska <katarzyna.cencelewska@intel.com>
This commit is contained in:
Katarzyna Cencelewska
2025-05-20 14:41:49 +00:00
committed by Compute-Runtime-Automation
parent 5f80490385
commit 6ad4ad41b1
26 changed files with 171 additions and 228 deletions

View File

@@ -1649,18 +1649,16 @@ HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeT
constexpr auto defaultMaxGroupSize = 1024u;
uint32_t simdSize = 16u;
uint32_t isHwLocalIdGeneration = true;
uint32_t numGrfRequired = GrfConfig::largeGrfNumber;
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment));
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize, rootDeviceEnvironment));
simdSize = 32u;
numGrfRequired = GrfConfig::largeGrfNumber;
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment));
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize, rootDeviceEnvironment));
simdSize = 16u;
isHwLocalIdGeneration = false;
numGrfRequired = GrfConfig::defaultGrfNumber;
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, isHwLocalIdGeneration, defaultMaxGroupSize, rootDeviceEnvironment));
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize, rootDeviceEnvironment));
}
HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue, IsAtMostXeHpcCore) {
@@ -1678,7 +1676,7 @@ HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThe
}};
for (auto &[simtSize, totalWgSize, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, true, rootDeviceEnvironment));
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, 32u, rootDeviceEnvironment));
}
}
@@ -1688,19 +1686,19 @@ HWTEST_F(GfxCoreHelperTest, givenFlagRemoveRestrictionsOnNumberOfThreadsInGpgpuT
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
std::array<std::array<uint32_t, 5>, 8> values = {{
{32u, 32u, 128u, 1, 1u}, // SIMT Size, totalWorkItems, Max Num of threads, Grf size, Hw local id generation
{32u, 64u, 32u, 1, 2u},
{32u, 128u, 256u, 1, 4u},
{32u, 1024u, 128u, 1, 32u},
{16u, 32u, 32u, 0, 2u},
{16u, 64u, 256u, 0, 4u},
{16u, 128u, 128u, 0, 8u},
{16u, 1024u, 256u, 0, 64u},
std::array<std::array<uint32_t, 4>, 8> values = {{
{32u, 32u, 128u, 1u}, // SIMT Size, totalWorkItems,Grf size, Max Num of threads
{32u, 64u, 32u, 2u},
{32u, 128u, 256u, 4u},
{32u, 1024u, 128u, 32u},
{16u, 32u, 32u, 2u},
{16u, 64u, 256u, 4u},
{16u, 128u, 128u, 8u},
{16u, 1024u, 256u, 64u},
}};
for (auto &[simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, rootDeviceEnvironment));
for (auto &[simtSize, totalWgSize, grfsize, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, rootDeviceEnvironment));
}
}

View File

@@ -81,7 +81,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.v0.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.v0.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
EXPECT_EQ(localIdsSize + ImplicitArgsV0::getAlignedSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
}

View File

@@ -807,23 +807,17 @@ XE2_HPG_CORETEST_F(GfxCoreHelperTestsXe2HpgCore, givenNumGrfAndSimdSizeWhenAdjus
auto defaultMaxWorkGroupSize = 2048u;
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
std::array<std::array<uint32_t, 4>, 12> values = {{
{GrfConfig::defaultGrfNumber, 16u, 0u, 1024u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads
{GrfConfig::defaultGrfNumber, 16u, 1u, 1024u},
{GrfConfig::defaultGrfNumber, 32u, 1u, 1024u},
{GrfConfig::defaultGrfNumber, 32u, 0u, 2048u},
{GrfConfig::largeGrfNumber, 16u, 0u, 512u},
{GrfConfig::largeGrfNumber, 16u, 1u, 512u},
{GrfConfig::largeGrfNumber, 32u, 0u, 1024u},
{GrfConfig::largeGrfNumber, 32u, 1u, 1024u},
{GrfConfig::defaultGrfNumber, 1u, 1u, 32u},
{GrfConfig::defaultGrfNumber, 1u, 0u, 64u},
{GrfConfig::largeGrfNumber, 1u, 0u, 32u},
{GrfConfig::largeGrfNumber, 1u, 1u, 32u},
std::array<std::array<uint32_t, 3>, 6> values = {{
{GrfConfig::defaultGrfNumber, 16u, 1024u}, // Grf Size, SIMT Size, Max Num of threads
{GrfConfig::defaultGrfNumber, 32u, 1024u},
{GrfConfig::largeGrfNumber, 16u, 512u},
{GrfConfig::largeGrfNumber, 32u, 1024u},
{GrfConfig::defaultGrfNumber, 1u, 32u},
{GrfConfig::largeGrfNumber, 1u, 32u},
}};
for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, isHwLocalIdGeneration, defaultMaxWorkGroupSize, rootDeviceEnvironment));
for (auto &[grfSize, simtSize, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, defaultMaxWorkGroupSize, rootDeviceEnvironment));
}
}
@@ -831,23 +825,17 @@ XE2_HPG_CORETEST_F(GfxCoreHelperTestsXe2HpgCore, givenParamsWhenCalculateNumThre
auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
auto totalWgSize = 2048u;
std::array<std::array<uint32_t, 4>, 12> values = {{
{GrfConfig::defaultGrfNumber, 16u, 0u, 64u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads
{GrfConfig::defaultGrfNumber, 16u, 1u, 64u},
{GrfConfig::defaultGrfNumber, 32u, 1u, 32u},
{GrfConfig::defaultGrfNumber, 32u, 0u, 64u},
{GrfConfig::defaultGrfNumber, 1u, 1u, 32u},
{GrfConfig::defaultGrfNumber, 1u, 0u, 64u},
{GrfConfig::largeGrfNumber, 16u, 0u, 32u},
{GrfConfig::largeGrfNumber, 16u, 1u, 32u},
{GrfConfig::largeGrfNumber, 32u, 0u, 32u},
{GrfConfig::largeGrfNumber, 32u, 1u, 32u},
{GrfConfig::largeGrfNumber, 1u, 0u, 32u},
{GrfConfig::largeGrfNumber, 1u, 1u, 32u},
std::array<std::array<uint32_t, 3>, 6> values = {{
{GrfConfig::defaultGrfNumber, 16u, 64u}, // Grf Size, SIMT Size, Max Num of threads
{GrfConfig::defaultGrfNumber, 32u, 32u},
{GrfConfig::defaultGrfNumber, 1u, 32u},
{GrfConfig::largeGrfNumber, 16u, 32u},
{GrfConfig::largeGrfNumber, 32u, 32u},
{GrfConfig::largeGrfNumber, 1u, 32u},
}};
for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThdreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThdreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, isHwLocalIdGeneration, rootDeviceEnvironment));
for (auto &[grfSize, simtSize, expectedNumThdreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThdreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, rootDeviceEnvironment));
}
}

View File

@@ -754,41 +754,26 @@ XE3_CORETEST_F(GfxCoreHelperTestsXe3Core, givenNumGrfAndSimdSizeWhenAdjustingMax
auto defaultMaxWorkGroupSize = 2048u;
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
std::array<std::array<uint32_t, 4>, 30> values = {{
{128u, 16u, 0u, 1024u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads
{128u, 16u, 1u, 1024u},
{128u, 32u, 1u, 1024u},
{128u, 32u, 0u, 2048u},
{160u, 16u, 0u, 768u},
{160u, 16u, 1u, 768u},
{160u, 32u, 1u, 1024u},
{160u, 32u, 0u, 1536u},
{192u, 16u, 0u, 640u},
{192u, 16u, 1u, 640u},
{192u, 32u, 1u, 1024u},
{192u, 32u, 0u, 1280u},
{256u, 16u, 0u, 512u},
{256u, 16u, 1u, 512u},
{256u, 32u, 1u, 1024u},
{256u, 32u, 0u, 1024u},
{512u, 16u, 0u, 256u},
{512u, 16u, 1u, 256u},
{512u, 32u, 1u, 512u},
{512u, 32u, 0u, 512u},
{128u, 1u, 1u, 32u},
{128u, 1u, 0u, 64u},
{160u, 1u, 1u, 32u},
{160u, 1u, 0u, 48u},
{192u, 1u, 1u, 32u},
{192u, 1u, 0u, 40u},
{256u, 1u, 1u, 32u},
{256u, 1u, 0u, 32u},
{512u, 1u, 1u, 16u},
{512u, 1u, 0u, 16u},
std::array<std::array<uint32_t, 3>, 15> values = {{
{128u, 16u, 1024u}, // Grf Size, SIMT Size, Max Num of threads
{128u, 32u, 1024u},
{160u, 16u, 768u},
{160u, 32u, 1024u},
{192u, 16u, 640u},
{192u, 32u, 1024u},
{256u, 16u, 512u},
{256u, 32u, 1024u},
{512u, 16u, 256u},
{512u, 32u, 512u},
{128u, 1u, 32u},
{160u, 1u, 32u},
{192u, 1u, 32u},
{256u, 1u, 32u},
{512u, 1u, 16u},
}};
for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, isHwLocalIdGeneration, defaultMaxWorkGroupSize, rootDeviceEnvironment));
for (auto &[grfSize, simtSize, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.adjustMaxWorkGroupSize(grfSize, simtSize, defaultMaxWorkGroupSize, rootDeviceEnvironment));
}
}
@@ -801,41 +786,26 @@ XE3_CORETEST_F(GfxCoreHelperTestsXe3Core, givenParamsWhenCalculateNumThreadsPerT
auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
auto totalWgSize = 2048u;
std::array<std::array<uint32_t, 4>, 30> values = {{
{128u, 16u, 0u, 64u}, // Grf Size, SIMT Size, HW local-id generation, Max Num of threads
{128u, 16u, 1u, 64u},
{128u, 32u, 1u, 32u},
{128u, 32u, 0u, 64u},
{128u, 1u, 1u, 32u},
{128u, 1u, 0u, 64u},
{160u, 16u, 0u, 48u},
{160u, 16u, 1u, 48u},
{160u, 32u, 1u, 32u},
{160u, 32u, 0u, 48u},
{160u, 1u, 1u, 32u},
{160u, 1u, 0u, 48u},
{192u, 16u, 0u, 40u},
{192u, 16u, 1u, 40u},
{192u, 32u, 1u, 32u},
{192u, 32u, 0u, 40u},
{192u, 1u, 1u, 32u},
{192u, 1u, 0u, 40u},
{256u, 16u, 0u, 32u},
{256u, 16u, 1u, 32u},
{256u, 32u, 1u, 32u},
{256u, 32u, 0u, 32u},
{256u, 1u, 1u, 32u},
{256u, 1u, 0u, 32u},
{512u, 16u, 0u, 16u},
{512u, 16u, 1u, 16u},
{512u, 32u, 1u, 16u},
{512u, 32u, 0u, 16u},
{512u, 1u, 1u, 16u},
{512u, 1u, 0u, 16u},
std::array<std::array<uint32_t, 3>, 15> values = {{
{128u, 16u, 64u}, // Grf Size, SIMT Size, Max Num of threads
{128u, 32u, 32u},
{128u, 1u, 32u},
{160u, 16u, 48u},
{160u, 32u, 32u},
{160u, 1u, 32u},
{192u, 16u, 40u},
{192u, 32u, 32u},
{192u, 1u, 32u},
{256u, 16u, 32u},
{256u, 32u, 32u},
{256u, 1u, 32u},
{512u, 16u, 16u},
{512u, 32u, 16u},
{512u, 1u, 16u},
}};
for (auto &[grfSize, simtSize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, isHwLocalIdGeneration, rootDeviceEnvironment));
for (auto &[grfSize, simtSize, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfSize, rootDeviceEnvironment));
}
}