From 828d6bafa7e2f0e20b94e9e4ea1a2316807485a5 Mon Sep 17 00:00:00 2001 From: Katarzyna Cencelewska Date: Thu, 29 May 2025 17:14:54 +0000 Subject: [PATCH] fix: return proper value for zeKernelSuggestGroupSize Resolves: HSD-18042274687 Signed-off-by: Katarzyna Cencelewska --- level_zero/core/source/module/module_imp.cpp | 4 +-- opencl/source/kernel/kernel.cpp | 5 ++-- .../gen12lp/gfx_core_helper_gen12lp.cpp | 2 +- shared/source/helpers/gfx_core_helper.h | 4 +-- .../gfx_core_helper_xehp_and_later.inl | 3 ++- .../helpers/gfx_core_helper_tests.cpp | 8 +++--- .../gfx_core_helper_tests_dg2_and_later.cpp | 26 ++++++++++++++++--- 7 files changed, 36 insertions(+), 16 deletions(-) diff --git a/level_zero/core/source/module/module_imp.cpp b/level_zero/core/source/module/module_imp.cpp index e72cec9128..b968a0a977 100644 --- a/level_zero/core/source/module/module_imp.cpp +++ b/level_zero/core/source/module/module_imp.cpp @@ -427,7 +427,7 @@ ze_result_t ModuleTranslationUnit::processUnpackedBinary() { } for (auto &kernelInfo : this->programInfo.kernelInfos) { - deviceInfoConstants.maxWorkGroupSize = gfxCoreHelper.calculateMaxWorkGroupSize(kernelInfo->kernelDescriptor, static_cast(device->getDeviceInfo().maxWorkGroupSize)); + deviceInfoConstants.maxWorkGroupSize = gfxCoreHelper.calculateMaxWorkGroupSize(kernelInfo->kernelDescriptor, static_cast(device->getDeviceInfo().maxWorkGroupSize), device->getNEODevice()->getRootDeviceEnvironment()); kernelInfo->apply(deviceInfoConstants); } @@ -905,7 +905,7 @@ const KernelImmutableData *ModuleImp::getKernelImmutableData(const char *kernelN } uint32_t ModuleImp::getMaxGroupSize(const NEO::KernelDescriptor &kernelDescriptor) const { - return this->device->getGfxCoreHelper().calculateMaxWorkGroupSize(kernelDescriptor, static_cast(this->device->getDeviceInfo().maxWorkGroupSize)); + return this->device->getGfxCoreHelper().calculateMaxWorkGroupSize(kernelDescriptor, static_cast(this->device->getDeviceInfo().maxWorkGroupSize), device->getNEODevice()->getRootDeviceEnvironment()); } void ModuleImp::createBuildOptions(const char *pBuildFlags, std::string &apiOptions, std::string &internalBuildOptions) { diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 4eba8b2c92..259d8c5e01 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -2234,10 +2234,9 @@ bool Kernel::areMultipleSubDevicesInContext() const { void Kernel::reconfigureKernel() { const auto &kernelDescriptor = kernelInfo.kernelDescriptor; const auto &gfxCoreHelper = this->getGfxCoreHelper(); - auto maxWorkGroupSize = gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, this->maxKernelWorkGroupSize); - maxWorkGroupSize = static_cast(kernelInfo.getMaxRequiredWorkGroupSize(maxWorkGroupSize)); + auto &rootDeviceEnvironment = getDevice().getRootDeviceEnvironment(); - this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, maxWorkGroupSize, getDevice().getRootDeviceEnvironment()); + this->maxKernelWorkGroupSize = gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, this->maxKernelWorkGroupSize, rootDeviceEnvironment); this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites; this->systolicPipelineSelectMode = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode; diff --git a/shared/source/gen12lp/gfx_core_helper_gen12lp.cpp b/shared/source/gen12lp/gfx_core_helper_gen12lp.cpp index 615f8b548d..ccb929e4f3 100644 --- a/shared/source/gen12lp/gfx_core_helper_gen12lp.cpp +++ b/shared/source/gen12lp/gfx_core_helper_gen12lp.cpp @@ -60,7 +60,7 @@ bool GfxCoreHelperHw::makeResidentBeforeLockNeeded(bool precondition) } template -inline uint32_t GfxCoreHelperHw::calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize) const { +inline uint32_t GfxCoreHelperHw::calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const { return std::min(defaultMaxGroupSize, CommonConstants::maxWorkgroupSize); } diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index 3f095dc2c6..634131da9d 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -99,7 +99,7 @@ class GfxCoreHelper { virtual uint32_t getMocsIndex(const GmmHelper &gmmHelper, bool l3enabled, bool l1enabled) const = 0; virtual uint8_t getBarriersCountFromHasBarriers(uint8_t hasBarriers) const = 0; virtual uint32_t calculateAvailableThreadCount(const HardwareInfo &hwInfo, uint32_t grfCount) const = 0; - virtual uint32_t calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize) const = 0; + virtual uint32_t calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0; virtual uint32_t alignSlmSize(uint32_t slmSize) const = 0; virtual uint32_t computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize, ReleaseHelper *releaseHelper, bool isHeapless) const = 0; @@ -320,7 +320,7 @@ class GfxCoreHelperHw : public GfxCoreHelper { void alignThreadGroupCountToDssSize(uint32_t &threadCount, uint32_t dssCount, uint32_t threadsPerDss, uint32_t threadGroupSize) const override; - uint32_t calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize) const override; + uint32_t calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override; uint32_t alignSlmSize(uint32_t slmSize) const override; diff --git a/shared/source/helpers/gfx_core_helper_xehp_and_later.inl b/shared/source/helpers/gfx_core_helper_xehp_and_later.inl index 98cc1a0cc3..657ddcc193 100644 --- a/shared/source/helpers/gfx_core_helper_xehp_and_later.inl +++ b/shared/source/helpers/gfx_core_helper_xehp_and_later.inl @@ -96,10 +96,11 @@ uint32_t GfxCoreHelperHw::getMocsIndex(const GmmHelper &gmmHelper, bo } template -inline uint32_t GfxCoreHelperHw::calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize) const { +inline uint32_t GfxCoreHelperHw::calculateMaxWorkGroupSize(const KernelDescriptor &kernelDescriptor, uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const { if (kernelDescriptor.kernelAttributes.simdSize != 32 && kernelDescriptor.kernelAttributes.numGrfRequired == GrfConfig::largeGrfNumber) { defaultMaxGroupSize >>= 1; } + defaultMaxGroupSize = adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, defaultMaxGroupSize, rootDeviceEnvironment); return std::min(defaultMaxGroupSize, CommonConstants::maxWorkgroupSize); } diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp index eb0b992a04..908a097fd5 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp @@ -1514,18 +1514,20 @@ HWTEST2_F(GfxCoreHelperTest, givenLargeGrfIsNotSupportedWhenCalculatingMaxWorkGr auto defaultMaxGroupSize = 42u; NEO::KernelDescriptor kernelDescriptor{}; + MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; kernelDescriptor.kernelAttributes.simdSize = 16; kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::largeGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize, rootDeviceEnvironment)); kernelDescriptor.kernelAttributes.simdSize = 32; kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::largeGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize, rootDeviceEnvironment)); kernelDescriptor.kernelAttributes.simdSize = 16; kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::defaultGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize, rootDeviceEnvironment)); } HWTEST_F(GfxCoreHelperTest, whenIsDynamicallyPopulatedisTrueThengetHighestEnabledSliceReturnsHighestEnabledSliceInfo) { diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests_dg2_and_later.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests_dg2_and_later.cpp index b7813223ff..bfe753d22b 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests_dg2_and_later.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests_dg2_and_later.cpp @@ -64,22 +64,40 @@ HWTEST2_F(GfxCoreHelperDg2AndLaterTest, GivenUseL1CacheAsFalseWhenCallSetL1Cache using GfxCoreHelperWithLargeGrf = ::testing::Test; HWTEST2_F(GfxCoreHelperWithLargeGrf, givenLargeGrfAndSimdSmallerThan32WhenCalculatingMaxWorkGroupSizeThenReturnHalfOfDeviceDefault, IsWithinXeGfxFamily) { MockExecutionEnvironment mockExecutionEnvironment{}; - auto &gfxCoreHelper = mockExecutionEnvironment.rootDeviceEnvironments[0]->getHelper(); + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); auto defaultMaxGroupSize = 42u; NEO::KernelDescriptor kernelDescriptor{}; kernelDescriptor.kernelAttributes.simdSize = 16; kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::largeGrfNumber; - EXPECT_EQ((defaultMaxGroupSize >> 1), gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); + EXPECT_EQ((defaultMaxGroupSize >> 1), gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize, rootDeviceEnvironment)); kernelDescriptor.kernelAttributes.simdSize = 32; kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::largeGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize, rootDeviceEnvironment)); kernelDescriptor.kernelAttributes.simdSize = 16; kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::defaultGrfNumber; - EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize)); + EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize, rootDeviceEnvironment)); +} + +HWTEST2_F(GfxCoreHelperDg2AndLaterTest, givenGfxCoreHelperWhenCallCalculateMaxWorkGroupSizeThenMethodAdjustMaxWorkGroupSizeIsCalled, IsWithinXeGfxFamily) { + static bool isCalledAdjustMaxWorkGroupSize = false; + struct MockGfxCoreHelper : NEO::GfxCoreHelperHw { + uint32_t adjustMaxWorkGroupSize(const uint32_t grfCount, const uint32_t simd, const uint32_t defaultMaxGroupSize, const RootDeviceEnvironment &rootDeviceEnvironment) const override { + isCalledAdjustMaxWorkGroupSize = true; + return defaultMaxGroupSize; + } + }; + MockGfxCoreHelper gfxCoreHelper; + MockExecutionEnvironment mockExecutionEnvironment{}; + auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0]; + NEO::KernelDescriptor kernelDescriptor{}; + auto defaultMaxGroupSize = 42u; + gfxCoreHelper.calculateMaxWorkGroupSize(kernelDescriptor, defaultMaxGroupSize, rootDeviceEnvironment); + EXPECT_TRUE(isCalledAdjustMaxWorkGroupSize); } using PipeControlHelperTestsDg2AndLater = ::testing::Test;