diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 023794273f..ef64193323 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -418,10 +418,7 @@ ze_result_t KernelImp::suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount } auto &hwHelper = NEO::HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); auto &descriptor = kernelImmData->getDescriptor(); - auto availableThreadCount = hwHelper.calculateAvailableThreadCount( - hardwareInfo.platform.eProductFamily, - descriptor.kernelAttributes.numGrfRequired, - hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount); + auto availableThreadCount = hwHelper.calculateAvailableThreadCount(hardwareInfo, descriptor.kernelAttributes.numGrfRequired); auto barrierCount = descriptor.kernelAttributes.barrierCount; const uint32_t workDim = 3; diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_function.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_function.cpp index f5fc568bbf..cd8c08a04c 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_function.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_function.cpp @@ -261,10 +261,7 @@ class KernelImpSuggestMaxCooperativeGroupCountTests : public KernelImp { funcInfo.kernelDescriptor = &kernelDescriptor; auto &hardwareInfo = device->getHwInfo(); auto &hwHelper = device->getHwHelper(); - availableThreadCount = hwHelper.calculateAvailableThreadCount( - hardwareInfo.platform.eProductFamily, - numGrf, - hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount); + availableThreadCount = hwHelper.calculateAvailableThreadCount(hardwareInfo, numGrf); dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount; if (dssCount == 0) { diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index d525b72730..12a149de22 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -1073,10 +1073,7 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local if (dssCount == 0) { dssCount = hardwareInfo.gtSystemInfo.SubSliceCount; } - auto availableThreadCount = hwHelper.calculateAvailableThreadCount( - hardwareInfo.platform.eProductFamily, - kernelDescriptor.kernelAttributes.numGrfRequired, - hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount); + auto availableThreadCount = hwHelper.calculateAvailableThreadCount(hardwareInfo, kernelDescriptor.kernelAttributes.numGrfRequired); auto barrierCount = kernelDescriptor.kernelAttributes.barrierCount; auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(kernelInfo.getMaxSimdSize(), diff --git a/opencl/test/unit_test/helpers/hw_helper_tests.cpp b/opencl/test/unit_test/helpers/hw_helper_tests.cpp index 793608ab52..2d1d8d275c 100644 --- a/opencl/test/unit_test/helpers/hw_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hw_helper_tests.cpp @@ -1002,14 +1002,20 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HwHelperTest, GivenBarrierEncodingWhenCallingGetBarr HWCMDTEST_F(IGFX_GEN8_CORE, HwHelperTest, GivenVariousValuesWhenCallingCalculateAvailableThreadCountThenCorrectValueIsReturned) { auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); - auto result = hwHelper.calculateAvailableThreadCount( - hardwareInfo.platform.eProductFamily, - 0, - hardwareInfo.gtSystemInfo.EUCount, - hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount); + auto result = hwHelper.calculateAvailableThreadCount(hardwareInfo, 0); EXPECT_EQ(hardwareInfo.gtSystemInfo.ThreadCount, result); } +HWCMDTEST_F(IGFX_GEN8_CORE, HwHelperTest, GivenModifiedGtSystemInfoWhenCallingCalculateAvailableThreadCountThenCorrectValueIsReturned) { + auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); + auto hwInfo = hardwareInfo; + for (auto threadCount : {1u, 5u, 9u}) { + hwInfo.gtSystemInfo.ThreadCount = threadCount; + auto result = hwHelper.calculateAvailableThreadCount(hwInfo, 0); + EXPECT_EQ(threadCount, result); + } +} + HWTEST_F(HwHelperTest, givenDefaultHwHelperHwWhenIsOffsetToSkipSetFFIDGPWARequiredCalledThenFalseIsReturned) { if (hardwareInfo.platform.eRenderCoreFamily == IGFX_GEN12LP_CORE) { GTEST_SKIP(); @@ -1481,3 +1487,30 @@ using LogicalStateHelperTest = ::testing::Test; HWTEST_F(LogicalStateHelperTest, whenCreatingLogicalStateHelperThenReturnNullptr) { EXPECT_EQ(nullptr, LogicalStateHelper::create()); } + +HWTEST2_F(HwHelperTest, GivenVariousValuesAndXeHpAndLaterPlatformsWhenCallingCalculateAvailableThreadCountThenCorrectValueIsReturned, ATSOrDG2) { + std::array, 3> grfTestInputs = {{{64, 8}, + {128, 8}, + {256, 4}}}; + + const auto &hwInfo = *defaultHwInfo; + auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); + for (const auto &[grfCount, expectedThreadCountPerEu] : grfTestInputs) { + auto expected = expectedThreadCountPerEu * hwInfo.gtSystemInfo.EUCount; + auto result = hwHelper.calculateAvailableThreadCount(hwInfo, grfCount); + EXPECT_EQ(expected, result); + } +} + +HWTEST2_F(HwHelperTest, GivenModifiedGtSystemInfoAndXeHpAndLaterPlatformsWhenCallingCalculateAvailableThreadCountThenCorrectValueIsReturned, ATSOrDG2) { + std::array, 3> testInputs = {{{1, 64, 1}, + {5, 128, 5}, + {8, 256, 4}}}; + auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); + auto hwInfo = hardwareInfo; + for (const auto &[threadCount, grfCount, expectedThreadCount] : testInputs) { + hwInfo.gtSystemInfo.ThreadCount = threadCount; + auto result = hwHelper.calculateAvailableThreadCount(hwInfo, grfCount); + EXPECT_EQ(expectedThreadCount, result); + } +} diff --git a/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp b/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp index 174a296c4b..4740965bd3 100644 --- a/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp +++ b/opencl/test/unit_test/helpers/hw_helper_tests_pvc_and_later.cpp @@ -45,34 +45,34 @@ HWTEST2_F(HwHelperTestPvcAndLater, givenRenderEngineWhenRemapCalledThenUseCccs, EXPECT_EQ(aub_stream::EngineType::ENGINE_BCS, EngineHelpers::remapEngineTypeToHwSpecific(aub_stream::EngineType::ENGINE_BCS, hardwareInfo)); } -HWTEST2_F(HwHelperTestPvcAndLater, GivenVariousValuesWhenCallingCalculateAvailableThreadCountThenCorrectValueIsReturned, IsAtLeastXeHpcCore) { - struct TestInput { - uint32_t grfCount; - uint32_t expectedThreadCountPerEu; - }; - - std::vector grfTestInputs = { - {64, 16}, - {96, 10}, - {128, 8}, - {160, 6}, - {192, 5}, - {256, 4}, - }; - +HWTEST2_F(HwHelperTestPvcAndLater, GivenVariousValuesAndPvcAndLaterPlatformsWhenCallingCalculateAvailableThreadCountThenCorrectValueIsReturned, IsAtLeastXeHpcCore) { + std::array, 6> grfTestInputs = {{{64, 16}, + {96, 10}, + {128, 8}, + {160, 6}, + {192, 5}, + {256, 4}}}; auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); - - for (auto &testInput : grfTestInputs) { - auto expected = testInput.expectedThreadCountPerEu * hardwareInfo.gtSystemInfo.EUCount; - auto result = hwHelper.calculateAvailableThreadCount( - hardwareInfo.platform.eProductFamily, - testInput.grfCount, - hardwareInfo.gtSystemInfo.EUCount, - hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount); + for (const auto &[grfCount, expectedThreadCountPerEu] : grfTestInputs) { + auto expected = expectedThreadCountPerEu * hardwareInfo.gtSystemInfo.EUCount; + auto result = hwHelper.calculateAvailableThreadCount(hardwareInfo, grfCount); EXPECT_EQ(expected, result); } } +HWTEST2_F(HwHelperTestPvcAndLater, GivenModifiedGtSystemInfoAndPvcAndLaterPlatformsWhenCallingCalculateAvailableThreadCountThenCorrectValueIsReturned, IsAtLeastXeHpcCore) { + std::array, 3> testInputs = {{{64, 256}, + {96, 384}, + {128, 512}}}; + auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); + auto hwInfo = hardwareInfo; + for (const auto &[euCount, expectedThreadCount] : testInputs) { + hwInfo.gtSystemInfo.EUCount = euCount; + auto result = hwHelper.calculateAvailableThreadCount(hwInfo, 256); + EXPECT_EQ(expectedThreadCount, result); + } +} + HWTEST2_F(HwHelperTestPvcAndLater, givenHwHelperWhenCheckIsUpdateTaskCountFromWaitSupportedThenReturnsTrue, IsAtLeastXeHpcCore) { auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index 51c6eeef45..2aa3f88a00 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -89,8 +89,7 @@ class HwHelper { virtual uint32_t getMocsIndex(const GmmHelper &gmmHelper, bool l3enabled, bool l1enabled) const = 0; virtual bool isLinearStoragePreferred(bool isSharedContext, bool isImage1d, bool forceLinearStorage) = 0; virtual uint8_t getBarriersCountFromHasBarriers(uint8_t hasBarriers) const = 0; - virtual uint32_t calculateAvailableThreadCount(PRODUCT_FAMILY family, uint32_t grfCount, uint32_t euCount, - uint32_t threadsPerEu) = 0; + virtual uint32_t calculateAvailableThreadCount(const HardwareInfo &hwInfo, uint32_t grfCount) = 0; virtual uint32_t alignSlmSize(uint32_t slmSize) = 0; virtual uint32_t computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) = 0; @@ -292,7 +291,7 @@ class HwHelperHw : public HwHelper { uint8_t getBarriersCountFromHasBarriers(uint8_t hasBarriers) const override; - uint32_t calculateAvailableThreadCount(PRODUCT_FAMILY family, uint32_t grfCount, uint32_t euCount, uint32_t threadsPerEu) override; + uint32_t calculateAvailableThreadCount(const HardwareInfo &hwInfo, uint32_t grfCount) override; uint32_t alignSlmSize(uint32_t slmSize) override; diff --git a/shared/source/helpers/hw_helper_bdw_and_later.inl b/shared/source/helpers/hw_helper_bdw_and_later.inl index 4005c42e4e..26a3e7a6c3 100644 --- a/shared/source/helpers/hw_helper_bdw_and_later.inl +++ b/shared/source/helpers/hw_helper_bdw_and_later.inl @@ -93,9 +93,8 @@ uint32_t HwHelperHw::getMocsIndex(const GmmHelper &gmmHelper, bool l3 } template -uint32_t HwHelperHw::calculateAvailableThreadCount(PRODUCT_FAMILY family, uint32_t grfCount, uint32_t euCount, - uint32_t threadsPerEu) { - return threadsPerEu * euCount; +uint32_t HwHelperHw::calculateAvailableThreadCount(const HardwareInfo &hwInfo, uint32_t grfCount) { + return hwInfo.gtSystemInfo.ThreadCount; } template diff --git a/shared/source/helpers/hw_helper_pvc_and_later.inl b/shared/source/helpers/hw_helper_pvc_and_later.inl index 6114bf930c..1983f499cf 100644 --- a/shared/source/helpers/hw_helper_pvc_and_later.inl +++ b/shared/source/helpers/hw_helper_pvc_and_later.inl @@ -72,4 +72,10 @@ size_t HwHelperHw::getPaddingForISAAllocation() const { return 0xE00; } +template <> +uint32_t HwHelperHw::calculateAvailableThreadCount(const HardwareInfo &hwInfo, uint32_t grfCount) { + auto maxThreadsPerEuCount = 1024u / grfCount; + return maxThreadsPerEuCount * hwInfo.gtSystemInfo.EUCount; +} + } // namespace NEO diff --git a/shared/source/helpers/hw_helper_xehp_and_later.inl b/shared/source/helpers/hw_helper_xehp_and_later.inl index f2942f762c..5182e24597 100644 --- a/shared/source/helpers/hw_helper_xehp_and_later.inl +++ b/shared/source/helpers/hw_helper_xehp_and_later.inl @@ -129,12 +129,11 @@ uint32_t HwHelperHw::getMocsIndex(const GmmHelper &gmmHelper, bool l3 } template -uint32_t HwHelperHw::calculateAvailableThreadCount(PRODUCT_FAMILY family, uint32_t grfCount, uint32_t euCount, - uint32_t threadsPerEu) { +uint32_t HwHelperHw::calculateAvailableThreadCount(const HardwareInfo &hwInfo, uint32_t grfCount) { if (grfCount > GrfConfig::DefaultGrfNumber) { - return threadsPerEu / 2u * euCount; + return hwInfo.gtSystemInfo.ThreadCount / 2u; } - return threadsPerEu * euCount; + return hwInfo.gtSystemInfo.ThreadCount; } template diff --git a/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp b/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp index 0c796a719f..ee0d460cd2 100644 --- a/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/hw_helper_xe_hpc_core.cpp @@ -115,13 +115,6 @@ bool HwHelperHw::isLinearStoragePreferred(bool isSharedContext, bool isI return true; } -template <> -uint32_t HwHelperHw::calculateAvailableThreadCount(PRODUCT_FAMILY family, uint32_t grfCount, uint32_t euCount, - uint32_t threadsPerEu) { - auto maxThreadsPerEuCount = 1024u / grfCount; - return maxThreadsPerEuCount * euCount; -} - template <> uint32_t HwHelperHw::getMetricsLibraryGenId() const { return static_cast(MetricsLibraryApi::ClientGen::XeHPC);