From 8ea5bbd3b5df7aa6df5527f66c5222afb99f872f Mon Sep 17 00:00:00 2001 From: Sebastian Luzynski Date: Wed, 15 Jun 2022 13:29:47 +0000 Subject: [PATCH] Use DualSubSliceCount to calculate workgroup size Related-To: NEO-5719 Signed-off-by: Sebastian Luzynski --- level_zero/core/test/unit_tests/main.cpp | 1 + .../sources/kernel/test_function.cpp | 9 +++++++ ...oncurrent_work_group_count_intel_tests.inl | 2 ++ .../unit_test/device/device_caps_tests.cpp | 11 ++++---- opencl/test/unit_test/main.cpp | 1 + shared/source/device/device_caps.cpp | 8 +++--- .../unit_test/device/neo_device_tests.cpp | 25 +++++++++++++++++++ shared/test/unit_test/main.cpp | 1 + 8 files changed, 49 insertions(+), 9 deletions(-) diff --git a/level_zero/core/test/unit_tests/main.cpp b/level_zero/core/test/unit_tests/main.cpp index 2174ca06a0..f75781ba2e 100644 --- a/level_zero/core/test/unit_tests/main.cpp +++ b/level_zero/core/test/unit_tests/main.cpp @@ -299,6 +299,7 @@ int main(int argc, char **argv) { // clang-format off gtSystemInfo.SliceCount = sliceCount; gtSystemInfo.SubSliceCount = gtSystemInfo.SliceCount * subSlicePerSliceCount; + gtSystemInfo.DualSubSliceCount = gtSystemInfo.SubSliceCount; gtSystemInfo.EUCount = gtSystemInfo.SubSliceCount * euPerSubSlice - dieRecovery; gtSystemInfo.ThreadCount = gtSystemInfo.EUCount * threadsPerEu; gtSystemInfo.MaxEuPerSubSlice = std::max(gtSystemInfo.MaxEuPerSubSlice, euPerSubSlice); diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_function.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_function.cpp index f5fc568bbf..a63c69a595 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_function.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_function.cpp @@ -301,6 +301,15 @@ TEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsedWh EXPECT_EQ(expected, getMaxWorkGroupCount()); } +TEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenNoBarriersOrSlmUsedAndDSSCountEqualZeroWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithSimd) { + auto workGroupSize = lws[0] * lws[1] * lws[2]; + auto expected = availableThreadCount / Math::divideAndRoundUp(workGroupSize, simd); + auto mutableHwInfo = neoDevice->getRootDeviceEnvironment().getMutableHardwareInfo(); + mutableHwInfo->gtSystemInfo.DualSubSliceCount = 0; + + EXPECT_EQ(expected, getMaxWorkGroupCount()); +} + TEST_F(KernelImpSuggestMaxCooperativeGroupCountTests, GivenBarriersWhenCalculatingMaxCooperativeGroupCountThenResultIsCalculatedWithRegardToBarriersCount) { usesBarriers = 1; auto expected = dssCount * (maxBarrierCount / usesBarriers); diff --git a/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl b/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl index 1d788e00eb..d05b5b953a 100644 --- a/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl +++ b/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl @@ -59,6 +59,8 @@ TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenVariousInputWhenGetting size_t globalWorkOffset[] = {0, 0, 0}; size_t localWorkSize[] = {8, 8, 8}; size_t maxConcurrentWorkGroupCount = 0; + auto mutableHwInfo = pDevice->getRootDeviceEnvironment().getMutableHardwareInfo(); + mutableHwInfo->gtSystemInfo.DualSubSliceCount = 0; const_cast(pKernel->getKernelInfo()).kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::DefaultGrfNumber; retVal = clGetKernelMaxConcurrentWorkGroupCountINTEL(pCommandQueue, pMultiDeviceKernel, workDim, globalWorkOffset, localWorkSize, diff --git a/opencl/test/unit_test/device/device_caps_tests.cpp b/opencl/test/unit_test/device/device_caps_tests.cpp index 174752463d..9ca2133675 100644 --- a/opencl/test/unit_test/device/device_caps_tests.cpp +++ b/opencl/test/unit_test/device/device_caps_tests.cpp @@ -1096,7 +1096,7 @@ TEST(DeviceGetCaps, givenDebugFlagToUseMaxSimdSizeForWkgCalculationWhenDeviceCap GT_SYSTEM_INFO &mySysInfo = myHwInfo.gtSystemInfo; mySysInfo.EUCount = 24; - mySysInfo.SubSliceCount = 3; + mySysInfo.DualSubSliceCount = 3; mySysInfo.ThreadCount = 24 * 7; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&myHwInfo)); @@ -1112,7 +1112,7 @@ HWTEST_F(DeviceGetCapsTest, givenDeviceThatHasHighNumberOfExecutionUnitsWhenMaxW auto &hwHelper = HwHelper::get(myHwInfo.platform.eRenderCoreFamily); mySysInfo.EUCount = 32; - mySysInfo.SubSliceCount = 2; + mySysInfo.DualSubSliceCount = 2; mySysInfo.ThreadCount = 32 * hwHelper.getMinimalSIMDSize(); // 128 threads per subslice, in simd 8 gives 1024 auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&myHwInfo)); @@ -1504,19 +1504,18 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DeviceGetCapsTest, givenSysInfoWhenDeviceCreatedThen PLATFORM &myPlatform = myHwInfo.platform; mySysInfo.EUCount = 16; - mySysInfo.SubSliceCount = 4; mySysInfo.DualSubSliceCount = 2; mySysInfo.ThreadCount = 16 * 8; myPlatform.usRevId = 0x4; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&myHwInfo)); auto minSimd = 8; - auto expectedWG = (mySysInfo.ThreadCount / mySysInfo.EUCount) * (mySysInfo.EUCount / mySysInfo.SubSliceCount) * minSimd; + auto expectedWG = (mySysInfo.ThreadCount / mySysInfo.EUCount) * (mySysInfo.EUCount / mySysInfo.DualSubSliceCount) * minSimd; EXPECT_EQ(expectedWG, device->sharedDeviceInfo.maxWorkGroupSize); } -HWTEST_F(DeviceGetCapsTest, givenDSSDifferentThanZeroWhenDeviceCreatedThenDualSubSliceCountIsDifferentThanSubSliceCount) { +HWTEST_F(DeviceGetCapsTest, givenDSSDifferentThanZeroAndDifferentThanSubSliceCountWhenDeviceCreatedThenDualSubSliceCountIsSameAsSubSliceCount) { HardwareInfo myHwInfo = *defaultHwInfo; GT_SYSTEM_INFO &mySysInfo = myHwInfo.gtSystemInfo; PLATFORM &myPlatform = myHwInfo.platform; @@ -1528,7 +1527,7 @@ HWTEST_F(DeviceGetCapsTest, givenDSSDifferentThanZeroWhenDeviceCreatedThenDualSu myPlatform.usRevId = 0x4; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&myHwInfo)); - EXPECT_NE(device->sharedDeviceInfo.maxNumEUsPerSubSlice, device->sharedDeviceInfo.maxNumEUsPerDualSubSlice); + EXPECT_EQ(device->sharedDeviceInfo.maxNumEUsPerSubSlice, device->sharedDeviceInfo.maxNumEUsPerDualSubSlice); } HWTEST_F(DeviceGetCapsTest, givenDSSCountEqualZeroWhenDeviceCreatedThenMaxEuPerDSSEqualMaxEuPerSS) { diff --git a/opencl/test/unit_test/main.cpp b/opencl/test/unit_test/main.cpp index be34436f7e..67ce6384a3 100644 --- a/opencl/test/unit_test/main.cpp +++ b/opencl/test/unit_test/main.cpp @@ -317,6 +317,7 @@ int main(int argc, char **argv) { // clang-format off gtSystemInfo.SliceCount = sliceCount; gtSystemInfo.SubSliceCount = gtSystemInfo.SliceCount * subSlicePerSliceCount; + gtSystemInfo.DualSubSliceCount = gtSystemInfo.SubSliceCount; gtSystemInfo.EUCount = gtSystemInfo.SubSliceCount * euPerSubSlice - dieRecovery; gtSystemInfo.ThreadCount = gtSystemInfo.EUCount * threadsPerEu; gtSystemInfo.MaxEuPerSubSlice = std::max(gtSystemInfo.MaxEuPerSubSlice, euPerSubSlice); diff --git a/shared/source/device/device_caps.cpp b/shared/source/device/device_caps.cpp index 99de6c00d3..14a7fbcb01 100644 --- a/shared/source/device/device_caps.cpp +++ b/shared/source/device/device_caps.cpp @@ -118,12 +118,14 @@ void Device::initializeCaps() { ? CommonConstants::maximalSimdSize : hwHelper.getMinimalSIMDSize(); + uint32_t dualSubsliceCount = systemInfo.DualSubSliceCount == 0 ? systemInfo.SubSliceCount : systemInfo.DualSubSliceCount; + deviceInfo.maxNumEUsPerSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.featureTable.flags.ftrPooledEuEnabled == 0) - ? (systemInfo.EUCount / systemInfo.SubSliceCount) + ? (systemInfo.EUCount / dualSubsliceCount) : systemInfo.EuCountPerPoolMin; - if (systemInfo.DualSubSliceCount != 0) { + if (dualSubsliceCount != 0) { deviceInfo.maxNumEUsPerDualSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.featureTable.flags.ftrPooledEuEnabled == 0) - ? (systemInfo.EUCount / systemInfo.DualSubSliceCount) + ? (systemInfo.EUCount / dualSubsliceCount) : systemInfo.EuCountPerPoolMin; } else { diff --git a/shared/test/unit_test/device/neo_device_tests.cpp b/shared/test/unit_test/device/neo_device_tests.cpp index 5975270cee..9e470914d1 100644 --- a/shared/test/unit_test/device/neo_device_tests.cpp +++ b/shared/test/unit_test/device/neo_device_tests.cpp @@ -118,6 +118,31 @@ TEST_F(DeviceGetCapsTest, givenMockCompilerInterfaceWhenInitializeCapsIsCalledTh EXPECT_EQ(1u, pDevice->getDeviceInfo().maxParameterSize); } +TEST_F(DeviceGetCapsTest, whenInitializeCapsIsCalledWithDSSCountSetToZeroThenMaxWorkGroupSizeIsTheSame) { + pDevice->initializeCaps(); + auto maxWorkGroupSizeBefore = pDevice->getDeviceInfo().maxWorkGroupSize; + + auto mutableHwInfo = pDevice->getRootDeviceEnvironment().getMutableHardwareInfo(); + mutableHwInfo->gtSystemInfo.DualSubSliceCount = 0; + pDevice->initializeCaps(); + auto maxWorkGroupSizeAfter = pDevice->getDeviceInfo().maxWorkGroupSize; + + EXPECT_EQ(maxWorkGroupSizeBefore, maxWorkGroupSizeAfter); +} + +TEST_F(DeviceGetCapsTest, givenSSCountAndDSSCountEqualToZeroAndEuCountPerPoolMinIsSetThenMaxNumEUsPerSliceIsSetAccordingly) { + auto mutableHwInfo = pDevice->getRootDeviceEnvironment().getMutableHardwareInfo(); + mutableHwInfo->gtSystemInfo.EuCountPerPoolMin = 16; + mutableHwInfo->featureTable.flags.ftrPooledEuEnabled = 1; + mutableHwInfo->gtSystemInfo.SubSliceCount = 0; + mutableHwInfo->gtSystemInfo.DualSubSliceCount = 0; + + pDevice->initializeCaps(); + + EXPECT_EQ(pDevice->getDeviceInfo().maxNumEUsPerSubSlice, 16ul); + EXPECT_EQ(pDevice->getDeviceInfo().maxNumEUsPerDualSubSlice, 16ul); +} + TEST_F(DeviceGetCapsTest, givenImplicitScalingWhenInitializeCapsIsCalledThenMaxMemAllocSizeIsSetCorrectly) { DebugManagerStateRestore dbgRestorer; diff --git a/shared/test/unit_test/main.cpp b/shared/test/unit_test/main.cpp index 02fc2a70b7..8d8f787c26 100644 --- a/shared/test/unit_test/main.cpp +++ b/shared/test/unit_test/main.cpp @@ -302,6 +302,7 @@ int main(int argc, char **argv) { // clang-format off gtSystemInfo.SliceCount = sliceCount; gtSystemInfo.SubSliceCount = gtSystemInfo.SliceCount * subSlicePerSliceCount; + gtSystemInfo.DualSubSliceCount = gtSystemInfo.SubSliceCount; gtSystemInfo.EUCount = gtSystemInfo.SubSliceCount * euPerSubSlice - dieRecovery; gtSystemInfo.ThreadCount = gtSystemInfo.EUCount * threadsPerEu; gtSystemInfo.MaxEuPerSubSlice = std::max(gtSystemInfo.MaxEuPerSubSlice, euPerSubSlice);