diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp index 3d9826055a..9a68065b41 100644 --- a/opencl/source/kernel/kernel.cpp +++ b/opencl/source/kernel/kernel.cpp @@ -77,7 +77,8 @@ Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &c program->retainForKernel(); imageTransformer.reset(new ImageTransformer); if (kernelInfoArg.kernelDescriptor.kernelAttributes.simdSize == 1u) { - maxKernelWorkGroupSize = HwHelper::get(getHardwareInfo().platform.eRenderCoreFamily).getMaxThreadsForWorkgroup(getHardwareInfo(), static_cast(getDevice().getDevice().getDeviceInfo().maxNumEUsPerSubSlice)); + auto deviceInfo = getDevice().getDevice().getDeviceInfo(); + maxKernelWorkGroupSize = HwHelper::get(getHardwareInfo().platform.eRenderCoreFamily).getMaxThreadsForWorkgroupInDSSOrSS(getHardwareInfo(), static_cast(deviceInfo.maxNumEUsPerSubSlice), static_cast(deviceInfo.maxNumEUsPerDualSubSlice)); } else { maxKernelWorkGroupSize = static_cast(clDevice.getSharedDeviceInfo().maxWorkGroupSize); } diff --git a/opencl/test/unit_test/device/device_caps_tests.cpp b/opencl/test/unit_test/device/device_caps_tests.cpp index 92fe1c9e0b..b4d7557c4b 100644 --- a/opencl/test/unit_test/device/device_caps_tests.cpp +++ b/opencl/test/unit_test/device/device_caps_tests.cpp @@ -1685,3 +1685,50 @@ HWTEST_F(QueueFamilyNameTest, givenBcsWhenGettingQueueFamilyNameThenReturnProper HWTEST_F(QueueFamilyNameTest, givenInvalidEngineGroupWhenGettingQueueFamilyNameThenReturnEmptyName) { verify(EngineGroupType::MaxEngineGroups, ""); } +HWCMDTEST_F(IGFX_GEN8_CORE, DeviceGetCapsTest, givenSysInfoWhenDeviceCreatedThenMaxWorkGroupCalculatedCorrectly) { + HardwareInfo myHwInfo = *defaultHwInfo; + GT_SYSTEM_INFO &mySysInfo = myHwInfo.gtSystemInfo; + PLATFORM &myPlatform = myHwInfo.platform; + + mySysInfo.EUCount = 16; + mySysInfo.SubSliceCount = 4; + mySysInfo.DualSubSliceCount = 2; + mySysInfo.ThreadCount = 16 * 8; + myPlatform.usRevId = 0x4; + auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&myHwInfo)); + auto minSimd = 8; + + auto expectedWG = (mySysInfo.ThreadCount / mySysInfo.EUCount) * (mySysInfo.EUCount / mySysInfo.SubSliceCount) * minSimd; + + EXPECT_EQ(expectedWG, device->sharedDeviceInfo.maxWorkGroupSize); +} + +HWTEST_F(DeviceGetCapsTest, givenDSSDifferentThanZeroWhenDeviceCreatedThenDualSubSliceCountIsDifferentThanSubSliceCount) { + HardwareInfo myHwInfo = *defaultHwInfo; + GT_SYSTEM_INFO &mySysInfo = myHwInfo.gtSystemInfo; + PLATFORM &myPlatform = myHwInfo.platform; + + mySysInfo.EUCount = 16; + mySysInfo.SubSliceCount = 4; + mySysInfo.DualSubSliceCount = 2; + mySysInfo.ThreadCount = 16 * 8; + myPlatform.usRevId = 0x4; + auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&myHwInfo)); + + EXPECT_NE(device->sharedDeviceInfo.maxNumEUsPerSubSlice, device->sharedDeviceInfo.maxNumEUsPerDualSubSlice); +} + +HWTEST_F(DeviceGetCapsTest, givenDSSCountEqualZeroWhenDeviceCreatedThenMaxEuPerDSSEqualMaxEuPerSS) { + HardwareInfo myHwInfo = *defaultHwInfo; + GT_SYSTEM_INFO &mySysInfo = myHwInfo.gtSystemInfo; + PLATFORM &myPlatform = myHwInfo.platform; + + mySysInfo.EUCount = 16; + mySysInfo.SubSliceCount = 4; + mySysInfo.DualSubSliceCount = 0; + mySysInfo.ThreadCount = 16 * 8; + myPlatform.usRevId = 0x4; + auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&myHwInfo)); + + EXPECT_EQ(device->sharedDeviceInfo.maxNumEUsPerSubSlice, device->sharedDeviceInfo.maxNumEUsPerDualSubSlice); +} diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index e718689da1..9e33d231b6 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -2515,6 +2515,7 @@ HWTEST_F(KernelTest, givenKernelWhenDebugFlagToUseMaxSimdForCalculationsIsUsedTh mySysInfo.EUCount = 24; mySysInfo.SubSliceCount = 3; + mySysInfo.DualSubSliceCount = 3; mySysInfo.ThreadCount = 24 * 7; auto device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(&myHwInfo)); @@ -3166,7 +3167,8 @@ TEST_F(KernelTests, givenKernelWithSimdEqual1WhenKernelCreatedThenMaxWorgGroupSi std::unique_ptr pKernel(new MockKernel(pProgram, *pKernelInfo, *pClDevice)); auto deviceMaxWorkGroupSize = pDevice->getDeviceInfo().maxWorkGroupSize; - auto maxThreadsPerWG = HwHelper::get(pKernel->getHardwareInfo().platform.eRenderCoreFamily).getMaxThreadsForWorkgroup(pKernel->getHardwareInfo(), static_cast(pClDevice->getDevice().getDeviceInfo().maxNumEUsPerSubSlice)); + auto deviceInfo = pClDevice->getDevice().getDeviceInfo(); + auto maxThreadsPerWG = HwHelper::get(pKernel->getHardwareInfo().platform.eRenderCoreFamily).getMaxThreadsForWorkgroupInDSSOrSS(pKernel->getHardwareInfo(), static_cast(deviceInfo.maxNumEUsPerSubSlice), static_cast(deviceInfo.maxNumEUsPerDualSubSlice)); EXPECT_LT(pKernel->getMaxKernelWorkGroupSize(), deviceMaxWorkGroupSize); EXPECT_EQ(pKernel->getMaxKernelWorkGroupSize(), maxThreadsPerWG); diff --git a/shared/source/device/device_caps.cpp b/shared/source/device/device_caps.cpp index 188335aa97..0349ee45da 100644 --- a/shared/source/device/device_caps.cpp +++ b/shared/source/device/device_caps.cpp @@ -107,9 +107,17 @@ void Device::initializeCaps() { deviceInfo.maxNumEUsPerSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.featureTable.ftrPooledEuEnabled == 0) ? (systemInfo.EUCount / systemInfo.SubSliceCount) : systemInfo.EuCountPerPoolMin; + if (systemInfo.DualSubSliceCount != 0) { + deviceInfo.maxNumEUsPerDualSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.featureTable.ftrPooledEuEnabled == 0) + ? (systemInfo.EUCount / systemInfo.DualSubSliceCount) + : systemInfo.EuCountPerPoolMin; + + } else { + deviceInfo.maxNumEUsPerDualSubSlice = deviceInfo.maxNumEUsPerSubSlice; + } deviceInfo.numThreadsPerEU = systemInfo.ThreadCount / systemInfo.EUCount; deviceInfo.threadsPerEUConfigs = hwHelper.getThreadsPerEUConfigs(); - auto maxWS = hwHelper.getMaxThreadsForWorkgroup(hwInfo, static_cast(deviceInfo.maxNumEUsPerSubSlice)) * simdSizeUsed; + auto maxWS = hwHelper.getMaxThreadsForWorkgroupInDSSOrSS(hwInfo, static_cast(deviceInfo.maxNumEUsPerSubSlice), static_cast(deviceInfo.maxNumEUsPerDualSubSlice)) * simdSizeUsed; maxWS = Math::prevPowerOfTwo(maxWS); deviceInfo.maxWorkGroupSize = std::min(maxWS, 1024u); diff --git a/shared/source/device/device_info.h b/shared/source/device/device_info.h index 8670f69856..901d16a375 100644 --- a/shared/source/device/device_info.h +++ b/shared/source/device/device_info.h @@ -26,6 +26,7 @@ struct DeviceInfo { size_t imageMaxArraySize; size_t imageMaxBufferSize; size_t maxNumEUsPerSubSlice; + size_t maxNumEUsPerDualSubSlice; size_t maxParameterSize; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index f8c7d5a468..211f55ca40 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -97,6 +97,7 @@ class HwHelper { virtual std::string getExtensions() const = 0; static uint32_t getMaxThreadsForVfe(const HardwareInfo &hwInfo); virtual uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const; + virtual uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const = 0; virtual uint32_t getMetricsLibraryGenId() const = 0; virtual uint32_t getMocsIndex(const GmmHelper &gmmHelper, bool l3enabled, bool l1enabled) const = 0; virtual bool tilingAllowed(bool isSharedContext, bool isImage1d, bool forceLinearStorage) = 0; @@ -211,7 +212,7 @@ class HwHelperHw : public HwHelper { size_t getPaddingForISAAllocation() const override; - uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const override; + uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const override; uint32_t getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const override; diff --git a/shared/source/helpers/hw_helper_bdw_plus.inl b/shared/source/helpers/hw_helper_bdw_plus.inl index 36e5bcb187..a9bbf7f02d 100644 --- a/shared/source/helpers/hw_helper_bdw_plus.inl +++ b/shared/source/helpers/hw_helper_bdw_plus.inl @@ -111,7 +111,7 @@ uint32_t HwHelperHw::getPlanarYuvMaxHeight() const { } template -uint32_t HwHelperHw::getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const { +uint32_t HwHelperHw::getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const { return HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerSubSlice); } diff --git a/shared/source/helpers/hw_helper_xehp_plus.inl b/shared/source/helpers/hw_helper_xehp_plus.inl index bee46219b9..0e685d7706 100644 --- a/shared/source/helpers/hw_helper_xehp_plus.inl +++ b/shared/source/helpers/hw_helper_xehp_plus.inl @@ -194,11 +194,11 @@ inline bool HwHelperHw::preferSmallWorkgroupSizeForKernel(const size_ } template -inline uint32_t HwHelperHw::getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const { +inline uint32_t HwHelperHw::getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const { if (isWorkaroundRequired(REVISION_A0, REVISION_B, hwInfo)) { - return std::min(HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerSubSlice), 64u); + return std::min(HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerDualSubSlice), 64u); } - return HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerSubSlice); + return HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerDualSubSlice); } } // namespace NEO