Selectively enable getMaxThreadsForWorkgroup WA

Related-To: NEO-6022

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2021-07-06 10:14:23 +00:00
committed by Compute-Runtime-Automation
parent 0b8b7000d5
commit bbe599aa95
21 changed files with 149 additions and 26 deletions

View File

@ -24,6 +24,7 @@
#include "shared/source/kernel/kernel_arg_descriptor_extended_vme.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/os_interface/hw_info_config.h"
#include "opencl/source/accelerators/intel_accelerator.h"
#include "opencl/source/accelerators/intel_motion_estimation.h"
@ -78,7 +79,8 @@ Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &c
imageTransformer.reset(new ImageTransformer);
if (kernelInfoArg.kernelDescriptor.kernelAttributes.simdSize == 1u) {
auto deviceInfo = getDevice().getDevice().getDeviceInfo();
maxKernelWorkGroupSize = HwHelper::get(getHardwareInfo().platform.eRenderCoreFamily).getMaxThreadsForWorkgroupInDSSOrSS(getHardwareInfo(), static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice));
auto &hwInfoConfig = *HwInfoConfig::get(getHardwareInfo().platform.eProductFamily);
maxKernelWorkGroupSize = hwInfoConfig.getMaxThreadsForWorkgroupInDSSOrSS(getHardwareInfo(), static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice));
} else {
maxKernelWorkGroupSize = static_cast<uint32_t>(clDevice.getSharedDeviceInfo().maxWorkGroupSize);
}

View File

@ -13,6 +13,7 @@
#include "shared/source/memory_manager/allocations_list.h"
#include "shared/source/memory_manager/os_agnostic_memory_manager.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/os_interface/hw_info_config.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h"
@ -3168,7 +3169,9 @@ TEST_F(KernelTests, givenKernelWithSimdEqual1WhenKernelCreatedThenMaxWorgGroupSi
auto deviceMaxWorkGroupSize = pDevice->getDeviceInfo().maxWorkGroupSize;
auto deviceInfo = pClDevice->getDevice().getDeviceInfo();
auto maxThreadsPerWG = HwHelper::get(pKernel->getHardwareInfo().platform.eRenderCoreFamily).getMaxThreadsForWorkgroupInDSSOrSS(pKernel->getHardwareInfo(), static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice));
auto &hwInfoConfig = *HwInfoConfig::get(pKernel->getHardwareInfo().platform.eProductFamily);
auto maxThreadsPerWG = hwInfoConfig.getMaxThreadsForWorkgroupInDSSOrSS(pKernel->getHardwareInfo(), static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice));
EXPECT_LT(pKernel->getMaxKernelWorkGroupSize(), deviceMaxWorkGroupSize);
EXPECT_EQ(pKernel->getMaxKernelWorkGroupSize(), maxThreadsPerWG);

View File

@ -140,9 +140,16 @@ HWTEST_F(HwInfoConfigTest, givenSamplerStateWhenAdjustSamplerStateThenNothingIsC
EXPECT_EQ(0, memcmp(&initialState, &state, sizeof(SAMPLER_STATE)));
}
HWTEST_F(HwInfoConfigTest, whenCallingIsAdditionalStateBaseAddressWARequiredThenFalseIsReturned) {
HWTEST_F(HwInfoConfigTest, givenHardwareInfoWhenCallingIsAdditionalStateBaseAddressWARequiredThenFalseIsReturned) {
auto hwInfoConfig = HwInfoConfig::get(pInHwInfo.platform.eProductFamily);
bool ret = hwInfoConfig->isAdditionalStateBaseAddressWARequired(pInHwInfo);
EXPECT_FALSE(ret);
}
HWTEST_F(HwInfoConfigTest, givenHardwareInfoWhenCallingIsMaxThreadsForWorkgroupWARequiredThenFalseIsReturned) {
auto hwInfoConfig = HwInfoConfig::get(pInHwInfo.platform.eProductFamily);
bool ret = hwInfoConfig->isMaxThreadsForWorkgroupWARequired(pInHwInfo);
EXPECT_FALSE(ret);
}

View File

@ -94,6 +94,21 @@ bool HwInfoConfigHw<IGFX_UNKNOWN>::isAdditionalStateBaseAddressWARequired(const
return false;
}
template <>
bool HwInfoConfigHw<IGFX_UNKNOWN>::isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const {
return false;
}
template <>
uint32_t HwInfoConfigHw<IGFX_UNKNOWN>::getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const {
return 0;
}
template <>
uint32_t HwInfoConfigHw<IGFX_UNKNOWN>::getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const {
return 0;
}
} // namespace NEO
struct DummyHwConfig : HwInfoConfigHw<IGFX_UNKNOWN> {
@ -550,6 +565,11 @@ HWTEST_F(HwInfoConfigTestLinuxDummy, givenHardwareInfoWhenCallingIsAdditionalSta
EXPECT_FALSE(ret);
}
HWTEST_F(HwInfoConfigTestLinuxDummy, givenHardwareInfoWhenCallingIsMaxThreadsForWorkgroupWARequiredThenFalseIsReturned) {
bool ret = hwConfig.isMaxThreadsForWorkgroupWARequired(outHwInfo);
EXPECT_FALSE(ret);
}
using HwConfigLinux = ::testing::Test;
HWTEST2_F(HwConfigLinux, GivenDifferentValuesFromTopologyQueryWhenConfiguringHwInfoThenMaxSlicesSupportedSetToAvailableCountInGtSystemInfo, MatchAny) {

View File

@ -72,6 +72,21 @@ bool HwInfoConfigHw<IGFX_UNKNOWN>::isAdditionalStateBaseAddressWARequired(const
return false;
}
template <>
bool HwInfoConfigHw<IGFX_UNKNOWN>::isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const {
return false;
}
template <>
uint32_t HwInfoConfigHw<IGFX_UNKNOWN>::getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const {
return 0;
}
template <>
uint32_t HwInfoConfigHw<IGFX_UNKNOWN>::getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const {
return 0;
}
HwInfoConfigTestWindows::HwInfoConfigTestWindows() {
this->executionEnvironment = std::make_unique<MockExecutionEnvironment>();
this->rootDeviceEnvironment = std::make_unique<RootDeviceEnvironment>(*executionEnvironment);
@ -127,6 +142,11 @@ HWTEST_F(HwInfoConfigTestWindows, givenHardwareInfoWhenCallingIsAdditionalStateB
EXPECT_FALSE(ret);
}
HWTEST_F(HwInfoConfigTestWindows, givenHardwareInfoWhenCallingIsMaxThreadsForWorkgroupWARequiredThenFalseIsReturned) {
bool ret = hwConfig.isMaxThreadsForWorkgroupWARequired(outHwInfo);
EXPECT_FALSE(ret);
}
HWTEST_F(HwInfoConfigTestWindows, givenFtrIaCoherencyFlagWhenConfiguringHwInfoThenSetCoherencySupportCorrectly) {
HardwareInfo initialHwInfo = *defaultHwInfo;
auto &hwHelper = HwHelper::get(initialHwInfo.platform.eRenderCoreFamily);

View File

@ -106,6 +106,17 @@ XE_HP_CORE_TEST_F(HwHelperTestXE_HP_CORE, givenRevisionEnumAndPlatformFamilyType
}
}
XE_HP_CORE_TEST_F(HwHelperTestXE_HP_CORE, givenRevisionEnumThenProperMaxThreadsForWorkgroupIsReturned) {
HwHelper &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
auto &hwInfoConfig = *HwInfoConfig::get(hardwareInfo.platform.eProductFamily);
hardwareInfo.platform.usRevId = hwHelper.getHwRevIdFromStepping(REVISION_A0, hardwareInfo);
EXPECT_EQ(64u, hwInfoConfig.getMaxThreadsForWorkgroupInDSSOrSS(hardwareInfo, 64u, 64u));
hardwareInfo.platform.usRevId = hwHelper.getHwRevIdFromStepping(REVISION_B, hardwareInfo);
uint32_t numThreadsPerEU = hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount;
EXPECT_EQ(64u * numThreadsPerEU, hwInfoConfig.getMaxThreadsForWorkgroupInDSSOrSS(hardwareInfo, 64u, 64u));
}
XE_HP_CORE_TEST_F(HwHelperTestXE_HP_CORE, givenDisablePipeControlFlagIsDefaultWhenLocalMemoryIsEnabledThenReturnFalseAndDoNotProgramPipeControl) {
hardwareInfo.featureTable.ftrLocalMemory = true;

View File

@ -15,6 +15,7 @@
#include "test.h"
HWTEST_EXCLUDE_PRODUCT(HwHelperTest, WhenAllowRenderCompressionIsCalledThenTrueIsReturned, IGFX_XE_HP_SDV);
HWTEST_EXCLUDE_PRODUCT(HwInfoConfigTest, givenHardwareInfoWhenCallingIsMaxThreadsForWorkgroupWARequiredThenFalseIsReturned, IGFX_XE_HP_SDV);
using namespace NEO;

View File

@ -117,7 +117,7 @@ void Device::initializeCaps() {
}
deviceInfo.numThreadsPerEU = systemInfo.ThreadCount / systemInfo.EUCount;
deviceInfo.threadsPerEUConfigs = hwHelper.getThreadsPerEUConfigs();
auto maxWS = hwHelper.getMaxThreadsForWorkgroupInDSSOrSS(hwInfo, static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice)) * simdSizeUsed;
auto maxWS = hwInfoConfig->getMaxThreadsForWorkgroupInDSSOrSS(hwInfo, static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice)) * simdSizeUsed;
maxWS = Math::prevPowerOfTwo(maxWS);
deviceInfo.maxWorkGroupSize = std::min(maxWS, 1024u);

View File

@ -55,11 +55,6 @@ uint32_t HwHelper::getMaxThreadsForVfe(const HardwareInfo &hwInfo) {
return maxHwThreadsReturned;
}
uint32_t HwHelper::getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const {
uint32_t numThreadsPerEU = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.EUCount;
return maxNumEUsPerSubSlice * numThreadsPerEU;
}
uint32_t HwHelper::getSubDevicesCount(const HardwareInfo *pHwInfo) {
if (DebugManager.flags.CreateMultipleSubDevices.get() > 0) {
return DebugManager.flags.CreateMultipleSubDevices.get();

View File

@ -96,8 +96,6 @@ class HwHelper {
virtual bool getEnableLocalMemory(const HardwareInfo &hwInfo) const = 0;
virtual std::string getExtensions() const = 0;
static uint32_t getMaxThreadsForVfe(const HardwareInfo &hwInfo);
virtual uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const;
virtual uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const = 0;
virtual uint32_t getMetricsLibraryGenId() const = 0;
virtual uint32_t getMocsIndex(const GmmHelper &gmmHelper, bool l3enabled, bool l1enabled) const = 0;
virtual bool tilingAllowed(bool isSharedContext, bool isImage1d, bool forceLinearStorage) = 0;
@ -212,8 +210,6 @@ class HwHelperHw : public HwHelper {
size_t getPaddingForISAAllocation() const override;
uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const override;
uint32_t getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const override;
uint32_t getPitchAlignmentForImage(const HardwareInfo *hwInfo) const override;

View File

@ -110,11 +110,6 @@ uint32_t HwHelperHw<GfxFamily>::getPlanarYuvMaxHeight() const {
return planarYuvMaxHeight;
}
template <typename GfxFamily>
uint32_t HwHelperHw<GfxFamily>::getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const {
return HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerSubSlice);
}
template <typename GfxFamily>
aub_stream::MMIOList HwHelperHw<GfxFamily>::getExtraMmioList(const HardwareInfo &hwInfo, const GmmHelper &gmmHelper) const {
return {};

View File

@ -193,12 +193,4 @@ inline bool HwHelperHw<GfxFamily>::preferSmallWorkgroupSizeForKernel(const size_
return true;
}
template <typename GfxFamily>
inline uint32_t HwHelperHw<GfxFamily>::getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const {
if (isWorkaroundRequired(REVISION_A0, REVISION_B, hwInfo)) {
return std::min(HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerDualSubSlice), 64u);
}
return HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerDualSubSlice);
}
} // namespace NEO

View File

@ -39,6 +39,9 @@ class HwInfoConfig {
virtual void convertTimestampsFromOaToCsDomain(uint64_t &timestampData) = 0;
virtual uint32_t getDeviceMemoryMaxClkRate(const HardwareInfo *hwInfo) = 0;
virtual bool isAdditionalStateBaseAddressWARequired(const HardwareInfo &hwInfo) const = 0;
virtual bool isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const = 0;
virtual uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const = 0;
virtual uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const = 0;
uint32_t threadsPerEu;
};
@ -61,6 +64,9 @@ class HwInfoConfigHw : public HwInfoConfig {
void convertTimestampsFromOaToCsDomain(uint64_t &timestampData) override;
uint32_t getDeviceMemoryMaxClkRate(const HardwareInfo *hwInfo) override;
bool isAdditionalStateBaseAddressWARequired(const HardwareInfo &hwInfo) const override;
bool isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const override;
uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const override;
uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const override;
protected:
HwInfoConfigHw() = default;

View File

@ -85,4 +85,14 @@ bool HwInfoConfigHw<gfxProduct>::isAdditionalStateBaseAddressWARequired(const Ha
return false;
}
template <PRODUCT_FAMILY gfxProduct>
bool HwInfoConfigHw<gfxProduct>::isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const {
return false;
}
template <PRODUCT_FAMILY gfxProduct>
uint32_t HwInfoConfigHw<gfxProduct>::getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const {
uint32_t numThreadsPerEU = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.EUCount;
return maxNumEUsPerSubSlice * numThreadsPerEU;
}
} // namespace NEO

View File

@ -25,4 +25,9 @@ void HwInfoConfigHw<gfxProduct>::enableRenderCompression(HardwareInfo *hwInfo) {
hwInfo->capabilityTable.ftrRenderCompressedBuffers = hwInfo->featureTable.ftrE2ECompression;
}
template <PRODUCT_FAMILY gfxProduct>
uint32_t HwInfoConfigHw<gfxProduct>::getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const {
return getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerSubSlice);
}
} // namespace NEO

View File

@ -24,4 +24,12 @@ void HwInfoConfigHw<gfxProduct>::enableRenderCompression(HardwareInfo *hwInfo) {
hwInfo->capabilityTable.ftrRenderCompressedBuffers = hwInfo->featureTable.ftrE2ECompression;
}
template <PRODUCT_FAMILY gfxProduct>
uint32_t HwInfoConfigHw<gfxProduct>::getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const {
if (isMaxThreadsForWorkgroupWARequired(hwInfo)) {
return std::min(getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerDualSubSlice), 64u);
}
return getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerDualSubSlice);
}
} // namespace NEO

View File

@ -9,6 +9,7 @@
#include "shared/source/helpers/hw_info.h"
#include "shared/source/kernel/kernel_properties.h"
#include "shared/source/os_interface/hw_info_config.h"
#include "shared/source/xe_hp_core/os_agnostic_hw_info_config_xe_hp_core.inl"
namespace NEO {
template <>

View File

@ -0,0 +1,15 @@
/*
* Copyright (C) 2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
using namespace NEO;
template <>
bool HwInfoConfigHw<IGFX_XE_HP_SDV>::isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const {
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
uint32_t stepping = hwHelper.getSteppingFromHwRevId(hwInfo);
return REVISION_A0 == stepping;
}

View File

@ -16,6 +16,7 @@
namespace NEO {
#ifdef SUPPORT_XEHP
#include "shared/source/xe_hp_core/os_agnostic_hw_info_config_xe_hp_core.inl"
template <>
int HwInfoConfigHw<IGFX_XE_HP_SDV>::configureHardwareCustom(HardwareInfo *hwInfo, OSInterface *osIface) {
auto &hwHelper = HwHelper::get(hwInfo->platform.eRenderCoreFamily);

View File

@ -16,6 +16,7 @@ if(TESTS_XE_HP_CORE)
target_sources(${TARGET_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/unit_test_helper_xe_hp_core.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_hw_info_config_xe_hp_core.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_preemption_xe_hp_core.cpp
${COMPUTE_RUNTIME_ULT_XE_HP_CORE}
${NEO_CORE_TESTS_XE_HP_CORE}

View File

@ -0,0 +1,34 @@
/*
* Copyright (C) 2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/os_interface/hw_info_config.h"
#include "shared/test/common/fixtures/device_fixture.h"
#include "test.h"
using namespace NEO;
using XeHPHwInfoConfig = Test<DeviceFixture>;
XEHPTEST_F(XeHPHwInfoConfig, givenXEHPWithA0SteppingThenMaxThreadsForWorkgroupWAIsRequired) {
auto hwInfoConfig = HwInfoConfig::get(productFamily);
auto hwInfo = pDevice->getRootDeviceEnvironment().getMutableHardwareInfo();
auto &hwHelper = HwHelper::get(hwInfo->platform.eRenderCoreFamily);
hwInfo->platform.usRevId = hwHelper.getHwRevIdFromStepping(REVISION_A0, *hwInfo);
auto isWARequired = hwInfoConfig->isMaxThreadsForWorkgroupWARequired(pDevice->getHardwareInfo());
EXPECT_TRUE(isWARequired);
}
XEHPTEST_F(XeHPHwInfoConfig, givenXEHPWithBSteppingThenMaxThreadsForWorkgroupWAIsNotRequired) {
auto hwInfoConfig = HwInfoConfig::get(productFamily);
auto hwInfo = pDevice->getRootDeviceEnvironment().getMutableHardwareInfo();
auto &hwHelper = HwHelper::get(hwInfo->platform.eRenderCoreFamily);
hwInfo->platform.usRevId = hwHelper.getHwRevIdFromStepping(REVISION_B, *hwInfo);
auto isWARequired = hwInfoConfig->isMaxThreadsForWorkgroupWARequired(pDevice->getHardwareInfo());
EXPECT_FALSE(isWARequired);
}