feature: adjust maxWorkGroupSize value

Related-To: NEO-7357

Signed-off-by: Rafal Maziejuk <rafal.maziejuk@intel.com>
This commit is contained in:
Rafal Maziejuk
2023-03-20 12:41:39 +00:00
committed by Compute-Runtime-Automation
parent 6437c1a91e
commit b9828b543e
14 changed files with 71 additions and 19 deletions

View File

@@ -747,7 +747,9 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties)
memset(pKernelProperties->uuid.kid, 0, ZE_MAX_KERNEL_UUID_SIZE);
memset(pKernelProperties->uuid.mid, 0, ZE_MAX_MODULE_UUID_SIZE);
const auto &gfxCoreHelper = this->module->getDevice()->getGfxCoreHelper();
uint32_t maxKernelWorkGroupSize = static_cast<uint32_t>(this->module->getMaxGroupSize(kernelDescriptor));
maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, maxKernelWorkGroupSize);
pKernelProperties->maxNumSubgroups = maxKernelWorkGroupSize / kernelDescriptor.kernelAttributes.simdSize;
void *pNext = pKernelProperties->pNext;
@@ -758,7 +760,6 @@ ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties)
reinterpret_cast<ze_kernel_preferred_group_size_properties_t *>(extendedProperties);
preferredGroupSizeProperties->preferredMultiple = this->kernelImmData->getKernelInfo()->getMaxSimdSize();
auto &gfxCoreHelper = this->module->getDevice()->getGfxCoreHelper();
if (gfxCoreHelper.isFusedEuDispatchEnabled(this->module->getDevice()->getHwInfo(), kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion)) {
preferredGroupSizeProperties->preferredMultiple *= 2;
}

View File

@@ -319,15 +319,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenSignalEventWhenAppendLaunchIndirec
context->freeMem(alloc);
}
struct ProgramChangedFieldsInComputeMode {
template <PRODUCT_FAMILY productFamily>
static constexpr bool isMatched() {
if (productFamily == IGFX_BROADWELL)
return false;
return TestTraits<NEO::ToGfxCoreFamily<productFamily>::get()>::programOnlyChangedFieldsInComputeStateMode;
}
};
HWTEST2_F(CommandListAppendLaunchKernel, GivenComputeModePropertiesWhenUpdateStreamPropertiesIsCalledTwiceThenChangedFieldsAreDirty, ProgramChangedFieldsInComputeMode) {
HWTEST2_F(CommandListAppendLaunchKernel, GivenComputeModePropertiesWhenUpdateStreamPropertiesIsCalledTwiceThenChangedFieldsAreDirty, IsAtLeastGen12lp) {
DebugManagerStateRestore restorer;
auto &productHelper = device->getProductHelper();
@@ -360,7 +352,9 @@ HWTEST2_F(CommandListAppendLaunchKernel, GivenComputeModePropertiesWhenUpdateStr
const_cast<NEO::KernelDescriptor *>(&kernel.getKernelDescriptor())->kernelAttributes.numGrfRequired = 0x80;
commandList->updateStreamProperties(kernel, false, &launchKernelArgs, false);
EXPECT_EQ(productHelper.isGrfNumReportedWithScm(), commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty);
if constexpr (TestTraits<gfxCoreFamily>::largeGrfModeInStateComputeModeSupported) {
EXPECT_EQ(productHelper.isGrfNumReportedWithScm(), commandList->finalStreamState.stateComputeMode.largeGrfMode.isDirty);
}
if (productHelper.getScmPropertyCoherencyRequiredSupport()) {
EXPECT_EQ(0, commandList->finalStreamState.stateComputeMode.isCoherencyRequired.value);
} else {

View File

@@ -2252,11 +2252,14 @@ bool Kernel::areMultipleSubDevicesInContext() const {
}
void Kernel::reconfigureKernel() {
auto &kernelDescriptor = kernelInfo.kernelDescriptor;
const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
if (kernelDescriptor.kernelAttributes.numGrfRequired == GrfConfig::LargeGrfNumber &&
kernelDescriptor.kernelAttributes.simdSize != 32) {
maxKernelWorkGroupSize >>= 1;
this->maxKernelWorkGroupSize >>= 1;
}
const auto &gfxCoreHelper = getDevice().getGfxCoreHelper();
this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, this->maxKernelWorkGroupSize);
this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites;
this->systolicPipelineSelectMode = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode;
}

View File

@@ -3303,7 +3303,7 @@ HWTEST_F(KernelLargeGrfTests, GivenLargeGrfAndSimdSizeWhenGettingMaxWorkGroupSiz
{
MockKernel kernel(program.get(), *pKernelInfo, *pClDevice);
pKernelInfo->kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::LargeGrfNumber - 1;
pKernelInfo->kernelDescriptor.kernelAttributes.numGrfRequired = GrfConfig::DefaultGrfNumber;
EXPECT_EQ(CL_SUCCESS, kernel.initialize());
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, *kernel.maxWorkGroupSizeForCrossThreadData);
EXPECT_EQ(pDevice->getDeviceInfo().maxWorkGroupSize, kernel.maxKernelWorkGroupSize);

View File

@@ -143,6 +143,9 @@ void Device::initializeCaps() {
maxWS = Math::prevPowerOfTwo(maxWS);
deviceInfo.maxWorkGroupSize = std::min(maxWS, 1024u);
const auto minGrfSize = gfxCoreHelper.getMinimalGrfSize();
deviceInfo.maxWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(minGrfSize, simdSizeUsed, static_cast<uint32_t>(deviceInfo.maxWorkGroupSize));
if (DebugManager.flags.OverrideMaxWorkgroupSize.get() != -1) {
deviceInfo.maxWorkGroupSize = DebugManager.flags.OverrideMaxWorkgroupSize.get();
}

View File

@@ -103,6 +103,7 @@ class GfxCoreHelper {
virtual bool isWaDisableRccRhwoOptimizationRequired() const = 0;
virtual bool isAdditionalFeatureFlagRequired(const FeatureTable *featureTable) const = 0;
virtual uint32_t getMinimalSIMDSize() const = 0;
virtual uint32_t getMinimalGrfSize() const = 0;
virtual bool isOffsetToSkipSetFFIDGPWARequired(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const = 0;
virtual bool isFusedEuDispatchEnabled(const HardwareInfo &hwInfo, bool disableEUFusionForKernel) const = 0;
virtual uint64_t getGpuTimeStampInNS(uint64_t timeStamp, double frequency) const = 0;
@@ -118,6 +119,7 @@ class GfxCoreHelper {
virtual bool isCooperativeDispatchSupported(const EngineGroupType engineGroupType, const RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
virtual uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const = 0;
virtual uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, const uint32_t defaultMaxGroupSize) const = 0;
virtual size_t getMaxFillPaternSizeForCopyEngine() const = 0;
virtual size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const = 0;
virtual bool isSipWANeeded(const HardwareInfo &hwInfo) const = 0;
@@ -293,6 +295,8 @@ class GfxCoreHelperHw : public GfxCoreHelper {
uint32_t getMinimalSIMDSize() const override;
uint32_t getMinimalGrfSize() const override;
uint64_t getGpuTimeStampInNS(uint64_t timeStamp, double frequency) const override;
uint32_t getGlobalTimeStampBits() const override;
@@ -316,6 +320,8 @@ class GfxCoreHelperHw : public GfxCoreHelper {
uint32_t adjustMaxWorkGroupCount(uint32_t maxWorkGroupCount, const EngineGroupType engineGroupType,
const RootDeviceEnvironment &rootDeviceEnvironment, bool isEngineInstanced) const override;
uint32_t adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, const uint32_t defaultMaxGroupSize) const override;
size_t getMaxFillPaternSizeForCopyEngine() const override;
size_t getSipKernelMaxDbgSurfaceSize(const HardwareInfo &hwInfo) const override;

View File

@@ -675,4 +675,14 @@ template <typename gfxProduct>
bool GfxCoreHelperHw<gfxProduct>::isRelaxedOrderingSupported() const {
return false;
}
template <typename GfxFamily>
uint32_t GfxCoreHelperHw<GfxFamily>::adjustMaxWorkGroupSize(const uint32_t numGrf, const uint32_t simd, const uint32_t defaultMaxGroupSize) const {
return defaultMaxGroupSize;
}
template <typename GfxFamily>
uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
return 128u;
}
} // namespace NEO

View File

@@ -19,4 +19,5 @@ struct TestTraits<IGFX_GEN12LP_CORE> {
static constexpr bool implementsPreambleThreadArbitration = false;
static constexpr bool forceGpuNonCoherent = true;
static constexpr bool imagesSupported = true;
static constexpr bool largeGrfModeInStateComputeModeSupported = true;
};

View File

@@ -29,4 +29,5 @@ struct TestTraits<IGFX_XE_HPC_CORE> {
static constexpr bool isUnTypedDataPortCacheFlushSupported = true;
static constexpr bool imagesSupported = false;
static constexpr bool isPipeControlExtendedPriorToNonPipelinedStateCommandSupported = true;
static constexpr bool largeGrfModeInStateComputeModeSupported = true;
};

View File

@@ -29,4 +29,5 @@ struct TestTraits<IGFX_XE_HPG_CORE> {
static constexpr bool isUnTypedDataPortCacheFlushSupported = true;
static constexpr bool imagesSupported = true;
static constexpr bool isPipeControlExtendedPriorToNonPipelinedStateCommandSupported = false;
static constexpr bool largeGrfModeInStateComputeModeSupported = true;
};

View File

@@ -15,8 +15,11 @@
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/test_macros/hw_test.h"
#include "shared/test/common/test_macros/test.h"
#include "test_traits_common.h"
using namespace NEO;
struct MockStateComputeModeProperties : public StateComputeModeProperties {
@@ -40,6 +43,8 @@ struct MockStateBaseAddressProperties : public StateBaseAddressProperties {
using StateBaseAddressProperties::stateBaseAddressPropertiesSupport;
};
using StreamPropertiesTests = ::testing::Test;
TEST(StreamPropertiesTests, whenPropertyValueIsChangedThenProperStateIsSet) {
NEO::StreamProperty streamProperty;
@@ -108,7 +113,7 @@ TEST(StreamPropertiesTests, whenSettingCooperativeKernelPropertiesThenCorrectVal
}
}
TEST(StreamPropertiesTests, whenSettingStateComputeModePropertiesThenCorrectValuesAreSet) {
HWTEST2_F(StreamPropertiesTests, whenSettingStateComputeModePropertiesThenCorrectValuesAreSet, IsAtLeastGen12lp) {
DebugManagerStateRestore restorer;
DebugManager.flags.ForceGrfNumProgrammingWithScm.set(1);
DebugManager.flags.ForceThreadArbitrationPolicyProgrammingWithScm.set(1);

View File

@@ -26,6 +26,8 @@
#include "shared/test/unit_test/fixtures/command_container_fixture.h"
#include "shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h"
#include "test_traits_common.h"
#include <memory>
using namespace NEO;
@@ -520,7 +522,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenForceBtpPrefetchModeD
}
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenDispatchInterfaceWhenNumRequiredGrfIsNotDefaultThenStateComputeModeCommandAdded) {
HWTEST2_F(CommandEncodeStatesTest, givenDispatchInterfaceWhenNumRequiredGrfIsNotDefaultThenStateComputeModeCommandAdded, IsAtLeastGen12lp) {
DebugManagerStateRestore restorer;
DebugManager.flags.ForceGrfNumProgrammingWithScm.set(1);

View File

@@ -927,10 +927,15 @@ HWCMDTEST_F(IGFX_GEN8_CORE, GfxCoreHelperTest, givenDefaultGfxCoreHelperHwWhenIs
}
HWTEST_F(GfxCoreHelperTest, givenDefaultGfxCoreHelperHwWhenMinimalSIMDSizeIsQueriedThen8IsReturned) {
auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
EXPECT_EQ(8u, gfxCoreHelper.getMinimalSIMDSize());
}
HWTEST_F(GfxCoreHelperTest, givenDefaultGfxCoreHelperHwWhenMinimalGrfSizeIsQueriedThen128IsReturned) {
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
EXPECT_EQ(128u, gfxCoreHelper.getMinimalGrfSize());
}
HWCMDTEST_F(IGFX_GEN8_CORE, GfxCoreHelperTest, WhenIsFusedEuDispatchEnabledIsCalledThenFalseIsReturned) {
if (hardwareInfo.platform.eRenderCoreFamily == IGFX_GEN12LP_CORE) {
GTEST_SKIP();
@@ -1563,3 +1568,20 @@ HWTEST_F(GfxCoreHelperTest, GivenCooperativeEngineSupportedAndNotUsedWhenAdjustM
}
}
}
HWTEST_F(GfxCoreHelperTest, givenNumGrfAndSimdSizeWhenAdjustingMaxWorkGroupSizeThenAlwaysReturnDeviceDefault) {
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
constexpr auto defaultMaxGroupSize = 1024u;
uint32_t simdSize = 16u;
uint32_t numGrfRequired = GrfConfig::LargeGrfNumber;
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize));
simdSize = 32u;
numGrfRequired = GrfConfig::LargeGrfNumber;
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize));
simdSize = 16u;
numGrfRequired = GrfConfig::DefaultGrfNumber;
EXPECT_EQ(defaultMaxGroupSize, gfxCoreHelper.adjustMaxWorkGroupSize(numGrfRequired, simdSize, defaultMaxGroupSize));
}

View File

@@ -26,6 +26,7 @@
#include "shared/test/common/test_macros/hw_test.h"
#include "gtest/gtest.h"
#include "test_traits_common.h"
using namespace NEO;
@@ -506,7 +507,7 @@ HWTEST_F(ProductHelperTest, givenSamplerStateWhenAdjustSamplerStateThenNothingIs
EXPECT_EQ(0, memcmp(&initialState, &state, sizeof(SAMPLER_STATE)));
}
HWTEST_F(ProductHelperTest, WhenFillingScmPropertiesSupportThenExpectUseCorrectGetters) {
HWTEST2_F(ProductHelperTest, WhenFillingScmPropertiesSupportThenExpectUseCorrectGetters, IsAtLeastGen12lp) {
StateComputeModePropertiesSupport scmPropertiesSupport = {};
productHelper->fillScmPropertiesSupportStructure(scmPropertiesSupport);
@@ -515,8 +516,10 @@ HWTEST_F(ProductHelperTest, WhenFillingScmPropertiesSupportThenExpectUseCorrectG
EXPECT_EQ(productHelper->getScmPropertyCoherencyRequiredSupport(), scmPropertiesSupport.coherencyRequired);
EXPECT_EQ(productHelper->getScmPropertyZPassAsyncComputeThreadLimitSupport(), scmPropertiesSupport.zPassAsyncComputeThreadLimit);
EXPECT_EQ(productHelper->getScmPropertyPixelAsyncComputeThreadLimitSupport(), scmPropertiesSupport.pixelAsyncComputeThreadLimit);
EXPECT_EQ(productHelper->isGrfNumReportedWithScm(), scmPropertiesSupport.largeGrfMode);
EXPECT_EQ(productHelper->getScmPropertyDevicePreemptionModeSupport(), scmPropertiesSupport.devicePreemptionMode);
if constexpr (TestTraits<gfxCoreFamily>::largeGrfModeInStateComputeModeSupported) {
EXPECT_EQ(productHelper->isGrfNumReportedWithScm(), scmPropertiesSupport.largeGrfMode);
}
}
HWTEST_F(ProductHelperTest, WhenFillingFrontEndPropertiesSupportThenExpectUseCorrectGetters) {