refactor: unify programming of preferred slm size 5/n

- remove xe2 hpg encode preferred slm size
- add xe2 release helper preferred slm array
- add dedicated method to calculate thread count per sub slice

Related-To: NEO-12639

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-10-08 17:33:33 +00:00
committed by Compute-Runtime-Automation
parent 946e421f77
commit 46a63d3e0e
10 changed files with 117 additions and 92 deletions

View File

@@ -132,6 +132,8 @@ struct EncodeDispatchKernel {
static void setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment,
const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
static uint32_t getThreadCountPerSubslice(const HardwareInfo &hwInfo);
template <typename InterfaceDescriptorType>
static void encodeEuSchedulingPolicy(InterfaceDescriptorType *pInterfaceDescriptor, const KernelDescriptor &kernelDesc, int32_t defaultPipelinedThreadArbitrationPolicy);

View File

@@ -10,4 +10,10 @@ template <typename Family>
size_t EncodeDispatchKernel<Family>::getDefaultIOHAlignment() {
return 1;
}
template <typename Family>
uint32_t EncodeDispatchKernel<Family>::getThreadCountPerSubslice(const HardwareInfo &hwInfo) {
return hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
}
} // namespace NEO

View File

@@ -15,4 +15,9 @@ size_t EncodeDispatchKernel<Family>::getDefaultIOHAlignment() {
return alignment;
}
template <typename Family>
uint32_t EncodeDispatchKernel<Family>::getThreadCountPerSubslice(const HardwareInfo &hwInfo) {
return hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.SubSliceCount;
}
} // namespace NEO

View File

@@ -1066,7 +1066,7 @@ template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
using PREFERRED_SLM_ALLOCATION_SIZE = typename InterfaceDescriptorType::PREFERRED_SLM_ALLOCATION_SIZE;
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
const uint32_t threadsPerDssCount = EncodeDispatchKernel<Family>::getThreadCountPerSubslice(hwInfo);
const uint32_t workGroupCountPerDss = static_cast<uint32_t>(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup));
uint32_t slmSize = 0u;

View File

@@ -7,6 +7,7 @@
#include "shared/source/release_helper/release_helper.h"
#include "shared/source/release_helper/release_helper_base.inl"
#include "shared/source/xe2_hpg_core/hw_cmds_base.h"
#include "release_definitions.h"
@@ -32,6 +33,22 @@ template <>
bool ReleaseHelperHw<release>::isBindlessAddressingDisabled() const {
return false;
}
template <>
const SizeToPreferredSlmValueArray &ReleaseHelperHw<release>::getSizeToPreferredSlmValue(bool isHeapless) const {
using PREFERRED_SLM_ALLOCATION_SIZE = typename Xe2HpgCoreFamily::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
static const SizeToPreferredSlmValueArray sizeToPreferredSlmValue = {{
{0, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_0K},
{16 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_16K},
{32 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_32K},
{64 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_64K},
{96 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_96K},
{128 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K},
{std::numeric_limits<uint32_t>::max(), PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_160K},
}};
return sizeToPreferredSlmValue;
}
} // namespace NEO
#include "shared/source/release_helper/release_helper_common_xe2_hpg.inl"

View File

@@ -27,12 +27,6 @@ inline bool ReleaseHelperHw<release>::isAuxSurfaceModeOverrideRequired() const {
return true;
}
template <>
int ReleaseHelperHw<release>::getProductMaxPreferredSlmSize(int preferredEnumValue) const {
using PREFERRED_SLM_ALLOCATION_SIZE = typename Xe2HpgCoreFamily::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
return std::min(preferredEnumValue, static_cast<int>(PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K));
}
template <>
bool ReleaseHelperHw<release>::isLocalOnlyAllowed() const {
return false;
@@ -43,6 +37,20 @@ bool ReleaseHelperHw<release>::isBindlessAddressingDisabled() const {
return false;
}
template <>
const SizeToPreferredSlmValueArray &ReleaseHelperHw<release>::getSizeToPreferredSlmValue(bool isHeapless) const {
using PREFERRED_SLM_ALLOCATION_SIZE = typename Xe2HpgCoreFamily::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
static const SizeToPreferredSlmValueArray sizeToPreferredSlmValue = {{
{0, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_0K},
{16 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_16K},
{32 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_32K},
{64 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_64K},
{96 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_96K},
{std::numeric_limits<uint32_t>::max(), PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K},
}};
return sizeToPreferredSlmValue;
}
} // namespace NEO
#include "shared/source/release_helper/release_helper_common_xe2_hpg.inl"

View File

@@ -253,64 +253,6 @@ void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDevice
walkerCmd.setComputeDispatchAllWalkerEnable(computeDispatchAllWalkerEnable);
}
template <>
template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
using PREFERRED_SLM_ALLOCATION_SIZE = typename InterfaceDescriptorType::PREFERRED_SLM_ALLOCATION_SIZE;
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount /
hwInfo.gtSystemInfo.SubSliceCount;
uint32_t workGroupCountPerDss = static_cast<uint32_t>(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup));
uint32_t slmSize = 0u;
switch (slmPolicy) {
case SlmPolicy::slmPolicyLargeData:
slmSize = slmTotalSize;
break;
case SlmPolicy::slmPolicyLargeSlm:
default:
slmSize = slmTotalSize * workGroupCountPerDss;
break;
}
struct SizeToPreferredSlmValue {
uint32_t upperLimit;
PREFERRED_SLM_ALLOCATION_SIZE valueToProgram;
};
const std::array<SizeToPreferredSlmValue, 10> ranges = {{
// upper limit, retVal
{0, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_0K},
{16 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_16K},
{32 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_32K},
{64 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_64K},
{96 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_96K},
{128 * MemoryConstants::kiloByte, PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K},
}};
auto programmableIdPreferredSlmSize = PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_160K;
auto *releaseHelper = rootDeviceEnvironment.getReleaseHelper();
programmableIdPreferredSlmSize = static_cast<PREFERRED_SLM_ALLOCATION_SIZE>(
releaseHelper->getProductMaxPreferredSlmSize(programmableIdPreferredSlmSize));
for (auto &range : ranges) {
if (slmSize <= range.upperLimit) {
programmableIdPreferredSlmSize = range.valueToProgram;
break;
}
}
pInterfaceDescriptor->setPreferredSlmAllocationSize(programmableIdPreferredSlmSize);
if (debugManager.flags.OverridePreferredSlmAllocationSizePerDss.get() != -1) {
auto toProgram =
static_cast<PREFERRED_SLM_ALLOCATION_SIZE>(debugManager.flags.OverridePreferredSlmAllocationSizePerDss.get());
pInterfaceDescriptor->setPreferredSlmAllocationSize(toProgram);
}
}
template <>
template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount,

View File

@@ -1644,3 +1644,15 @@ HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenForcingDifferent
debugManager.flags.ForceIOHAlignment.set(expectedAlignemnt);
EXPECT_EQ(NEO::EncodeDispatchKernel<FamilyType>::getDefaultIOHAlignment(), expectedAlignemnt);
}
HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenGettingThreadCountPerSubsliceThenUseDualSubSliceAsDenominator, IsAtMostXeHpcCore) {
auto &hwInfo = pDevice->getHardwareInfo();
auto expectedValue = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
EXPECT_EQ(expectedValue, NEO::EncodeDispatchKernel<FamilyType>::getThreadCountPerSubslice(hwInfo));
}
HWTEST2_F(CommandEncodeStatesTest, givenEncodeDispatchKernelWhenGettingThreadCountPerSubsliceThenUseSubSliceAsDenominator, IsAtLeastXe2HpgCore) {
auto &hwInfo = pDevice->getHardwareInfo();
auto expectedValue = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.SubSliceCount;
EXPECT_EQ(expectedValue, NEO::EncodeDispatchKernel<FamilyType>::getThreadCountPerSubslice(hwInfo));
}

View File

@@ -75,6 +75,34 @@ TEST_F(ReleaseHelper2001Tests, whenIsLocalOnlyAllowedCalledThenFalseReturned) {
whenIsLocalOnlyAllowedCalledThenFalseReturned();
}
TEST_F(ReleaseHelper2001Tests, whenGettingPreferredSlmSizeThenAllEntriesEmpty) {
whenGettingPreferredSlmSizeThenAllEntriesEmpty();
TEST_F(ReleaseHelper2001Tests, whenGettingPreferredSlmSizeThenAllEntriesHaveCorrectValues) {
for (auto &revision : getRevisions()) {
ipVersion.revision = revision;
releaseHelper = ReleaseHelper::create(ipVersion);
ASSERT_NE(nullptr, releaseHelper);
constexpr uint32_t kB = 1024;
auto &preferredSlmValueArray = releaseHelper->getSizeToPreferredSlmValue(false);
EXPECT_EQ(0u, preferredSlmValueArray[0].upperLimit);
EXPECT_EQ(0u, preferredSlmValueArray[0].valueToProgram);
EXPECT_EQ(16 * kB, preferredSlmValueArray[1].upperLimit);
EXPECT_EQ(1u, preferredSlmValueArray[1].valueToProgram);
EXPECT_EQ(32 * kB, preferredSlmValueArray[2].upperLimit);
EXPECT_EQ(2u, preferredSlmValueArray[2].valueToProgram);
EXPECT_EQ(64 * kB, preferredSlmValueArray[3].upperLimit);
EXPECT_EQ(3u, preferredSlmValueArray[3].valueToProgram);
EXPECT_EQ(96 * kB, preferredSlmValueArray[4].upperLimit);
EXPECT_EQ(4u, preferredSlmValueArray[4].valueToProgram);
EXPECT_EQ(128 * kB, preferredSlmValueArray[5].upperLimit);
EXPECT_EQ(5u, preferredSlmValueArray[5].valueToProgram);
EXPECT_EQ(std::numeric_limits<uint32_t>::max(), preferredSlmValueArray[6].upperLimit);
EXPECT_EQ(6u, preferredSlmValueArray[6].valueToProgram);
}
}

View File

@@ -44,28 +44,8 @@ TEST_F(ReleaseHelper2004Tests, whenGettingCapabilitiesThenCorrectPropertiesAreRe
}
}
TEST_F(ReleaseHelper2004Tests, whenGettingMaxPreferredSlmSizeThenSizeSizeIsLimitedBy128K) {
for (auto &revision : getRevisions()) {
ipVersion.revision = revision;
releaseHelper = ReleaseHelper::create(ipVersion);
ASSERT_NE(nullptr, releaseHelper);
using PREFERRED_SLM_ALLOCATION_SIZE = typename Xe2HpgCoreFamily::INTERFACE_DESCRIPTOR_DATA::PREFERRED_SLM_ALLOCATION_SIZE;
for (auto &preferredSlmSize : {PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_0K,
PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_16K,
PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_32K,
PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_64K,
PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_96K,
PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K}) {
auto maxPreferredSlmValue = releaseHelper->getProductMaxPreferredSlmSize(preferredSlmSize);
EXPECT_EQ(maxPreferredSlmValue, preferredSlmSize);
}
auto preferredSlmSize128k = PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_128K;
auto preferredSlmSize160k = PREFERRED_SLM_ALLOCATION_SIZE::PREFERRED_SLM_ALLOCATION_SIZE_160K;
auto maxPreferredSlmValue = releaseHelper->getProductMaxPreferredSlmSize(preferredSlmSize160k);
EXPECT_EQ(maxPreferredSlmValue, preferredSlmSize128k);
}
TEST_F(ReleaseHelper2004Tests, whenGettingMaxPreferredSlmSizeThenSizeIsNotModified) {
whenGettingMaxPreferredSlmSizeThenSizeIsNotModified();
}
TEST_F(ReleaseHelper2004Tests, whenShouldAdjustCalledThenTrueReturned) {
@@ -96,6 +76,31 @@ TEST_F(ReleaseHelper2004Tests, whenIsLocalOnlyAllowedCalledThenFalseReturned) {
whenIsLocalOnlyAllowedCalledThenFalseReturned();
}
TEST_F(ReleaseHelper2004Tests, whenGettingPreferredSlmSizeThenAllEntriesEmpty) {
whenGettingPreferredSlmSizeThenAllEntriesEmpty();
TEST_F(ReleaseHelper2004Tests, whenGettingPreferredSlmSizeThenAllEntriesHaveCorrectValues) {
for (auto &revision : getRevisions()) {
ipVersion.revision = revision;
releaseHelper = ReleaseHelper::create(ipVersion);
ASSERT_NE(nullptr, releaseHelper);
constexpr uint32_t kB = 1024;
auto &preferredSlmValueArray = releaseHelper->getSizeToPreferredSlmValue(false);
EXPECT_EQ(0u, preferredSlmValueArray[0].upperLimit);
EXPECT_EQ(0u, preferredSlmValueArray[0].valueToProgram);
EXPECT_EQ(16 * kB, preferredSlmValueArray[1].upperLimit);
EXPECT_EQ(1u, preferredSlmValueArray[1].valueToProgram);
EXPECT_EQ(32 * kB, preferredSlmValueArray[2].upperLimit);
EXPECT_EQ(2u, preferredSlmValueArray[2].valueToProgram);
EXPECT_EQ(64 * kB, preferredSlmValueArray[3].upperLimit);
EXPECT_EQ(3u, preferredSlmValueArray[3].valueToProgram);
EXPECT_EQ(96 * kB, preferredSlmValueArray[4].upperLimit);
EXPECT_EQ(4u, preferredSlmValueArray[4].valueToProgram);
EXPECT_EQ(std::numeric_limits<uint32_t>::max(), preferredSlmValueArray[5].upperLimit);
EXPECT_EQ(5u, preferredSlmValueArray[5].valueToProgram);
}
}