feature: Define thread group dispatch size according to kernel metadata

Related-To: NEO-10945

Signed-off-by: Vysochyn, Illia <illia.vysochyn@intel.com>
This commit is contained in:
Vysochyn, Illia 2025-05-16 16:19:31 +00:00 committed by Compute-Runtime-Automation
parent e0362a7c39
commit f99a4c2193
17 changed files with 104 additions and 58 deletions

View File

@ -197,7 +197,8 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
}
EncodeDispatchKernel<GfxFamily>::encodeEuSchedulingPolicy(&interfaceDescriptor, kernelDescriptor, defaultPipelinedThreadArbitrationPolicy);
const uint32_t threadGroupDimensions[] = {walkerCmd->getThreadGroupIdXDimension(), walkerCmd->getThreadGroupIdYDimension(), walkerCmd->getThreadGroupIdXDimension()};
EncodeDispatchKernel<GfxFamily>::encodeThreadGroupDispatch(interfaceDescriptor, device, hardwareInfo, threadGroupDimensions, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, threadsPerThreadGroup, *walkerCmd);
EncodeDispatchKernel<GfxFamily>::encodeThreadGroupDispatch(interfaceDescriptor, device, hardwareInfo, threadGroupDimensions, threadGroupCount, kernelDescriptor.kernelMetadata.requiredThreadGroupDispatchSize,
kernelDescriptor.kernelAttributes.numGrfRequired, threadsPerThreadGroup, *walkerCmd);
*pInterfaceDescriptor = interfaceDescriptor;
return (size_t)offsetInterfaceDescriptor;

View File

@ -271,8 +271,8 @@ struct EncodeDispatchKernel : public EncodeDispatchKernelBase<GfxFamily> {
template <typename WalkerType, typename InterfaceDescriptorType>
static void encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup,
WalkerType &walkerCmd);
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t requiredThreadGroupDispatchSize,
const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd);
template <typename WalkerType>
static void setWalkerRegionSettings(WalkerType &walkerCmd, const NEO::Device &device, uint32_t partitionCount, uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder);

View File

@ -11,7 +11,7 @@ template struct NEO::EncodeDispatchKernel<Family>;
template void NEO::EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields<Family::DefaultWalkerType>(const RootDeviceEnvironment &rootDeviceEnvironment, Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template void NEO::EncodeDispatchKernel<Family>::setGrfInfo<Family::DefaultWalkerType::InterfaceDescriptorType>(Family::DefaultWalkerType::InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment);
template void NEO::EncodeDispatchKernel<Family>::setupPreferredSlmSize<Family::DefaultWalkerType::InterfaceDescriptorType>(Family::DefaultWalkerType::InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy);
template void NEO::EncodeDispatchKernel<Family>::encodeThreadGroupDispatch<Family::DefaultWalkerType, Family::DefaultWalkerType::InterfaceDescriptorType>(Family::DefaultWalkerType::InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, Family::DefaultWalkerType &walkerCmd);
template void NEO::EncodeDispatchKernel<Family>::encodeThreadGroupDispatch<Family::DefaultWalkerType, Family::DefaultWalkerType::InterfaceDescriptorType>(Family::DefaultWalkerType::InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t requiredThreadGroupDispatchSize, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, Family::DefaultWalkerType &walkerCmd);
template void NEO::EncodeDispatchKernel<Family>::encode<Family::DefaultWalkerType>(CommandContainer &container, EncodeDispatchKernelArgs &args);
template void NEO::EncodeDispatchKernel<Family>::encodeThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
template void NEO::EncodeDispatchKernel<Family>::adjustWalkOrder<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);

View File

@ -387,7 +387,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
walkerCmd.setPredicateEnable(args.isPredicate);
auto threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadDimsVec, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadDimsVec, threadGroupCount,
kernelDescriptor.kernelMetadata.requiredThreadGroupDispatchSize, kernelDescriptor.kernelAttributes.numGrfRequired, threadsPerThreadGroup, walkerCmd);
if (debugManager.flags.PrintKernelDispatchParameters.get()) {
fprintf(stdout, "kernel, %s, grfCount, %d, simdSize, %d, tilesCount, %d, implicitScaling, %s, threadGroupCount, %d, numberOfThreadsInGpgpuThreadGroup, %d, threadGroupDimensions, %d, %d, %d, threadGroupDispatchSize enum, %d\n",
kernelDescriptor.kernelMetadata.kernelName.c_str(),
@ -1064,10 +1065,13 @@ void EncodeDispatchKernel<Family>::overrideDefaultValues(WalkerType &walkerCmd,
template <typename Family>
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd) {
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t requiredThreadGroupDispatchSize,
const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd) {
const auto &productHelper = device.getProductHelper();
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
if (requiredThreadGroupDispatchSize != 0) {
interfaceDescriptor.setThreadGroupDispatchSize(static_cast<typename InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE>(requiredThreadGroupDispatchSize));
} else if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
bool adjustTGDispatchSize = true;

View File

@ -80,6 +80,7 @@ inline constexpr ConstStringRef invalidKernel("invalid_kernel");
inline constexpr ConstStringRef vecTypeHint("vec_type_hint");
inline constexpr ConstStringRef workgroupSizeHint("work_group_size_hint");
inline constexpr ConstStringRef hintSuffix("_hint");
inline constexpr ConstStringRef intelReqdThreadgroupDispatchSize("intel_reqd_thread_group_dispatch_size");
} // namespace Attributes
namespace DebugEnv {
@ -459,12 +460,14 @@ using ReqdWorkgroupSizeT = std::array<int32_t, 3>;
using InvalidKernelT = ConstStringRef;
using WorkgroupSizeHint = std::array<int32_t, 3>;
using VecTypeHintT = ConstStringRef;
using IntelReqdThreadgroupDispatchSizeT = int32_t;
namespace Defaults {
inline constexpr IntelReqdSubgroupSizeT intelReqdSubgroupSize = 0;
inline constexpr IntelReqdWorkgroupWalkOrder intelReqdWorkgroupWalkOrder = {0, 0, 0};
inline constexpr ReqdWorkgroupSizeT reqdWorkgroupSize = {0, 0, 0};
inline constexpr WorkgroupSizeHint workgroupSizeHint = {0, 0, 0};
inline constexpr IntelReqdThreadgroupDispatchSizeT intelReqdThreadgroupDispatchSize = 0;
} // namespace Defaults
struct AttributesBaseT {
@ -474,6 +477,7 @@ struct AttributesBaseT {
std::optional<InvalidKernelT> invalidKernel;
std::optional<WorkgroupSizeHint> workgroupSizeHint;
std::optional<VecTypeHintT> vecTypeHint;
std::optional<IntelReqdThreadgroupDispatchSizeT> intelReqdThreadgroupDispatchSize;
std::vector<std::pair<ConstStringRef, ConstStringRef>> otherHints;
};
} // namespace Attributes

View File

@ -801,6 +801,9 @@ DecodeError readZeInfoAttributes(const Yaml::YamlParser &parser, const Yaml::Nod
outAttributes.invalidKernel = parser.readValue(attributesMetadataNd);
} else if (key == Tags::Kernel::Attributes::vecTypeHint) {
outAttributes.vecTypeHint = parser.readValue(attributesMetadataNd);
} else if (key == Tags::Kernel::Attributes::intelReqdThreadgroupDispatchSize) {
outAttributes.intelReqdThreadgroupDispatchSize = AttributeTypes::Defaults::intelReqdThreadgroupDispatchSize;
validAttributes &= readZeInfoValueChecked(parser, attributesMetadataNd, *outAttributes.intelReqdThreadgroupDispatchSize, context, outErrReason);
} else if (key.contains(Tags::Kernel::Attributes::hintSuffix.data())) {
outAttributes.otherHints.push_back({key, parser.readValue(attributesMetadataNd)});
} else {
@ -849,10 +852,12 @@ void populateKernelSourceAttributes(NEO::KernelDescriptor &dst, const KernelAttr
appendAttributeIfSet(languageAttributes, AttributeTags::workgroupSizeHint, attributes.workgroupSizeHint);
appendAttributeIfSet(languageAttributes, AttributeTags::vecTypeHint, attributes.vecTypeHint);
appendAttributeIfSet(languageAttributes, AttributeTags::invalidKernel, attributes.invalidKernel);
appendAttributeIfSet(languageAttributes, AttributeTags::intelReqdThreadgroupDispatchSize, attributes.intelReqdThreadgroupDispatchSize);
dst.kernelAttributes.flags.isInvalid = attributes.invalidKernel.has_value();
dst.kernelAttributes.flags.requiresWorkgroupWalkOrder = attributes.intelReqdWorkgroupWalkOrder.has_value();
dst.kernelMetadata.requiredSubGroupSize = static_cast<uint8_t>(attributes.intelReqdSubgroupSize.value_or(0U));
dst.kernelMetadata.requiredThreadGroupDispatchSize = static_cast<uint8_t>(attributes.intelReqdThreadgroupDispatchSize.value_or(0U));
}
DecodeError decodeZeInfoKernelDebugEnvironment(KernelDescriptor &dst, Yaml::YamlParser &parser, const ZeInfoKernelSections &kernelSections, std::string &outErrReason, std::string &outWarning) {

View File

@ -291,7 +291,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
cmd.setPredicateEnable(args.isPredicate);
auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension();
EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd);
EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, 0, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd);
EncodeWalkerArgs walkerArgs{
.kernelExecutionType = KernelExecutionType::defaultType,
@ -669,8 +669,8 @@ void EncodeDispatchKernel<Family>::overrideDefaultValues(WalkerType &walkerCmd,
template <typename Family>
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup,
WalkerType &walkerCmd) {
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t requiredThreadGroupDispatchSize,
const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd) {
}
template <typename Family>

View File

@ -266,6 +266,7 @@ struct KernelDescriptor : NEO::NonCopyableAndNonMovableClass {
uint16_t compiledSubGroupsNumber = 0U;
uint8_t requiredSubGroupSize = 0U;
uint8_t requiredThreadGroupDispatchSize = 0U;
bool isGeneratedByIgc = true;
} kernelMetadata;

View File

@ -126,6 +126,18 @@ void populateKernelDescriptor(KernelDescriptor &dst, const SPatchKernelAttribute
}
}
constexpr ConstStringRef attributeReqdThreadGroupDispatchSizeBeg = "intel_reqd_thread_group_dispatch_size(";
it = attributes.find(attributeReqdThreadGroupDispatchSizeBeg.begin());
if (it != std::string::npos) {
it += attributeReqdThreadGroupDispatchSizeBeg.size();
dst.kernelMetadata.requiredThreadGroupDispatchSize = 0U;
while ((attributes[it] >= '0') && (attributes[it] <= '9')) {
dst.kernelMetadata.requiredThreadGroupDispatchSize *= 10;
dst.kernelMetadata.requiredThreadGroupDispatchSize += attributes[it] - '0';
++it;
}
}
constexpr ConstStringRef invalidKernelAttrBeg = "invalid_kernel(";
dst.kernelAttributes.flags.isInvalid = (attributes.find(invalidKernelAttrBeg.data()) != std::string::npos);
}

View File

@ -31,9 +31,12 @@ namespace NEO {
template <>
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd) {
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t requiredThreadGroupDispatchSize,
const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd) {
const auto &productHelper = device.getProductHelper();
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
if (requiredThreadGroupDispatchSize != 0) {
interfaceDescriptor.setThreadGroupDispatchSize(static_cast<typename InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE>(requiredThreadGroupDispatchSize));
} else if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
if (threadsPerThreadGroup == 1) {
interfaceDescriptor.setThreadGroupDispatchSize(static_cast<INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE>(2u));
} else {

View File

@ -2094,6 +2094,7 @@ kernels:
work_group_size_hint: [256, 2, 1]
new_user_hint: new_user_hint_value
invalid_kernel: invalid_kernel_reason
intel_reqd_thread_group_dispatch_size: 8
...
)===";
@ -2125,6 +2126,7 @@ kernels:
EXPECT_TRUE(equals(attributes.otherHints[0].first, "new_user_hint"));
EXPECT_TRUE(equals(attributes.otherHints[0].second, "new_user_hint_value"));
EXPECT_TRUE(equals(attributes.invalidKernel.value(), "invalid_kernel_reason"));
EXPECT_EQ(8, attributes.intelReqdThreadgroupDispatchSize.value());
}
TEST(ReadZeInfoDebugEnvironment, givenSipSurfaceBtiEntryThenSetProperMembers) {
@ -2366,6 +2368,7 @@ kernels:
intel_reqd_sub_group_size: 16
intel_reqd_workgroup_walk_order: [0, 1, 2]
reqd_work_group_size: [256, 2, 1]
intel_reqd_thread_group_dispatch_size: 8
vec_type_hint: uint
work_group_size_hint: [256, 2, 1]
new_user_hint: new_user_hint_value
@ -2376,8 +2379,9 @@ kernels:
EXPECT_TRUE(warnings.empty()) << warnings;
EXPECT_TRUE(errors.empty()) << errors;
EXPECT_STREQ("new_user_hint(new_user_hint_value) intel_reqd_sub_group_size(16) intel_reqd_workgroup_walk_order(0,1,2) reqd_work_group_size(256,2,1) work_group_size_hint(256,2,1) vec_type_hint(uint)", kernelDescriptor->kernelMetadata.kernelLanguageAttributes.c_str());
EXPECT_STREQ("new_user_hint(new_user_hint_value) intel_reqd_sub_group_size(16) intel_reqd_workgroup_walk_order(0,1,2) reqd_work_group_size(256,2,1) work_group_size_hint(256,2,1) vec_type_hint(uint) intel_reqd_thread_group_dispatch_size(8)", kernelDescriptor->kernelMetadata.kernelLanguageAttributes.c_str());
EXPECT_EQ(16U, kernelDescriptor->kernelMetadata.requiredSubGroupSize);
EXPECT_EQ(8U, kernelDescriptor->kernelMetadata.requiredThreadGroupDispatchSize);
EXPECT_FALSE(kernelDescriptor->kernelAttributes.flags.isInvalid);
}

View File

@ -85,7 +85,7 @@ HWTEST2_F(DG2CommandEncoderTest, givenInterfaceDescriptorDataWhenForceThreadGrou
for (auto numberOfThreadsInGroup : {1u, 4u, 16u}) {
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInGroup);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, 0, 0, numberOfThreadsInGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, 0, 0, 0, numberOfThreadsInGroup, walkerCmd);
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
if (numberOfThreadsInGroup == 1) {

View File

@ -148,7 +148,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDispatchSizeSmallerOrEqualToA
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
for (const auto threadGroupCount : {1u, 2u}) {
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, 1u, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, 0u, numGrf, 1u, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}
@ -168,15 +168,16 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenMultipleTilesAndImplicitScali
auto &gfxCoreHelper = pDevice->getGfxCoreHelper();
const uint32_t threadGroupCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf) / 32u;
uint32_t threadsPerThreadGroup = 64u;
const uint32_t requiredThreadGroupDispatchSize = 0u;
iddArg.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
ASSERT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
debugManager.flags.EnableWalkerPartition.set(1);
pDevice->numSubDevices = 2;
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}
@ -190,6 +191,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupW
InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor<InterfaceDescriptorType>();
const uint32_t threadGroupCount = 512u;
const uint32_t requiredThreadGroupDispatchSize = 0u;
const uint32_t numGrf = GrfConfig::defaultGrfNumber;
std::array<std::pair<uint32_t, uint32_t>, 3> testParams = {{{16u, InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8},
{32u, InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4},
@ -198,7 +200,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupW
for (const auto &[numberOfThreadsInThreadGroup, expectedThreadGroupDispatchSize] : testParams) {
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(expectedThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
}
@ -214,13 +216,14 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupA
InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor<InterfaceDescriptorType>();
const uint32_t threadGroupCount = 512u;
const uint32_t requiredThreadGroupDispatchSize = 0u;
const uint32_t numGrf = GrfConfig::defaultGrfNumber;
uint32_t threadsPerThreadGroup = 16;
iddArg.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
{
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
walkerCmd.setThreadGroupIdYDimension(2);
@ -228,19 +231,19 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupA
{
walkerCmd.setThreadGroupIdXDimension(4);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize());
}
{
walkerCmd.setThreadGroupIdXDimension(2);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
}
{
walkerCmd.setThreadGroupIdXDimension(1);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
walkerCmd.setThreadGroupIdYDimension(1);
@ -248,19 +251,19 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupA
{
walkerCmd.setThreadGroupIdXDimension(4);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize());
}
{
walkerCmd.setThreadGroupIdXDimension(2);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
}
{
walkerCmd.setThreadGroupIdXDimension(1);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
walkerCmd.setThreadGroupIdYDimension(1);
@ -268,19 +271,19 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupA
{
walkerCmd.setThreadGroupIdXDimension(4);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
{
walkerCmd.setThreadGroupIdXDimension(2);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
{
walkerCmd.setThreadGroupIdXDimension(1);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
walkerCmd.setThreadGroupIdXDimension(1);
@ -288,19 +291,19 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupA
{
walkerCmd.setThreadGroupIdYDimension(4);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize());
}
{
walkerCmd.setThreadGroupIdYDimension(2);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
}
{
walkerCmd.setThreadGroupIdYDimension(1);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, threadsPerThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
}
@ -313,6 +316,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDifferentNumGrfWhenCallingEnc
InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor<InterfaceDescriptorType>();
const uint32_t numberOfThreadsInThreadGroup = 1u;
const uint32_t requiredThreadGroupDispatchSize = 0u;
walkerCmd.setThreadGroupIdXDimension(1);
walkerCmd.setThreadGroupIdYDimension(1);
@ -323,7 +327,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDifferentNumGrfWhenCallingEnc
const uint32_t threadGroupCount = 1;
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
ASSERT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}
@ -332,7 +336,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDifferentNumGrfWhenCallingEnc
const uint32_t threadGroupCount = 1;
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}
}
@ -348,6 +352,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
mutableHwInfo->gtSystemInfo.ThreadCount = 4096u;
auto hwInfo = pDevice->getHardwareInfo();
const uint32_t requiredThreadGroupDispatchSize = 0u;
uint32_t numGrf = GrfConfig::defaultGrfNumber;
InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor<InterfaceDescriptorType>();
@ -361,7 +366,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}
@ -373,7 +378,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize());
}
@ -386,7 +391,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}
@ -399,7 +404,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
@ -412,7 +417,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
}
@ -425,7 +430,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
}
@ -438,7 +443,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
}
@ -451,7 +456,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}
@ -464,7 +469,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
}
@ -477,7 +482,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize());
}
@ -490,7 +495,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
}
@ -503,7 +508,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
}
@ -516,7 +521,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}
@ -529,7 +534,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize());
}
@ -542,7 +547,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA
{
const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
}
}
@ -557,6 +562,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDualSubSliceCountNotEqualToMa
mutableHwInfo->gtSystemInfo.ThreadCount = 2048u;
auto hwInfo = pDevice->getHardwareInfo();
const uint32_t requiredThreadGroupDispatchSize = 0u;
uint32_t numGrf = GrfConfig::defaultGrfNumber;
InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor<InterfaceDescriptorType>();
@ -569,7 +575,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDualSubSliceCountNotEqualToMa
walkerCmd.setThreadGroupIdYDimension(1);
walkerCmd.setThreadGroupIdZDimension(1);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}
@ -586,6 +592,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupA
InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor<InterfaceDescriptorType>();
const uint32_t threadGroupCount = 1u;
const uint32_t requiredThreadGroupDispatchSize = 0;
const uint32_t numGrf = GrfConfig::defaultGrfNumber;
std::array<std::pair<uint32_t, uint32_t>, 3> testParams = {{{16u, InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1},
{32u, InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1},
@ -594,7 +601,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupA
for (const auto &[numberOfThreadsInThreadGroup, expectedThreadGroupDispatchSize] : testParams) {
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(expectedThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
}
@ -610,12 +617,13 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenThreadGroupCountZeroWhenCalli
auto hwInfo = pDevice->getHardwareInfo();
const uint32_t threadGroupCount = 1u;
const uint32_t requiredThreadGroupDispatchSize = 0u;
const uint32_t numGrf = GrfConfig::defaultGrfNumber;
InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor<InterfaceDescriptorType>();
uint32_t numberOfThreadsInThreadGroup = 1;
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, requiredThreadGroupDispatchSize, numGrf, numberOfThreadsInThreadGroup, walkerCmd);
EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
}

View File

@ -720,7 +720,7 @@ HWTEST2_F(CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGr
uint32_t threadsPerThreadGroup = 4;
for (auto revision : revisions) {
hwInfo.platform.usRevId = productHelper.getHwRevIdFromStepping(revision, hwInfo);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, 0, threadsPerThreadGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, 0, 0, threadsPerThreadGroup, walkerCmd);
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
@ -748,7 +748,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDa
DebugManagerStateRestore restorer;
debugManager.flags.ForceThreadGroupDispatchSize.set(forceThreadGroupDispatchSize);
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, pDevice->getHardwareInfo(), threadGroups, threadGroupCount, 1, 1, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, *pDevice, pDevice->getHardwareInfo(), threadGroups, threadGroupCount, 0, 1, 1, walkerCmd);
EXPECT_NE(defaultThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
EXPECT_EQ(forceThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2024 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -444,6 +444,7 @@ TEST(KernelDescriptorFromPatchtokens, GivenhKernelAttributesThenPopulatesStrings
NEO::populateKernelDescriptor(kernelDescriptor, kernelTokens, 4);
EXPECT_TRUE(kernelDescriptor.kernelMetadata.kernelLanguageAttributes.empty());
EXPECT_EQ(0U, kernelDescriptor.kernelMetadata.requiredSubGroupSize);
EXPECT_EQ(0U, kernelDescriptor.kernelMetadata.requiredThreadGroupDispatchSize);
iOpenCL::SPatchKernelAttributesInfo kernelAttributesToken;
kernelAttributesToken.AttributesSize = 0U;
@ -451,8 +452,9 @@ TEST(KernelDescriptorFromPatchtokens, GivenhKernelAttributesThenPopulatesStrings
NEO::populateKernelDescriptor(kernelDescriptor, kernelTokens, 4);
EXPECT_TRUE(kernelDescriptor.kernelMetadata.kernelLanguageAttributes.empty());
EXPECT_EQ(0U, kernelDescriptor.kernelMetadata.requiredSubGroupSize);
EXPECT_EQ(0U, kernelDescriptor.kernelMetadata.requiredThreadGroupDispatchSize);
std::string attribute = "intel_reqd_sub_group_size(32)";
std::string attribute = "intel_reqd_sub_group_size(32) intel_reqd_thread_group_dispatch_size(8)";
kernelAttributesToken.AttributesSize = static_cast<uint32_t>(attribute.size());
std::vector<uint8_t> tokenStorage;
tokenStorage.insert(tokenStorage.end(), reinterpret_cast<uint8_t *>(&kernelAttributesToken), reinterpret_cast<uint8_t *>(&kernelAttributesToken + 1));
@ -462,6 +464,7 @@ TEST(KernelDescriptorFromPatchtokens, GivenhKernelAttributesThenPopulatesStrings
NEO::populateKernelDescriptor(kernelDescriptor, kernelTokens, 4);
EXPECT_EQ(attribute, kernelDescriptor.kernelMetadata.kernelLanguageAttributes);
EXPECT_EQ(32U, kernelDescriptor.kernelMetadata.requiredSubGroupSize);
EXPECT_EQ(8U, kernelDescriptor.kernelMetadata.requiredThreadGroupDispatchSize);
EXPECT_FALSE(kernelDescriptor.kernelAttributes.flags.isInvalid);
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2024 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -86,6 +86,7 @@ TEST(KernelDescriptor, WhenDefaultInitializedThenValuesAreCleared) {
EXPECT_TRUE(desc.kernelMetadata.printfStringsMap.empty());
EXPECT_EQ(0U, desc.kernelMetadata.compiledSubGroupsNumber);
EXPECT_EQ(0U, desc.kernelMetadata.requiredSubGroupSize);
EXPECT_EQ(0U, desc.kernelMetadata.requiredThreadGroupDispatchSize);
EXPECT_EQ(nullptr, desc.external.debugData.get());
EXPECT_EQ(nullptr, desc.external.igcInfoForGtpin);
}
@ -271,4 +272,4 @@ TEST(KernelDescriptor, GivenDescriptorWithoutStatefulArgsWhenInitBindlessOffsets
desc.initBindlessOffsetToSurfaceState();
EXPECT_EQ(0u, desc.bindlessArgsMap.size());
}
}

View File

@ -67,7 +67,7 @@ XE3_CORETEST_F(CommandEncodeXe3CoreTest, givenInterfaceDescriptorDataWhenAdjustI
MockDevice mockDevice;
uint32_t threadsPerGroup = 1;
uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()};
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, mockDevice, *defaultHwInfo, threadGroups, 1, 0, threadsPerGroup, walkerCmd);
EncodeDispatchKernel<FamilyType>::encodeThreadGroupDispatch(iddArg, mockDevice, *defaultHwInfo, threadGroups, 0, 1, 0, threadsPerGroup, walkerCmd);
EXPECT_EQ(2u, iddArg.getBindingTableEntryCount());
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::SAMPLER_COUNT_BETWEEN_1_AND_4_SAMPLERS_USED, iddArg.getSamplerCount());
}