mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-31 20:13:04 +08:00
Apply heuristics when setting TG dispatch size on XE_HPC_CORE
The default TG dispatch size can be changed to a better value based on number of threads in TG or currently available amount of threads on GPU. Decision on what TG dispatch size should be are based on implemented heuristics. Signed-off-by: Rafal Maziejuk <rafal.maziejuk@intel.com> Related-To: NEO-6989
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
52133e61ce
commit
ed0c36117e
@@ -97,7 +97,7 @@ struct EncodeDispatchKernel {
|
||||
|
||||
static void programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
|
||||
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo);
|
||||
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf);
|
||||
|
||||
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
|
||||
|
||||
|
||||
@@ -672,7 +672,7 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo) {}
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {}
|
||||
|
||||
template <typename Family>
|
||||
constexpr bool EncodeDispatchKernel<Family>::shouldUpdateGlobalAtomics(bool ¤tVal, bool refVal, bool updateCurrent) { return false; }
|
||||
|
||||
@@ -221,7 +221,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
container.getResidencyContainer().push_back(args.device->getBindlessHeapsHelper()->getHeap(NEO::BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH)->getGraphicsAllocation());
|
||||
}
|
||||
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, hwInfo);
|
||||
auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension();
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
|
||||
PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *args.device);
|
||||
|
||||
|
||||
@@ -257,7 +257,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
|
||||
walkerCmd.setPredicateEnable(args.isPredicate);
|
||||
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, hwInfo);
|
||||
auto threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
|
||||
EncodeDispatchKernel<Family>::appendAdditionalIDDFields(&idd, hwInfo, threadsPerThreadGroup,
|
||||
args.dispatchInterface->getSlmTotalSize(),
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "shared/source/helpers/aligned_memory.h"
|
||||
#include "shared/source/helpers/debug_helpers.h"
|
||||
#include "shared/source/kernel/debug_data.h"
|
||||
#include "shared/source/kernel/grf_config.h"
|
||||
#include "shared/source/kernel/kernel_arg_descriptor.h"
|
||||
#include "shared/source/kernel/kernel_arg_metadata.h"
|
||||
#include "shared/source/utilities/stackvec.h"
|
||||
@@ -149,7 +150,7 @@ struct KernelDescriptor {
|
||||
uint16_t inlineDataPayloadSize = 0U;
|
||||
uint16_t perThreadDataSize = 0U;
|
||||
uint16_t numArgsToPatch = 0U;
|
||||
uint16_t numGrfRequired = 0U;
|
||||
uint16_t numGrfRequired = GrfConfig::DefaultGrfNumber;
|
||||
uint8_t barrierCount = 0u;
|
||||
bool hasNonKernelArgLoad = true;
|
||||
bool hasNonKernelArgStore = true;
|
||||
|
||||
@@ -61,7 +61,7 @@ void EncodeDispatchKernel<Family>::programBarrierEnable(INTERFACE_DESCRIPTOR_DAT
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo) {
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily);
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
|
||||
@@ -31,10 +31,40 @@ void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd,
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo) {
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily);
|
||||
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
|
||||
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
|
||||
if (hwInfo.gtSystemInfo.SliceCount % 2 == 0) {
|
||||
UNRECOVERABLE_IF(numGrf == 0u);
|
||||
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
|
||||
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
|
||||
uint32_t availableThreadCount = hwHelper.calculateAvailableThreadCount(hwInfo, numGrf);
|
||||
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
|
||||
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
|
||||
|
||||
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
|
||||
|
||||
if (dispatchedTotalThreadCount <= availableThreadCount) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
|
||||
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
|
||||
} else {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
|
||||
}
|
||||
|
||||
uint32_t exponent = INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1 - interfaceDescriptor.getThreadGroupDispatchSize();
|
||||
uint32_t threadGroupDispatchSize = 1u << exponent;
|
||||
if ((dispatchedTotalThreadCount % (numberOfThreadsInThreadGroup * threadGroupDispatchSize)) != 0) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (DebugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
|
||||
|
||||
@@ -85,7 +85,7 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTO
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo) {
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily);
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
if (interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup() == 1) {
|
||||
|
||||
@@ -60,16 +60,16 @@ HWTEST2_F(DG2CommandEncoderTest, givenInterfaceDescriptorDataWhenForceThreadGrou
|
||||
|
||||
for (auto numberOfThreadsInGroup : {1u, 4u, 16u}) {
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInGroup);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, 0, 0);
|
||||
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
if (numberOfThreadsInGroup == 1) {
|
||||
EXPECT_EQ(2u, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize());
|
||||
} else {
|
||||
EXPECT_EQ(3u, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
} else {
|
||||
EXPECT_EQ(0u, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -798,6 +798,7 @@ HWTEST_F(EncodeDispatchKernelTest, givenNonBindlessOrStatelessArgWhenDispatching
|
||||
|
||||
dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.numEntries = numBindingTable;
|
||||
dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.tableOffset = 0U;
|
||||
dispatchInterface->kernelDescriptor.kernelAttributes.numGrfRequired = 128U;
|
||||
|
||||
auto &arg = dispatchInterface->kernelDescriptor.payloadMappings.explicitArgs[0].as<NEO::ArgDescPointer>();
|
||||
arg.bindless = NEO::undefined<CrossThreadDataOffset>;
|
||||
@@ -828,6 +829,7 @@ HWTEST_F(EncodeDispatchKernelTest, givenNonBindlessOrStatelessArgWhenDispatching
|
||||
|
||||
dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.numEntries = numBindingTable;
|
||||
dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.tableOffset = 0U;
|
||||
dispatchInterface->kernelDescriptor.kernelAttributes.numGrfRequired = 128U;
|
||||
|
||||
sshData = reinterpret_cast<uint8_t *>(&bindingTableState);
|
||||
dispatchInterface->getSurfaceStateHeapDataResult = const_cast<uint8_t *>(sshData);
|
||||
|
||||
@@ -614,12 +614,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDa
|
||||
uint32_t revisions[] = {REVISION_A0, REVISION_B};
|
||||
for (auto revision : revisions) {
|
||||
hwInfo.platform.usRevId = hwInfoConfig.getHwRevIdFromStepping(revision, hwInfo);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo);
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, 0, 0);
|
||||
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
EXPECT_EQ(3u, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
} else {
|
||||
EXPECT_EQ(0u, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -629,13 +629,16 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDa
|
||||
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg;
|
||||
iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u);
|
||||
|
||||
const uint32_t forceThreadGroupDispatchSize = 1;
|
||||
const uint32_t defaultThreadGroupDispatchSize = iddArg.getThreadGroupDispatchSize();
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.ForceThreadGroupDispatchSize.set(forceThreadGroupDispatchSize);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, pDevice->getHardwareInfo());
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, pDevice->getHardwareInfo(), threadGroupCount, 1);
|
||||
|
||||
EXPECT_NE(defaultThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
|
||||
EXPECT_EQ(forceThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
|
||||
|
||||
@@ -25,7 +25,7 @@ TEST(KernelDescriptor, WhenDefaultInitializedThenValuesAreCleared) {
|
||||
EXPECT_EQ(0U, desc.kernelAttributes.crossThreadDataSize);
|
||||
EXPECT_EQ(0U, desc.kernelAttributes.perThreadDataSize);
|
||||
EXPECT_EQ(0U, desc.kernelAttributes.numArgsToPatch);
|
||||
EXPECT_EQ(0U, desc.kernelAttributes.numGrfRequired);
|
||||
EXPECT_EQ(128U, desc.kernelAttributes.numGrfRequired);
|
||||
EXPECT_EQ(NEO::KernelDescriptor::BindfulAndStateless, desc.kernelAttributes.bufferAddressingMode);
|
||||
EXPECT_EQ(NEO::KernelDescriptor::Bindful, desc.kernelAttributes.imageAddressingMode);
|
||||
EXPECT_EQ(NEO::KernelDescriptor::Bindful, desc.kernelAttributes.samplerAddressingMode);
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
|
||||
HWTEST_EXCLUDE_PRODUCT(CommandEncodeStatesTest, givenSlmTotalSizeEqualZeroWhenDispatchingKernelThenSharedMemorySizeIsSetCorrectly, IGFX_XE_HPC_CORE);
|
||||
HWTEST_EXCLUDE_PRODUCT(CommandEncodeStatesTest, givenOverrideSlmTotalSizeDebugVariableWhenDispatchingKernelThenSharedMemorySizeIsSetCorrectly, IGFX_XE_HPC_CORE);
|
||||
HWTEST_EXCLUDE_PRODUCT(CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGroupDispatchSizeVariableIsDefaultThenThreadGroupDispatchSizeIsNotChanged, IGFX_XE_HPC_CORE);
|
||||
HWTEST_EXCLUDE_PRODUCT(DrmMemoryManagerTest, givenDrmAllocationWithHostPtrWhenItIsCreatedWithIncorrectCacheRegionThenReturnNull, IGFX_XE_HPC_CORE);
|
||||
HWTEST_EXCLUDE_PRODUCT(DrmMemoryManagerTest, givenDrmAllocationWithWithAlignmentFromUserptrWhenItIsCreatedWithIncorrectCacheRegionThenReturnNull, IGFX_XE_HPC_CORE);
|
||||
HWTEST_EXCLUDE_PRODUCT(WalkerPartitionTests, givenMiAtomicWhenItIsProgrammedThenAllFieldsAreSetCorrectly, IGFX_XE_HPC_CORE);
|
||||
|
||||
@@ -508,3 +508,100 @@ XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenCleanHeapsAndSlmNotChangedAndU
|
||||
EXPECT_EQ(cmdSba->getStatelessDataPortAccessMemoryObjectControlState(),
|
||||
(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED)));
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenInterfaceDescriptorDataAndOddSliceCountWhenAdjustInterfaceDescriptorDataIsCalledThenThreadGroupDispatchSizeIsCorrectlySet) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(productFamily);
|
||||
auto hwInfo = pDevice->getHardwareInfo();
|
||||
hwInfo.gtSystemInfo.SliceCount = 7u;
|
||||
|
||||
for (const auto &revision : {REVISION_A0, REVISION_B}) {
|
||||
hwInfo.platform.usRevId = hwInfoConfig.getHwRevIdFromStepping(revision, hwInfo);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, 0, 0);
|
||||
|
||||
if (hwInfoConfig.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
} else {
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenDispatchSizeSmallerOrEqualToAvailableThreadCountWhenAdjustInterfaceDescriptorDataIsCalledThenThreadGroupDispatchSizeIsCorrectlySet) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(productFamily);
|
||||
auto hwInfo = pDevice->getHardwareInfo();
|
||||
hwInfo.platform.usRevId = hwInfoConfig.getHwRevIdFromStepping(REVISION_B, hwInfo);
|
||||
hwInfo.gtSystemInfo.EUCount = 2u;
|
||||
|
||||
const uint32_t numGrf = 1024u;
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u);
|
||||
|
||||
for (const auto threadGroupCount : {1u, 2u}) {
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, threadGroupCount, numGrf);
|
||||
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenNumberOfThreadsInThreadGroupWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(productFamily);
|
||||
auto hwInfo = pDevice->getHardwareInfo();
|
||||
hwInfo.platform.usRevId = hwInfoConfig.getHwRevIdFromStepping(REVISION_B, hwInfo);
|
||||
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
const uint32_t threadGroupCount = 512u;
|
||||
const uint32_t numGrf = 256u;
|
||||
std::array<std::pair<uint32_t, uint32_t>, 3> testParams = {{{16u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8},
|
||||
{32u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4},
|
||||
{64u, INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2}}};
|
||||
|
||||
for (const auto &[numberOfThreadsInThreadGroup, expectedThreadGroupDispatchSize] : testParams) {
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, threadGroupCount, numGrf);
|
||||
|
||||
EXPECT_EQ(expectedThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenIndivisibleDispatchSizeWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(productFamily);
|
||||
auto hwInfo = pDevice->getHardwareInfo();
|
||||
hwInfo.platform.usRevId = hwInfoConfig.getHwRevIdFromStepping(REVISION_B, hwInfo);
|
||||
|
||||
const uint32_t threadGroupCount = 343u;
|
||||
const uint32_t numGrf = 256u;
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(18u);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, threadGroupCount, numGrf);
|
||||
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
|
||||
XE_HPC_CORETEST_F(EncodeKernelXeHpcCoreTest, givenThreadGroupCountZeroWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsSetToDefault) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
const auto &hwInfoConfig = *HwInfoConfig::get(productFamily);
|
||||
auto hwInfo = pDevice->getHardwareInfo();
|
||||
hwInfo.platform.usRevId = hwInfoConfig.getHwRevIdFromStepping(REVISION_B, hwInfo);
|
||||
|
||||
const uint32_t threadGroupCount = 0u;
|
||||
const uint32_t numGrf = 256u;
|
||||
INTERFACE_DESCRIPTOR_DATA iddArg = FamilyType::cmdInitInterfaceDescriptorData;
|
||||
iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u);
|
||||
|
||||
EncodeDispatchKernel<FamilyType>::adjustInterfaceDescriptorData(iddArg, hwInfo, threadGroupCount, numGrf);
|
||||
|
||||
EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user