mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-10 15:12:56 +08:00
Apply heuristics when setting TG dispatch size on XE_HPC_CORE
The default TG dispatch size can be changed to a better value based on number of threads in TG or currently available amount of threads on GPU. Decision on what TG dispatch size should be are based on implemented heuristics. Signed-off-by: Rafal Maziejuk <rafal.maziejuk@intel.com> Related-To: NEO-6989
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
52133e61ce
commit
ed0c36117e
@@ -65,6 +65,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
|
||||
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
|
||||
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
|
||||
auto threadGroupCount = static_cast<uint32_t>(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z);
|
||||
|
||||
if (walkerArgs.currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNode = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
|
||||
@@ -83,6 +84,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
kernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
simd,
|
||||
walkerArgs.localWorkSizes,
|
||||
threadGroupCount,
|
||||
walkerArgs.offsetInterfaceDescriptorTable,
|
||||
walkerArgs.interfaceDescriptorIndex,
|
||||
walkerArgs.preemptionMode,
|
||||
|
||||
@@ -60,6 +60,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
|
||||
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
|
||||
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
|
||||
auto threadGroupCount = static_cast<uint32_t>(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z);
|
||||
uint32_t requiredWalkOrder = 0u;
|
||||
|
||||
bool localIdsGenerationByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
|
||||
@@ -98,6 +99,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, false),
|
||||
simd,
|
||||
walkerArgs.localWorkSizes,
|
||||
threadGroupCount,
|
||||
walkerArgs.offsetInterfaceDescriptorTable,
|
||||
walkerArgs.interfaceDescriptorIndex,
|
||||
walkerArgs.preemptionMode,
|
||||
|
||||
@@ -47,6 +47,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
|
||||
size_t bindingTablePointer,
|
||||
[[maybe_unused]] size_t offsetSamplerState,
|
||||
uint32_t numSamplers,
|
||||
const uint32_t threadGroupCount,
|
||||
uint32_t numThreadsPerThreadGroup,
|
||||
const Kernel &kernel,
|
||||
uint32_t bindingTablePrefetchSize,
|
||||
@@ -79,6 +80,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
|
||||
uint64_t kernelStartOffset,
|
||||
uint32_t simd,
|
||||
const size_t localWorkSize[3],
|
||||
const uint32_t threadGroupCount,
|
||||
const uint64_t offsetInterfaceDescriptorTable,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
PreemptionMode preemptionMode,
|
||||
|
||||
@@ -113,6 +113,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
||||
size_t bindingTablePointer,
|
||||
[[maybe_unused]] size_t offsetSamplerState,
|
||||
uint32_t numSamplers,
|
||||
const uint32_t threadGroupCount,
|
||||
uint32_t threadsPerThreadGroup,
|
||||
const Kernel &kernel,
|
||||
uint32_t bindingTablePrefetchSize,
|
||||
@@ -169,7 +170,8 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
||||
hardwareInfo);
|
||||
|
||||
PreemptionHelper::programInterfaceDescriptorDataPreemption<GfxFamily>(&interfaceDescriptor, preemptionMode);
|
||||
EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorData(interfaceDescriptor, hardwareInfo);
|
||||
|
||||
EncodeDispatchKernel<GfxFamily>::adjustInterfaceDescriptorData(interfaceDescriptor, hardwareInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
|
||||
*pInterfaceDescriptor = interfaceDescriptor;
|
||||
return (size_t)offsetInterfaceDescriptor;
|
||||
@@ -185,6 +187,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
uint64_t kernelStartOffset,
|
||||
uint32_t simd,
|
||||
const size_t localWorkSize[3],
|
||||
const uint32_t threadGroupCount,
|
||||
const uint64_t offsetInterfaceDescriptorTable,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
PreemptionMode preemptionMode,
|
||||
@@ -263,6 +266,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
dstBindingTablePointer,
|
||||
samplerStateOffset,
|
||||
samplerCount,
|
||||
threadGroupCount,
|
||||
threadsPerThreadGroup,
|
||||
kernel,
|
||||
bindingTablePrefetchSize,
|
||||
|
||||
@@ -88,9 +88,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenProgramInterfaceDescriptor
|
||||
auto usedIndirectHeapBefore = indirectHeap.getUsed();
|
||||
indirectHeap.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
|
||||
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
size_t crossThreadDataSize = kernel->getCrossThreadDataSize();
|
||||
HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
|
||||
indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, 1, *kernel, 0, pDevice->getPreemptionMode(), nullptr, *pDevice);
|
||||
indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, threadGroupCount, 1, *kernel, 0, pDevice->getPreemptionMode(), nullptr, *pDevice);
|
||||
|
||||
auto usedIndirectHeapAfter = indirectHeap.getUsed();
|
||||
EXPECT_EQ(sizeof(INTERFACE_DESCRIPTOR_DATA), usedIndirectHeapAfter - usedIndirectHeapBefore);
|
||||
@@ -309,6 +310,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
|
||||
|
||||
const size_t localWorkSize = 256;
|
||||
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
|
||||
auto &commandStream = cmdQ.getCS(1024);
|
||||
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
|
||||
@@ -343,6 +345,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenAllocatingIndirectStateRes
|
||||
kernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
kernel->getKernelInfo().getMaxSimdSize(),
|
||||
localWorkSizes,
|
||||
threadGroupCount,
|
||||
idToffset,
|
||||
interfaceDescriptorIndex,
|
||||
pDevice->getPreemptionMode(),
|
||||
@@ -385,6 +388,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
|
||||
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192);
|
||||
const size_t localWorkSize = 256;
|
||||
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
uint32_t interfaceDescriptorIndex = 0;
|
||||
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
|
||||
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
|
||||
@@ -398,6 +402,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
|
||||
mockKernelWithInternal->mockKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
|
||||
localWorkSizes,
|
||||
threadGroupCount,
|
||||
0,
|
||||
interfaceDescriptorIndex,
|
||||
pDevice->getPreemptionMode(),
|
||||
@@ -431,6 +436,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
|
||||
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192);
|
||||
const size_t localWorkSize = 256;
|
||||
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
uint32_t interfaceDescriptorIndex = 0;
|
||||
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
|
||||
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
|
||||
@@ -444,6 +450,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
|
||||
mockKernelWithInternal->mockKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
|
||||
localWorkSizes,
|
||||
threadGroupCount,
|
||||
0,
|
||||
interfaceDescriptorIndex,
|
||||
pDevice->getPreemptionMode(),
|
||||
@@ -487,6 +494,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
|
||||
const size_t localWorkSizeY = 3;
|
||||
const size_t localWorkSizeZ = 4;
|
||||
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
|
||||
auto &commandStream = cmdQ.getCS(1024);
|
||||
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
|
||||
@@ -523,6 +531,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
|
||||
mockKernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
modifiedKernelInfo.getMaxSimdSize(),
|
||||
localWorkSizes,
|
||||
threadGroupCount,
|
||||
idToffset,
|
||||
interfaceDescriptorIndex,
|
||||
pDevice->getPreemptionMode(),
|
||||
@@ -578,6 +587,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenSendingIndirectStateThenBi
|
||||
ASSERT_NE(nullptr, kernel);
|
||||
|
||||
const size_t localWorkSizes[3]{256, 1, 1};
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
|
||||
auto &commandStream = cmdQ.getCS(1024);
|
||||
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
|
||||
@@ -613,6 +623,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenSendingIndirectStateThenBi
|
||||
kernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
kernel->getKernelInfo().getMaxSimdSize(),
|
||||
localWorkSizes,
|
||||
threadGroupCount,
|
||||
0,
|
||||
interfaceDescriptorIndex,
|
||||
pDevice->getPreemptionMode(),
|
||||
@@ -701,6 +712,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
|
||||
EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates());
|
||||
|
||||
const size_t localWorkSizes[3]{256, 1, 1};
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
|
||||
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
|
||||
|
||||
@@ -722,6 +734,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, WhenGettingBindingTableStateTh
|
||||
pKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
pKernel->getKernelInfo().getMaxSimdSize(),
|
||||
localWorkSizes,
|
||||
threadGroupCount,
|
||||
0,
|
||||
interfaceDescriptorIndex,
|
||||
pDevice->getPreemptionMode(),
|
||||
@@ -847,6 +860,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
|
||||
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::SURFACE_STATE, 8192);
|
||||
const size_t localWorkSize = 256;
|
||||
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
uint32_t interfaceDescriptorIndex = 0;
|
||||
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
|
||||
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel);
|
||||
@@ -863,6 +877,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
|
||||
mockKernelWithInternal->mockKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
|
||||
localWorkSizes,
|
||||
threadGroupCount,
|
||||
0,
|
||||
interfaceDescriptorIndex,
|
||||
pDevice->getPreemptionMode(),
|
||||
@@ -887,6 +902,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithInvalidSamplerS
|
||||
mockKernelWithInternal->mockKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
|
||||
localWorkSizes,
|
||||
threadGroupCount,
|
||||
0,
|
||||
interfaceDescriptorIndex,
|
||||
pDevice->getPreemptionMode(),
|
||||
@@ -909,6 +925,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
|
||||
|
||||
CommandQueueHw<FamilyType> cmdQ(nullptr, pClDevice, 0, false);
|
||||
const size_t localWorkSizes[3]{1, 1, 1};
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
|
||||
auto &commandStream = cmdQ.getCS(1024);
|
||||
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
|
||||
@@ -957,6 +974,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
|
||||
mockKernelWithInternal->mockKernel->getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
||||
8,
|
||||
localWorkSizes,
|
||||
threadGroupCount,
|
||||
interfaceDescriptorTableOffset,
|
||||
interfaceDescriptorIndex,
|
||||
pDevice->getPreemptionMode(),
|
||||
|
||||
@@ -67,6 +67,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, KernelSLMAndBarrierTest, GivenStaticSlmSizeWhenProgr
|
||||
// After creating Mock Kernel now create Indirect Heap
|
||||
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
|
||||
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
uint64_t interfaceDescriptorOffset = indirectHeap.getUsed();
|
||||
|
||||
size_t offsetInterfaceDescriptorData = HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
|
||||
@@ -78,6 +79,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, KernelSLMAndBarrierTest, GivenStaticSlmSizeWhenProgr
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
threadGroupCount,
|
||||
1,
|
||||
kernel,
|
||||
4u,
|
||||
@@ -154,6 +156,7 @@ HWTEST_F(KernelSLMAndBarrierTest, GivenInterfaceDescriptorProgrammedWhenOverride
|
||||
CommandQueueHw<FamilyType> cmdQ(nullptr, pClDevice, 0, false);
|
||||
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192);
|
||||
|
||||
const uint32_t threadGroupCount = 1u;
|
||||
uint64_t interfaceDescriptorOffset = indirectHeap.getUsed();
|
||||
INTERFACE_DESCRIPTOR_DATA interfaceDescriptorData;
|
||||
|
||||
@@ -166,6 +169,7 @@ HWTEST_F(KernelSLMAndBarrierTest, GivenInterfaceDescriptorProgrammedWhenOverride
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
threadGroupCount,
|
||||
1,
|
||||
kernel,
|
||||
4u,
|
||||
|
||||
Reference in New Issue
Block a user