Apply heuristics when setting TG dispatch size on XE_HPC_CORE

The default TG dispatch size can be changed
to a better value based on number of threads in TG or
currently available amount of threads on GPU.
Decision on what TG dispatch size should be are based on
implemented heuristics.

Signed-off-by: Rafal Maziejuk <rafal.maziejuk@intel.com>
Related-To: NEO-6989
This commit is contained in:
Rafal Maziejuk
2022-08-03 12:22:30 +00:00
committed by Compute-Runtime-Automation
parent 52133e61ce
commit ed0c36117e
21 changed files with 188 additions and 19 deletions

View File

@@ -65,6 +65,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
auto threadGroupCount = static_cast<uint32_t>(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z);
if (walkerArgs.currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
auto timestampPacketNode = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
@@ -83,6 +84,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
kernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
simd,
walkerArgs.localWorkSizes,
threadGroupCount,
walkerArgs.offsetInterfaceDescriptorTable,
walkerArgs.interfaceDescriptorIndex,
walkerArgs.preemptionMode,