mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-24 12:23:05 +08:00
performance: adjust thread group dispatch size
adjust thread group dispatch size on pvc if chosen size does not evenly divide dimension this is to avoid leftover thread groups Related-To: NEO-7927 Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
4f297cf971
commit
c84c7a0c91
@@ -129,7 +129,7 @@ struct EncodeDispatchKernel {
|
||||
|
||||
static void programBarrierEnable(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
|
||||
|
||||
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf);
|
||||
static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd);
|
||||
|
||||
static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount);
|
||||
|
||||
|
||||
@@ -723,7 +723,7 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {}
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd) {}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) {
|
||||
|
||||
@@ -260,7 +260,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
}
|
||||
|
||||
auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension();
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, cmd);
|
||||
|
||||
memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd));
|
||||
|
||||
|
||||
@@ -298,7 +298,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
walkerCmd.setPredicateEnable(args.isPredicate);
|
||||
|
||||
auto threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired);
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, walkerCmd);
|
||||
|
||||
EncodeDispatchKernel<Family>::appendAdditionalIDDFields(&idd, rootDeviceEnvironment, threadsPerThreadGroup,
|
||||
args.dispatchInterface->getSlmTotalSize(),
|
||||
|
||||
@@ -31,7 +31,7 @@ void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd,
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd) {
|
||||
const auto &productHelper = device.getProductHelper();
|
||||
|
||||
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
@@ -54,15 +54,30 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
|
||||
uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup();
|
||||
uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount;
|
||||
UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u);
|
||||
auto tgDispatchSizeSelected = 1u;
|
||||
|
||||
if (dispatchedTotalThreadCount <= availableThreadCount) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
tgDispatchSizeSelected = 1;
|
||||
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
|
||||
tgDispatchSizeSelected = 8;
|
||||
} else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
|
||||
tgDispatchSizeSelected = 4;
|
||||
} else {
|
||||
tgDispatchSizeSelected = 2;
|
||||
}
|
||||
if (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1) {
|
||||
while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) {
|
||||
tgDispatchSizeSelected /= 2;
|
||||
}
|
||||
}
|
||||
if (tgDispatchSizeSelected == 8) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
|
||||
} else if (tgDispatchSizeSelected == 1) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
|
||||
} else if (tgDispatchSizeSelected == 2) {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
|
||||
} else {
|
||||
interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,7 +87,7 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTO
|
||||
}
|
||||
|
||||
template <>
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf) {
|
||||
void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WALKER_TYPE &walkerCmd) {
|
||||
const auto &productHelper = device.getProductHelper();
|
||||
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
|
||||
if (interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup() == 1) {
|
||||
|
||||
Reference in New Issue
Block a user