diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index 9966a7de2d..1df46e642d 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -158,6 +158,9 @@ struct EncodeDispatchKernel { template static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd); + template + static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd); + static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount); template diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 715e0b6612..c524f89d35 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -7,6 +7,7 @@ #pragma once #include "shared/source/command_container/command_encoder.h" +#include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/debugger/debugger_l0.h" #include "shared/source/device/device.h" @@ -18,6 +19,7 @@ #include "shared/source/helpers/blit_commands_helper.h" #include "shared/source/helpers/definitions/command_encoder_args.h" #include "shared/source/helpers/gfx_core_helper.h" +#include "shared/source/helpers/hw_info.h" #include "shared/source/helpers/local_id_gen.h" #include "shared/source/helpers/preamble.h" #include "shared/source/helpers/register_offsets.h" @@ -755,6 +757,119 @@ size_t EncodeDispatchKernel::getSizeRequiredDsh(const KernelDescriptor & return size; } +template +template +void EncodeDispatchKernel::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) { + const auto &productHelper = device.getProductHelper(); + + if (productHelper.isDisableOverdispatchAvailable(hwInfo)) { + interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); + + bool adjustTGDispatchSize = true; + if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) { + adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get(); + } + // apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount + auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1; + if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) { + algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get(); + } + + if (algorithmVersion == 2) { + auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported; + if (numGrf == 256) { + threadsPerXeCore /= 2; + } + auto tgDispatchSizeSelected = 8; + uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); + + if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { + while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { + tgDispatchSizeSelected /= 2; + } + } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { + while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { + tgDispatchSizeSelected /= 2; + } + } + + auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u; + + // make sure we fit all xe core + while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) { + tgDispatchSizeSelected /= 2; + } + + auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup; + // make sure we do not use more threads then present on each xe core + while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) { + tgDispatchSizeSelected /= 2; + threadCountPerGrouping /= 2; + } + + if (tgDispatchSizeSelected == 8) { + interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); + } else if (tgDispatchSizeSelected == 1) { + interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); + } else if (tgDispatchSizeSelected == 2) { + interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); + } else { + interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); + } + } else { + if (adjustTGDispatchSize) { + UNRECOVERABLE_IF(numGrf == 0u); + constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u; + constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u; + auto &gfxCoreHelper = device.getGfxCoreHelper(); + uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf); + if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) { + const uint32_t tilesCount = device.getNumSubDevices(); + availableThreadCount *= tilesCount; + } + uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); + uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount; + UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u); + auto tgDispatchSizeSelected = 1u; + + if (dispatchedTotalThreadCount <= availableThreadCount) { + tgDispatchSizeSelected = 1; + } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) { + tgDispatchSizeSelected = 8; + } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) { + tgDispatchSizeSelected = 4; + } else { + tgDispatchSizeSelected = 2; + } + if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { + while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { + tgDispatchSizeSelected /= 2; + } + } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { + while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { + tgDispatchSizeSelected /= 2; + } + } + if (tgDispatchSizeSelected == 8) { + interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); + } else if (tgDispatchSizeSelected == 1) { + interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); + } else if (tgDispatchSizeSelected == 2) { + interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); + } else { + interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); + } + } + } + } + + if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) { + interfaceDescriptor.setThreadGroupDispatchSize(static_cast( + debugManager.flags.ForceThreadGroupDispatchSize.get())); + } +} + template size_t EncodeDispatchKernel::getSizeRequiredSsh(const KernelInfo &kernelInfo) { size_t requiredSshSize = kernelInfo.heapInfo.surfaceStateHeapSize; diff --git a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp index 51d8405020..d8bb828989 100644 --- a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,113 +29,7 @@ namespace NEO { template <> template void EncodeDispatchKernel::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t numGrf, WalkerType &walkerCmd) { - const auto &productHelper = device.getProductHelper(); - - if (productHelper.isDisableOverdispatchAvailable(hwInfo)) { - interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); - bool adjustTGDispatchSize = true; - if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) { - adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get(); - } - // apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount - auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1; - if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) { - algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get(); - } - - if (algorithmVersion == 2) { - auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported; - if (numGrf == 256) { - threadsPerXeCore /= 2; - } - auto tgDispatchSizeSelected = 8; - uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); - - if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { - while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { - tgDispatchSizeSelected /= 2; - } - } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { - while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { - tgDispatchSizeSelected /= 2; - } - } - - auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); - auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u; - - // make sure we fit all xe core - while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) { - tgDispatchSizeSelected /= 2; - } - - auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup; - // make sure we do not use more threads then present on each xe core - while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) { - tgDispatchSizeSelected /= 2; - threadCountPerGrouping /= 2; - } - - if (tgDispatchSizeSelected == 8) { - interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); - } else if (tgDispatchSizeSelected == 1) { - interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); - } else if (tgDispatchSizeSelected == 2) { - interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); - } else { - interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); - } - } else { - if (adjustTGDispatchSize) { - UNRECOVERABLE_IF(numGrf == 0u); - constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u; - constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u; - auto &gfxCoreHelper = device.getGfxCoreHelper(); - uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf); - if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) { - const uint32_t tilesCount = device.getNumSubDevices(); - availableThreadCount *= tilesCount; - } - uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); - uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount; - UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u); - auto tgDispatchSizeSelected = 1u; - - if (dispatchedTotalThreadCount <= availableThreadCount) { - tgDispatchSizeSelected = 1; - } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) { - tgDispatchSizeSelected = 8; - } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) { - tgDispatchSizeSelected = 4; - } else { - tgDispatchSizeSelected = 2; - } - if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { - while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { - tgDispatchSizeSelected /= 2; - } - } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { - while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { - tgDispatchSizeSelected /= 2; - } - } - if (tgDispatchSizeSelected == 8) { - interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); - } else if (tgDispatchSizeSelected == 1) { - interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); - } else if (tgDispatchSizeSelected == 2) { - interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); - } else { - interfaceDescriptor.setThreadGroupDispatchSize(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); - } - } - } - } - - if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) { - interfaceDescriptor.setThreadGroupDispatchSize(static_cast( - debugManager.flags.ForceThreadGroupDispatchSize.get())); - } + EncodeDispatchKernel::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, numGrf, walkerCmd); } template <> diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp index 66d334d50b..4007c50205 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp @@ -685,7 +685,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInlineDataRequiredAnd EXPECT_EQ(0u, cmd->getLocalZMaximum()); } -HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGroupDispatchSizeVariableIsDefaultThenThreadGroupDispatchSizeIsNotChanged) { +HWTEST2_F(CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGroupDispatchSizeVariableIsDefaultThenThreadGroupDispatchSizeIsNotChanged, IsXeLpg) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using DefaultWalkerType = typename FamilyType::DefaultWalkerType; INTERFACE_DESCRIPTOR_DATA iddArg;