diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index ff88b8e0ce..62a3dcc63c 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -196,8 +196,8 @@ size_t HardwareCommandsHelper::sendInterfaceDescriptorData( defaultPipelinedThreadArbitrationPolicy = NEO::debugManager.flags.OverrideThreadArbitrationPolicy.get(); } EncodeDispatchKernel::encodeEuSchedulingPolicy(&interfaceDescriptor, kernelDescriptor, defaultPipelinedThreadArbitrationPolicy); - - EncodeDispatchKernel::adjustInterfaceDescriptorData(interfaceDescriptor, device, hardwareInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, *walkerCmd); + const uint32_t threadGroupDimensions[] = {walkerCmd->getThreadGroupIdXDimension(), walkerCmd->getThreadGroupIdYDimension(), walkerCmd->getThreadGroupIdXDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(interfaceDescriptor, device, hardwareInfo, threadGroupDimensions, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, threadsPerThreadGroup, *walkerCmd); *pInterfaceDescriptor = interfaceDescriptor; return (size_t)offsetInterfaceDescriptor; diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index c8ea787f60..182435de24 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -172,10 +172,9 @@ struct EncodeDispatchKernel { static void programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo); template - static void adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd); - - template - static void adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd); + static void encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, + const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, + WalkerType &walkerCmd); static void adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index a15b642293..15e7883726 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -761,10 +761,6 @@ void EncodeDispatchKernel::adjustBindingTablePrefetch(INTERFACE_DESCRIPT } } -template -template -void EncodeDispatchKernel::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) {} - template size_t EncodeDispatchKernel::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t iddCount) { using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA; @@ -791,119 +787,6 @@ size_t EncodeDispatchKernel::getSizeRequiredDsh(const KernelDescriptor & return size; } -template -template -void EncodeDispatchKernel::adjustInterfaceDescriptorDataForOverdispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) { - const auto &productHelper = device.getProductHelper(); - - if (productHelper.isDisableOverdispatchAvailable(hwInfo)) { - interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); - - bool adjustTGDispatchSize = true; - if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) { - adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get(); - } - // apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount - auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1; - if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) { - algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get(); - } - - if (algorithmVersion == 2) { - auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported; - if (grfCount == 256) { - threadsPerXeCore /= 2; - } - auto tgDispatchSizeSelected = 8; - uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); - - if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { - while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { - tgDispatchSizeSelected /= 2; - } - } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { - while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { - tgDispatchSizeSelected /= 2; - } - } - - auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); - auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u; - - // make sure we fit all xe core - while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) { - tgDispatchSizeSelected /= 2; - } - - auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup; - // make sure we do not use more threads then present on each xe core - while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) { - tgDispatchSizeSelected /= 2; - threadCountPerGrouping /= 2; - } - - if (tgDispatchSizeSelected == 8) { - interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); - } else if (tgDispatchSizeSelected == 1) { - interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); - } else if (tgDispatchSizeSelected == 2) { - interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); - } else { - interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); - } - } else { - if (adjustTGDispatchSize) { - UNRECOVERABLE_IF(grfCount == 0u); - constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u; - constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u; - auto &gfxCoreHelper = device.getGfxCoreHelper(); - uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount); - if (ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true)) { - const uint32_t tilesCount = device.getNumSubDevices(); - availableThreadCount *= tilesCount; - } - uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); - uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount; - UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u); - auto tgDispatchSizeSelected = 1u; - - if (dispatchedTotalThreadCount <= availableThreadCount) { - tgDispatchSizeSelected = 1; - } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) { - tgDispatchSizeSelected = 8; - } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) { - tgDispatchSizeSelected = 4; - } else { - tgDispatchSizeSelected = 2; - } - if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { - while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { - tgDispatchSizeSelected /= 2; - } - } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { - while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { - tgDispatchSizeSelected /= 2; - } - } - if (tgDispatchSizeSelected == 8) { - interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); - } else if (tgDispatchSizeSelected == 1) { - interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); - } else if (tgDispatchSizeSelected == 2) { - interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); - } else { - interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); - } - } - } - } - - if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) { - interfaceDescriptor.setThreadGroupDispatchSize(static_cast( - debugManager.flags.ForceThreadGroupDispatchSize.get())); - } -} - template size_t EncodeDispatchKernel::getSizeRequiredSsh(const KernelInfo &kernelInfo) { size_t requiredSshSize = kernelInfo.heapInfo.surfaceStateHeapSize; diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index f783de29b6..0c1796296d 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -67,12 +67,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis LinearStream *listCmdBufferStream = container.getCommandStream(); - auto threadDims = static_cast(args.threadGroupDimensions); - const Vec3 threadStartVec{0, 0, 0}; - Vec3 threadDimsVec{0, 0, 0}; - if (!args.isIndirect) { - threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]}; - } + auto threadGroupDims = static_cast(args.threadGroupDimensions); DefaultWalkerType cmd = Family::cmdInitGpgpuWalker; auto idd = Family::cmdInitInterfaceDescriptorData; @@ -267,11 +262,11 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis EncodeDispatchKernel::encodeThreadData(cmd, nullptr, - threadDims, + threadGroupDims, args.dispatchInterface->getGroupSize(), kernelDescriptor.kernelAttributes.simdSize, kernelDescriptor.kernelAttributes.numLocalIdChannels, - args.dispatchInterface->getNumThreadsPerThreadGroup(), + numThreadsPerThreadGroup, args.dispatchInterface->getThreadExecutionMask(), true, false, @@ -282,7 +277,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis cmd.setPredicateEnable(args.isPredicate); auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension(); - EncodeDispatchKernel::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, cmd); + EncodeDispatchKernel::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd); memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd)); @@ -635,4 +630,11 @@ template void EncodeDispatchKernel::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) { } +template +template +void EncodeDispatchKernel::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, + const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, + WalkerType &walkerCmd) { +} + } // namespace NEO diff --git a/shared/source/command_container/command_encoder_enablers.inl b/shared/source/command_container/command_encoder_enablers.inl index 31d2e5686a..b0df4c5baa 100644 --- a/shared/source/command_container/command_encoder_enablers.inl +++ b/shared/source/command_container/command_encoder_enablers.inl @@ -16,7 +16,7 @@ template void NEO::EncodeDispatchKernel::setupPostSyncForRegularEvent::setupPostSyncForInOrderExec(Family::DefaultWalkerType &walkerCmd, const EncodeDispatchKernelArgs &args); template void NEO::EncodeDispatchKernel::setGrfInfo(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t grfCount, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const RootDeviceEnvironment &rootDeviceEnvironment); template void NEO::EncodeDispatchKernel::setupPreferredSlmSize(Family::INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy); -template void NEO::EncodeDispatchKernel::adjustInterfaceDescriptorData(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, Family::DefaultWalkerType &walkerCmd); +template void NEO::EncodeDispatchKernel::encodeThreadGroupDispatch(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, Family::DefaultWalkerType &walkerCmd); template void NEO::EncodeDispatchKernel::setupPostSyncMocs(Family::DefaultWalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush); template void NEO::EncodeDispatchKernel::encode(CommandContainer &container, EncodeDispatchKernelArgs &args); template void NEO::EncodeDispatchKernel::encodeThreadData(Family::DefaultWalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment); diff --git a/shared/source/command_container/command_encoder_xe_hpc_core_and_later.inl b/shared/source/command_container/command_encoder_xe_hpc_core_and_later.inl index 055879f5ed..f32b31b5bf 100644 --- a/shared/source/command_container/command_encoder_xe_hpc_core_and_later.inl +++ b/shared/source/command_container/command_encoder_xe_hpc_core_and_later.inl @@ -33,10 +33,4 @@ void EncodeBatchBufferStartOrEnd::appendBatchBufferStart(MI_BATCH_BUFFER cmd.setPredicationEnable(predicate); } -template <> -template -void EncodeDispatchKernel::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) { - EncodeDispatchKernel::adjustInterfaceDescriptorDataForOverdispatch(interfaceDescriptor, device, hwInfo, threadGroupCount, grfCount, walkerCmd); -} - } // namespace NEO \ No newline at end of file diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 56892e7171..073068d219 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -67,12 +67,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis LinearStream *listCmdBufferStream = container.getCommandStream(); - auto threadDims = static_cast(args.threadGroupDimensions); - const Vec3 threadStartVec{0, 0, 0}; - Vec3 threadDimsVec{0, 0, 0}; - if (!args.isIndirect) { - threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]}; - } + auto threadGroupDims = static_cast(args.threadGroupDimensions); if (!args.makeCommandView) { bool systolicModeRequired = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode; @@ -354,7 +349,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis EncodeDispatchKernel::encodeThreadData(walkerCmd, nullptr, - threadDims, + threadGroupDims, args.dispatchInterface->getGroupSize(), kernelDescriptor.kernelAttributes.simdSize, kernelDescriptor.kernelAttributes.numLocalIdChannels, @@ -383,7 +378,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis walkerCmd.setPredicateEnable(args.isPredicate); auto threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); - EncodeDispatchKernel::adjustInterfaceDescriptorData(idd, *args.device, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, walkerCmd); + EncodeDispatchKernel::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, threadsPerThreadGroup, walkerCmd); if (debugManager.flags.PrintKernelDispatchParameters.get()) { fprintf(stdout, "kernel, %s, grfCount, %d, simdSize, %d, tilesCount, %d, implicitScaling, %s, threadGroupCount, %d, numberOfThreadsInGpgpuThreadGroup, %d, threadGroupDimensions, %d, %d, %d, threadGroupDispatchSize enum, %d\n", kernelDescriptor.kernelMetadata.kernelName.c_str(), @@ -1121,4 +1116,117 @@ template void EncodeDispatchKernel::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) { } +template +template +void EncodeDispatchKernel::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, + const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd) { + const auto &productHelper = device.getProductHelper(); + + if (productHelper.isDisableOverdispatchAvailable(hwInfo)) { + interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); + + bool adjustTGDispatchSize = true; + if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) { + adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get(); + } + // apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount + auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1; + if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) { + algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get(); + } + + auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u; + + if (algorithmVersion == 2) { + auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported; + if (grfCount == 256) { + threadsPerXeCore /= 2; + } + auto tgDispatchSizeSelected = 8; + uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); + + if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { + while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { + tgDispatchSizeSelected /= 2; + } + } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { + while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { + tgDispatchSizeSelected /= 2; + } + } + + auto workgroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + + // make sure we fit all xe core + while (workgroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) { + tgDispatchSizeSelected /= 2; + } + + auto threadCountPerGrouping = tgDispatchSizeSelected * numberOfThreadsInThreadGroup; + // make sure we do not use more threads then present on each xe core + while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) { + tgDispatchSizeSelected /= 2; + threadCountPerGrouping /= 2; + } + + if (tgDispatchSizeSelected == 8) { + interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); + } else if (tgDispatchSizeSelected == 1) { + interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); + } else if (tgDispatchSizeSelected == 2) { + interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); + } else { + interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); + } + } else { + if (adjustTGDispatchSize) { + UNRECOVERABLE_IF(grfCount == 0u); + constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u; + constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u; + auto &gfxCoreHelper = device.getGfxCoreHelper(); + uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount); + availableThreadCount *= tileCount; + + uint32_t numberOfThreadsInThreadGroup = interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup(); + uint32_t dispatchedTotalThreadCount = numberOfThreadsInThreadGroup * threadGroupCount; + UNRECOVERABLE_IF(numberOfThreadsInThreadGroup == 0u); + auto tgDispatchSizeSelected = 1u; + + if (dispatchedTotalThreadCount <= availableThreadCount) { + tgDispatchSizeSelected = 1; + } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize8) { + tgDispatchSizeSelected = 8; + } else if (numberOfThreadsInThreadGroup <= maxThreadsInTGForTGDispatchSize4) { + tgDispatchSizeSelected = 4; + } else { + tgDispatchSizeSelected = 2; + } + if (walkerCmd.getThreadGroupIdXDimension() > 1 && (walkerCmd.getThreadGroupIdYDimension() > 1 || walkerCmd.getThreadGroupIdZDimension() > 1)) { + while (walkerCmd.getThreadGroupIdXDimension() % tgDispatchSizeSelected != 0) { + tgDispatchSizeSelected /= 2; + } + } else if (walkerCmd.getThreadGroupIdYDimension() > 1 && walkerCmd.getThreadGroupIdZDimension() > 1) { + while (walkerCmd.getThreadGroupIdYDimension() % tgDispatchSizeSelected != 0) { + tgDispatchSizeSelected /= 2; + } + } + if (tgDispatchSizeSelected == 8) { + interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8); + } else if (tgDispatchSizeSelected == 1) { + interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1); + } else if (tgDispatchSizeSelected == 2) { + interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2); + } else { + interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4); + } + } + } + } + + if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) { + interfaceDescriptor.setThreadGroupDispatchSize(static_cast( + debugManager.flags.ForceThreadGroupDispatchSize.get())); + } +} + } // namespace NEO diff --git a/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp b/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp index cbc91b62f3..fc9ec22531 100644 --- a/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp +++ b/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp @@ -29,7 +29,8 @@ namespace NEO { template <> template -void EncodeDispatchKernel::adjustInterfaceDescriptorData(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, const uint32_t threadGroupCount, const uint32_t grfCount, WalkerType &walkerCmd) { +void EncodeDispatchKernel::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo, + const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd) { const auto &productHelper = device.getProductHelper(); if (productHelper.isDisableOverdispatchAvailable(hwInfo)) { if (interfaceDescriptor.getNumberOfThreadsInGpgpuThreadGroup() == 1) { diff --git a/shared/test/unit_test/encoders/command_encoder_tests_dg2.cpp b/shared/test/unit_test/encoders/command_encoder_tests_dg2.cpp index 3bfd7b94fc..33e2dc161c 100644 --- a/shared/test/unit_test/encoders/command_encoder_tests_dg2.cpp +++ b/shared/test/unit_test/encoders/command_encoder_tests_dg2.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -70,6 +70,7 @@ HWTEST2_F(DG2CommandEncoderTest, givenInterfaceDescriptorDataWhenForceThreadGrou using DefaultWalkerType = typename FamilyType::DefaultWalkerType; INTERFACE_DESCRIPTOR_DATA iddArg; DefaultWalkerType walkerCmd{}; + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; iddArg = FamilyType::cmdInitInterfaceDescriptorData; const uint32_t forceThreadGroupDispatchSize = -1; auto hwInfo = pDevice->getHardwareInfo(); @@ -84,7 +85,7 @@ HWTEST2_F(DG2CommandEncoderTest, givenInterfaceDescriptorDataWhenForceThreadGrou for (auto numberOfThreadsInGroup : {1u, 4u, 16u}) { iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInGroup); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, 0, 0, walkerCmd); + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, 0, 0, numberOfThreadsInGroup, walkerCmd); if (productHelper.isDisableOverdispatchAvailable(hwInfo)) { if (numberOfThreadsInGroup == 1) { diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_pvc_and_later.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_pvc_and_later.cpp index e8ef9e98f5..e28539b5fe 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_pvc_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_pvc_and_later.cpp @@ -132,7 +132,7 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDebugVariableWhenPostSyncIsPr EXPECT_FALSE(postSyncData.getDataportSubsliceCacheFlush()); } -HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDispatchSizeSmallerOrEqualToAvailableThreadCountWhenAdjustInterfaceDescriptorDataIsCalledThenThreadGroupDispatchSizeIsCorrectlySet, IsAtLeastXeHpcCore) { +HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDispatchSizeSmallerOrEqualToAvailableThreadCountWhenEncodeThreadGroupDispatchIsCalledThenThreadGroupDispatchSizeIsCorrectlySet, IsAtLeastXeHpcCore) { using DefaultWalkerType = typename FamilyType::DefaultWalkerType; using InterfaceDescriptorType = typename DefaultWalkerType::InterfaceDescriptorType; DefaultWalkerType walkerCmd{}; @@ -143,14 +143,15 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDispatchSizeSmallerOrEqualToA InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor(); iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; for (const auto threadGroupCount : {1u, 2u}) { - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, 1u, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); } } -HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenMultipleTilesAndImplicitScalingWhenAdjustInterfaceDescriptorDataIsCalledThenThreadGroupDispatchSizeIsCorrectlySet, IsAtLeastXeHpcCore) { +HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenMultipleTilesAndImplicitScalingWhenEncodeThreadGroupDispatchIsCalledThenThreadGroupDispatchSizeIsCorrectlySet, IsAtLeastXeHpcCore) { using DefaultWalkerType = typename FamilyType::DefaultWalkerType; using InterfaceDescriptorType = typename DefaultWalkerType::InterfaceDescriptorType; DefaultWalkerType walkerCmd{}; @@ -163,18 +164,20 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenMultipleTilesAndImplicitScali const uint32_t numGrf = GrfConfig::defaultGrfNumber; auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); const uint32_t threadGroupCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf) / 32u; - iddArg.setNumberOfThreadsInGpgpuThreadGroup(64u); + uint32_t threadsPerThreadGroup = 64u; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); ASSERT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); debugManager.flags.EnableWalkerPartition.set(1); pDevice->numSubDevices = 2; - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); } -HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet, IsAtLeastXeHpcCore) { +HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupWhenCallingEncodeThreadGroupDispatchThenThreadGroupDispatchSizeIsCorrectlySet, IsAtLeastXeHpcCore) { DebugManagerStateRestore restorer; debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.set(1u); using DefaultWalkerType = typename FamilyType::DefaultWalkerType; @@ -188,17 +191,17 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupW std::array, 3> testParams = {{{16u, InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8}, {32u, InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4}, {64u, InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2}}}; - + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; for (const auto &[numberOfThreadsInThreadGroup, expectedThreadGroupDispatchSize] : testParams) { iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); EXPECT_EQ(expectedThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize()); } } -HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupAndDimensionsWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet, IsAtLeastXeHpcCore) { +HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupAndDimensionsWhenCallingEncodeThreadGroupDispatchThenThreadGroupDispatchSizeIsCorrectlySet, IsAtLeastXeHpcCore) { DebugManagerStateRestore restorer; debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.set(1u); using DefaultWalkerType = typename FamilyType::DefaultWalkerType; @@ -210,105 +213,123 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupA const uint32_t threadGroupCount = 512u; const uint32_t numGrf = GrfConfig::defaultGrfNumber; - iddArg.setNumberOfThreadsInGpgpuThreadGroup(16u); - + uint32_t threadsPerThreadGroup = 16; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); { - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } walkerCmd.setThreadGroupIdYDimension(2); walkerCmd.setThreadGroupIdZDimension(1); { walkerCmd.setThreadGroupIdXDimension(4); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); } { walkerCmd.setThreadGroupIdXDimension(2); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); } { walkerCmd.setThreadGroupIdXDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(2); { walkerCmd.setThreadGroupIdXDimension(4); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); } { walkerCmd.setThreadGroupIdXDimension(2); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); } { walkerCmd.setThreadGroupIdXDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(1); { walkerCmd.setThreadGroupIdXDimension(4); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } { walkerCmd.setThreadGroupIdXDimension(2); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } { walkerCmd.setThreadGroupIdXDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } walkerCmd.setThreadGroupIdXDimension(1); walkerCmd.setThreadGroupIdZDimension(2); { walkerCmd.setThreadGroupIdYDimension(4); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); } { walkerCmd.setThreadGroupIdYDimension(2); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); } { walkerCmd.setThreadGroupIdYDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, threadsPerThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); } } -HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDifferentNumGrfWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsCorrectlySet, IsAtLeastXeHpcCore) { +HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDifferentNumGrfWhenCallingEncodeThreadGroupDispatchThenThreadGroupDispatchSizeIsCorrectlySet, IsAtLeastXeHpcCore) { using DefaultWalkerType = typename FamilyType::DefaultWalkerType; using InterfaceDescriptorType = typename DefaultWalkerType::InterfaceDescriptorType; DefaultWalkerType walkerCmd{}; auto hwInfo = pDevice->getHardwareInfo(); InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor(); - auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); const uint32_t numberOfThreadsInThreadGroup = 1u; + walkerCmd.setThreadGroupIdXDimension(1); + walkerCmd.setThreadGroupIdYDimension(1); + walkerCmd.setThreadGroupIdZDimension(1); + { const uint32_t numGrf = GrfConfig::defaultGrfNumber; - const uint32_t threadGroupCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf); + const uint32_t threadGroupCount = 1; iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); ASSERT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); } { const uint32_t numGrf = GrfConfig::largeGrfNumber; - const uint32_t threadGroupCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf); + const uint32_t threadGroupCount = 1; iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); } } @@ -324,131 +345,203 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenVariousDispatchParamtersWhenA mutableHwInfo->gtSystemInfo.ThreadCount = 4096u; auto hwInfo = pDevice->getHardwareInfo(); - auto &gfxCoreHelper = pDevice->getGfxCoreHelper(); uint32_t numGrf = GrfConfig::defaultGrfNumber; - const uint32_t threadGroupCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, numGrf); InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor(); - iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u); + uint32_t numberOfThreadsInThreadGroup = 1u; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(1); walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u); + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(256); walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(64u); + numberOfThreadsInThreadGroup = 64; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(64); walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u); + numberOfThreadsInThreadGroup = 1; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(512); walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(32u); + numberOfThreadsInThreadGroup = 32; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(512); walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(8u); + numberOfThreadsInThreadGroup = 8; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(512); walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(32u); + numberOfThreadsInThreadGroup = 32; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(510); walkerCmd.setThreadGroupIdYDimension(512); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(32u); + numberOfThreadsInThreadGroup = 32; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(509); walkerCmd.setThreadGroupIdYDimension(512); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(32u); + numberOfThreadsInThreadGroup = 32; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(508); walkerCmd.setThreadGroupIdYDimension(512); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(16u); + numberOfThreadsInThreadGroup = 16; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(508); walkerCmd.setThreadGroupIdYDimension(512); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(16u); + numberOfThreadsInThreadGroup = 16; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::largeGrfNumber; walkerCmd.setThreadGroupIdXDimension(508); walkerCmd.setThreadGroupIdYDimension(512); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(32u); + numberOfThreadsInThreadGroup = 32; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(1); walkerCmd.setThreadGroupIdYDimension(510); walkerCmd.setThreadGroupIdZDimension(512); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(32u); + numberOfThreadsInThreadGroup = 32; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(1); walkerCmd.setThreadGroupIdYDimension(509); walkerCmd.setThreadGroupIdZDimension(512); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(16u); + numberOfThreadsInThreadGroup = 16; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(1); walkerCmd.setThreadGroupIdYDimension(508); walkerCmd.setThreadGroupIdZDimension(512); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4, iddArg.getThreadGroupDispatchSize()); + } - iddArg.setNumberOfThreadsInGpgpuThreadGroup(32u); + numberOfThreadsInThreadGroup = 32; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; walkerCmd.setThreadGroupIdXDimension(1); walkerCmd.setThreadGroupIdYDimension(508); walkerCmd.setThreadGroupIdZDimension(512); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); - EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + { + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); + EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2, iddArg.getThreadGroupDispatchSize()); + } } HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDualSubSliceCountNotEqualToMaxSubsliceCounteWhenTgDispatchSizeIsSelectedThenAlgorithmV1IsUsed, IsAtLeastXeHpcCore) { @@ -465,51 +558,61 @@ HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenDualSubSliceCountNotEqualToMa InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor(); - iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u); + uint32_t numberOfThreadsInThreadGroup = 1; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); numGrf = GrfConfig::defaultGrfNumber; - walkerCmd.setThreadGroupIdXDimension(256); + const uint32_t threadGroupCount = 256; + walkerCmd.setThreadGroupIdXDimension(threadGroupCount); walkerCmd.setThreadGroupIdYDimension(1); walkerCmd.setThreadGroupIdZDimension(1); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, 256u, numGrf, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); } -HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupAndDebugFlagDisabledWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsDefault, IsAtLeastXeHpcCore) { +HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenNumberOfThreadsInThreadGroupAndDebugFlagDisabledWhenCallingEncodeThreadGroupDispatchThenThreadGroupDispatchSizeIsDefault, IsAtLeastXeHpcCore) { using DefaultWalkerType = typename FamilyType::DefaultWalkerType; using InterfaceDescriptorType = typename DefaultWalkerType::InterfaceDescriptorType; DefaultWalkerType walkerCmd{}; + walkerCmd.setThreadGroupIdXDimension(1); + walkerCmd.setThreadGroupIdYDimension(1); + walkerCmd.setThreadGroupIdZDimension(1); DebugManagerStateRestore restorer; debugManager.flags.AdjustThreadGroupDispatchSize.set(0); auto hwInfo = pDevice->getHardwareInfo(); InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor(); - const uint32_t threadGroupCount = 512u; + const uint32_t threadGroupCount = 1u; const uint32_t numGrf = GrfConfig::defaultGrfNumber; std::array, 3> testParams = {{{16u, InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1}, {32u, InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1}, {64u, InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1}}}; - + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; for (const auto &[numberOfThreadsInThreadGroup, expectedThreadGroupDispatchSize] : testParams) { iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); EXPECT_EQ(expectedThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize()); } } -HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenThreadGroupCountZeroWhenCallingAdjustInterfaceDescriptorDataThenThreadGroupDispatchSizeIsSetToDefault, IsAtLeastXeHpcCore) { +HWTEST2_F(CommandEncodeStatesTestPvcAndLater, givenThreadGroupCountZeroWhenCallingEncodeThreadGroupDispatchThenThreadGroupDispatchSizeIsSetToDefault, IsAtLeastXeHpcCore) { using DefaultWalkerType = typename FamilyType::DefaultWalkerType; using InterfaceDescriptorType = typename DefaultWalkerType::InterfaceDescriptorType; DefaultWalkerType walkerCmd{}; + walkerCmd.setThreadGroupIdXDimension(1); + walkerCmd.setThreadGroupIdYDimension(1); + walkerCmd.setThreadGroupIdZDimension(1); auto hwInfo = pDevice->getHardwareInfo(); - const uint32_t threadGroupCount = 0u; + const uint32_t threadGroupCount = 1u; const uint32_t numGrf = GrfConfig::defaultGrfNumber; InterfaceDescriptorType iddArg = FamilyType::template getInitInterfaceDescriptor(); - iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u); - - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, threadGroupCount, numGrf, walkerCmd); + uint32_t numberOfThreadsInThreadGroup = 1; + iddArg.setNumberOfThreadsInGpgpuThreadGroup(numberOfThreadsInThreadGroup); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, numGrf, numberOfThreadsInThreadGroup, walkerCmd); EXPECT_EQ(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); } diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp index 41dd1475b2..b33c338a7a 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp @@ -689,6 +689,9 @@ HWTEST2_F(CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGr using DefaultWalkerType = typename FamilyType::DefaultWalkerType; INTERFACE_DESCRIPTOR_DATA iddArg; DefaultWalkerType walkerCmd{}; + walkerCmd.setThreadGroupIdXDimension(1); + walkerCmd.setThreadGroupIdYDimension(1); + walkerCmd.setThreadGroupIdZDimension(1); iddArg = FamilyType::cmdInitInterfaceDescriptorData; const uint32_t forceThreadGroupDispatchSize = -1; auto hwInfo = pDevice->getHardwareInfo(); @@ -696,11 +699,13 @@ HWTEST2_F(CommandEncodeStatesTest, givenInterfaceDescriptorDataWhenForceThreadGr DebugManagerStateRestore restorer; debugManager.flags.ForceThreadGroupDispatchSize.set(forceThreadGroupDispatchSize); - + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + const uint32_t threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension(); uint32_t revisions[] = {REVISION_A0, REVISION_B}; + uint32_t threadsPerThreadGroup = 4; for (auto revision : revisions) { hwInfo.platform.usRevId = productHelper.getHwRevIdFromStepping(revision, hwInfo); - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, hwInfo, 0, 0, walkerCmd); + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, hwInfo, threadGroups, threadGroupCount, 0, threadsPerThreadGroup, walkerCmd); if (productHelper.isDisableOverdispatchAvailable(hwInfo)) { EXPECT_EQ(INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1, iddArg.getThreadGroupDispatchSize()); @@ -715,6 +720,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDa using DefaultWalkerType = typename FamilyType::DefaultWalkerType; INTERFACE_DESCRIPTOR_DATA iddArg; DefaultWalkerType walkerCmd{}; + walkerCmd.setThreadGroupIdXDimension(1); + walkerCmd.setThreadGroupIdYDimension(1); + walkerCmd.setThreadGroupIdZDimension(1); iddArg = FamilyType::cmdInitInterfaceDescriptorData; iddArg.setNumberOfThreadsInGpgpuThreadGroup(1u); @@ -724,8 +732,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesTest, givenInterfaceDescriptorDa DebugManagerStateRestore restorer; debugManager.flags.ForceThreadGroupDispatchSize.set(forceThreadGroupDispatchSize); - - EncodeDispatchKernel::adjustInterfaceDescriptorData(iddArg, *pDevice, pDevice->getHardwareInfo(), threadGroupCount, 1, walkerCmd); + uint32_t threadGroups[] = {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}; + EncodeDispatchKernel::encodeThreadGroupDispatch(iddArg, *pDevice, pDevice->getHardwareInfo(), threadGroups, threadGroupCount, 1, 1, walkerCmd); EXPECT_NE(defaultThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize()); EXPECT_EQ(forceThreadGroupDispatchSize, iddArg.getThreadGroupDispatchSize());