diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 6ced89cdb3..8470ea9e39 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -178,7 +178,7 @@ size_t HardwareCommandsHelper::sendInterfaceDescriptorData( const auto &hardwareInfo = device.getHardwareInfo(); auto &gfxCoreHelper = device.getGfxCoreHelper(); - auto programmableIDSLMSize = static_cast(gfxCoreHelper.computeSlmValues(hardwareInfo, slmTotalSize)); + auto programmableIDSLMSize = EncodeDispatchKernel::computeSlmValues(hardwareInfo, slmTotalSize); if (debugManager.flags.OverrideSlmAllocationSize.get() != -1) { programmableIDSLMSize = static_cast(debugManager.flags.OverrideSlmAllocationSize.get()); diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index 10db2df52b..2d746d2899 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -219,6 +219,11 @@ struct EncodeDispatchKernel { template static void forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd); + + static uint32_t alignSlmSize(uint32_t slmSize); + static uint32_t computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize); + + static bool singleTileExecImplicitScalingRequired(bool cooperativeKernel); }; template diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 854fecd4da..8c21e834fd 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -63,7 +63,6 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis auto pImplicitArgs = args.dispatchInterface->getImplicitArgs(); auto &hwInfo = args.device->getHardwareInfo(); - auto &gfxCoreHelper = args.device->getGfxCoreHelper(); auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment(); LinearStream *listCmdBufferStream = container.getCommandStream(); @@ -95,8 +94,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis EncodeDispatchKernel::programBarrierEnable(idd, kernelDescriptor.kernelAttributes.barrierCount, hwInfo); - auto slmSize = static_cast( - gfxCoreHelper.computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize())); + auto slmSize = EncodeDispatchKernel::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()); idd.setSharedLocalMemorySize(slmSize); uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries; @@ -643,6 +641,32 @@ template void EncodeDispatchKernel::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) { } +template +uint32_t EncodeDispatchKernel::alignSlmSize(uint32_t slmSize) { + if (slmSize == 0u) { + return 0u; + } + slmSize = std::max(slmSize, 1024u); + slmSize = Math::nextPowerOfTwo(slmSize); + UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte); + return slmSize; +} + +template +uint32_t EncodeDispatchKernel::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) { + auto value = std::max(slmSize, 1024u); + value = Math::nextPowerOfTwo(value); + value = Math::getMinLsbSet(value); + value = value - 9; + DEBUG_BREAK_IF(value > 7); + return value * !!slmSize; +} + +template +bool EncodeDispatchKernel::singleTileExecImplicitScalingRequired(bool cooperativeKernel) { + return cooperativeKernel; +} + template size_t EncodeStates::getSshHeapSize() { return 64 * MemoryConstants::kiloByte; diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 63eb0ab880..2ef8c187e0 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -119,9 +119,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis EncodeDispatchKernel::encodeEuSchedulingPolicy(&idd, kernelDescriptor, args.defaultPipelinedThreadArbitrationPolicy); - auto &gfxCoreHelper = args.device->getGfxCoreHelper(); - auto slmSize = static_cast( - gfxCoreHelper.computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize())); + auto slmSize = EncodeDispatchKernel::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()); if (debugManager.flags.OverrideSlmAllocationSize.get() != -1) { slmSize = static_cast(debugManager.flags.OverrideSlmAllocationSize.get()); @@ -262,7 +260,6 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast(heap->getUsed() - sizeThreadData - args.reserveExtraPayloadSpace); - auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment(); if (pImplicitArgs) { offsetThreadData -= sizeForImplicitArgsStruct; pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize; @@ -421,16 +418,16 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress(); ImplicitScalingDispatchCommandArgs implicitScalingArgs{ - workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa - &hwInfo, // hwInfo - &args.outWalkerPtr, // outWalkerPtr - args.requiredPartitionDim, // requiredPartitionDim - args.partitionCount, // partitionCount - !(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer - !args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup - args.dcFlushEnable, // dcFlush - gfxCoreHelper.singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile - args.makeCommandView}; // blockDispatchToCommandBuffer + workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa + &hwInfo, // hwInfo + &args.outWalkerPtr, // outWalkerPtr + args.requiredPartitionDim, // requiredPartitionDim + args.partitionCount, // partitionCount + !(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer + !args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup + args.dcFlushEnable, // dcFlush + EncodeDispatchKernel::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile + args.makeCommandView}; // blockDispatchToCommandBuffer ImplicitScalingDispatch::dispatchCommands(*listCmdBufferStream, walkerCmd, @@ -990,6 +987,82 @@ void EncodeDispatchKernel::forceComputeWalkerPostSyncFlushWithWrite(Walk } } +template +uint32_t EncodeDispatchKernel::alignSlmSize(uint32_t slmSize) { + const uint32_t alignedSlmSizes[] = { + 0u, + 1u * MemoryConstants::kiloByte, + 2u * MemoryConstants::kiloByte, + 4u * MemoryConstants::kiloByte, + 8u * MemoryConstants::kiloByte, + 16u * MemoryConstants::kiloByte, + 24u * MemoryConstants::kiloByte, + 32u * MemoryConstants::kiloByte, + 48u * MemoryConstants::kiloByte, + 64u * MemoryConstants::kiloByte, + 96u * MemoryConstants::kiloByte, + 128u * MemoryConstants::kiloByte, + }; + + for (auto &alignedSlmSize : alignedSlmSizes) { + if (slmSize <= alignedSlmSize) { + return alignedSlmSize; + } + } + + UNRECOVERABLE_IF(true); + return 0; +} + +template +uint32_t EncodeDispatchKernel::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) { + using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE; + auto alignedSlmSize = EncodeDispatchKernel::alignSlmSize(slmSize); + + if (alignedSlmSize == 0u) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K; + } + + UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte); + + if (alignedSlmSize > 96u * MemoryConstants::kiloByte) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_128K; + } + if (alignedSlmSize > 64u * MemoryConstants::kiloByte) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_96K; + } + if (alignedSlmSize > 48u * MemoryConstants::kiloByte) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K; + } + if (alignedSlmSize > 32u * MemoryConstants::kiloByte) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_48K; + } + if (alignedSlmSize > 24u * MemoryConstants::kiloByte) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K; + } + if (alignedSlmSize > 16u * MemoryConstants::kiloByte) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_24K; + } + if (alignedSlmSize > 8u * MemoryConstants::kiloByte) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K; + } + if (alignedSlmSize > 4u * MemoryConstants::kiloByte) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K; + } + if (alignedSlmSize > 2u * MemoryConstants::kiloByte) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K; + } + if (alignedSlmSize > 1u * MemoryConstants::kiloByte) { + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K; + } + return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K; +} + +template +bool EncodeDispatchKernel::singleTileExecImplicitScalingRequired(bool cooperativeKernel) { + return cooperativeKernel; +} + template size_t EncodeStates::getSshHeapSize() { return 2 * MemoryConstants::megaByte; diff --git a/shared/source/gen8/command_encoder_gen8.cpp b/shared/source/gen8/command_encoder_gen8.cpp index 9905b6ea0f..e5f1aa8d6d 100644 --- a/shared/source/gen8/command_encoder_gen8.cpp +++ b/shared/source/gen8/command_encoder_gen8.cpp @@ -62,6 +62,27 @@ template <> void EncodeBatchBufferStartOrEnd::appendBatchBufferStart(MI_BATCH_BUFFER_START &cmd, bool indirect, bool predicate) { } +static uint32_t slmSizeId[] = {0, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16}; + +template <> +uint32_t EncodeDispatchKernel::alignSlmSize(uint32_t slmSize) { + if (slmSize == 0u) { + return 0u; + } + slmSize = std::max(slmSize, 4096u); + slmSize = Math::nextPowerOfTwo(slmSize); + return slmSize; +} + +template <> +uint32_t EncodeDispatchKernel::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) { + slmSize += (4 * MemoryConstants::kiloByte - 1); + slmSize = slmSize >> 12; + slmSize = std::min(slmSize, 15u); + slmSize = slmSizeId[slmSize]; + return slmSize; +} + } // namespace NEO #include "shared/source/command_container/command_encoder_enablers.inl" diff --git a/shared/source/gen8/gfx_core_helper_gen8.cpp b/shared/source/gen8/gfx_core_helper_gen8.cpp index be4568d428..9f020bba2d 100644 --- a/shared/source/gen8/gfx_core_helper_gen8.cpp +++ b/shared/source/gen8/gfx_core_helper_gen8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -17,27 +17,6 @@ namespace NEO { typedef Gen8Family Family; -static uint32_t slmSizeId[] = {0, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16}; - -template <> -uint32_t GfxCoreHelperHw::alignSlmSize(uint32_t slmSize) const { - if (slmSize == 0u) { - return 0u; - } - slmSize = std::max(slmSize, 4096u); - slmSize = Math::nextPowerOfTwo(slmSize); - return slmSize; -} - -template <> -uint32_t GfxCoreHelperHw::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const { - slmSize += (4 * MemoryConstants::kiloByte - 1); - slmSize = slmSize >> 12; - slmSize = std::min(slmSize, 15u); - slmSize = slmSizeId[slmSize]; - return slmSize; -} - template <> size_t GfxCoreHelperHw::getMaxBarrierRegisterPerSlice() const { return 16; diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index 4abccb388a..590c52299f 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -402,23 +402,12 @@ uint32_t GfxCoreHelperHw::getMetricsLibraryGenId() const { template uint32_t GfxCoreHelperHw::alignSlmSize(uint32_t slmSize) const { - if (slmSize == 0u) { - return 0u; - } - slmSize = std::max(slmSize, 1024u); - slmSize = Math::nextPowerOfTwo(slmSize); - UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte); - return slmSize; + return EncodeDispatchKernel::alignSlmSize(slmSize); } template uint32_t GfxCoreHelperHw::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const { - auto value = std::max(slmSize, 1024u); - value = Math::nextPowerOfTwo(value); - value = Math::getMinLsbSet(value); - value = value - 9; - DEBUG_BREAK_IF(value > 7); - return value * !!slmSize; + return EncodeDispatchKernel::computeSlmValues(hwInfo, slmSize); } template @@ -756,7 +745,7 @@ bool GfxCoreHelperHw::is48ResourceNeededForCmdBuffer() const { template bool GfxCoreHelperHw::singleTileExecImplicitScalingRequired(bool cooperativeKernel) const { - return cooperativeKernel; + return EncodeDispatchKernel::singleTileExecImplicitScalingRequired(cooperativeKernel); } template diff --git a/shared/source/xe2_hpg_core/gfx_core_helper_xe2_hpg_core.cpp b/shared/source/xe2_hpg_core/gfx_core_helper_xe2_hpg_core.cpp index 334034a84d..81b11120a4 100644 --- a/shared/source/xe2_hpg_core/gfx_core_helper_xe2_hpg_core.cpp +++ b/shared/source/xe2_hpg_core/gfx_core_helper_xe2_hpg_core.cpp @@ -277,77 +277,6 @@ void GfxCoreHelperHw::setExtraAllocationData(AllocationData &allocationD } } -template <> -uint32_t GfxCoreHelperHw::alignSlmSize(uint32_t slmSize) const { - const uint32_t alignedSlmSizes[] = { - 0u, - 1u * MemoryConstants::kiloByte, - 2u * MemoryConstants::kiloByte, - 4u * MemoryConstants::kiloByte, - 8u * MemoryConstants::kiloByte, - 16u * MemoryConstants::kiloByte, - 24u * MemoryConstants::kiloByte, - 32u * MemoryConstants::kiloByte, - 48u * MemoryConstants::kiloByte, - 64u * MemoryConstants::kiloByte, - 96u * MemoryConstants::kiloByte, - 128u * MemoryConstants::kiloByte, - }; - - for (auto &alignedSlmSize : alignedSlmSizes) { - if (slmSize <= alignedSlmSize) { - return alignedSlmSize; - } - } - - UNRECOVERABLE_IF(true); - return 0; -} - -template <> -uint32_t GfxCoreHelperHw::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const { - using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE; - auto alignedSlmSize = alignSlmSize(slmSize); - - if (alignedSlmSize == 0u) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K; - } - - UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte); - - if (alignedSlmSize > 96u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_128K; - } - if (alignedSlmSize > 64u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_96K; - } - if (alignedSlmSize > 48u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K; - } - if (alignedSlmSize > 32u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_48K; - } - if (alignedSlmSize > 24u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K; - } - if (alignedSlmSize > 16u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_24K; - } - if (alignedSlmSize > 8u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K; - } - if (alignedSlmSize > 4u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K; - } - if (alignedSlmSize > 2u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K; - } - if (alignedSlmSize > 1u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K; - } - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K; -} - template <> int32_t GfxCoreHelperHw::getDefaultThreadArbitrationPolicy() const { return ThreadArbitrationPolicy::RoundRobinAfterDependency; diff --git a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp index 9cb7492e4e..01a38f85ec 100644 --- a/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/command_encoder_xe_hpc_core.cpp @@ -193,9 +193,7 @@ void EncodeDispatchKernel::appendAdditionalIDDFields(InterfaceDescriptor const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount; const uint32_t workGroupCountPerDss = static_cast(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup)); - auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); - - const uint32_t workgroupSlmSize = gfxCoreHelper.alignSlmSize(slmTotalSize); + const uint32_t workgroupSlmSize = EncodeDispatchKernel::alignSlmSize(slmTotalSize); uint32_t slmSize = 0u; diff --git a/shared/source/xe_hpc_core/gfx_core_helper_xe_hpc_core.cpp b/shared/source/xe_hpc_core/gfx_core_helper_xe_hpc_core.cpp index 6a5f8e6f05..4dd2fa31fc 100644 --- a/shared/source/xe_hpc_core/gfx_core_helper_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/gfx_core_helper_xe_hpc_core.cpp @@ -248,75 +248,6 @@ void GfxCoreHelperHw::setExtraAllocationData(AllocationData &allocationD } } -template <> -uint32_t GfxCoreHelperHw::alignSlmSize(uint32_t slmSize) const { - const uint32_t alignedSlmSizes[] = { - 0u, - 1u * MemoryConstants::kiloByte, - 2u * MemoryConstants::kiloByte, - 4u * MemoryConstants::kiloByte, - 8u * MemoryConstants::kiloByte, - 16u * MemoryConstants::kiloByte, - 24u * MemoryConstants::kiloByte, - 32u * MemoryConstants::kiloByte, - 48u * MemoryConstants::kiloByte, - 64u * MemoryConstants::kiloByte, - 96u * MemoryConstants::kiloByte, - 128u * MemoryConstants::kiloByte, - }; - - for (auto &alignedSlmSize : alignedSlmSizes) { - if (slmSize <= alignedSlmSize) { - return alignedSlmSize; - } - } - - UNRECOVERABLE_IF(true); - return 0; -} - -template <> -uint32_t GfxCoreHelperHw::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const { - using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE; - if (slmSize == 0u) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K; - } - - UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte); - - if (slmSize > 96u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_128K; - } - if (slmSize > 64u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_96K; - } - if (slmSize > 48u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K; - } - if (slmSize > 32u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_48K; - } - if (slmSize > 24u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K; - } - if (slmSize > 16u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_24K; - } - if (slmSize > 8u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K; - } - if (slmSize > 4u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K; - } - if (slmSize > 2u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K; - } - if (slmSize > 1u * MemoryConstants::kiloByte) { - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K; - } - return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K; -} - template <> int32_t GfxCoreHelperHw::getDefaultThreadArbitrationPolicy() const { return ThreadArbitrationPolicy::RoundRobinAfterDependency; diff --git a/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp b/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp index 5330cce939..be7bf0ff02 100644 --- a/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp +++ b/shared/source/xe_hpg_core/command_encoder_xe_hpg_core.cpp @@ -34,8 +34,8 @@ void EncodeDispatchKernel::appendAdditionalIDDFields(InterfaceDescriptor auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo(); const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount; const uint32_t workGroupCountPerDss = threadsPerDssCount / threadsPerThreadGroup; - auto &gfxCoreHelper = rootDeviceEnvironment.getHelper(); - const uint32_t workgroupSlmSize = gfxCoreHelper.alignSlmSize(slmTotalSize); + + const uint32_t workgroupSlmSize = EncodeDispatchKernel::alignSlmSize(slmTotalSize); uint32_t slmSize = 0u; @@ -195,6 +195,30 @@ void EncodeDispatchKernel::adjustWalkOrder(WalkerType &walkerCmd, uint32 } } +template <> +uint32_t EncodeDispatchKernel::alignSlmSize(uint32_t slmSize) { + if (slmSize == 0u) { + return 0u; + } + slmSize = std::max(slmSize, 1024u); + slmSize = Math::nextPowerOfTwo(slmSize); + UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte); + return slmSize; +} + +template <> +uint32_t EncodeDispatchKernel::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) { + using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE; + + auto slmValue = std::max(slmSize, 1024u); + slmValue = Math::nextPowerOfTwo(slmValue); + slmValue = Math::getMinLsbSet(slmValue); + slmValue = slmValue - 9; + DEBUG_BREAK_IF(slmValue > 7); + slmValue *= !!slmSize; + return slmValue; +} + template <> void adjustL3ControlField(void *l3ControlBuffer) { using L3_CONTROL = typename Family::L3_CONTROL; diff --git a/shared/source/xe_hpg_core/gfx_core_helper_xe_hpg_core.cpp b/shared/source/xe_hpg_core/gfx_core_helper_xe_hpg_core.cpp index 79879e772b..c649eef161 100644 --- a/shared/source/xe_hpg_core/gfx_core_helper_xe_hpg_core.cpp +++ b/shared/source/xe_hpg_core/gfx_core_helper_xe_hpg_core.cpp @@ -107,19 +107,6 @@ bool GfxCoreHelperHw::isBufferSizeSuitableForCompression(const size_t si } } -template <> -uint32_t GfxCoreHelperHw::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const { - using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE; - - auto slmValue = std::max(slmSize, 1024u); - slmValue = Math::nextPowerOfTwo(slmValue); - slmValue = Math::getMinLsbSet(slmValue); - slmValue = slmValue - 9; - DEBUG_BREAK_IF(slmValue > 7); - slmValue *= !!slmSize; - return slmValue; -} - template <> bool GfxCoreHelperHw::copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const { if (debugManager.flags.ExperimentalCopyThroughLock.get() != -1) {