performance: replace virtual calls with native class methods
Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
parent
b0420fbf40
commit
7ce4a8adc2
|
@ -178,7 +178,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
|||
|
||||
const auto &hardwareInfo = device.getHardwareInfo();
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
auto programmableIDSLMSize = static_cast<uint32_t>(gfxCoreHelper.computeSlmValues(hardwareInfo, slmTotalSize));
|
||||
auto programmableIDSLMSize = EncodeDispatchKernel<GfxFamily>::computeSlmValues(hardwareInfo, slmTotalSize);
|
||||
|
||||
if (debugManager.flags.OverrideSlmAllocationSize.get() != -1) {
|
||||
programmableIDSLMSize = static_cast<uint32_t>(debugManager.flags.OverrideSlmAllocationSize.get());
|
||||
|
|
|
@ -219,6 +219,11 @@ struct EncodeDispatchKernel {
|
|||
|
||||
template <typename WalkerType>
|
||||
static void forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd);
|
||||
|
||||
static uint32_t alignSlmSize(uint32_t slmSize);
|
||||
static uint32_t computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize);
|
||||
|
||||
static bool singleTileExecImplicitScalingRequired(bool cooperativeKernel);
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
|
|
@ -63,7 +63,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
|||
auto pImplicitArgs = args.dispatchInterface->getImplicitArgs();
|
||||
|
||||
auto &hwInfo = args.device->getHardwareInfo();
|
||||
auto &gfxCoreHelper = args.device->getGfxCoreHelper();
|
||||
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
|
||||
|
||||
LinearStream *listCmdBufferStream = container.getCommandStream();
|
||||
|
@ -95,8 +94,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
|||
EncodeDispatchKernel<Family>::programBarrierEnable(idd,
|
||||
kernelDescriptor.kernelAttributes.barrierCount,
|
||||
hwInfo);
|
||||
auto slmSize = static_cast<uint32_t>(
|
||||
gfxCoreHelper.computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()));
|
||||
auto slmSize = EncodeDispatchKernel<Family>::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize());
|
||||
idd.setSharedLocalMemorySize(slmSize);
|
||||
|
||||
uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
|
||||
|
@ -643,6 +641,32 @@ template <typename WalkerType>
|
|||
void EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
|
||||
if (slmSize == 0u) {
|
||||
return 0u;
|
||||
}
|
||||
slmSize = std::max(slmSize, 1024u);
|
||||
slmSize = Math::nextPowerOfTwo(slmSize);
|
||||
UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte);
|
||||
return slmSize;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) {
|
||||
auto value = std::max(slmSize, 1024u);
|
||||
value = Math::nextPowerOfTwo(value);
|
||||
value = Math::getMinLsbSet(value);
|
||||
value = value - 9;
|
||||
DEBUG_BREAK_IF(value > 7);
|
||||
return value * !!slmSize;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
bool EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(bool cooperativeKernel) {
|
||||
return cooperativeKernel;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeStates<Family>::getSshHeapSize() {
|
||||
return 64 * MemoryConstants::kiloByte;
|
||||
|
|
|
@ -119,9 +119,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
|||
|
||||
EncodeDispatchKernel<Family>::encodeEuSchedulingPolicy(&idd, kernelDescriptor, args.defaultPipelinedThreadArbitrationPolicy);
|
||||
|
||||
auto &gfxCoreHelper = args.device->getGfxCoreHelper();
|
||||
auto slmSize = static_cast<uint32_t>(
|
||||
gfxCoreHelper.computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()));
|
||||
auto slmSize = EncodeDispatchKernel<Family>::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize());
|
||||
|
||||
if (debugManager.flags.OverrideSlmAllocationSize.get() != -1) {
|
||||
slmSize = static_cast<uint32_t>(debugManager.flags.OverrideSlmAllocationSize.get());
|
||||
|
@ -262,7 +260,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
|||
}
|
||||
|
||||
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - args.reserveExtraPayloadSpace);
|
||||
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
|
||||
if (pImplicitArgs) {
|
||||
offsetThreadData -= sizeForImplicitArgsStruct;
|
||||
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
|
||||
|
@ -429,7 +426,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
|||
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
|
||||
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup
|
||||
args.dcFlushEnable, // dcFlush
|
||||
gfxCoreHelper.singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
|
||||
EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
|
||||
args.makeCommandView}; // blockDispatchToCommandBuffer
|
||||
|
||||
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
|
||||
|
@ -990,6 +987,82 @@ void EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(Walk
|
|||
}
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
|
||||
const uint32_t alignedSlmSizes[] = {
|
||||
0u,
|
||||
1u * MemoryConstants::kiloByte,
|
||||
2u * MemoryConstants::kiloByte,
|
||||
4u * MemoryConstants::kiloByte,
|
||||
8u * MemoryConstants::kiloByte,
|
||||
16u * MemoryConstants::kiloByte,
|
||||
24u * MemoryConstants::kiloByte,
|
||||
32u * MemoryConstants::kiloByte,
|
||||
48u * MemoryConstants::kiloByte,
|
||||
64u * MemoryConstants::kiloByte,
|
||||
96u * MemoryConstants::kiloByte,
|
||||
128u * MemoryConstants::kiloByte,
|
||||
};
|
||||
|
||||
for (auto &alignedSlmSize : alignedSlmSizes) {
|
||||
if (slmSize <= alignedSlmSize) {
|
||||
return alignedSlmSize;
|
||||
}
|
||||
}
|
||||
|
||||
UNRECOVERABLE_IF(true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) {
|
||||
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
||||
auto alignedSlmSize = EncodeDispatchKernel<Family>::alignSlmSize(slmSize);
|
||||
|
||||
if (alignedSlmSize == 0u) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K;
|
||||
}
|
||||
|
||||
UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte);
|
||||
|
||||
if (alignedSlmSize > 96u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_128K;
|
||||
}
|
||||
if (alignedSlmSize > 64u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_96K;
|
||||
}
|
||||
if (alignedSlmSize > 48u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K;
|
||||
}
|
||||
if (alignedSlmSize > 32u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_48K;
|
||||
}
|
||||
if (alignedSlmSize > 24u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K;
|
||||
}
|
||||
if (alignedSlmSize > 16u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_24K;
|
||||
}
|
||||
if (alignedSlmSize > 8u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K;
|
||||
}
|
||||
if (alignedSlmSize > 4u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K;
|
||||
}
|
||||
if (alignedSlmSize > 2u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K;
|
||||
}
|
||||
if (alignedSlmSize > 1u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K;
|
||||
}
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
bool EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(bool cooperativeKernel) {
|
||||
return cooperativeKernel;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeStates<Family>::getSshHeapSize() {
|
||||
return 2 * MemoryConstants::megaByte;
|
||||
|
|
|
@ -62,6 +62,27 @@ template <>
|
|||
void EncodeBatchBufferStartOrEnd<Family>::appendBatchBufferStart(MI_BATCH_BUFFER_START &cmd, bool indirect, bool predicate) {
|
||||
}
|
||||
|
||||
static uint32_t slmSizeId[] = {0, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16};
|
||||
|
||||
template <>
|
||||
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
|
||||
if (slmSize == 0u) {
|
||||
return 0u;
|
||||
}
|
||||
slmSize = std::max(slmSize, 4096u);
|
||||
slmSize = Math::nextPowerOfTwo(slmSize);
|
||||
return slmSize;
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) {
|
||||
slmSize += (4 * MemoryConstants::kiloByte - 1);
|
||||
slmSize = slmSize >> 12;
|
||||
slmSize = std::min(slmSize, 15u);
|
||||
slmSize = slmSizeId[slmSize];
|
||||
return slmSize;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
#include "shared/source/command_container/command_encoder_enablers.inl"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 2018-2023 Intel Corporation
|
||||
* Copyright (C) 2018-2024 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
|
@ -17,27 +17,6 @@
|
|||
namespace NEO {
|
||||
typedef Gen8Family Family;
|
||||
|
||||
static uint32_t slmSizeId[] = {0, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16};
|
||||
|
||||
template <>
|
||||
uint32_t GfxCoreHelperHw<Family>::alignSlmSize(uint32_t slmSize) const {
|
||||
if (slmSize == 0u) {
|
||||
return 0u;
|
||||
}
|
||||
slmSize = std::max(slmSize, 4096u);
|
||||
slmSize = Math::nextPowerOfTwo(slmSize);
|
||||
return slmSize;
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t GfxCoreHelperHw<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
||||
slmSize += (4 * MemoryConstants::kiloByte - 1);
|
||||
slmSize = slmSize >> 12;
|
||||
slmSize = std::min(slmSize, 15u);
|
||||
slmSize = slmSizeId[slmSize];
|
||||
return slmSize;
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t GfxCoreHelperHw<Family>::getMaxBarrierRegisterPerSlice() const {
|
||||
return 16;
|
||||
|
|
|
@ -402,23 +402,12 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMetricsLibraryGenId() const {
|
|||
|
||||
template <typename GfxFamily>
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::alignSlmSize(uint32_t slmSize) const {
|
||||
if (slmSize == 0u) {
|
||||
return 0u;
|
||||
}
|
||||
slmSize = std::max(slmSize, 1024u);
|
||||
slmSize = Math::nextPowerOfTwo(slmSize);
|
||||
UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte);
|
||||
return slmSize;
|
||||
return EncodeDispatchKernel<GfxFamily>::alignSlmSize(slmSize);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
||||
auto value = std::max(slmSize, 1024u);
|
||||
value = Math::nextPowerOfTwo(value);
|
||||
value = Math::getMinLsbSet(value);
|
||||
value = value - 9;
|
||||
DEBUG_BREAK_IF(value > 7);
|
||||
return value * !!slmSize;
|
||||
return EncodeDispatchKernel<GfxFamily>::computeSlmValues(hwInfo, slmSize);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
@ -756,7 +745,7 @@ bool GfxCoreHelperHw<GfxFamily>::is48ResourceNeededForCmdBuffer() const {
|
|||
|
||||
template <typename GfxFamily>
|
||||
bool GfxCoreHelperHw<GfxFamily>::singleTileExecImplicitScalingRequired(bool cooperativeKernel) const {
|
||||
return cooperativeKernel;
|
||||
return EncodeDispatchKernel<GfxFamily>::singleTileExecImplicitScalingRequired(cooperativeKernel);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
|
|
@ -277,77 +277,6 @@ void GfxCoreHelperHw<Family>::setExtraAllocationData(AllocationData &allocationD
|
|||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t GfxCoreHelperHw<Family>::alignSlmSize(uint32_t slmSize) const {
|
||||
const uint32_t alignedSlmSizes[] = {
|
||||
0u,
|
||||
1u * MemoryConstants::kiloByte,
|
||||
2u * MemoryConstants::kiloByte,
|
||||
4u * MemoryConstants::kiloByte,
|
||||
8u * MemoryConstants::kiloByte,
|
||||
16u * MemoryConstants::kiloByte,
|
||||
24u * MemoryConstants::kiloByte,
|
||||
32u * MemoryConstants::kiloByte,
|
||||
48u * MemoryConstants::kiloByte,
|
||||
64u * MemoryConstants::kiloByte,
|
||||
96u * MemoryConstants::kiloByte,
|
||||
128u * MemoryConstants::kiloByte,
|
||||
};
|
||||
|
||||
for (auto &alignedSlmSize : alignedSlmSizes) {
|
||||
if (slmSize <= alignedSlmSize) {
|
||||
return alignedSlmSize;
|
||||
}
|
||||
}
|
||||
|
||||
UNRECOVERABLE_IF(true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t GfxCoreHelperHw<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
||||
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
||||
auto alignedSlmSize = alignSlmSize(slmSize);
|
||||
|
||||
if (alignedSlmSize == 0u) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K;
|
||||
}
|
||||
|
||||
UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte);
|
||||
|
||||
if (alignedSlmSize > 96u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_128K;
|
||||
}
|
||||
if (alignedSlmSize > 64u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_96K;
|
||||
}
|
||||
if (alignedSlmSize > 48u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K;
|
||||
}
|
||||
if (alignedSlmSize > 32u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_48K;
|
||||
}
|
||||
if (alignedSlmSize > 24u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K;
|
||||
}
|
||||
if (alignedSlmSize > 16u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_24K;
|
||||
}
|
||||
if (alignedSlmSize > 8u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K;
|
||||
}
|
||||
if (alignedSlmSize > 4u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K;
|
||||
}
|
||||
if (alignedSlmSize > 2u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K;
|
||||
}
|
||||
if (alignedSlmSize > 1u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K;
|
||||
}
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K;
|
||||
}
|
||||
|
||||
template <>
|
||||
int32_t GfxCoreHelperHw<Family>::getDefaultThreadArbitrationPolicy() const {
|
||||
return ThreadArbitrationPolicy::RoundRobinAfterDependency;
|
||||
|
|
|
@ -193,9 +193,7 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptor
|
|||
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
|
||||
const uint32_t workGroupCountPerDss = static_cast<uint32_t>(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup));
|
||||
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
|
||||
|
||||
const uint32_t workgroupSlmSize = gfxCoreHelper.alignSlmSize(slmTotalSize);
|
||||
const uint32_t workgroupSlmSize = EncodeDispatchKernel<Family>::alignSlmSize(slmTotalSize);
|
||||
|
||||
uint32_t slmSize = 0u;
|
||||
|
||||
|
|
|
@ -248,75 +248,6 @@ void GfxCoreHelperHw<Family>::setExtraAllocationData(AllocationData &allocationD
|
|||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t GfxCoreHelperHw<Family>::alignSlmSize(uint32_t slmSize) const {
|
||||
const uint32_t alignedSlmSizes[] = {
|
||||
0u,
|
||||
1u * MemoryConstants::kiloByte,
|
||||
2u * MemoryConstants::kiloByte,
|
||||
4u * MemoryConstants::kiloByte,
|
||||
8u * MemoryConstants::kiloByte,
|
||||
16u * MemoryConstants::kiloByte,
|
||||
24u * MemoryConstants::kiloByte,
|
||||
32u * MemoryConstants::kiloByte,
|
||||
48u * MemoryConstants::kiloByte,
|
||||
64u * MemoryConstants::kiloByte,
|
||||
96u * MemoryConstants::kiloByte,
|
||||
128u * MemoryConstants::kiloByte,
|
||||
};
|
||||
|
||||
for (auto &alignedSlmSize : alignedSlmSizes) {
|
||||
if (slmSize <= alignedSlmSize) {
|
||||
return alignedSlmSize;
|
||||
}
|
||||
}
|
||||
|
||||
UNRECOVERABLE_IF(true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t GfxCoreHelperHw<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
||||
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
||||
if (slmSize == 0u) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K;
|
||||
}
|
||||
|
||||
UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte);
|
||||
|
||||
if (slmSize > 96u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_128K;
|
||||
}
|
||||
if (slmSize > 64u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_96K;
|
||||
}
|
||||
if (slmSize > 48u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K;
|
||||
}
|
||||
if (slmSize > 32u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_48K;
|
||||
}
|
||||
if (slmSize > 24u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K;
|
||||
}
|
||||
if (slmSize > 16u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_24K;
|
||||
}
|
||||
if (slmSize > 8u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K;
|
||||
}
|
||||
if (slmSize > 4u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K;
|
||||
}
|
||||
if (slmSize > 2u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K;
|
||||
}
|
||||
if (slmSize > 1u * MemoryConstants::kiloByte) {
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K;
|
||||
}
|
||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K;
|
||||
}
|
||||
|
||||
template <>
|
||||
int32_t GfxCoreHelperHw<Family>::getDefaultThreadArbitrationPolicy() const {
|
||||
return ThreadArbitrationPolicy::RoundRobinAfterDependency;
|
||||
|
|
|
@ -34,8 +34,8 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptor
|
|||
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
|
||||
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
|
||||
const uint32_t workGroupCountPerDss = threadsPerDssCount / threadsPerThreadGroup;
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
|
||||
const uint32_t workgroupSlmSize = gfxCoreHelper.alignSlmSize(slmTotalSize);
|
||||
|
||||
const uint32_t workgroupSlmSize = EncodeDispatchKernel<Family>::alignSlmSize(slmTotalSize);
|
||||
|
||||
uint32_t slmSize = 0u;
|
||||
|
||||
|
@ -195,6 +195,30 @@ void EncodeDispatchKernel<Family>::adjustWalkOrder(WalkerType &walkerCmd, uint32
|
|||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
|
||||
if (slmSize == 0u) {
|
||||
return 0u;
|
||||
}
|
||||
slmSize = std::max(slmSize, 1024u);
|
||||
slmSize = Math::nextPowerOfTwo(slmSize);
|
||||
UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte);
|
||||
return slmSize;
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) {
|
||||
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
||||
|
||||
auto slmValue = std::max(slmSize, 1024u);
|
||||
slmValue = Math::nextPowerOfTwo(slmValue);
|
||||
slmValue = Math::getMinLsbSet(slmValue);
|
||||
slmValue = slmValue - 9;
|
||||
DEBUG_BREAK_IF(slmValue > 7);
|
||||
slmValue *= !!slmSize;
|
||||
return slmValue;
|
||||
}
|
||||
|
||||
template <>
|
||||
void adjustL3ControlField<Family>(void *l3ControlBuffer) {
|
||||
using L3_CONTROL = typename Family::L3_CONTROL;
|
||||
|
|
|
@ -107,19 +107,6 @@ bool GfxCoreHelperHw<Family>::isBufferSizeSuitableForCompression(const size_t si
|
|||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
uint32_t GfxCoreHelperHw<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
||||
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
||||
|
||||
auto slmValue = std::max(slmSize, 1024u);
|
||||
slmValue = Math::nextPowerOfTwo(slmValue);
|
||||
slmValue = Math::getMinLsbSet(slmValue);
|
||||
slmValue = slmValue - 9;
|
||||
DEBUG_BREAK_IF(slmValue > 7);
|
||||
slmValue *= !!slmSize;
|
||||
return slmValue;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool GfxCoreHelperHw<Family>::copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const {
|
||||
if (debugManager.flags.ExperimentalCopyThroughLock.get() != -1) {
|
||||
|
|
Loading…
Reference in New Issue