performance: replace virtual calls with native class methods
Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
parent
b0420fbf40
commit
7ce4a8adc2
|
@ -178,7 +178,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
||||||
|
|
||||||
const auto &hardwareInfo = device.getHardwareInfo();
|
const auto &hardwareInfo = device.getHardwareInfo();
|
||||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||||
auto programmableIDSLMSize = static_cast<uint32_t>(gfxCoreHelper.computeSlmValues(hardwareInfo, slmTotalSize));
|
auto programmableIDSLMSize = EncodeDispatchKernel<GfxFamily>::computeSlmValues(hardwareInfo, slmTotalSize);
|
||||||
|
|
||||||
if (debugManager.flags.OverrideSlmAllocationSize.get() != -1) {
|
if (debugManager.flags.OverrideSlmAllocationSize.get() != -1) {
|
||||||
programmableIDSLMSize = static_cast<uint32_t>(debugManager.flags.OverrideSlmAllocationSize.get());
|
programmableIDSLMSize = static_cast<uint32_t>(debugManager.flags.OverrideSlmAllocationSize.get());
|
||||||
|
|
|
@ -219,6 +219,11 @@ struct EncodeDispatchKernel {
|
||||||
|
|
||||||
template <typename WalkerType>
|
template <typename WalkerType>
|
||||||
static void forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd);
|
static void forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd);
|
||||||
|
|
||||||
|
static uint32_t alignSlmSize(uint32_t slmSize);
|
||||||
|
static uint32_t computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize);
|
||||||
|
|
||||||
|
static bool singleTileExecImplicitScalingRequired(bool cooperativeKernel);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
|
|
|
@ -63,7 +63,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
auto pImplicitArgs = args.dispatchInterface->getImplicitArgs();
|
auto pImplicitArgs = args.dispatchInterface->getImplicitArgs();
|
||||||
|
|
||||||
auto &hwInfo = args.device->getHardwareInfo();
|
auto &hwInfo = args.device->getHardwareInfo();
|
||||||
auto &gfxCoreHelper = args.device->getGfxCoreHelper();
|
|
||||||
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
|
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
|
||||||
|
|
||||||
LinearStream *listCmdBufferStream = container.getCommandStream();
|
LinearStream *listCmdBufferStream = container.getCommandStream();
|
||||||
|
@ -95,8 +94,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
EncodeDispatchKernel<Family>::programBarrierEnable(idd,
|
EncodeDispatchKernel<Family>::programBarrierEnable(idd,
|
||||||
kernelDescriptor.kernelAttributes.barrierCount,
|
kernelDescriptor.kernelAttributes.barrierCount,
|
||||||
hwInfo);
|
hwInfo);
|
||||||
auto slmSize = static_cast<uint32_t>(
|
auto slmSize = EncodeDispatchKernel<Family>::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize());
|
||||||
gfxCoreHelper.computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()));
|
|
||||||
idd.setSharedLocalMemorySize(slmSize);
|
idd.setSharedLocalMemorySize(slmSize);
|
||||||
|
|
||||||
uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
|
uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
|
||||||
|
@ -643,6 +641,32 @@ template <typename WalkerType>
|
||||||
void EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) {
|
void EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Family>
|
||||||
|
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
|
||||||
|
if (slmSize == 0u) {
|
||||||
|
return 0u;
|
||||||
|
}
|
||||||
|
slmSize = std::max(slmSize, 1024u);
|
||||||
|
slmSize = Math::nextPowerOfTwo(slmSize);
|
||||||
|
UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte);
|
||||||
|
return slmSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Family>
|
||||||
|
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) {
|
||||||
|
auto value = std::max(slmSize, 1024u);
|
||||||
|
value = Math::nextPowerOfTwo(value);
|
||||||
|
value = Math::getMinLsbSet(value);
|
||||||
|
value = value - 9;
|
||||||
|
DEBUG_BREAK_IF(value > 7);
|
||||||
|
return value * !!slmSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Family>
|
||||||
|
bool EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(bool cooperativeKernel) {
|
||||||
|
return cooperativeKernel;
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Family>
|
template <typename Family>
|
||||||
size_t EncodeStates<Family>::getSshHeapSize() {
|
size_t EncodeStates<Family>::getSshHeapSize() {
|
||||||
return 64 * MemoryConstants::kiloByte;
|
return 64 * MemoryConstants::kiloByte;
|
||||||
|
|
|
@ -119,9 +119,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
|
|
||||||
EncodeDispatchKernel<Family>::encodeEuSchedulingPolicy(&idd, kernelDescriptor, args.defaultPipelinedThreadArbitrationPolicy);
|
EncodeDispatchKernel<Family>::encodeEuSchedulingPolicy(&idd, kernelDescriptor, args.defaultPipelinedThreadArbitrationPolicy);
|
||||||
|
|
||||||
auto &gfxCoreHelper = args.device->getGfxCoreHelper();
|
auto slmSize = EncodeDispatchKernel<Family>::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize());
|
||||||
auto slmSize = static_cast<uint32_t>(
|
|
||||||
gfxCoreHelper.computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize()));
|
|
||||||
|
|
||||||
if (debugManager.flags.OverrideSlmAllocationSize.get() != -1) {
|
if (debugManager.flags.OverrideSlmAllocationSize.get() != -1) {
|
||||||
slmSize = static_cast<uint32_t>(debugManager.flags.OverrideSlmAllocationSize.get());
|
slmSize = static_cast<uint32_t>(debugManager.flags.OverrideSlmAllocationSize.get());
|
||||||
|
@ -262,7 +260,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
}
|
}
|
||||||
|
|
||||||
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - args.reserveExtraPayloadSpace);
|
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - args.reserveExtraPayloadSpace);
|
||||||
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
|
|
||||||
if (pImplicitArgs) {
|
if (pImplicitArgs) {
|
||||||
offsetThreadData -= sizeForImplicitArgsStruct;
|
offsetThreadData -= sizeForImplicitArgsStruct;
|
||||||
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
|
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
|
||||||
|
@ -429,7 +426,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||||
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
|
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
|
||||||
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup
|
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup
|
||||||
args.dcFlushEnable, // dcFlush
|
args.dcFlushEnable, // dcFlush
|
||||||
gfxCoreHelper.singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
|
EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
|
||||||
args.makeCommandView}; // blockDispatchToCommandBuffer
|
args.makeCommandView}; // blockDispatchToCommandBuffer
|
||||||
|
|
||||||
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
|
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
|
||||||
|
@ -990,6 +987,82 @@ void EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(Walk
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Family>
|
||||||
|
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
|
||||||
|
const uint32_t alignedSlmSizes[] = {
|
||||||
|
0u,
|
||||||
|
1u * MemoryConstants::kiloByte,
|
||||||
|
2u * MemoryConstants::kiloByte,
|
||||||
|
4u * MemoryConstants::kiloByte,
|
||||||
|
8u * MemoryConstants::kiloByte,
|
||||||
|
16u * MemoryConstants::kiloByte,
|
||||||
|
24u * MemoryConstants::kiloByte,
|
||||||
|
32u * MemoryConstants::kiloByte,
|
||||||
|
48u * MemoryConstants::kiloByte,
|
||||||
|
64u * MemoryConstants::kiloByte,
|
||||||
|
96u * MemoryConstants::kiloByte,
|
||||||
|
128u * MemoryConstants::kiloByte,
|
||||||
|
};
|
||||||
|
|
||||||
|
for (auto &alignedSlmSize : alignedSlmSizes) {
|
||||||
|
if (slmSize <= alignedSlmSize) {
|
||||||
|
return alignedSlmSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
UNRECOVERABLE_IF(true);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Family>
|
||||||
|
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) {
|
||||||
|
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
||||||
|
auto alignedSlmSize = EncodeDispatchKernel<Family>::alignSlmSize(slmSize);
|
||||||
|
|
||||||
|
if (alignedSlmSize == 0u) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K;
|
||||||
|
}
|
||||||
|
|
||||||
|
UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte);
|
||||||
|
|
||||||
|
if (alignedSlmSize > 96u * MemoryConstants::kiloByte) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_128K;
|
||||||
|
}
|
||||||
|
if (alignedSlmSize > 64u * MemoryConstants::kiloByte) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_96K;
|
||||||
|
}
|
||||||
|
if (alignedSlmSize > 48u * MemoryConstants::kiloByte) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K;
|
||||||
|
}
|
||||||
|
if (alignedSlmSize > 32u * MemoryConstants::kiloByte) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_48K;
|
||||||
|
}
|
||||||
|
if (alignedSlmSize > 24u * MemoryConstants::kiloByte) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K;
|
||||||
|
}
|
||||||
|
if (alignedSlmSize > 16u * MemoryConstants::kiloByte) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_24K;
|
||||||
|
}
|
||||||
|
if (alignedSlmSize > 8u * MemoryConstants::kiloByte) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K;
|
||||||
|
}
|
||||||
|
if (alignedSlmSize > 4u * MemoryConstants::kiloByte) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K;
|
||||||
|
}
|
||||||
|
if (alignedSlmSize > 2u * MemoryConstants::kiloByte) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K;
|
||||||
|
}
|
||||||
|
if (alignedSlmSize > 1u * MemoryConstants::kiloByte) {
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K;
|
||||||
|
}
|
||||||
|
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Family>
|
||||||
|
bool EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(bool cooperativeKernel) {
|
||||||
|
return cooperativeKernel;
|
||||||
|
}
|
||||||
|
|
||||||
template <typename Family>
|
template <typename Family>
|
||||||
size_t EncodeStates<Family>::getSshHeapSize() {
|
size_t EncodeStates<Family>::getSshHeapSize() {
|
||||||
return 2 * MemoryConstants::megaByte;
|
return 2 * MemoryConstants::megaByte;
|
||||||
|
|
|
@ -62,6 +62,27 @@ template <>
|
||||||
void EncodeBatchBufferStartOrEnd<Family>::appendBatchBufferStart(MI_BATCH_BUFFER_START &cmd, bool indirect, bool predicate) {
|
void EncodeBatchBufferStartOrEnd<Family>::appendBatchBufferStart(MI_BATCH_BUFFER_START &cmd, bool indirect, bool predicate) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static uint32_t slmSizeId[] = {0, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
|
||||||
|
if (slmSize == 0u) {
|
||||||
|
return 0u;
|
||||||
|
}
|
||||||
|
slmSize = std::max(slmSize, 4096u);
|
||||||
|
slmSize = Math::nextPowerOfTwo(slmSize);
|
||||||
|
return slmSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) {
|
||||||
|
slmSize += (4 * MemoryConstants::kiloByte - 1);
|
||||||
|
slmSize = slmSize >> 12;
|
||||||
|
slmSize = std::min(slmSize, 15u);
|
||||||
|
slmSize = slmSizeId[slmSize];
|
||||||
|
return slmSize;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|
||||||
#include "shared/source/command_container/command_encoder_enablers.inl"
|
#include "shared/source/command_container/command_encoder_enablers.inl"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2024 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -17,27 +17,6 @@
|
||||||
namespace NEO {
|
namespace NEO {
|
||||||
typedef Gen8Family Family;
|
typedef Gen8Family Family;
|
||||||
|
|
||||||
static uint32_t slmSizeId[] = {0, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16};
|
|
||||||
|
|
||||||
template <>
|
|
||||||
uint32_t GfxCoreHelperHw<Family>::alignSlmSize(uint32_t slmSize) const {
|
|
||||||
if (slmSize == 0u) {
|
|
||||||
return 0u;
|
|
||||||
}
|
|
||||||
slmSize = std::max(slmSize, 4096u);
|
|
||||||
slmSize = Math::nextPowerOfTwo(slmSize);
|
|
||||||
return slmSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
uint32_t GfxCoreHelperHw<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
|
||||||
slmSize += (4 * MemoryConstants::kiloByte - 1);
|
|
||||||
slmSize = slmSize >> 12;
|
|
||||||
slmSize = std::min(slmSize, 15u);
|
|
||||||
slmSize = slmSizeId[slmSize];
|
|
||||||
return slmSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
size_t GfxCoreHelperHw<Family>::getMaxBarrierRegisterPerSlice() const {
|
size_t GfxCoreHelperHw<Family>::getMaxBarrierRegisterPerSlice() const {
|
||||||
return 16;
|
return 16;
|
||||||
|
|
|
@ -402,23 +402,12 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMetricsLibraryGenId() const {
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
uint32_t GfxCoreHelperHw<GfxFamily>::alignSlmSize(uint32_t slmSize) const {
|
uint32_t GfxCoreHelperHw<GfxFamily>::alignSlmSize(uint32_t slmSize) const {
|
||||||
if (slmSize == 0u) {
|
return EncodeDispatchKernel<GfxFamily>::alignSlmSize(slmSize);
|
||||||
return 0u;
|
|
||||||
}
|
|
||||||
slmSize = std::max(slmSize, 1024u);
|
|
||||||
slmSize = Math::nextPowerOfTwo(slmSize);
|
|
||||||
UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte);
|
|
||||||
return slmSize;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
uint32_t GfxCoreHelperHw<GfxFamily>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
uint32_t GfxCoreHelperHw<GfxFamily>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
||||||
auto value = std::max(slmSize, 1024u);
|
return EncodeDispatchKernel<GfxFamily>::computeSlmValues(hwInfo, slmSize);
|
||||||
value = Math::nextPowerOfTwo(value);
|
|
||||||
value = Math::getMinLsbSet(value);
|
|
||||||
value = value - 9;
|
|
||||||
DEBUG_BREAK_IF(value > 7);
|
|
||||||
return value * !!slmSize;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
|
@ -756,7 +745,7 @@ bool GfxCoreHelperHw<GfxFamily>::is48ResourceNeededForCmdBuffer() const {
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
bool GfxCoreHelperHw<GfxFamily>::singleTileExecImplicitScalingRequired(bool cooperativeKernel) const {
|
bool GfxCoreHelperHw<GfxFamily>::singleTileExecImplicitScalingRequired(bool cooperativeKernel) const {
|
||||||
return cooperativeKernel;
|
return EncodeDispatchKernel<GfxFamily>::singleTileExecImplicitScalingRequired(cooperativeKernel);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
|
|
|
@ -277,77 +277,6 @@ void GfxCoreHelperHw<Family>::setExtraAllocationData(AllocationData &allocationD
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
|
||||||
uint32_t GfxCoreHelperHw<Family>::alignSlmSize(uint32_t slmSize) const {
|
|
||||||
const uint32_t alignedSlmSizes[] = {
|
|
||||||
0u,
|
|
||||||
1u * MemoryConstants::kiloByte,
|
|
||||||
2u * MemoryConstants::kiloByte,
|
|
||||||
4u * MemoryConstants::kiloByte,
|
|
||||||
8u * MemoryConstants::kiloByte,
|
|
||||||
16u * MemoryConstants::kiloByte,
|
|
||||||
24u * MemoryConstants::kiloByte,
|
|
||||||
32u * MemoryConstants::kiloByte,
|
|
||||||
48u * MemoryConstants::kiloByte,
|
|
||||||
64u * MemoryConstants::kiloByte,
|
|
||||||
96u * MemoryConstants::kiloByte,
|
|
||||||
128u * MemoryConstants::kiloByte,
|
|
||||||
};
|
|
||||||
|
|
||||||
for (auto &alignedSlmSize : alignedSlmSizes) {
|
|
||||||
if (slmSize <= alignedSlmSize) {
|
|
||||||
return alignedSlmSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
UNRECOVERABLE_IF(true);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
uint32_t GfxCoreHelperHw<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
|
||||||
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
|
||||||
auto alignedSlmSize = alignSlmSize(slmSize);
|
|
||||||
|
|
||||||
if (alignedSlmSize == 0u) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K;
|
|
||||||
}
|
|
||||||
|
|
||||||
UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte);
|
|
||||||
|
|
||||||
if (alignedSlmSize > 96u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_128K;
|
|
||||||
}
|
|
||||||
if (alignedSlmSize > 64u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_96K;
|
|
||||||
}
|
|
||||||
if (alignedSlmSize > 48u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K;
|
|
||||||
}
|
|
||||||
if (alignedSlmSize > 32u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_48K;
|
|
||||||
}
|
|
||||||
if (alignedSlmSize > 24u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K;
|
|
||||||
}
|
|
||||||
if (alignedSlmSize > 16u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_24K;
|
|
||||||
}
|
|
||||||
if (alignedSlmSize > 8u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K;
|
|
||||||
}
|
|
||||||
if (alignedSlmSize > 4u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K;
|
|
||||||
}
|
|
||||||
if (alignedSlmSize > 2u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K;
|
|
||||||
}
|
|
||||||
if (alignedSlmSize > 1u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K;
|
|
||||||
}
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
int32_t GfxCoreHelperHw<Family>::getDefaultThreadArbitrationPolicy() const {
|
int32_t GfxCoreHelperHw<Family>::getDefaultThreadArbitrationPolicy() const {
|
||||||
return ThreadArbitrationPolicy::RoundRobinAfterDependency;
|
return ThreadArbitrationPolicy::RoundRobinAfterDependency;
|
||||||
|
|
|
@ -193,9 +193,7 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptor
|
||||||
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
|
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
|
||||||
const uint32_t workGroupCountPerDss = static_cast<uint32_t>(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup));
|
const uint32_t workGroupCountPerDss = static_cast<uint32_t>(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup));
|
||||||
|
|
||||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
|
const uint32_t workgroupSlmSize = EncodeDispatchKernel<Family>::alignSlmSize(slmTotalSize);
|
||||||
|
|
||||||
const uint32_t workgroupSlmSize = gfxCoreHelper.alignSlmSize(slmTotalSize);
|
|
||||||
|
|
||||||
uint32_t slmSize = 0u;
|
uint32_t slmSize = 0u;
|
||||||
|
|
||||||
|
|
|
@ -248,75 +248,6 @@ void GfxCoreHelperHw<Family>::setExtraAllocationData(AllocationData &allocationD
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
|
||||||
uint32_t GfxCoreHelperHw<Family>::alignSlmSize(uint32_t slmSize) const {
|
|
||||||
const uint32_t alignedSlmSizes[] = {
|
|
||||||
0u,
|
|
||||||
1u * MemoryConstants::kiloByte,
|
|
||||||
2u * MemoryConstants::kiloByte,
|
|
||||||
4u * MemoryConstants::kiloByte,
|
|
||||||
8u * MemoryConstants::kiloByte,
|
|
||||||
16u * MemoryConstants::kiloByte,
|
|
||||||
24u * MemoryConstants::kiloByte,
|
|
||||||
32u * MemoryConstants::kiloByte,
|
|
||||||
48u * MemoryConstants::kiloByte,
|
|
||||||
64u * MemoryConstants::kiloByte,
|
|
||||||
96u * MemoryConstants::kiloByte,
|
|
||||||
128u * MemoryConstants::kiloByte,
|
|
||||||
};
|
|
||||||
|
|
||||||
for (auto &alignedSlmSize : alignedSlmSizes) {
|
|
||||||
if (slmSize <= alignedSlmSize) {
|
|
||||||
return alignedSlmSize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
UNRECOVERABLE_IF(true);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
uint32_t GfxCoreHelperHw<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
|
||||||
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
|
||||||
if (slmSize == 0u) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K;
|
|
||||||
}
|
|
||||||
|
|
||||||
UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte);
|
|
||||||
|
|
||||||
if (slmSize > 96u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_128K;
|
|
||||||
}
|
|
||||||
if (slmSize > 64u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_96K;
|
|
||||||
}
|
|
||||||
if (slmSize > 48u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_64K;
|
|
||||||
}
|
|
||||||
if (slmSize > 32u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_48K;
|
|
||||||
}
|
|
||||||
if (slmSize > 24u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_32K;
|
|
||||||
}
|
|
||||||
if (slmSize > 16u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_24K;
|
|
||||||
}
|
|
||||||
if (slmSize > 8u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_16K;
|
|
||||||
}
|
|
||||||
if (slmSize > 4u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_8K;
|
|
||||||
}
|
|
||||||
if (slmSize > 2u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_4K;
|
|
||||||
}
|
|
||||||
if (slmSize > 1u * MemoryConstants::kiloByte) {
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_2K;
|
|
||||||
}
|
|
||||||
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_ENCODES_1K;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
int32_t GfxCoreHelperHw<Family>::getDefaultThreadArbitrationPolicy() const {
|
int32_t GfxCoreHelperHw<Family>::getDefaultThreadArbitrationPolicy() const {
|
||||||
return ThreadArbitrationPolicy::RoundRobinAfterDependency;
|
return ThreadArbitrationPolicy::RoundRobinAfterDependency;
|
||||||
|
|
|
@ -34,8 +34,8 @@ void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(InterfaceDescriptor
|
||||||
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
|
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
|
||||||
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
|
const uint32_t threadsPerDssCount = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.DualSubSliceCount;
|
||||||
const uint32_t workGroupCountPerDss = threadsPerDssCount / threadsPerThreadGroup;
|
const uint32_t workGroupCountPerDss = threadsPerDssCount / threadsPerThreadGroup;
|
||||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
|
|
||||||
const uint32_t workgroupSlmSize = gfxCoreHelper.alignSlmSize(slmTotalSize);
|
const uint32_t workgroupSlmSize = EncodeDispatchKernel<Family>::alignSlmSize(slmTotalSize);
|
||||||
|
|
||||||
uint32_t slmSize = 0u;
|
uint32_t slmSize = 0u;
|
||||||
|
|
||||||
|
@ -195,6 +195,30 @@ void EncodeDispatchKernel<Family>::adjustWalkOrder(WalkerType &walkerCmd, uint32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
|
||||||
|
if (slmSize == 0u) {
|
||||||
|
return 0u;
|
||||||
|
}
|
||||||
|
slmSize = std::max(slmSize, 1024u);
|
||||||
|
slmSize = Math::nextPowerOfTwo(slmSize);
|
||||||
|
UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte);
|
||||||
|
return slmSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) {
|
||||||
|
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
||||||
|
|
||||||
|
auto slmValue = std::max(slmSize, 1024u);
|
||||||
|
slmValue = Math::nextPowerOfTwo(slmValue);
|
||||||
|
slmValue = Math::getMinLsbSet(slmValue);
|
||||||
|
slmValue = slmValue - 9;
|
||||||
|
DEBUG_BREAK_IF(slmValue > 7);
|
||||||
|
slmValue *= !!slmSize;
|
||||||
|
return slmValue;
|
||||||
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void adjustL3ControlField<Family>(void *l3ControlBuffer) {
|
void adjustL3ControlField<Family>(void *l3ControlBuffer) {
|
||||||
using L3_CONTROL = typename Family::L3_CONTROL;
|
using L3_CONTROL = typename Family::L3_CONTROL;
|
||||||
|
|
|
@ -107,19 +107,6 @@ bool GfxCoreHelperHw<Family>::isBufferSizeSuitableForCompression(const size_t si
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
|
||||||
uint32_t GfxCoreHelperHw<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) const {
|
|
||||||
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
|
||||||
|
|
||||||
auto slmValue = std::max(slmSize, 1024u);
|
|
||||||
slmValue = Math::nextPowerOfTwo(slmValue);
|
|
||||||
slmValue = Math::getMinLsbSet(slmValue);
|
|
||||||
slmValue = slmValue - 9;
|
|
||||||
DEBUG_BREAK_IF(slmValue > 7);
|
|
||||||
slmValue *= !!slmSize;
|
|
||||||
return slmValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
bool GfxCoreHelperHw<Family>::copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const {
|
bool GfxCoreHelperHw<Family>::copyThroughLockedPtrEnabled(const HardwareInfo &hwInfo, const ProductHelper &productHelper) const {
|
||||||
if (debugManager.flags.ExperimentalCopyThroughLock.get() != -1) {
|
if (debugManager.flags.ExperimentalCopyThroughLock.get() != -1) {
|
||||||
|
|
Loading…
Reference in New Issue