Files
compute-runtime/shared/source/command_container/command_encoder_xehp_and_later.inl
Young Jin Yoon 415fe245d7 feature: apply or set additional blit properties
Modified appendMemoryCopyBlitRegion, appendCopyImageBlit,
appendMemoryCopyRegion, and appendBlitFill in
CommmandListCoreFamily to change behavior when
useAdditionaBlitProperties is used.
Modified dispatchBlitMemoryFill and
dispatchBlitCommandForImageRegion in BlitCommandHelper to
call applyAdditionalProperties when useAdditionalProperties
is enabled.

Related-To: NEO-13003

Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@intel.com>
Signed-off-by: Young Jin Yoon <young.jin.yoon@intel.com>
2025-05-09 00:04:57 +02:00

1213 lines
63 KiB
C++

/*
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_container/encode_surface_state.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/command_stream/preemption.h"
#include "shared/source/command_stream/scratch_space_controller.h"
#include "shared/source/command_stream/stream_properties.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/gmm_helper/client_context/gmm_client_context.h"
#include "shared/source/gmm_helper/gmm.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/gmm_helper/resource_info.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/cache_policy.h"
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/helpers/constants.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_walk_order.h"
#include "shared/source/helpers/in_order_cmd_helpers.h"
#include "shared/source/helpers/pause_on_gpu_properties.h"
#include "shared/source/helpers/pipe_control_args.h"
#include "shared/source/helpers/ray_tracing_helper.h"
#include "shared/source/helpers/simd_helper.h"
#include "shared/source/helpers/state_base_address.h"
#include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
#include "shared/source/kernel/implicit_args_helper.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/source/os_interface/product_helper.h"
#include "shared/source/release_helper/release_helper.h"
#include "encode_dispatch_kernel_args_ext.h"
#include "encode_surface_state_args.h"
#include <algorithm>
#include <type_traits>
namespace NEO {
template <typename Family>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDispatchKernelArgs &args) {
using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS;
UNRECOVERABLE_IF(args.makeCommandView && (args.cpuWalkerBuffer == nullptr || args.cpuPayloadBuffer == nullptr));
constexpr bool heaplessModeEnabled = Family::template isHeaplessMode<WalkerType>();
const HardwareInfo &hwInfo = args.device->getHardwareInfo();
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
const auto &kernelDescriptor = args.dispatchInterface->getKernelDescriptor();
auto sizeCrossThreadData = args.dispatchInterface->getCrossThreadDataSize();
auto sizePerThreadData = args.dispatchInterface->getPerThreadDataSize();
auto sizePerThreadDataForWholeGroup = args.dispatchInterface->getPerThreadDataSizeForWholeThreadGroup();
auto pImplicitArgs = args.dispatchInterface->getImplicitArgs();
LinearStream *listCmdBufferStream = container.getCommandStream();
auto threadGroupDims = static_cast<const uint32_t *>(args.threadGroupDimensions);
uint32_t threadDimsVec[3] = {0, 0, 0};
if (!args.isIndirect) {
threadDimsVec[0] = threadGroupDims[0];
threadDimsVec[1] = threadGroupDims[1];
threadDimsVec[2] = threadGroupDims[2];
}
if (!args.makeCommandView) {
bool systolicModeRequired = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode;
if (container.systolicModeSupportRef() && (container.lastPipelineSelectModeRequiredRef() != systolicModeRequired)) {
container.lastPipelineSelectModeRequiredRef() = systolicModeRequired;
EncodeComputeMode<Family>::adjustPipelineSelect(container, kernelDescriptor);
}
}
WalkerType walkerCmd = Family::template getInitGpuWalker<WalkerType>();
auto &idd = walkerCmd.getInterfaceDescriptor();
EncodeDispatchKernel<Family>::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData,
sizePerThreadData, rootDeviceEnvironment);
bool localIdsGenerationByRuntime = args.dispatchInterface->requiresGenerationOfLocalIdsByRuntime();
auto requiredWorkgroupOrder = args.dispatchInterface->getRequiredWorkgroupOrder();
{
auto isaAllocation = args.dispatchInterface->getIsaAllocation();
UNRECOVERABLE_IF(nullptr == isaAllocation);
uint64_t kernelStartPointer = args.dispatchInterface->getIsaOffsetInParentAllocation();
if constexpr (heaplessModeEnabled) {
kernelStartPointer += isaAllocation->getGpuAddress();
} else {
kernelStartPointer += isaAllocation->getGpuAddressToPatch();
}
if (!localIdsGenerationByRuntime) {
kernelStartPointer += kernelDescriptor.entryPoints.skipPerThreadDataLoad;
}
idd.setKernelStartPointer(kernelStartPointer);
}
if (args.dispatchInterface->getKernelDescriptor().kernelAttributes.flags.usesAssert && args.device->getL0Debugger() != nullptr) {
idd.setSoftwareExceptionEnable(1);
}
auto threadsPerThreadGroup = args.dispatchInterface->getNumThreadsPerThreadGroup();
idd.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
EncodeDispatchKernel<Family>::programBarrierEnable(idd,
kernelDescriptor,
hwInfo);
EncodeDispatchKernel<Family>::encodeEuSchedulingPolicy(&idd, kernelDescriptor, args.defaultPipelinedThreadArbitrationPolicy);
auto releaseHelper = rootDeviceEnvironment.getReleaseHelper();
auto slmSize = EncodeDispatchKernel<Family>::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize(), releaseHelper, heaplessModeEnabled);
if (debugManager.flags.OverrideSlmAllocationSize.get() != -1) {
slmSize = static_cast<uint32_t>(debugManager.flags.OverrideSlmAllocationSize.get());
}
idd.setSharedLocalMemorySize(slmSize);
auto bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
bool sshProgrammingRequired = true;
auto &productHelper = args.device->getProductHelper();
if (productHelper.isSkippingStatefulInformationRequired(kernelDescriptor)) {
bindingTableStateCount = 0u;
sshProgrammingRequired = false;
}
if (sshProgrammingRequired && !args.makeCommandView) {
bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor);
if (isBindlessKernel) {
bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr;
auto sshHeapSize = args.dispatchInterface->getSurfaceStateHeapDataSize();
if (sshHeapSize > 0u) {
auto ssh = args.surfaceStateHeap;
if (ssh == nullptr) {
container.prepareBindfulSsh();
ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, sshHeapSize, NEO::EncodeDispatchKernel<Family>::getDefaultSshAlignment());
}
uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase());
if (globalBindlessSsh) {
bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress());
}
DEBUG_BREAK_IF(bindingTableStateCount > 0u);
if (bindingTableStateCount == 0) {
// Allocate space for new ssh data
auto dstSurfaceState = ssh->getSpace(sshHeapSize);
memcpy_s(dstSurfaceState, sshHeapSize, args.dispatchInterface->getSurfaceStateHeapData(), sshHeapSize);
}
args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset);
}
} else {
if constexpr (heaplessModeEnabled == false) {
if (bindingTableStateCount > 0u) {
auto ssh = args.surfaceStateHeap;
if (ssh == nullptr) {
container.prepareBindfulSsh();
ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), NEO::EncodeDispatchKernel<Family>::getDefaultSshAlignment());
}
auto bindingTablePointer = static_cast<uint32_t>(EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(
*ssh,
args.dispatchInterface->getSurfaceStateHeapData(),
args.dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
idd.setBindingTablePointer(bindingTablePointer);
}
}
}
}
auto preemptionMode = args.device->getDebugger() ? PreemptionMode::ThreadGroup : args.preemptionMode;
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, preemptionMode);
uint32_t samplerCount = 0;
if constexpr (Family::supportsSampler) {
if (args.device->getDeviceInfo().imageSupport && !args.makeCommandView) {
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
auto dsHeap = args.dynamicStateHeap;
if (dsHeap == nullptr) {
dsHeap = container.getIndirectHeap(HeapType::dynamicState);
auto dshSizeRequired = NEO::EncodeDispatchKernel<Family>::getSizeRequiredDsh(kernelDescriptor, container.getNumIddPerBlock());
if (dsHeap->getAvailableSpace() <= dshSizeRequired) {
dsHeap = container.getHeapWithRequiredSizeAndAlignment(HeapType::dynamicState, dsHeap->getMaxAvailableSpace(), NEO::EncodeDispatchKernel<Family>::getDefaultDshAlignment());
}
}
UNRECOVERABLE_IF(!dsHeap);
auto bindlessHeapsHelper = args.device->getBindlessHeapsHelper();
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
uint64_t samplerStateOffset = EncodeStates<Family>::copySamplerState(
dsHeap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
kernelDescriptor.payloadMappings.samplerTable.numSamplers,
kernelDescriptor.payloadMappings.samplerTable.borderColor,
args.dispatchInterface->getDynamicStateHeapData(),
bindlessHeapsHelper, rootDeviceEnvironment);
if (bindlessHeapsHelper && !bindlessHeapsHelper->isGlobalDshSupported()) {
// add offset of graphics allocation base address relative to heap base address
samplerStateOffset += static_cast<uint32_t>(ptrDiff(dsHeap->getGpuBase(), bindlessHeapsHelper->getGlobalHeapsBase()));
}
if (heaplessModeEnabled && bindlessHeapsHelper) {
samplerStateOffset += bindlessHeapsHelper->getGlobalHeapsBase();
}
args.dispatchInterface->patchSamplerBindlessOffsetsInCrossThreadData(samplerStateOffset);
if constexpr (!heaplessModeEnabled) {
idd.setSamplerStatePointer(static_cast<uint32_t>(samplerStateOffset));
}
}
}
}
if constexpr (heaplessModeEnabled == false) {
EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount);
}
uint64_t offsetThreadData = 0u;
constexpr uint32_t inlineDataSize = WalkerType::getInlineDataSize();
auto crossThreadData = args.dispatchInterface->getCrossThreadData();
uint32_t inlineDataProgrammingOffset = 0u;
bool inlineDataProgramming = EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(kernelDescriptor);
if (inlineDataProgramming) {
inlineDataProgrammingOffset = std::min(inlineDataSize, sizeCrossThreadData);
auto dest = reinterpret_cast<char *>(walkerCmd.getInlineDataPointer());
memcpy_s(dest, inlineDataSize, crossThreadData, inlineDataProgrammingOffset);
sizeCrossThreadData -= inlineDataProgrammingOffset;
crossThreadData = ptrOffset(crossThreadData, inlineDataProgrammingOffset);
inlineDataProgramming = inlineDataProgrammingOffset != 0;
}
auto scratchAddressForImmediatePatching = EncodeDispatchKernel<Family>::getScratchAddressForImmediatePatching<heaplessModeEnabled>(container, args);
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, !localIdsGenerationByRuntime, rootDeviceEnvironment);
uint32_t sizeForImplicitArgsStruct = NEO::ImplicitArgsHelper::getSizeForImplicitArgsStruct(pImplicitArgs, kernelDescriptor, true, rootDeviceEnvironment);
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching + args.reserveExtraPayloadSpace;
IndirectParamsInInlineDataArgs encodeIndirectParamsArgs{};
{
void *ptr = nullptr;
if (!args.makeCommandView) {
auto heap = container.getIndirectHeap(HeapType::indirectObject);
UNRECOVERABLE_IF(!heap);
heap->align(Family::cacheLineSize);
if (args.isKernelDispatchedFromImmediateCmdList) {
ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::indirectObject, iohRequiredSize, Family::indirectDataAlignment)->getSpace(iohRequiredSize);
} else {
ptr = container.getHeapSpaceAllowGrow(HeapType::indirectObject, iohRequiredSize);
}
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - args.reserveExtraPayloadSpace);
if (pImplicitArgs) {
offsetThreadData -= sizeForImplicitArgsStruct;
pImplicitArgs->setLocalIdTablePtr(heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize);
EncodeDispatchKernel<Family>::patchScratchAddressInImplicitArgs<heaplessModeEnabled>(*pImplicitArgs, scratchAddressForImmediatePatching, args.immediateScratchAddressPatching);
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(!localIdsGenerationByRuntime, requiredWorkgroupOrder), rootDeviceEnvironment, &args.outImplicitArgsPtr);
}
if (args.isIndirect) {
auto gpuPtr = heap->getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - inlineDataProgrammingOffset);
uint64_t implicitArgsGpuPtr = 0u;
if (pImplicitArgs) {
implicitArgsGpuPtr = gpuPtr + inlineDataProgrammingOffset - sizeForImplicitArgsStruct;
}
EncodeIndirectParams<Family>::encode(container, gpuPtr, args.dispatchInterface, implicitArgsGpuPtr, &encodeIndirectParamsArgs);
}
} else {
ptr = args.cpuPayloadBuffer;
}
if (sizeCrossThreadData > 0) {
memcpy_s(ptr, sizeCrossThreadData,
crossThreadData, sizeCrossThreadData);
}
auto perThreadDataPtr = args.dispatchInterface->getPerThreadData();
if (perThreadDataPtr != nullptr) {
ptr = ptrOffset(ptr, sizeCrossThreadData);
memcpy_s(ptr, sizePerThreadDataForWholeGroup,
perThreadDataPtr, sizePerThreadDataForWholeGroup);
}
}
if (args.isHeaplessStateInitEnabled == false && !args.makeCommandView) {
if (container.isAnyHeapDirty() ||
args.requiresUncachedMocs) {
PipeControlArgs syncArgs;
syncArgs.dcFlushEnable = args.postSyncArgs.dcFlushEnable;
MemorySynchronizationCommands<Family>::addSingleBarrier(*container.getCommandStream(), syncArgs);
STATE_BASE_ADDRESS sbaCmd;
auto gmmHelper = container.getDevice()->getGmmHelper();
uint32_t statelessMocsIndex =
args.requiresUncachedMocs ? (gmmHelper->getUncachedMOCS() >> 1) : (gmmHelper->getL3EnabledMOCS() >> 1);
auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false);
auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true);
EncodeStateBaseAddressArgs<Family> encodeStateBaseAddressArgs = {
&container, // container
sbaCmd, // sbaCmd
nullptr, // sbaProperties
statelessMocsIndex, // statelessMocsIndex
l1CachePolicy, // l1CachePolicy
l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive
args.partitionCount > 1, // multiOsContextCapable
args.isRcs, // isRcs
container.doubleSbaWaRef(), // doubleSbaWa
heaplessModeEnabled // heaplessModeEnabled
};
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
container.setDirtyStateForAllHeaps(false);
bool sbaTrackingEnabled = NEO::Debugger::isDebugEnabled(args.isInternal) && args.device->getL0Debugger();
NEO::EncodeStateBaseAddress<Family>::setSbaTrackingForL0DebuggerIfEnabled(sbaTrackingEnabled,
*args.device,
*container.getCommandStream(),
sbaCmd, container.isUsingPrimaryBuffer());
}
}
if (!args.makeCommandView) {
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false));
args.additionalCommands->push_back(commandBuffer);
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
}
}
uint8_t *inlineDataPtr = reinterpret_cast<uint8_t *>(walkerCmd.getInlineDataPointer());
EncodeDispatchKernel<Family>::programInlineDataHeapless<heaplessModeEnabled>(inlineDataPtr, args, container, offsetThreadData, scratchAddressForImmediatePatching);
if constexpr (heaplessModeEnabled == false) {
if (!args.makeCommandView) {
walkerCmd.setIndirectDataStartAddress(static_cast<uint32_t>(offsetThreadData));
walkerCmd.setIndirectDataLength(sizeThreadData);
}
}
container.getIndirectHeap(HeapType::indirectObject)->align(NEO::EncodeDispatchKernel<Family>::getDefaultIOHAlignment());
EncodeDispatchKernel<Family>::encodeThreadData(walkerCmd,
nullptr,
threadGroupDims,
args.dispatchInterface->getGroupSize(),
kernelDescriptor.kernelAttributes.simdSize,
kernelDescriptor.kernelAttributes.numLocalIdChannels,
threadsPerThreadGroup,
args.dispatchInterface->getThreadExecutionMask(),
localIdsGenerationByRuntime,
inlineDataProgramming,
args.isIndirect,
requiredWorkgroupOrder,
rootDeviceEnvironment);
if (args.postSyncArgs.inOrderExecInfo) {
EncodePostSync<Family>::setupPostSyncForInOrderExec(walkerCmd, args.postSyncArgs);
} else if (args.postSyncArgs.isRegularEvent()) {
EncodePostSync<Family>::setupPostSyncForRegularEvent(walkerCmd, args.postSyncArgs);
} else {
EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(walkerCmd);
}
if (debugManager.flags.ForceComputeWalkerPostSyncFlush.get() == 1) {
auto &postSync = walkerCmd.getPostSync();
postSync.setDataportPipelineFlush(true);
postSync.setDataportSubsliceCacheFlush(true);
}
walkerCmd.setPredicateEnable(args.isPredicate);
auto threadGroupCount = walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension();
EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadDimsVec, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, threadsPerThreadGroup, walkerCmd);
if (debugManager.flags.PrintKernelDispatchParameters.get()) {
fprintf(stdout, "kernel, %s, grfCount, %d, simdSize, %d, tilesCount, %d, implicitScaling, %s, threadGroupCount, %d, numberOfThreadsInGpgpuThreadGroup, %d, threadGroupDimensions, %d, %d, %d, threadGroupDispatchSize enum, %d\n",
kernelDescriptor.kernelMetadata.kernelName.c_str(),
kernelDescriptor.kernelAttributes.numGrfRequired,
kernelDescriptor.kernelAttributes.simdSize,
args.device->getNumSubDevices(),
ImplicitScalingHelper::isImplicitScalingEnabled(args.device->getDeviceBitfield(), true) ? "Yes" : "no",
threadGroupCount,
idd.getNumberOfThreadsInGpgpuThreadGroup(),
walkerCmd.getThreadGroupIdXDimension(),
walkerCmd.getThreadGroupIdYDimension(),
walkerCmd.getThreadGroupIdZDimension(),
idd.getThreadGroupDispatchSize());
}
EncodeDispatchKernel<Family>::setupPreferredSlmSize(&idd, rootDeviceEnvironment, threadsPerThreadGroup,
args.dispatchInterface->getSlmTotalSize(),
args.dispatchInterface->getSlmPolicy());
auto kernelExecutionType = args.isCooperative ? KernelExecutionType::concurrent : KernelExecutionType::defaultType;
EncodeWalkerArgs walkerArgs{
.argsExtended = args.extendedArgs,
.kernelExecutionType = kernelExecutionType,
.requiredDispatchWalkOrder = args.requiredDispatchWalkOrder,
.localRegionSize = args.localRegionSize,
.maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads,
.requiredSystemFence = args.postSyncArgs.requiresSystemMemoryFence(),
.hasSample = kernelDescriptor.kernelAttributes.flags.hasSample,
.l0DebuggerEnabled = args.device->getL0Debugger() != nullptr};
EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(walkerCmd, rootDeviceEnvironment, walkerArgs);
EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(walkerCmd, &idd, rootDeviceEnvironment, walkerArgs);
EncodeDispatchKernel<Family>::overrideDefaultValues(walkerCmd, idd);
uint32_t workgroupSize = args.dispatchInterface->getGroupSize()[0] * args.dispatchInterface->getGroupSize()[1] * args.dispatchInterface->getGroupSize()[2];
bool isRequiredDispatchWorkGroupOrder = args.requiredDispatchWalkOrder != NEO::RequiredDispatchWalkOrder::none;
if (args.partitionCount > 1 && !args.isInternal) {
const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
ImplicitScalingDispatchCommandArgs implicitScalingArgs{
workPartitionAllocationGpuVa, // workPartitionAllocationGpuVa
args.device, // device
&args.outWalkerPtr, // outWalkerPtr
args.requiredPartitionDim, // requiredPartitionDim
args.partitionCount, // partitionCount
workgroupSize, // workgroupSize
threadGroupCount, // threadGroupCount
args.maxWgCountPerTile, // maxWgCountPerTile
!(container.getFlushTaskUsedForImmediate() || container.isUsingPrimaryBuffer()), // useSecondaryBatchBuffer
!args.isKernelDispatchedFromImmediateCmdList, // apiSelfCleanup
args.postSyncArgs.dcFlushEnable, // dcFlush
EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(args.isCooperative), // forceExecutionOnSingleTile
args.makeCommandView, // blockDispatchToCommandBuffer
isRequiredDispatchWorkGroupOrder}; // isRequiredDispatchWorkGroupOrder
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
walkerCmd,
args.device->getDeviceBitfield(),
implicitScalingArgs);
args.partitionCount = implicitScalingArgs.partitionCount;
} else {
args.partitionCount = 1;
EncodeDispatchKernel<Family>::setWalkerRegionSettings(walkerCmd, *args.device, args.partitionCount, workgroupSize, threadGroupCount, args.maxWgCountPerTile, isRequiredDispatchWorkGroupOrder);
if (!args.makeCommandView) {
auto buffer = listCmdBufferStream->getSpaceForCmd<WalkerType>();
args.outWalkerPtr = buffer;
*buffer = walkerCmd;
}
}
if (args.isIndirect) {
auto walkerGpuVa = listCmdBufferStream->getGpuBase() + ptrDiff(args.outWalkerPtr, listCmdBufferStream->getCpuBase());
EncodeIndirectParams<Family>::applyInlineDataGpuVA(encodeIndirectParamsArgs, walkerGpuVa + ptrDiff(walkerCmd.getInlineDataPointer(), &walkerCmd));
}
if (args.cpuWalkerBuffer) {
*reinterpret_cast<WalkerType *>(args.cpuWalkerBuffer) = walkerCmd;
}
if (!args.makeCommandView) {
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) {
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false));
args.additionalCommands->push_back(commandBuffer);
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
}
}
}
template <typename Family>
template <typename CommandType>
void EncodePostSync<Family>::setupPostSyncForRegularEvent(CommandType &cmd, const EncodePostSyncArgs &args) {
using POSTSYNC_DATA = decltype(Family::template getPostSyncType<CommandType>());
auto &postSync = getPostSync(cmd, 0);
auto operationType = POSTSYNC_DATA::OPERATION_WRITE_IMMEDIATE_DATA;
uint64_t gpuVa = args.eventAddress;
uint64_t immData = args.postSyncImmValue;
if (args.isTimestampEvent) {
operationType = POSTSYNC_DATA::OPERATION_WRITE_TIMESTAMP;
immData = 0;
UNRECOVERABLE_IF(!(isAligned<timestampDestinationAddressAlignment>(gpuVa)));
} else {
UNRECOVERABLE_IF(!(isAligned<immWriteDestinationAddressAlignment>(gpuVa)));
}
uint32_t mocs = getPostSyncMocs(args.device->getRootDeviceEnvironment(), args.dcFlushEnable);
setPostSyncData(postSync, operationType, gpuVa, immData, 0, mocs, false, false);
encodeL3Flush(cmd, args);
adjustTimestampPacket(cmd, args);
}
template <typename Family>
template <typename PostSyncT>
void EncodePostSync<Family>::setPostSyncDataCommon(PostSyncT &postSyncData, typename PostSyncT::OPERATION operation, uint64_t gpuVa, uint64_t immData) {
postSyncData.setOperation(operation);
postSyncData.setImmediateData(immData);
postSyncData.setDestinationAddress(gpuVa);
}
template <typename Family>
inline uint32_t EncodePostSync<Family>::getPostSyncMocs(const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {
auto gmmHelper = rootDeviceEnvironment.getGmmHelper();
if (debugManager.flags.OverridePostSyncMocs.get() != -1) {
return debugManager.flags.OverridePostSyncMocs.get();
}
if (dcFlush) {
return gmmHelper->getUncachedMOCS();
} else {
return gmmHelper->getL3EnabledMOCS();
}
}
template <typename Family>
bool EncodeDispatchKernel<Family>::isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
const size_t *lws,
std::array<uint8_t, 3> walkOrder,
bool requireInputWalkOrder,
uint32_t &requiredWalkOrder,
uint32_t simd) {
if (simd == 1) {
return true;
}
bool hwGenerationOfLocalIdsEnabled = true;
if (debugManager.flags.EnableHwGenerationLocalIds.get() != -1) {
hwGenerationOfLocalIdsEnabled = !!debugManager.flags.EnableHwGenerationLocalIds.get();
}
if (hwGenerationOfLocalIdsEnabled) {
if (activeChannels == 0) {
return false;
}
size_t totalLwsSize = 1u;
for (auto dimension = 0u; dimension < activeChannels; dimension++) {
totalLwsSize *= lws[dimension];
}
if (totalLwsSize > 1024u) {
return true;
}
// check if we need to follow kernel requirements
if (requireInputWalkOrder) {
for (uint32_t dimension = 0; dimension < activeChannels - 1; dimension++) {
if (!Math::isPow2<size_t>(lws[walkOrder[dimension]])) {
return true;
}
}
auto index = 0u;
while (index < HwWalkOrderHelper::walkOrderPossibilties) {
if (walkOrder[0] == HwWalkOrderHelper::compatibleDimensionOrders[index][0] &&
walkOrder[1] == HwWalkOrderHelper::compatibleDimensionOrders[index][1]) {
break;
}
index++;
}
DEBUG_BREAK_IF(index >= HwWalkOrderHelper::walkOrderPossibilties);
requiredWalkOrder = index;
return false;
}
// kernel doesn't specify any walk order requirements, check if we have any compatible
for (uint32_t walkOrder = 0; walkOrder < HwWalkOrderHelper::walkOrderPossibilties; walkOrder++) {
bool allDimensionsCompatible = true;
for (uint32_t dimension = 0; dimension < activeChannels - 1; dimension++) {
if (!Math::isPow2<size_t>(lws[HwWalkOrderHelper::compatibleDimensionOrders[walkOrder][dimension]])) {
allDimensionsCompatible = false;
break;
}
}
if (allDimensionsCompatible) {
requiredWalkOrder = walkOrder;
return false;
}
}
}
return true;
}
template <typename Family>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeThreadData(WalkerType &walkerCmd,
const uint32_t *startWorkGroup,
const uint32_t *numWorkGroups,
const uint32_t *workGroupSizes,
uint32_t simd,
uint32_t localIdDimensions,
uint32_t threadsPerThreadGroup,
uint32_t threadExecutionMask,
bool localIdsGenerationByRuntime,
bool inlineDataProgrammingRequired,
bool isIndirect,
uint32_t requiredWorkGroupOrder,
const RootDeviceEnvironment &rootDeviceEnvironment) {
if (isIndirect) {
walkerCmd.setIndirectParameterEnable(true);
} else {
walkerCmd.setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
walkerCmd.setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
walkerCmd.setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
}
if (startWorkGroup) {
walkerCmd.setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroup[0]));
walkerCmd.setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroup[1]));
walkerCmd.setThreadGroupIdStartingZ(static_cast<uint32_t>(startWorkGroup[2]));
}
uint64_t executionMask = threadExecutionMask;
if (executionMask == 0) {
auto workGroupSize = workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2];
auto remainderSimdLanes = workGroupSize & (simd - 1);
executionMask = maxNBitValue(remainderSimdLanes);
if (!executionMask) {
executionMask = maxNBitValue(isSimd1(simd) ? 32 : simd);
}
}
walkerCmd.setExecutionMask(static_cast<uint32_t>(executionMask));
walkerCmd.setSimdSize(getSimdConfig<WalkerType>(simd));
walkerCmd.setMessageSimd(walkerCmd.getSimdSize());
if (debugManager.flags.ForceSimdMessageSizeInWalker.get() != -1) {
walkerCmd.setMessageSimd(debugManager.flags.ForceSimdMessageSizeInWalker.get());
}
// 1) cross-thread inline data will be put into R1, but if kernel uses local ids, then cross-thread should be put further back
// so whenever local ids are driver or hw generated, reserve space by setting right values for emitLocalIds
// 2) Auto-generation of local ids should be possible, when in fact local ids are used
if (!localIdsGenerationByRuntime && localIdDimensions > 0) {
UNRECOVERABLE_IF(localIdDimensions != 3);
uint32_t emitLocalIdsForDim = (1 << 0) | (1 << 1) | (1 << 2);
walkerCmd.setEmitLocalId(emitLocalIdsForDim);
walkerCmd.setLocalXMaximum(static_cast<uint32_t>(workGroupSizes[0] - 1));
walkerCmd.setLocalYMaximum(static_cast<uint32_t>(workGroupSizes[1] - 1));
walkerCmd.setLocalZMaximum(static_cast<uint32_t>(workGroupSizes[2] - 1));
walkerCmd.setGenerateLocalId(1);
walkerCmd.setWalkOrder(requiredWorkGroupOrder);
}
adjustWalkOrder(walkerCmd, requiredWorkGroupOrder, rootDeviceEnvironment);
if (inlineDataProgrammingRequired == true) {
walkerCmd.setEmitInlineParameter(1);
}
}
template <typename Family>
inline bool EncodeDispatchKernel<Family>::isDshNeeded(const DeviceInfo &deviceInfo) {
if constexpr (Family::supportsSampler) {
return deviceInfo.imageSupport;
}
return false;
}
template <typename Family>
void EncodeStateBaseAddress<Family>::setSbaAddressesForDebugger(NEO::Debugger::SbaAddresses &sbaAddress, const STATE_BASE_ADDRESS &sbaCmd) {
sbaAddress.bindlessSurfaceStateBaseAddress = sbaCmd.getBindlessSurfaceStateBaseAddress();
sbaAddress.dynamicStateBaseAddress = sbaCmd.getDynamicStateBaseAddress();
sbaAddress.generalStateBaseAddress = sbaCmd.getGeneralStateBaseAddress();
sbaAddress.instructionBaseAddress = sbaCmd.getInstructionBaseAddress();
sbaAddress.surfaceStateBaseAddress = sbaCmd.getSurfaceStateBaseAddress();
sbaAddress.indirectObjectBaseAddress = 0;
}
template <typename Family>
void EncodeStateBaseAddress<Family>::encode(EncodeStateBaseAddressArgs<Family> &args) {
auto &device = *args.container->getDevice();
auto gmmHelper = device.getRootDeviceEnvironment().getGmmHelper();
auto dsh = args.container->isHeapDirty(HeapType::dynamicState) ? args.container->getIndirectHeap(HeapType::dynamicState) : nullptr;
auto ioh = args.container->isHeapDirty(HeapType::indirectObject) ? args.container->getIndirectHeap(HeapType::indirectObject) : nullptr;
auto ssh = args.container->isHeapDirty(HeapType::surfaceState) ? args.container->getIndirectHeap(HeapType::surfaceState) : nullptr;
auto isDebuggerActive = device.getDebugger() != nullptr;
bool setGeneralStateBaseAddress = args.sbaProperties ? false : true;
uint64_t globalHeapsBase = 0;
uint64_t bindlessSurfStateBase = 0;
bool useGlobalSshAndDsh = false;
if (device.getBindlessHeapsHelper()) {
bindlessSurfStateBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase();
globalHeapsBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase();
useGlobalSshAndDsh = true;
}
StateBaseAddressHelperArgs<Family> stateBaseAddressHelperArgs = {
0, // generalStateBaseAddress
args.container->getIndirectObjectHeapBaseAddress(), // indirectObjectHeapBaseAddress
args.container->getInstructionHeapBaseAddress(), // instructionHeapBaseAddress
globalHeapsBase, // globalHeapsBaseAddress
0, // surfaceStateBaseAddress
bindlessSurfStateBase, // bindlessSurfaceStateBaseAddress
&args.sbaCmd, // stateBaseAddressCmd
args.sbaProperties, // sbaProperties
dsh, // dsh
ioh, // ioh
ssh, // ssh
gmmHelper, // gmmHelper
args.statelessMocsIndex, // statelessMocsIndex
args.l1CachePolicy, // l1CachePolicy
args.l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive
NEO::MemoryCompressionState::notApplicable, // memoryCompressionState
true, // setInstructionStateBaseAddress
setGeneralStateBaseAddress, // setGeneralStateBaseAddress
useGlobalSshAndDsh, // useGlobalHeapsBaseAddress
args.multiOsContextCapable, // isMultiOsContextCapable
false, // areMultipleSubDevicesInContext
false, // overrideSurfaceStateBaseAddress
isDebuggerActive, // isDebuggerActive
args.doubleSbaWa, // doubleSbaWa
args.heaplessModeEnabled // heaplessModeEnabled
};
StateBaseAddressHelper<Family>::programStateBaseAddressIntoCommandStream(stateBaseAddressHelperArgs,
*args.container->getCommandStream());
if (args.sbaProperties) {
if (args.sbaProperties->bindingTablePoolBaseAddress.value != StreamProperty64::initValue) {
StateBaseAddressHelper<Family>::programBindingTableBaseAddress(*args.container->getCommandStream(),
static_cast<uint64_t>(args.sbaProperties->bindingTablePoolBaseAddress.value),
static_cast<uint32_t>(args.sbaProperties->bindingTablePoolSize.value),
gmmHelper);
}
} else if (args.container->isHeapDirty(HeapType::surfaceState) && ssh != nullptr) {
auto heap = args.container->getIndirectHeap(HeapType::surfaceState);
StateBaseAddressHelper<Family>::programBindingTableBaseAddress(*args.container->getCommandStream(),
*heap,
gmmHelper);
}
}
template <typename Family>
size_t EncodeStateBaseAddress<Family>::getRequiredSizeForStateBaseAddress(Device &device, CommandContainer &container, bool isRcs) {
if constexpr (!Family::isHeaplessRequired()) {
auto &hwInfo = device.getHardwareInfo();
auto &productHelper = device.getProductHelper();
size_t size = sizeof(typename Family::STATE_BASE_ADDRESS);
if (productHelper.isAdditionalStateBaseAddressWARequired(hwInfo)) {
size += sizeof(typename Family::STATE_BASE_ADDRESS);
}
if (container.isHeapDirty(HeapType::surfaceState)) {
size += sizeof(typename Family::_3DSTATE_BINDING_TABLE_POOL_ALLOC);
}
return size;
} else {
UNRECOVERABLE_IF(true);
return 0;
}
}
template <typename Family>
inline void EncodeMediaInterfaceDescriptorLoad<Family>::encode(CommandContainer &container, IndirectHeap *childDsh) {}
template <typename Family>
void EncodeSurfaceState<Family>::encodeExtraBufferParams(EncodeSurfaceStateArgs &args) {
auto surfaceState = reinterpret_cast<R_SURFACE_STATE *>(args.outMemory);
Gmm *gmm = args.allocation ? args.allocation->getDefaultGmm() : nullptr;
uint32_t compressionFormat = 0;
bool setConstCachePolicy = false;
if (args.allocation && args.allocation->getAllocationType() == AllocationType::constantSurface) {
setConstCachePolicy = true;
}
if (surfaceState->getMemoryObjectControlState() == args.gmmHelper->getL3EnabledMOCS() &&
debugManager.flags.ForceL1Caching.get() != 0) {
setConstCachePolicy = true;
}
if (setConstCachePolicy == true) {
surfaceState->setMemoryObjectControlState(args.gmmHelper->getL1EnabledMOCS());
}
encodeExtraCacheSettings(surfaceState, args);
if (EncodeSurfaceState<Family>::isAuxModeEnabled(surfaceState, gmm)) {
auto resourceFormat = gmm->gmmResourceInfo->getResourceFormat();
compressionFormat = args.gmmHelper->getClientContext()->getSurfaceStateCompressionFormat(resourceFormat);
if (debugManager.flags.ForceBufferCompressionFormat.get() != -1) {
compressionFormat = debugManager.flags.ForceBufferCompressionFormat.get();
}
}
if (debugManager.flags.EnableStatelessCompressionWithUnifiedMemory.get()) {
if (args.allocation && !MemoryPoolHelper::isSystemMemoryPool(args.allocation->getMemoryPool())) {
setCoherencyType(surfaceState, R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT);
setBufferAuxParamsForCCS(surfaceState);
compressionFormat = debugManager.flags.FormatForStatelessCompressionWithUnifiedMemory.get();
}
}
surfaceState->setCompressionFormat(compressionFormat);
}
template <typename Family>
void EncodeSemaphore<Family>::programMiSemaphoreWait(MI_SEMAPHORE_WAIT *cmd,
uint64_t compareAddress,
uint64_t compareData,
COMPARE_OPERATION compareMode,
bool registerPollMode,
bool waitMode,
bool useQwordData,
bool indirect,
bool switchOnUnsuccessful) {
MI_SEMAPHORE_WAIT localCmd = Family::cmdInitMiSemaphoreWait;
localCmd.setCompareOperation(compareMode);
localCmd.setSemaphoreDataDword(static_cast<uint32_t>(compareData));
localCmd.setSemaphoreGraphicsAddress(compareAddress);
localCmd.setWaitMode(waitMode ? MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE : MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_SIGNAL_MODE);
localCmd.setRegisterPollMode(registerPollMode ? MI_SEMAPHORE_WAIT::REGISTER_POLL_MODE::REGISTER_POLL_MODE_REGISTER_POLL : MI_SEMAPHORE_WAIT::REGISTER_POLL_MODE::REGISTER_POLL_MODE_MEMORY_POLL);
localCmd.setIndirectSemaphoreDataDword(indirect);
EncodeSemaphore<Family>::appendSemaphoreCommand(localCmd, compareData, indirect, useQwordData, switchOnUnsuccessful);
*cmd = localCmd;
}
template <typename Family>
inline void EncodeWA<Family>::encodeAdditionalPipelineSelect(LinearStream &stream, const PipelineSelectArgs &args, bool is3DPipeline,
const RootDeviceEnvironment &rootDeviceEnvironment, bool isRcs) {}
template <typename Family>
inline size_t EncodeWA<Family>::getAdditionalPipelineSelectSize(Device &device, bool isRcs) {
return 0u;
}
template <typename Family>
inline void EncodeWA<Family>::addPipeControlPriorToNonPipelinedStateCommand(LinearStream &commandStream, PipeControlArgs args,
const RootDeviceEnvironment &rootDeviceEnvironment, bool isRcs) {
auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
auto *releaseHelper = rootDeviceEnvironment.getReleaseHelper();
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
const auto &[isBasicWARequired, isExtendedWARequired] = productHelper.isPipeControlPriorToNonPipelinedStateCommandsWARequired(hwInfo, isRcs, releaseHelper);
if (isExtendedWARequired) {
args.textureCacheInvalidationEnable = true;
args.hdcPipelineFlush = true;
args.amfsFlushEnable = true;
args.instructionCacheInvalidateEnable = true;
args.constantCacheInvalidationEnable = true;
args.stateCacheInvalidationEnable = true;
args.dcFlushEnable = false;
NEO::EncodeWA<Family>::setAdditionalPipeControlFlagsForNonPipelineStateCommand(args);
} else if (isBasicWARequired) {
args.hdcPipelineFlush = true;
NEO::EncodeWA<Family>::setAdditionalPipeControlFlagsForNonPipelineStateCommand(args);
}
MemorySynchronizationCommands<Family>::addSingleBarrier(commandStream, args);
}
template <typename Family>
inline void EncodeStoreMemory<Family>::programStoreDataImm(MI_STORE_DATA_IMM *cmdBuffer,
uint64_t gpuAddress,
uint32_t dataDword0,
uint32_t dataDword1,
bool storeQword,
bool workloadPartitionOffset) {
MI_STORE_DATA_IMM storeDataImmediate = Family::cmdInitStoreDataImm;
storeDataImmediate.setAddress(gpuAddress);
storeDataImmediate.setStoreQword(storeQword);
storeDataImmediate.setDataDword0(dataDword0);
if (storeQword) {
storeDataImmediate.setDataDword1(dataDword1);
storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_QWORD);
} else {
storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD);
}
storeDataImmediate.setWorkloadPartitionIdOffsetEnable(workloadPartitionOffset);
EncodeStoreMemory<Family>::encodeForceCompletionCheck(storeDataImmediate);
*cmdBuffer = storeDataImmediate;
}
template <typename Family>
inline void EncodeStoreMMIO<Family>::appendFlags(MI_STORE_REGISTER_MEM *storeRegMem, bool workloadPartition) {
storeRegMem->setMmioRemapEnable(true);
storeRegMem->setWorkloadPartitionIdOffsetEnable(workloadPartition);
}
template <typename Family>
size_t EncodeDispatchKernel<Family>::additionalSizeRequiredDsh(uint32_t iddCount) {
return 0u;
}
template <typename Family>
inline size_t EncodeDispatchKernel<Family>::getInlineDataOffset(EncodeDispatchKernelArgs &args) {
using DefaultWalkerType = typename Family::DefaultWalkerType;
return offsetof(DefaultWalkerType, TheStructure.Common.InlineData);
}
template <typename Family>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) {
using POSTSYNC_DATA = decltype(Family::template getPostSyncType<WalkerType>());
using OperationType = typename POSTSYNC_DATA::OPERATION;
if (debugManager.flags.ForceComputeWalkerPostSyncFlushWithWrite.get() != -1) {
auto &postSync = walkerCmd.getPostSync();
postSync.setDataportPipelineFlush(true);
postSync.setDataportSubsliceCacheFlush(true);
postSync.setDestinationAddress(static_cast<uint64_t>(debugManager.flags.ForceComputeWalkerPostSyncFlushWithWrite.get()));
postSync.setOperation(OperationType::OPERATION_WRITE_IMMEDIATE_DATA);
postSync.setImmediateData(0u);
}
}
template <typename Family>
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
const uint32_t alignedSlmSizes[] = {
0u,
1u * MemoryConstants::kiloByte,
2u * MemoryConstants::kiloByte,
4u * MemoryConstants::kiloByte,
8u * MemoryConstants::kiloByte,
16u * MemoryConstants::kiloByte,
24u * MemoryConstants::kiloByte,
32u * MemoryConstants::kiloByte,
48u * MemoryConstants::kiloByte,
64u * MemoryConstants::kiloByte,
96u * MemoryConstants::kiloByte,
128u * MemoryConstants::kiloByte,
};
for (auto &alignedSlmSize : alignedSlmSizes) {
if (slmSize <= alignedSlmSize) {
return alignedSlmSize;
}
}
UNRECOVERABLE_IF(true);
return 0;
}
template <typename Family>
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize, ReleaseHelper *releaseHelper, bool isHeapless) {
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
if (slmSize == 0u) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_0K;
}
UNRECOVERABLE_IF(slmSize > 128u * MemoryConstants::kiloByte);
if (slmSize > 96u * MemoryConstants::kiloByte) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_128K;
}
if (slmSize > 64u * MemoryConstants::kiloByte) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_96K;
}
if (slmSize > 48u * MemoryConstants::kiloByte) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_64K;
}
if (slmSize > 32u * MemoryConstants::kiloByte) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_48K;
}
if (slmSize > 24u * MemoryConstants::kiloByte) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_32K;
}
if (slmSize > 16u * MemoryConstants::kiloByte) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_24K;
}
if (slmSize > 8u * MemoryConstants::kiloByte) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_16K;
}
if (slmSize > 4u * MemoryConstants::kiloByte) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_8K;
}
if (slmSize > 2u * MemoryConstants::kiloByte) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_4K;
}
if (slmSize > 1u * MemoryConstants::kiloByte) {
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_2K;
}
return SHARED_LOCAL_MEMORY_SIZE::SHARED_LOCAL_MEMORY_SIZE_SLM_ENCODES_1K;
}
template <typename Family>
template <typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {
using PREFERRED_SLM_ALLOCATION_SIZE = typename InterfaceDescriptorType::PREFERRED_SLM_ALLOCATION_SIZE;
auto &hwInfo = *rootDeviceEnvironment.getHardwareInfo();
const uint32_t threadsPerDssCount = EncodeDispatchKernel<Family>::getThreadCountPerSubslice(hwInfo);
const uint32_t workGroupCountPerDss = static_cast<uint32_t>(Math::divideAndRoundUp(threadsPerDssCount, threadsPerThreadGroup));
slmTotalSize = EncodeDispatchKernel<Family>::alignPreferredSlmSize(slmTotalSize);
uint32_t slmSize = 0u;
switch (slmPolicy) {
case SlmPolicy::slmPolicyLargeData:
slmSize = slmTotalSize;
break;
case SlmPolicy::slmPolicyLargeSlm:
default:
slmSize = slmTotalSize * workGroupCountPerDss;
break;
}
constexpr bool isHeapless = Family::template isInterfaceDescriptorHeaplessMode<InterfaceDescriptorType>();
auto releaseHelper = rootDeviceEnvironment.getReleaseHelper();
const auto &sizeToPreferredSlmValueArray = releaseHelper->getSizeToPreferredSlmValue(isHeapless);
uint32_t programmableIdPreferredSlmSize = 0;
for (auto &range : sizeToPreferredSlmValueArray) {
if (slmSize <= range.upperLimit) {
programmableIdPreferredSlmSize = range.valueToProgram;
break;
}
}
if (debugManager.flags.OverridePreferredSlmAllocationSizePerDss.get() != -1) {
programmableIdPreferredSlmSize = static_cast<uint32_t>(debugManager.flags.OverridePreferredSlmAllocationSizePerDss.get());
}
pInterfaceDescriptor->setPreferredSlmAllocationSize(static_cast<PREFERRED_SLM_ALLOCATION_SIZE>(programmableIdPreferredSlmSize));
}
template <typename Family>
size_t EncodeStates<Family>::getSshHeapSize() {
return 2 * MemoryConstants::megaByte;
}
template <typename Family>
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) {
int32_t forceL3PrefetchForComputeWalker = debugManager.flags.ForceL3PrefetchForComputeWalker.get();
if (forceL3PrefetchForComputeWalker != -1) {
walkerCmd.setL3PrefetchDisable(!forceL3PrefetchForComputeWalker);
}
}
template <typename Family>
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup, WalkerType &walkerCmd) {
const auto &productHelper = device.getProductHelper();
if (productHelper.isDisableOverdispatchAvailable(hwInfo)) {
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
bool adjustTGDispatchSize = true;
if (debugManager.flags.AdjustThreadGroupDispatchSize.get() != -1) {
adjustTGDispatchSize = !!debugManager.flags.AdjustThreadGroupDispatchSize.get();
}
// apply v2 algorithm only for parts where MaxSubSlicesSupported is equal to SubSliceCount
auto algorithmVersion = hwInfo.gtSystemInfo.MaxSubSlicesSupported == hwInfo.gtSystemInfo.SubSliceCount ? 2 : 1;
if (debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get() != -1) {
algorithmVersion = debugManager.flags.ForceThreadGroupDispatchSizeAlgorithm.get();
}
auto tileCount = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), true) ? device.getNumSubDevices() : 1u;
if (algorithmVersion == 2) {
auto threadsPerXeCore = hwInfo.gtSystemInfo.ThreadCount / hwInfo.gtSystemInfo.MaxSubSlicesSupported;
if (grfCount == 256) {
threadsPerXeCore /= 2;
}
auto tgDispatchSizeSelected = 8;
if (threadGroupDimensions[0] > 1 && (threadGroupDimensions[1] > 1 || threadGroupDimensions[2] > 1)) {
while (threadGroupDimensions[0] % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
} else if (threadGroupDimensions[1] > 1 && threadGroupDimensions[2] > 1) {
while (threadGroupDimensions[1] % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
}
// make sure we fit all xe core
while (threadGroupCount / tgDispatchSizeSelected < hwInfo.gtSystemInfo.MaxSubSlicesSupported * tileCount && tgDispatchSizeSelected > 1) {
tgDispatchSizeSelected /= 2;
}
auto threadCountPerGrouping = tgDispatchSizeSelected * threadsPerThreadGroup;
// make sure we do not use more threads then present on each xe core
while (threadCountPerGrouping > threadsPerXeCore && tgDispatchSizeSelected > 1) {
tgDispatchSizeSelected /= 2;
threadCountPerGrouping /= 2;
}
if (tgDispatchSizeSelected == 8) {
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
} else if (tgDispatchSizeSelected == 1) {
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
} else if (tgDispatchSizeSelected == 2) {
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
} else {
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
}
} else {
if (adjustTGDispatchSize) {
UNRECOVERABLE_IF(grfCount == 0u);
constexpr uint32_t maxThreadsInTGForTGDispatchSize8 = 16u;
constexpr uint32_t maxThreadsInTGForTGDispatchSize4 = 32u;
auto &gfxCoreHelper = device.getGfxCoreHelper();
uint32_t availableThreadCount = gfxCoreHelper.calculateAvailableThreadCount(hwInfo, grfCount);
availableThreadCount *= tileCount;
uint32_t dispatchedTotalThreadCount = threadsPerThreadGroup * threadGroupCount;
UNRECOVERABLE_IF(threadsPerThreadGroup == 0u);
auto tgDispatchSizeSelected = 1u;
if (dispatchedTotalThreadCount <= availableThreadCount) {
tgDispatchSizeSelected = 1;
} else if (threadsPerThreadGroup <= maxThreadsInTGForTGDispatchSize8) {
tgDispatchSizeSelected = 8;
} else if (threadsPerThreadGroup <= maxThreadsInTGForTGDispatchSize4) {
tgDispatchSizeSelected = 4;
} else {
tgDispatchSizeSelected = 2;
}
if (threadGroupDimensions[0] > 1 && (threadGroupDimensions[1] > 1 || threadGroupDimensions[2] > 1)) {
while (threadGroupDimensions[0] % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
} else if (threadGroupDimensions[1] > 1 && threadGroupDimensions[2] > 1) {
while (threadGroupDimensions[1] % tgDispatchSizeSelected != 0) {
tgDispatchSizeSelected /= 2;
}
}
if (tgDispatchSizeSelected == 8) {
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_8);
} else if (tgDispatchSizeSelected == 1) {
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
} else if (tgDispatchSizeSelected == 2) {
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_2);
} else {
interfaceDescriptor.setThreadGroupDispatchSize(InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_4);
}
}
}
}
if (debugManager.flags.ForceThreadGroupDispatchSize.get() != -1) {
interfaceDescriptor.setThreadGroupDispatchSize(static_cast<typename InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE>(
debugManager.flags.ForceThreadGroupDispatchSize.get()));
}
}
template <typename Family>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {
auto programGlobalFenceAsPostSyncOperationInComputeWalker = rootDeviceEnvironment.getProductHelper().isGlobalFenceInPostSyncRequired(*rootDeviceEnvironment.getHardwareInfo()) && walkerArgs.requiredSystemFence;
int32_t overrideProgramSystemMemoryFence = debugManager.flags.ProgramGlobalFenceAsPostSyncOperationInComputeWalker.get();
if (overrideProgramSystemMemoryFence != -1) {
programGlobalFenceAsPostSyncOperationInComputeWalker = !!overrideProgramSystemMemoryFence;
}
auto &postSyncData = walkerCmd.getPostSync();
postSyncData.setSystemMemoryFenceRequest(programGlobalFenceAsPostSyncOperationInComputeWalker);
}
template <typename Family>
void EncodeSurfaceState<Family>::encodeExtraCacheSettings(R_SURFACE_STATE *surfaceState, const EncodeSurfaceStateArgs &args) {
using L1_CACHE_CONTROL = typename R_SURFACE_STATE::L1_CACHE_CONTROL;
auto &productHelper = args.gmmHelper->getRootDeviceEnvironment().getHelper<ProductHelper>();
auto cachePolicy = static_cast<L1_CACHE_CONTROL>(productHelper.getL1CachePolicy(args.isDebuggerActive));
if (debugManager.flags.OverrideL1CacheControlInSurfaceState.get() != -1 &&
debugManager.flags.ForceAllResourcesUncached.get() == false) {
cachePolicy = static_cast<L1_CACHE_CONTROL>(debugManager.flags.OverrideL1CacheControlInSurfaceState.get());
}
surfaceState->setL1CacheControlCachePolicy(cachePolicy);
}
template <typename Family>
void EncodeEnableRayTracing<Family>::programEnableRayTracing(LinearStream &commandStream, uint64_t backBuffer) {
auto cmd = Family::cmd3dStateBtd;
cmd.setPerDssMemoryBackedBufferSize(static_cast<typename Family::_3DSTATE_BTD::PER_DSS_MEMORY_BACKED_BUFFER_SIZE>(RayTracingHelper::getMemoryBackedFifoSizeToPatch()));
cmd.setMemoryBackedBufferBasePointer(backBuffer);
append3dStateBtd(&cmd);
*commandStream.getSpaceForCmd<typename Family::_3DSTATE_BTD>() = cmd;
}
template <typename Family>
inline void EncodeWA<Family>::setAdditionalPipeControlFlagsForNonPipelineStateCommand(PipeControlArgs &args) {
args.unTypedDataPortCacheFlush = true;
}
} // namespace NEO