mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-30 01:35:20 +08:00
refactor: adjust file names after pre-gen12 removal 4/n
Related-To: NEO-12681 Signed-off-by: Michał Pryba <michal.pryba@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
b8504913e3
commit
3c027cfedc
@@ -10,7 +10,6 @@ set(NEO_CORE_COMMAND_CONTAINER
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cmdcontainer.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/command_encoder.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/command_encoder.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_bdw_and_later.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_enablers.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_tgllp_and_later.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/encode_alu_helper.h
|
||||
|
||||
@@ -1,692 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/command_container/command_encoder.h"
|
||||
#include "shared/source/command_container/encode_surface_state.h"
|
||||
#include "shared/source/command_stream/linear_stream.h"
|
||||
#include "shared/source/command_stream/memory_compression_state.h"
|
||||
#include "shared/source/command_stream/preemption.h"
|
||||
#include "shared/source/execution_environment/execution_environment.h"
|
||||
#include "shared/source/gmm_helper/gmm_helper.h"
|
||||
#include "shared/source/helpers/api_specific_config.h"
|
||||
#include "shared/source/helpers/cache_policy.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/in_order_cmd_helpers.h"
|
||||
#include "shared/source/helpers/pause_on_gpu_properties.h"
|
||||
#include "shared/source/helpers/pipe_control_args.h"
|
||||
#include "shared/source/helpers/pipeline_select_args.h"
|
||||
#include "shared/source/helpers/simd_helper.h"
|
||||
#include "shared/source/helpers/state_base_address.h"
|
||||
#include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
|
||||
#include "shared/source/kernel/implicit_args_helper.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount,
|
||||
const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
auto grfSize = sizeof(typename Family::GRF);
|
||||
DEBUG_BREAK_IF((sizeCrossThreadData % grfSize) != 0);
|
||||
auto numGrfCrossThreadData = static_cast<uint32_t>(sizeCrossThreadData / grfSize);
|
||||
DEBUG_BREAK_IF(numGrfCrossThreadData == 0);
|
||||
pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData);
|
||||
|
||||
DEBUG_BREAK_IF((sizePerThreadData % grfSize) != 0);
|
||||
auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / grfSize);
|
||||
|
||||
// at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0
|
||||
numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);
|
||||
pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDispatchKernelArgs &args) {
|
||||
|
||||
using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
|
||||
using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS;
|
||||
|
||||
auto &kernelDescriptor = args.dispatchInterface->getKernelDescriptor();
|
||||
auto sizeCrossThreadData = args.dispatchInterface->getCrossThreadDataSize();
|
||||
auto sizePerThreadData = args.dispatchInterface->getPerThreadDataSize();
|
||||
auto sizePerThreadDataForWholeGroup = args.dispatchInterface->getPerThreadDataSizeForWholeThreadGroup();
|
||||
auto pImplicitArgs = args.dispatchInterface->getImplicitArgs();
|
||||
|
||||
auto &hwInfo = args.device->getHardwareInfo();
|
||||
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
|
||||
|
||||
LinearStream *listCmdBufferStream = container.getCommandStream();
|
||||
|
||||
auto threadGroupDims = static_cast<const uint32_t *>(args.threadGroupDimensions);
|
||||
|
||||
DefaultWalkerType cmd = Family::cmdInitGpgpuWalker;
|
||||
auto idd = Family::cmdInitInterfaceDescriptorData;
|
||||
{
|
||||
auto alloc = args.dispatchInterface->getIsaAllocation();
|
||||
UNRECOVERABLE_IF(nullptr == alloc);
|
||||
auto offset = alloc->getGpuAddressToPatch() + args.dispatchInterface->getIsaOffsetInParentAllocation();
|
||||
idd.setKernelStartPointer(offset);
|
||||
idd.setKernelStartPointerHigh(0u);
|
||||
}
|
||||
|
||||
if (args.dispatchInterface->getKernelDescriptor().kernelAttributes.flags.usesAssert && args.device->getL0Debugger() != nullptr) {
|
||||
idd.setSoftwareExceptionEnable(1);
|
||||
}
|
||||
|
||||
auto numThreadsPerThreadGroup = args.dispatchInterface->getNumThreadsPerThreadGroup();
|
||||
idd.setNumberOfThreadsInGpgpuThreadGroup(numThreadsPerThreadGroup);
|
||||
|
||||
EncodeDispatchKernel<Family>::programBarrierEnable(idd,
|
||||
kernelDescriptor,
|
||||
hwInfo);
|
||||
auto slmSize = EncodeDispatchKernel<Family>::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize());
|
||||
idd.setSharedLocalMemorySize(slmSize);
|
||||
|
||||
uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
|
||||
uint32_t bindingTablePointer = 0u;
|
||||
bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor);
|
||||
|
||||
if (!isBindlessKernel) {
|
||||
container.prepareBindfulSsh();
|
||||
if (bindingTableStateCount > 0u) {
|
||||
auto ssh = args.surfaceStateHeap;
|
||||
if (ssh == nullptr) {
|
||||
ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), NEO::EncodeDispatchKernel<Family>::getDefaultSshAlignment());
|
||||
}
|
||||
bindingTablePointer = static_cast<uint32_t>(EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(
|
||||
*ssh,
|
||||
args.dispatchInterface->getSurfaceStateHeapData(),
|
||||
args.dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
|
||||
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
|
||||
}
|
||||
} else {
|
||||
bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr;
|
||||
auto sshHeapSize = args.dispatchInterface->getSurfaceStateHeapDataSize();
|
||||
|
||||
if (sshHeapSize > 0u) {
|
||||
auto ssh = args.surfaceStateHeap;
|
||||
if (ssh == nullptr) {
|
||||
container.prepareBindfulSsh();
|
||||
ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, sshHeapSize, NEO::EncodeDispatchKernel<Family>::getDefaultSshAlignment());
|
||||
}
|
||||
uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase());
|
||||
if (globalBindlessSsh) {
|
||||
bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress());
|
||||
}
|
||||
|
||||
DEBUG_BREAK_IF(bindingTableStateCount > 0u);
|
||||
|
||||
// Allocate space for new ssh data
|
||||
auto dstSurfaceState = ssh->getSpace(sshHeapSize);
|
||||
memcpy_s(dstSurfaceState, sshHeapSize, args.dispatchInterface->getSurfaceStateHeapData(), sshHeapSize);
|
||||
|
||||
args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset);
|
||||
}
|
||||
}
|
||||
idd.setBindingTablePointer(bindingTablePointer);
|
||||
|
||||
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, args.preemptionMode);
|
||||
|
||||
uint32_t samplerStateOffset = 0;
|
||||
uint32_t samplerCount = 0;
|
||||
|
||||
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
|
||||
auto dsHeap = args.dynamicStateHeap;
|
||||
if (dsHeap == nullptr) {
|
||||
dsHeap = container.getIndirectHeap(HeapType::dynamicState);
|
||||
auto dshSizeRequired = NEO::EncodeDispatchKernel<Family>::getSizeRequiredDsh(kernelDescriptor, container.getNumIddPerBlock());
|
||||
if (dsHeap->getAvailableSpace() <= dshSizeRequired) {
|
||||
dsHeap = container.getHeapWithRequiredSizeAndAlignment(HeapType::dynamicState, dsHeap->getMaxAvailableSpace(), NEO::EncodeDispatchKernel<Family>::getDefaultDshAlignment());
|
||||
}
|
||||
}
|
||||
UNRECOVERABLE_IF(!dsHeap);
|
||||
|
||||
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
|
||||
samplerStateOffset = EncodeStates<Family>::copySamplerState(dsHeap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
|
||||
kernelDescriptor.payloadMappings.samplerTable.numSamplers,
|
||||
kernelDescriptor.payloadMappings.samplerTable.borderColor,
|
||||
args.dispatchInterface->getDynamicStateHeapData(),
|
||||
args.device->getBindlessHeapsHelper(), args.device->getRootDeviceEnvironment());
|
||||
}
|
||||
|
||||
idd.setSamplerStatePointer(samplerStateOffset);
|
||||
if (!isBindlessKernel) {
|
||||
EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount);
|
||||
}
|
||||
|
||||
EncodeDispatchKernel<Family>::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData,
|
||||
sizePerThreadData, rootDeviceEnvironment);
|
||||
|
||||
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
||||
bool isHwLocalIdGeneration = false;
|
||||
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
|
||||
uint64_t offsetThreadData = 0u;
|
||||
{
|
||||
auto heapIndirect = container.getIndirectHeap(HeapType::indirectObject);
|
||||
UNRECOVERABLE_IF(!(heapIndirect));
|
||||
heapIndirect->align(Family::cacheLineSize);
|
||||
void *ptr = nullptr;
|
||||
if (args.isKernelDispatchedFromImmediateCmdList) {
|
||||
ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::indirectObject, iohRequiredSize, DefaultWalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)->getSpace(iohRequiredSize);
|
||||
} else {
|
||||
ptr = container.getHeapSpaceAllowGrow(HeapType::indirectObject, iohRequiredSize);
|
||||
}
|
||||
UNRECOVERABLE_IF(!(ptr));
|
||||
offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast<uint64_t>(heapIndirect->getUsed() - sizeThreadData);
|
||||
|
||||
uint64_t implicitArgsGpuVA = 0u;
|
||||
if (pImplicitArgs) {
|
||||
implicitArgsGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(heapIndirect->getUsed() - iohRequiredSize);
|
||||
auto implicitArgsCrossThreadPtr = ptrOffset(const_cast<uint64_t *>(reinterpret_cast<const uint64_t *>(args.dispatchInterface->getCrossThreadData())), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||
*implicitArgsCrossThreadPtr = implicitArgsGpuVA;
|
||||
|
||||
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, rootDeviceEnvironment, nullptr);
|
||||
}
|
||||
|
||||
memcpy_s(ptr, sizeCrossThreadData,
|
||||
args.dispatchInterface->getCrossThreadData(), sizeCrossThreadData);
|
||||
|
||||
if (args.isIndirect) {
|
||||
auto crossThreadDataGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData;
|
||||
EncodeIndirectParams<Family>::encode(container, crossThreadDataGpuVA, args.dispatchInterface, implicitArgsGpuVA);
|
||||
}
|
||||
|
||||
ptr = ptrOffset(ptr, sizeCrossThreadData);
|
||||
memcpy_s(ptr, sizePerThreadDataForWholeGroup,
|
||||
args.dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup);
|
||||
}
|
||||
|
||||
uint32_t numIDD = 0u;
|
||||
void *iddPtr = getInterfaceDescriptor(container, args.dynamicStateHeap, numIDD);
|
||||
|
||||
auto slmSizeNew = args.dispatchInterface->getSlmTotalSize();
|
||||
bool dirtyHeaps = container.isAnyHeapDirty();
|
||||
bool flush = container.slmSizeRef() != slmSizeNew || dirtyHeaps || args.requiresUncachedMocs;
|
||||
|
||||
if (flush) {
|
||||
PipeControlArgs syncArgs;
|
||||
syncArgs.dcFlushEnable = args.dcFlushEnable;
|
||||
if (dirtyHeaps) {
|
||||
syncArgs.hdcPipelineFlush = true;
|
||||
}
|
||||
MemorySynchronizationCommands<Family>::addSingleBarrier(*container.getCommandStream(), syncArgs);
|
||||
|
||||
if (dirtyHeaps || args.requiresUncachedMocs) {
|
||||
STATE_BASE_ADDRESS sba;
|
||||
auto gmmHelper = container.getDevice()->getGmmHelper();
|
||||
uint32_t statelessMocsIndex =
|
||||
args.requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1);
|
||||
auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false);
|
||||
auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true);
|
||||
EncodeStateBaseAddressArgs<Family> encodeStateBaseAddressArgs = {
|
||||
&container, // container
|
||||
sba, // sbaCmd
|
||||
nullptr, // sbaProperties
|
||||
statelessMocsIndex, // statelessMocsIndex
|
||||
l1CachePolicy, // l1CachePolicy
|
||||
l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive
|
||||
false, // multiOsContextCapable
|
||||
args.isRcs, // isRcs
|
||||
container.doubleSbaWaRef(), // doubleSbaWa
|
||||
false, // heaplessModeEnabled
|
||||
};
|
||||
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
|
||||
container.setDirtyStateForAllHeaps(false);
|
||||
args.requiresUncachedMocs = false;
|
||||
}
|
||||
|
||||
if (container.slmSizeRef() != slmSizeNew) {
|
||||
EncodeL3State<Family>::encode(container, slmSizeNew != 0u);
|
||||
container.slmSizeRef() = slmSizeNew;
|
||||
}
|
||||
}
|
||||
|
||||
if (numIDD == 0 || flush) {
|
||||
EncodeMediaInterfaceDescriptorLoad<Family>::encode(container, args.dynamicStateHeap);
|
||||
}
|
||||
|
||||
cmd.setIndirectDataStartAddress(static_cast<uint32_t>(offsetThreadData));
|
||||
cmd.setIndirectDataLength(sizeThreadData);
|
||||
cmd.setInterfaceDescriptorOffset(numIDD);
|
||||
|
||||
EncodeDispatchKernel<Family>::encodeThreadData(cmd,
|
||||
nullptr,
|
||||
threadGroupDims,
|
||||
args.dispatchInterface->getGroupSize(),
|
||||
kernelDescriptor.kernelAttributes.simdSize,
|
||||
kernelDescriptor.kernelAttributes.numLocalIdChannels,
|
||||
numThreadsPerThreadGroup,
|
||||
args.dispatchInterface->getThreadExecutionMask(),
|
||||
true,
|
||||
false,
|
||||
args.isIndirect,
|
||||
args.dispatchInterface->getRequiredWorkgroupOrder(),
|
||||
rootDeviceEnvironment);
|
||||
|
||||
cmd.setPredicateEnable(args.isPredicate);
|
||||
|
||||
auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension();
|
||||
EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd);
|
||||
|
||||
EncodeWalkerArgs walkerArgs{
|
||||
.kernelExecutionType = KernelExecutionType::defaultType,
|
||||
.requiredDispatchWalkOrder = args.requiredDispatchWalkOrder,
|
||||
.localRegionSize = args.localRegionSize,
|
||||
.maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads,
|
||||
.requiredSystemFence = args.requiresSystemMemoryFence(),
|
||||
.hasSample = false};
|
||||
|
||||
EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(rootDeviceEnvironment, cmd, walkerArgs);
|
||||
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(cmd, walkerArgs);
|
||||
EncodeDispatchKernel<Family>::template encodeComputeDispatchAllWalker<WalkerType, INTERFACE_DESCRIPTOR_DATA>(cmd, nullptr, rootDeviceEnvironment, walkerArgs);
|
||||
|
||||
memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd));
|
||||
|
||||
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
|
||||
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false));
|
||||
args.additionalCommands->push_back(commandBuffer);
|
||||
|
||||
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
|
||||
}
|
||||
|
||||
auto buffer = listCmdBufferStream->getSpaceForCmd<DefaultWalkerType>();
|
||||
*buffer = cmd;
|
||||
|
||||
{
|
||||
auto mediaStateFlush = listCmdBufferStream->getSpaceForCmd<MEDIA_STATE_FLUSH>();
|
||||
*mediaStateFlush = Family::cmdInitMediaStateFlush;
|
||||
}
|
||||
|
||||
args.partitionCount = 1;
|
||||
|
||||
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) {
|
||||
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false));
|
||||
args.additionalCommands->push_back(commandBuffer);
|
||||
|
||||
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeMediaInterfaceDescriptorLoad<Family>::encode(CommandContainer &container, IndirectHeap *childDsh) {
|
||||
using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
|
||||
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
|
||||
void *heapBase = nullptr;
|
||||
if (childDsh != nullptr) {
|
||||
heapBase = childDsh->getCpuBase();
|
||||
} else {
|
||||
heapBase = container.getIndirectHeap(HeapType::dynamicState)->getCpuBase();
|
||||
}
|
||||
|
||||
auto mediaStateFlush = container.getCommandStream()->getSpaceForCmd<MEDIA_STATE_FLUSH>();
|
||||
*mediaStateFlush = Family::cmdInitMediaStateFlush;
|
||||
|
||||
auto iddOffset = static_cast<uint32_t>(ptrDiff(container.getIddBlock(), heapBase));
|
||||
|
||||
MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = Family::cmdInitMediaInterfaceDescriptorLoad;
|
||||
cmd.setInterfaceDescriptorDataStartAddress(iddOffset);
|
||||
cmd.setInterfaceDescriptorTotalLength(sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock());
|
||||
|
||||
auto buffer = container.getCommandStream()->getSpace(sizeof(cmd));
|
||||
*(decltype(cmd) *)buffer = cmd;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline bool EncodeDispatchKernel<Family>::isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
|
||||
const size_t *lws,
|
||||
std::array<uint8_t, 3> walkOrder,
|
||||
bool requireInputWalkOrder,
|
||||
uint32_t &requiredWalkOrder,
|
||||
uint32_t simd) {
|
||||
requiredWalkOrder = 0u;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::encodeThreadData(WalkerType &walkerCmd,
|
||||
const uint32_t *startWorkGroup,
|
||||
const uint32_t *numWorkGroups,
|
||||
const uint32_t *workGroupSizes,
|
||||
uint32_t simd,
|
||||
uint32_t localIdDimensions,
|
||||
uint32_t threadsPerThreadGroup,
|
||||
uint32_t threadExecutionMask,
|
||||
bool localIdsGenerationByRuntime,
|
||||
bool inlineDataProgrammingRequired,
|
||||
bool isIndirect,
|
||||
uint32_t requiredWorkGroupOrder,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
|
||||
if (isIndirect) {
|
||||
walkerCmd.setIndirectParameterEnable(true);
|
||||
} else {
|
||||
walkerCmd.setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
|
||||
walkerCmd.setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
|
||||
walkerCmd.setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
|
||||
}
|
||||
|
||||
if (startWorkGroup) {
|
||||
walkerCmd.setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroup[0]));
|
||||
walkerCmd.setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroup[1]));
|
||||
walkerCmd.setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroup[2]));
|
||||
}
|
||||
|
||||
walkerCmd.setSimdSize(getSimdConfig<WalkerType>(simd));
|
||||
|
||||
auto localWorkSize = static_cast<uint32_t>(workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]);
|
||||
if (threadsPerThreadGroup == 0) {
|
||||
threadsPerThreadGroup = getThreadsPerWG(simd, localWorkSize);
|
||||
}
|
||||
walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup);
|
||||
|
||||
uint64_t executionMask = threadExecutionMask;
|
||||
if (executionMask == 0) {
|
||||
auto remainderSimdLanes = localWorkSize & (simd - 1);
|
||||
executionMask = maxNBitValue(remainderSimdLanes);
|
||||
if (!executionMask)
|
||||
executionMask = ~executionMask;
|
||||
}
|
||||
|
||||
constexpr uint32_t maxDword = std::numeric_limits<uint32_t>::max();
|
||||
walkerCmd.setRightExecutionMask(static_cast<uint32_t>(executionMask));
|
||||
walkerCmd.setBottomExecutionMask(maxDword);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor,
|
||||
const KernelDescriptor &kernelDescriptor,
|
||||
const HardwareInfo &hwInfo) {
|
||||
interfaceDescriptor.setBarrierEnable(kernelDescriptor.kernelAttributes.barrierCount);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
inline void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
inline void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}
|
||||
|
||||
template <typename Family>
|
||||
inline bool EncodeDispatchKernel<Family>::isDshNeeded(const DeviceInfo &deviceInfo) {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeStateBaseAddress<Family>::setSbaAddressesForDebugger(NEO::Debugger::SbaAddresses &sbaAddress, const STATE_BASE_ADDRESS &sbaCmd) {
|
||||
sbaAddress.indirectObjectBaseAddress = sbaCmd.getIndirectObjectBaseAddress();
|
||||
sbaAddress.bindlessSurfaceStateBaseAddress = sbaCmd.getBindlessSurfaceStateBaseAddress();
|
||||
sbaAddress.dynamicStateBaseAddress = sbaCmd.getDynamicStateBaseAddress();
|
||||
sbaAddress.generalStateBaseAddress = sbaCmd.getGeneralStateBaseAddress();
|
||||
sbaAddress.instructionBaseAddress = sbaCmd.getInstructionBaseAddress();
|
||||
sbaAddress.surfaceStateBaseAddress = sbaCmd.getSurfaceStateBaseAddress();
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeStateBaseAddress<Family>::encode(EncodeStateBaseAddressArgs<Family> &args) {
|
||||
auto &device = *args.container->getDevice();
|
||||
|
||||
if (args.container->isAnyHeapDirty()) {
|
||||
EncodeWA<Family>::encodeAdditionalPipelineSelect(*args.container->getCommandStream(), {}, true, device.getRootDeviceEnvironment(), args.isRcs);
|
||||
}
|
||||
|
||||
auto gmmHelper = device.getGmmHelper();
|
||||
|
||||
auto dsh = args.container->isHeapDirty(HeapType::dynamicState) ? args.container->getIndirectHeap(HeapType::dynamicState) : nullptr;
|
||||
auto ioh = args.container->isHeapDirty(HeapType::indirectObject) ? args.container->getIndirectHeap(HeapType::indirectObject) : nullptr;
|
||||
auto ssh = args.container->isHeapDirty(HeapType::surfaceState) ? args.container->getIndirectHeap(HeapType::surfaceState) : nullptr;
|
||||
auto isDebuggerActive = device.getDebugger() != nullptr;
|
||||
uint64_t globalHeapsBase = 0;
|
||||
uint64_t bindlessSurfStateBase = 0;
|
||||
bool useGlobalSshAndDsh = false;
|
||||
|
||||
if (device.getBindlessHeapsHelper()) {
|
||||
bindlessSurfStateBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase();
|
||||
globalHeapsBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase();
|
||||
useGlobalSshAndDsh = true;
|
||||
}
|
||||
|
||||
StateBaseAddressHelperArgs<Family> stateBaseAddressHelperArgs = {
|
||||
0, // generalStateBaseAddress
|
||||
args.container->getIndirectObjectHeapBaseAddress(), // indirectObjectHeapBaseAddress
|
||||
args.container->getInstructionHeapBaseAddress(), // instructionHeapBaseAddress
|
||||
globalHeapsBase, // globalHeapsBaseAddress
|
||||
0, // surfaceStateBaseAddress
|
||||
bindlessSurfStateBase, // bindlessSurfaceStateBaseAddress
|
||||
&args.sbaCmd, // stateBaseAddressCmd
|
||||
args.sbaProperties, // sbaProperties
|
||||
dsh, // dsh
|
||||
ioh, // ioh
|
||||
ssh, // ssh
|
||||
gmmHelper, // gmmHelper
|
||||
args.statelessMocsIndex, // statelessMocsIndex
|
||||
args.l1CachePolicy, // l1CachePolicy
|
||||
args.l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive
|
||||
NEO::MemoryCompressionState::notApplicable, // memoryCompressionState
|
||||
false, // setInstructionStateBaseAddress
|
||||
false, // setGeneralStateBaseAddress
|
||||
useGlobalSshAndDsh, // useGlobalHeapsBaseAddress
|
||||
false, // isMultiOsContextCapable
|
||||
false, // areMultipleSubDevicesInContext
|
||||
false, // overrideSurfaceStateBaseAddress
|
||||
isDebuggerActive, // isDebuggerActive
|
||||
args.doubleSbaWa, // doubleSbaWa
|
||||
args.heaplessModeEnabled // heaplessModeEnabled
|
||||
};
|
||||
|
||||
StateBaseAddressHelper<Family>::programStateBaseAddressIntoCommandStream(stateBaseAddressHelperArgs,
|
||||
*args.container->getCommandStream());
|
||||
|
||||
EncodeWA<Family>::encodeAdditionalPipelineSelect(*args.container->getCommandStream(), {}, false, device.getRootDeviceEnvironment(), args.isRcs);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeStateBaseAddress<Family>::getRequiredSizeForStateBaseAddress(Device &device, CommandContainer &container, bool isRcs) {
|
||||
return sizeof(typename Family::STATE_BASE_ADDRESS) + 2 * EncodeWA<Family>::getAdditionalPipelineSelectSize(device, isRcs);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void EncodeMiFlushDW<GfxFamily>::adjust(MI_FLUSH_DW *miFlushDwCmd, const ProductHelper &productHelper) {}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void EncodeWA<GfxFamily>::addPipeControlPriorToNonPipelinedStateCommand(LinearStream &commandStream, PipeControlArgs args,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment, bool isRcs) {
|
||||
MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(commandStream, args);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void EncodeWA<GfxFamily>::adjustCompressionFormatForPlanarImage(uint32_t &compressionFormat, int plane) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSurfaceState<Family>::setCoherencyType(R_SURFACE_STATE *surfaceState, COHERENCY_TYPE coherencyType) {
|
||||
surfaceState->setCoherencyType(coherencyType);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSemaphore<Family>::programMiSemaphoreWait(MI_SEMAPHORE_WAIT *cmd,
|
||||
uint64_t compareAddress,
|
||||
uint64_t compareData,
|
||||
COMPARE_OPERATION compareMode,
|
||||
bool registerPollMode,
|
||||
bool waitMode,
|
||||
bool useQwordData,
|
||||
bool indirect,
|
||||
bool switchOnUnsuccessful) {
|
||||
constexpr uint64_t upper32b = static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) << 32;
|
||||
UNRECOVERABLE_IF(useQwordData || (compareData & upper32b));
|
||||
UNRECOVERABLE_IF(indirect);
|
||||
|
||||
MI_SEMAPHORE_WAIT localCmd = Family::cmdInitMiSemaphoreWait;
|
||||
localCmd.setCompareOperation(compareMode);
|
||||
localCmd.setSemaphoreDataDword(static_cast<uint32_t>(compareData));
|
||||
localCmd.setSemaphoreGraphicsAddress(compareAddress);
|
||||
localCmd.setWaitMode(waitMode ? MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE : MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_SIGNAL_MODE);
|
||||
|
||||
*cmd = localCmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void EncodeEnableRayTracing<GfxFamily>::programEnableRayTracing(LinearStream &commandStream, uint64_t backBuffer) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeStoreMemory<Family>::programStoreDataImm(MI_STORE_DATA_IMM *cmdBuffer,
|
||||
uint64_t gpuAddress,
|
||||
uint32_t dataDword0,
|
||||
uint32_t dataDword1,
|
||||
bool storeQword,
|
||||
bool workloadPartitionOffset) {
|
||||
MI_STORE_DATA_IMM storeDataImmediate = Family::cmdInitStoreDataImm;
|
||||
storeDataImmediate.setAddress(gpuAddress);
|
||||
storeDataImmediate.setStoreQword(storeQword);
|
||||
storeDataImmediate.setDataDword0(dataDword0);
|
||||
if (storeQword) {
|
||||
storeDataImmediate.setDataDword1(dataDword1);
|
||||
storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_QWORD);
|
||||
} else {
|
||||
storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD);
|
||||
}
|
||||
EncodeStoreMemory<Family>::encodeForceCompletionCheck(storeDataImmediate);
|
||||
|
||||
*cmdBuffer = storeDataImmediate;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::setupPostSyncForRegularEvent(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::setupPostSyncForInOrderExec(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::adjustWalkOrder(WalkerType &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment) {}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::additionalSizeRequiredDsh(uint32_t iddCount) {
|
||||
return iddCount * sizeof(typename Family::INTERFACE_DESCRIPTOR_DATA);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline size_t EncodeDispatchKernel<Family>::getInlineDataOffset(EncodeDispatchKernelArgs &args) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
|
||||
if (slmSize == 0u) {
|
||||
return 0u;
|
||||
}
|
||||
slmSize = std::max(slmSize, 1024u);
|
||||
slmSize = Math::nextPowerOfTwo(slmSize);
|
||||
UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte);
|
||||
return slmSize;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) {
|
||||
auto value = std::max(slmSize, 1024u);
|
||||
value = Math::nextPowerOfTwo(value);
|
||||
value = Math::getMinLsbSet(value);
|
||||
value = value - 9;
|
||||
DEBUG_BREAK_IF(value > 7);
|
||||
return value * !!slmSize;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
bool EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(bool cooperativeKernel) {
|
||||
return cooperativeKernel;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeStates<Family>::getSshHeapSize() {
|
||||
return 64 * MemoryConstants::kiloByte;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void InOrderPatchCommandHelpers::PatchCmd<Family>::patchComputeWalker(uint64_t appendCounterValue) {
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
|
||||
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup,
|
||||
WalkerType &walkerCmd) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::getScratchPtrOffsetOfImplicitArgs() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSurfaceState<Family>::setPitchForScratch(R_SURFACE_STATE *surfaceState, uint32_t pitch, const ProductHelper &productHelper) {
|
||||
surfaceState->setSurfacePitch(pitch);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeSurfaceState<Family>::getPitchForScratchInBytes(R_SURFACE_STATE *surfaceState, const ProductHelper &productHelper) {
|
||||
return surfaceState->getSurfacePitch();
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSemaphore<Family>::appendSemaphoreCommand(MI_SEMAPHORE_WAIT &cmd, uint64_t compareData, bool indirect, bool useQwordData, bool switchOnUnsuccessful) {
|
||||
constexpr uint64_t upper32b = static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) << 32;
|
||||
UNRECOVERABLE_IF(useQwordData || (compareData & upper32b));
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <bool isHeapless>
|
||||
void EncodeDispatchKernel<Family>::setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &submissionCsr) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::encodeEuSchedulingPolicy(InterfaceDescriptorType *pInterfaceDescriptor, const KernelDescriptor &kernelDesc, int32_t defaultPipelinedThreadArbitrationPolicy) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::setWalkerRegionSettings(WalkerType &walkerCmd, const NEO::Device &device, uint32_t partitionCount, uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {}
|
||||
} // namespace NEO
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2024 Intel Corporation
|
||||
* Copyright (C) 2020-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -7,20 +7,37 @@
|
||||
|
||||
#include "shared/source/command_container/command_encoder.h"
|
||||
#include "shared/source/command_container/command_encoder.inl"
|
||||
#include "shared/source/command_container/command_encoder_bdw_and_later.inl"
|
||||
#include "shared/source/command_container/command_encoder_from_gen12lp_to_xe2_hpg.inl"
|
||||
#include "shared/source/command_container/command_encoder_gen12lp_and_xe_hpg.inl"
|
||||
#include "shared/source/command_container/command_encoder_pre_xe2_hpg_core.inl"
|
||||
#include "shared/source/command_container/command_encoder_tgllp_and_later.inl"
|
||||
#include "shared/source/command_container/encode_surface_state.h"
|
||||
#include "shared/source/command_stream/linear_stream.h"
|
||||
#include "shared/source/command_stream/memory_compression_state.h"
|
||||
#include "shared/source/command_stream/preemption.h"
|
||||
#include "shared/source/command_stream/stream_properties.h"
|
||||
#include "shared/source/execution_environment/execution_environment.h"
|
||||
#include "shared/source/gen12lp/hw_cmds_base.h"
|
||||
#include "shared/source/gen12lp/reg_configs.h"
|
||||
#include "shared/source/gmm_helper/gmm_helper.h"
|
||||
#include "shared/source/helpers/api_specific_config.h"
|
||||
#include "shared/source/helpers/cache_policy.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/in_order_cmd_helpers.h"
|
||||
#include "shared/source/helpers/pause_on_gpu_properties.h"
|
||||
#include "shared/source/helpers/pipe_control_args.h"
|
||||
#include "shared/source/helpers/pipeline_select_args.h"
|
||||
#include "shared/source/helpers/preamble.h"
|
||||
#include "shared/source/helpers/simd_helper.h"
|
||||
#include "shared/source/helpers/state_base_address.h"
|
||||
#include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
|
||||
#include "shared/source/kernel/implicit_args_helper.h"
|
||||
#include "shared/source/release_helper/release_helper.h"
|
||||
|
||||
#include "encode_surface_state_args.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
using Family = NEO::Gen12LpFamily;
|
||||
|
||||
#include "shared/source/command_container/command_encoder_heap_addressing.inl"
|
||||
@@ -28,6 +45,667 @@ using Family = NEO::Gen12LpFamily;
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::setGrfInfo(InterfaceDescriptorType *pInterfaceDescriptor, uint32_t grfCount,
|
||||
const size_t &sizeCrossThreadData, const size_t &sizePerThreadData,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
auto grfSize = sizeof(typename Family::GRF);
|
||||
DEBUG_BREAK_IF((sizeCrossThreadData % grfSize) != 0);
|
||||
auto numGrfCrossThreadData = static_cast<uint32_t>(sizeCrossThreadData / grfSize);
|
||||
DEBUG_BREAK_IF(numGrfCrossThreadData == 0);
|
||||
pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData);
|
||||
|
||||
DEBUG_BREAK_IF((sizePerThreadData % grfSize) != 0);
|
||||
auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / grfSize);
|
||||
|
||||
// at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0
|
||||
numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);
|
||||
pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDispatchKernelArgs &args) {
|
||||
|
||||
using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
|
||||
using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS;
|
||||
|
||||
auto &kernelDescriptor = args.dispatchInterface->getKernelDescriptor();
|
||||
auto sizeCrossThreadData = args.dispatchInterface->getCrossThreadDataSize();
|
||||
auto sizePerThreadData = args.dispatchInterface->getPerThreadDataSize();
|
||||
auto sizePerThreadDataForWholeGroup = args.dispatchInterface->getPerThreadDataSizeForWholeThreadGroup();
|
||||
auto pImplicitArgs = args.dispatchInterface->getImplicitArgs();
|
||||
|
||||
auto &hwInfo = args.device->getHardwareInfo();
|
||||
auto &rootDeviceEnvironment = args.device->getRootDeviceEnvironment();
|
||||
|
||||
LinearStream *listCmdBufferStream = container.getCommandStream();
|
||||
|
||||
auto threadGroupDims = static_cast<const uint32_t *>(args.threadGroupDimensions);
|
||||
|
||||
DefaultWalkerType cmd = Family::cmdInitGpgpuWalker;
|
||||
auto idd = Family::cmdInitInterfaceDescriptorData;
|
||||
{
|
||||
auto alloc = args.dispatchInterface->getIsaAllocation();
|
||||
UNRECOVERABLE_IF(nullptr == alloc);
|
||||
auto offset = alloc->getGpuAddressToPatch() + args.dispatchInterface->getIsaOffsetInParentAllocation();
|
||||
idd.setKernelStartPointer(offset);
|
||||
idd.setKernelStartPointerHigh(0u);
|
||||
}
|
||||
|
||||
if (args.dispatchInterface->getKernelDescriptor().kernelAttributes.flags.usesAssert && args.device->getL0Debugger() != nullptr) {
|
||||
idd.setSoftwareExceptionEnable(1);
|
||||
}
|
||||
|
||||
auto numThreadsPerThreadGroup = args.dispatchInterface->getNumThreadsPerThreadGroup();
|
||||
idd.setNumberOfThreadsInGpgpuThreadGroup(numThreadsPerThreadGroup);
|
||||
|
||||
EncodeDispatchKernel<Family>::programBarrierEnable(idd,
|
||||
kernelDescriptor,
|
||||
hwInfo);
|
||||
auto slmSize = EncodeDispatchKernel<Family>::computeSlmValues(hwInfo, args.dispatchInterface->getSlmTotalSize());
|
||||
idd.setSharedLocalMemorySize(slmSize);
|
||||
|
||||
uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
|
||||
uint32_t bindingTablePointer = 0u;
|
||||
bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor);
|
||||
|
||||
if (!isBindlessKernel) {
|
||||
container.prepareBindfulSsh();
|
||||
if (bindingTableStateCount > 0u) {
|
||||
auto ssh = args.surfaceStateHeap;
|
||||
if (ssh == nullptr) {
|
||||
ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), NEO::EncodeDispatchKernel<Family>::getDefaultSshAlignment());
|
||||
}
|
||||
bindingTablePointer = static_cast<uint32_t>(EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(
|
||||
*ssh,
|
||||
args.dispatchInterface->getSurfaceStateHeapData(),
|
||||
args.dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
|
||||
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
|
||||
}
|
||||
} else {
|
||||
bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr;
|
||||
auto sshHeapSize = args.dispatchInterface->getSurfaceStateHeapDataSize();
|
||||
|
||||
if (sshHeapSize > 0u) {
|
||||
auto ssh = args.surfaceStateHeap;
|
||||
if (ssh == nullptr) {
|
||||
container.prepareBindfulSsh();
|
||||
ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::surfaceState, sshHeapSize, NEO::EncodeDispatchKernel<Family>::getDefaultSshAlignment());
|
||||
}
|
||||
uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase());
|
||||
if (globalBindlessSsh) {
|
||||
bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress());
|
||||
}
|
||||
|
||||
DEBUG_BREAK_IF(bindingTableStateCount > 0u);
|
||||
|
||||
// Allocate space for new ssh data
|
||||
auto dstSurfaceState = ssh->getSpace(sshHeapSize);
|
||||
memcpy_s(dstSurfaceState, sshHeapSize, args.dispatchInterface->getSurfaceStateHeapData(), sshHeapSize);
|
||||
|
||||
args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset);
|
||||
}
|
||||
}
|
||||
idd.setBindingTablePointer(bindingTablePointer);
|
||||
|
||||
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, args.preemptionMode);
|
||||
|
||||
uint32_t samplerStateOffset = 0;
|
||||
uint32_t samplerCount = 0;
|
||||
|
||||
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
|
||||
auto dsHeap = args.dynamicStateHeap;
|
||||
if (dsHeap == nullptr) {
|
||||
dsHeap = container.getIndirectHeap(HeapType::dynamicState);
|
||||
auto dshSizeRequired = NEO::EncodeDispatchKernel<Family>::getSizeRequiredDsh(kernelDescriptor, container.getNumIddPerBlock());
|
||||
if (dsHeap->getAvailableSpace() <= dshSizeRequired) {
|
||||
dsHeap = container.getHeapWithRequiredSizeAndAlignment(HeapType::dynamicState, dsHeap->getMaxAvailableSpace(), NEO::EncodeDispatchKernel<Family>::getDefaultDshAlignment());
|
||||
}
|
||||
}
|
||||
UNRECOVERABLE_IF(!dsHeap);
|
||||
|
||||
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
|
||||
samplerStateOffset = EncodeStates<Family>::copySamplerState(dsHeap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
|
||||
kernelDescriptor.payloadMappings.samplerTable.numSamplers,
|
||||
kernelDescriptor.payloadMappings.samplerTable.borderColor,
|
||||
args.dispatchInterface->getDynamicStateHeapData(),
|
||||
args.device->getBindlessHeapsHelper(), args.device->getRootDeviceEnvironment());
|
||||
}
|
||||
|
||||
idd.setSamplerStatePointer(samplerStateOffset);
|
||||
if (!isBindlessKernel) {
|
||||
EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount);
|
||||
}
|
||||
|
||||
EncodeDispatchKernel<Family>::setGrfInfo(&idd, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData,
|
||||
sizePerThreadData, rootDeviceEnvironment);
|
||||
|
||||
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
||||
bool isHwLocalIdGeneration = false;
|
||||
uint32_t sizeForImplicitArgsPatching = NEO::ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, rootDeviceEnvironment);
|
||||
uint32_t iohRequiredSize = sizeThreadData + sizeForImplicitArgsPatching;
|
||||
uint64_t offsetThreadData = 0u;
|
||||
{
|
||||
auto heapIndirect = container.getIndirectHeap(HeapType::indirectObject);
|
||||
UNRECOVERABLE_IF(!(heapIndirect));
|
||||
heapIndirect->align(Family::cacheLineSize);
|
||||
void *ptr = nullptr;
|
||||
if (args.isKernelDispatchedFromImmediateCmdList) {
|
||||
ptr = container.getHeapWithRequiredSizeAndAlignment(HeapType::indirectObject, iohRequiredSize, DefaultWalkerType::INDIRECTDATASTARTADDRESS_ALIGN_SIZE)->getSpace(iohRequiredSize);
|
||||
} else {
|
||||
ptr = container.getHeapSpaceAllowGrow(HeapType::indirectObject, iohRequiredSize);
|
||||
}
|
||||
UNRECOVERABLE_IF(!(ptr));
|
||||
offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast<uint64_t>(heapIndirect->getUsed() - sizeThreadData);
|
||||
|
||||
uint64_t implicitArgsGpuVA = 0u;
|
||||
if (pImplicitArgs) {
|
||||
implicitArgsGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(heapIndirect->getUsed() - iohRequiredSize);
|
||||
auto implicitArgsCrossThreadPtr = ptrOffset(const_cast<uint64_t *>(reinterpret_cast<const uint64_t *>(args.dispatchInterface->getCrossThreadData())), kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||
*implicitArgsCrossThreadPtr = implicitArgsGpuVA;
|
||||
|
||||
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, {}, rootDeviceEnvironment, nullptr);
|
||||
}
|
||||
|
||||
memcpy_s(ptr, sizeCrossThreadData,
|
||||
args.dispatchInterface->getCrossThreadData(), sizeCrossThreadData);
|
||||
|
||||
if (args.isIndirect) {
|
||||
auto crossThreadDataGpuVA = heapIndirect->getGraphicsAllocation()->getGpuAddress() + heapIndirect->getUsed() - sizeThreadData;
|
||||
EncodeIndirectParams<Family>::encode(container, crossThreadDataGpuVA, args.dispatchInterface, implicitArgsGpuVA);
|
||||
}
|
||||
|
||||
ptr = ptrOffset(ptr, sizeCrossThreadData);
|
||||
memcpy_s(ptr, sizePerThreadDataForWholeGroup,
|
||||
args.dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup);
|
||||
}
|
||||
|
||||
uint32_t numIDD = 0u;
|
||||
void *iddPtr = getInterfaceDescriptor(container, args.dynamicStateHeap, numIDD);
|
||||
|
||||
auto slmSizeNew = args.dispatchInterface->getSlmTotalSize();
|
||||
bool dirtyHeaps = container.isAnyHeapDirty();
|
||||
bool flush = container.slmSizeRef() != slmSizeNew || dirtyHeaps || args.requiresUncachedMocs;
|
||||
|
||||
if (flush) {
|
||||
PipeControlArgs syncArgs;
|
||||
syncArgs.dcFlushEnable = args.dcFlushEnable;
|
||||
if (dirtyHeaps) {
|
||||
syncArgs.hdcPipelineFlush = true;
|
||||
}
|
||||
MemorySynchronizationCommands<Family>::addSingleBarrier(*container.getCommandStream(), syncArgs);
|
||||
|
||||
if (dirtyHeaps || args.requiresUncachedMocs) {
|
||||
STATE_BASE_ADDRESS sba;
|
||||
auto gmmHelper = container.getDevice()->getGmmHelper();
|
||||
uint32_t statelessMocsIndex =
|
||||
args.requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1);
|
||||
auto l1CachePolicy = container.l1CachePolicyDataRef()->getL1CacheValue(false);
|
||||
auto l1CachePolicyDebuggerActive = container.l1CachePolicyDataRef()->getL1CacheValue(true);
|
||||
EncodeStateBaseAddressArgs<Family> encodeStateBaseAddressArgs = {
|
||||
&container, // container
|
||||
sba, // sbaCmd
|
||||
nullptr, // sbaProperties
|
||||
statelessMocsIndex, // statelessMocsIndex
|
||||
l1CachePolicy, // l1CachePolicy
|
||||
l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive
|
||||
false, // multiOsContextCapable
|
||||
args.isRcs, // isRcs
|
||||
container.doubleSbaWaRef(), // doubleSbaWa
|
||||
false, // heaplessModeEnabled
|
||||
};
|
||||
EncodeStateBaseAddress<Family>::encode(encodeStateBaseAddressArgs);
|
||||
container.setDirtyStateForAllHeaps(false);
|
||||
args.requiresUncachedMocs = false;
|
||||
}
|
||||
|
||||
if (container.slmSizeRef() != slmSizeNew) {
|
||||
EncodeL3State<Family>::encode(container, slmSizeNew != 0u);
|
||||
container.slmSizeRef() = slmSizeNew;
|
||||
}
|
||||
}
|
||||
|
||||
if (numIDD == 0 || flush) {
|
||||
EncodeMediaInterfaceDescriptorLoad<Family>::encode(container, args.dynamicStateHeap);
|
||||
}
|
||||
|
||||
cmd.setIndirectDataStartAddress(static_cast<uint32_t>(offsetThreadData));
|
||||
cmd.setIndirectDataLength(sizeThreadData);
|
||||
cmd.setInterfaceDescriptorOffset(numIDD);
|
||||
|
||||
EncodeDispatchKernel<Family>::encodeThreadData(cmd,
|
||||
nullptr,
|
||||
threadGroupDims,
|
||||
args.dispatchInterface->getGroupSize(),
|
||||
kernelDescriptor.kernelAttributes.simdSize,
|
||||
kernelDescriptor.kernelAttributes.numLocalIdChannels,
|
||||
numThreadsPerThreadGroup,
|
||||
args.dispatchInterface->getThreadExecutionMask(),
|
||||
true,
|
||||
false,
|
||||
args.isIndirect,
|
||||
args.dispatchInterface->getRequiredWorkgroupOrder(),
|
||||
rootDeviceEnvironment);
|
||||
|
||||
cmd.setPredicateEnable(args.isPredicate);
|
||||
|
||||
auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension();
|
||||
EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(idd, *args.device, hwInfo, threadGroupDims, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, numThreadsPerThreadGroup, cmd);
|
||||
|
||||
EncodeWalkerArgs walkerArgs{
|
||||
.kernelExecutionType = KernelExecutionType::defaultType,
|
||||
.requiredDispatchWalkOrder = args.requiredDispatchWalkOrder,
|
||||
.localRegionSize = args.localRegionSize,
|
||||
.maxFrontEndThreads = args.device->getDeviceInfo().maxFrontEndThreads,
|
||||
.requiredSystemFence = args.requiresSystemMemoryFence(),
|
||||
.hasSample = false};
|
||||
|
||||
EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(rootDeviceEnvironment, cmd, walkerArgs);
|
||||
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(cmd, walkerArgs);
|
||||
EncodeDispatchKernel<Family>::template encodeComputeDispatchAllWalker<WalkerType, INTERFACE_DESCRIPTOR_DATA>(cmd, nullptr, rootDeviceEnvironment, walkerArgs);
|
||||
|
||||
memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd));
|
||||
|
||||
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
|
||||
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false));
|
||||
args.additionalCommands->push_back(commandBuffer);
|
||||
|
||||
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
|
||||
}
|
||||
|
||||
auto buffer = listCmdBufferStream->getSpaceForCmd<DefaultWalkerType>();
|
||||
*buffer = cmd;
|
||||
|
||||
{
|
||||
auto mediaStateFlush = listCmdBufferStream->getSpaceForCmd<MEDIA_STATE_FLUSH>();
|
||||
*mediaStateFlush = Family::cmdInitMediaStateFlush;
|
||||
}
|
||||
|
||||
args.partitionCount = 1;
|
||||
|
||||
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) {
|
||||
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false));
|
||||
args.additionalCommands->push_back(commandBuffer);
|
||||
|
||||
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeMediaInterfaceDescriptorLoad<Family>::encode(CommandContainer &container, IndirectHeap *childDsh) {
|
||||
using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
|
||||
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
|
||||
void *heapBase = nullptr;
|
||||
if (childDsh != nullptr) {
|
||||
heapBase = childDsh->getCpuBase();
|
||||
} else {
|
||||
heapBase = container.getIndirectHeap(HeapType::dynamicState)->getCpuBase();
|
||||
}
|
||||
|
||||
auto mediaStateFlush = container.getCommandStream()->getSpaceForCmd<MEDIA_STATE_FLUSH>();
|
||||
*mediaStateFlush = Family::cmdInitMediaStateFlush;
|
||||
|
||||
auto iddOffset = static_cast<uint32_t>(ptrDiff(container.getIddBlock(), heapBase));
|
||||
|
||||
MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = Family::cmdInitMediaInterfaceDescriptorLoad;
|
||||
cmd.setInterfaceDescriptorDataStartAddress(iddOffset);
|
||||
cmd.setInterfaceDescriptorTotalLength(sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock());
|
||||
|
||||
auto buffer = container.getCommandStream()->getSpace(sizeof(cmd));
|
||||
*(decltype(cmd) *)buffer = cmd;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline bool EncodeDispatchKernel<Family>::isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
|
||||
const size_t *lws,
|
||||
std::array<uint8_t, 3> walkOrder,
|
||||
bool requireInputWalkOrder,
|
||||
uint32_t &requiredWalkOrder,
|
||||
uint32_t simd) {
|
||||
requiredWalkOrder = 0u;
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::encodeThreadData(WalkerType &walkerCmd,
|
||||
const uint32_t *startWorkGroup,
|
||||
const uint32_t *numWorkGroups,
|
||||
const uint32_t *workGroupSizes,
|
||||
uint32_t simd,
|
||||
uint32_t localIdDimensions,
|
||||
uint32_t threadsPerThreadGroup,
|
||||
uint32_t threadExecutionMask,
|
||||
bool localIdsGenerationByRuntime,
|
||||
bool inlineDataProgrammingRequired,
|
||||
bool isIndirect,
|
||||
uint32_t requiredWorkGroupOrder,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
|
||||
if (isIndirect) {
|
||||
walkerCmd.setIndirectParameterEnable(true);
|
||||
} else {
|
||||
walkerCmd.setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
|
||||
walkerCmd.setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
|
||||
walkerCmd.setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
|
||||
}
|
||||
|
||||
if (startWorkGroup) {
|
||||
walkerCmd.setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroup[0]));
|
||||
walkerCmd.setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroup[1]));
|
||||
walkerCmd.setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroup[2]));
|
||||
}
|
||||
|
||||
walkerCmd.setSimdSize(getSimdConfig<WalkerType>(simd));
|
||||
|
||||
auto localWorkSize = static_cast<uint32_t>(workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2]);
|
||||
if (threadsPerThreadGroup == 0) {
|
||||
threadsPerThreadGroup = getThreadsPerWG(simd, localWorkSize);
|
||||
}
|
||||
walkerCmd.setThreadWidthCounterMaximum(threadsPerThreadGroup);
|
||||
|
||||
uint64_t executionMask = threadExecutionMask;
|
||||
if (executionMask == 0) {
|
||||
auto remainderSimdLanes = localWorkSize & (simd - 1);
|
||||
executionMask = maxNBitValue(remainderSimdLanes);
|
||||
if (!executionMask)
|
||||
executionMask = ~executionMask;
|
||||
}
|
||||
|
||||
constexpr uint32_t maxDword = std::numeric_limits<uint32_t>::max();
|
||||
walkerCmd.setRightExecutionMask(static_cast<uint32_t>(executionMask));
|
||||
walkerCmd.setBottomExecutionMask(maxDword);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::programBarrierEnable(InterfaceDescriptorType &interfaceDescriptor,
|
||||
const KernelDescriptor &kernelDescriptor,
|
||||
const HardwareInfo &hwInfo) {
|
||||
interfaceDescriptor.setBarrierEnable(kernelDescriptor.kernelAttributes.barrierCount);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const RootDeviceEnvironment &rootDeviceEnvironment, WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
inline void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
inline void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::setupPreferredSlmSize(InterfaceDescriptorType *pInterfaceDescriptor, const RootDeviceEnvironment &rootDeviceEnvironment, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}
|
||||
|
||||
template <typename Family>
|
||||
inline bool EncodeDispatchKernel<Family>::isDshNeeded(const DeviceInfo &deviceInfo) {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeStateBaseAddress<Family>::setSbaAddressesForDebugger(NEO::Debugger::SbaAddresses &sbaAddress, const STATE_BASE_ADDRESS &sbaCmd) {
|
||||
sbaAddress.indirectObjectBaseAddress = sbaCmd.getIndirectObjectBaseAddress();
|
||||
sbaAddress.bindlessSurfaceStateBaseAddress = sbaCmd.getBindlessSurfaceStateBaseAddress();
|
||||
sbaAddress.dynamicStateBaseAddress = sbaCmd.getDynamicStateBaseAddress();
|
||||
sbaAddress.generalStateBaseAddress = sbaCmd.getGeneralStateBaseAddress();
|
||||
sbaAddress.instructionBaseAddress = sbaCmd.getInstructionBaseAddress();
|
||||
sbaAddress.surfaceStateBaseAddress = sbaCmd.getSurfaceStateBaseAddress();
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeStateBaseAddress<Family>::encode(EncodeStateBaseAddressArgs<Family> &args) {
|
||||
auto &device = *args.container->getDevice();
|
||||
|
||||
if (args.container->isAnyHeapDirty()) {
|
||||
EncodeWA<Family>::encodeAdditionalPipelineSelect(*args.container->getCommandStream(), {}, true, device.getRootDeviceEnvironment(), args.isRcs);
|
||||
}
|
||||
|
||||
auto gmmHelper = device.getGmmHelper();
|
||||
|
||||
auto dsh = args.container->isHeapDirty(HeapType::dynamicState) ? args.container->getIndirectHeap(HeapType::dynamicState) : nullptr;
|
||||
auto ioh = args.container->isHeapDirty(HeapType::indirectObject) ? args.container->getIndirectHeap(HeapType::indirectObject) : nullptr;
|
||||
auto ssh = args.container->isHeapDirty(HeapType::surfaceState) ? args.container->getIndirectHeap(HeapType::surfaceState) : nullptr;
|
||||
auto isDebuggerActive = device.getDebugger() != nullptr;
|
||||
uint64_t globalHeapsBase = 0;
|
||||
uint64_t bindlessSurfStateBase = 0;
|
||||
bool useGlobalSshAndDsh = false;
|
||||
|
||||
if (device.getBindlessHeapsHelper()) {
|
||||
bindlessSurfStateBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase();
|
||||
globalHeapsBase = device.getBindlessHeapsHelper()->getGlobalHeapsBase();
|
||||
useGlobalSshAndDsh = true;
|
||||
}
|
||||
|
||||
StateBaseAddressHelperArgs<Family> stateBaseAddressHelperArgs = {
|
||||
0, // generalStateBaseAddress
|
||||
args.container->getIndirectObjectHeapBaseAddress(), // indirectObjectHeapBaseAddress
|
||||
args.container->getInstructionHeapBaseAddress(), // instructionHeapBaseAddress
|
||||
globalHeapsBase, // globalHeapsBaseAddress
|
||||
0, // surfaceStateBaseAddress
|
||||
bindlessSurfStateBase, // bindlessSurfaceStateBaseAddress
|
||||
&args.sbaCmd, // stateBaseAddressCmd
|
||||
args.sbaProperties, // sbaProperties
|
||||
dsh, // dsh
|
||||
ioh, // ioh
|
||||
ssh, // ssh
|
||||
gmmHelper, // gmmHelper
|
||||
args.statelessMocsIndex, // statelessMocsIndex
|
||||
args.l1CachePolicy, // l1CachePolicy
|
||||
args.l1CachePolicyDebuggerActive, // l1CachePolicyDebuggerActive
|
||||
NEO::MemoryCompressionState::notApplicable, // memoryCompressionState
|
||||
false, // setInstructionStateBaseAddress
|
||||
false, // setGeneralStateBaseAddress
|
||||
useGlobalSshAndDsh, // useGlobalHeapsBaseAddress
|
||||
false, // isMultiOsContextCapable
|
||||
false, // areMultipleSubDevicesInContext
|
||||
false, // overrideSurfaceStateBaseAddress
|
||||
isDebuggerActive, // isDebuggerActive
|
||||
args.doubleSbaWa, // doubleSbaWa
|
||||
args.heaplessModeEnabled // heaplessModeEnabled
|
||||
};
|
||||
|
||||
StateBaseAddressHelper<Family>::programStateBaseAddressIntoCommandStream(stateBaseAddressHelperArgs,
|
||||
*args.container->getCommandStream());
|
||||
|
||||
EncodeWA<Family>::encodeAdditionalPipelineSelect(*args.container->getCommandStream(), {}, false, device.getRootDeviceEnvironment(), args.isRcs);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeStateBaseAddress<Family>::getRequiredSizeForStateBaseAddress(Device &device, CommandContainer &container, bool isRcs) {
|
||||
return sizeof(typename Family::STATE_BASE_ADDRESS) + 2 * EncodeWA<Family>::getAdditionalPipelineSelectSize(device, isRcs);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void EncodeMiFlushDW<GfxFamily>::adjust(MI_FLUSH_DW *miFlushDwCmd, const ProductHelper &productHelper) {}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void EncodeWA<GfxFamily>::addPipeControlPriorToNonPipelinedStateCommand(LinearStream &commandStream, PipeControlArgs args,
|
||||
const RootDeviceEnvironment &rootDeviceEnvironment, bool isRcs) {
|
||||
MemorySynchronizationCommands<GfxFamily>::addSingleBarrier(commandStream, args);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void EncodeWA<GfxFamily>::adjustCompressionFormatForPlanarImage(uint32_t &compressionFormat, int plane) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSurfaceState<Family>::setCoherencyType(R_SURFACE_STATE *surfaceState, COHERENCY_TYPE coherencyType) {
|
||||
surfaceState->setCoherencyType(coherencyType);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSemaphore<Family>::programMiSemaphoreWait(MI_SEMAPHORE_WAIT *cmd,
|
||||
uint64_t compareAddress,
|
||||
uint64_t compareData,
|
||||
COMPARE_OPERATION compareMode,
|
||||
bool registerPollMode,
|
||||
bool waitMode,
|
||||
bool useQwordData,
|
||||
bool indirect,
|
||||
bool switchOnUnsuccessful) {
|
||||
constexpr uint64_t upper32b = static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) << 32;
|
||||
UNRECOVERABLE_IF(useQwordData || (compareData & upper32b));
|
||||
UNRECOVERABLE_IF(indirect);
|
||||
|
||||
MI_SEMAPHORE_WAIT localCmd = Family::cmdInitMiSemaphoreWait;
|
||||
localCmd.setCompareOperation(compareMode);
|
||||
localCmd.setSemaphoreDataDword(static_cast<uint32_t>(compareData));
|
||||
localCmd.setSemaphoreGraphicsAddress(compareAddress);
|
||||
localCmd.setWaitMode(waitMode ? MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE : MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_SIGNAL_MODE);
|
||||
|
||||
*cmd = localCmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void EncodeEnableRayTracing<GfxFamily>::programEnableRayTracing(LinearStream &commandStream, uint64_t backBuffer) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeStoreMemory<Family>::programStoreDataImm(MI_STORE_DATA_IMM *cmdBuffer,
|
||||
uint64_t gpuAddress,
|
||||
uint32_t dataDword0,
|
||||
uint32_t dataDword1,
|
||||
bool storeQword,
|
||||
bool workloadPartitionOffset) {
|
||||
MI_STORE_DATA_IMM storeDataImmediate = Family::cmdInitStoreDataImm;
|
||||
storeDataImmediate.setAddress(gpuAddress);
|
||||
storeDataImmediate.setStoreQword(storeQword);
|
||||
storeDataImmediate.setDataDword0(dataDword0);
|
||||
if (storeQword) {
|
||||
storeDataImmediate.setDataDword1(dataDword1);
|
||||
storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_QWORD);
|
||||
} else {
|
||||
storeDataImmediate.setDwordLength(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD);
|
||||
}
|
||||
EncodeStoreMemory<Family>::encodeForceCompletionCheck(storeDataImmediate);
|
||||
|
||||
*cmdBuffer = storeDataImmediate;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::setupPostSyncMocs(WalkerType &walkerCmd, const RootDeviceEnvironment &rootDeviceEnvironment, bool dcFlush) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::setupPostSyncForRegularEvent(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::setupPostSyncForInOrderExec(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::adjustWalkOrder(WalkerType &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment) {}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::additionalSizeRequiredDsh(uint32_t iddCount) {
|
||||
return iddCount * sizeof(typename Family::INTERFACE_DESCRIPTOR_DATA);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline size_t EncodeDispatchKernel<Family>::getInlineDataOffset(EncodeDispatchKernelArgs &args) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::forceComputeWalkerPostSyncFlushWithWrite(WalkerType &walkerCmd) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeDispatchKernel<Family>::alignSlmSize(uint32_t slmSize) {
|
||||
if (slmSize == 0u) {
|
||||
return 0u;
|
||||
}
|
||||
slmSize = std::max(slmSize, 1024u);
|
||||
slmSize = Math::nextPowerOfTwo(slmSize);
|
||||
UNRECOVERABLE_IF(slmSize > 64u * MemoryConstants::kiloByte);
|
||||
return slmSize;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeDispatchKernel<Family>::computeSlmValues(const HardwareInfo &hwInfo, uint32_t slmSize) {
|
||||
auto value = std::max(slmSize, 1024u);
|
||||
value = Math::nextPowerOfTwo(value);
|
||||
value = Math::getMinLsbSet(value);
|
||||
value = value - 9;
|
||||
DEBUG_BREAK_IF(value > 7);
|
||||
return value * !!slmSize;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
bool EncodeDispatchKernel<Family>::singleTileExecImplicitScalingRequired(bool cooperativeKernel) {
|
||||
return cooperativeKernel;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeStates<Family>::getSshHeapSize() {
|
||||
return 64 * MemoryConstants::kiloByte;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void InOrderPatchCommandHelpers::PatchCmd<Family>::patchComputeWalker(uint64_t appendCounterValue) {
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType, typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::encodeThreadGroupDispatch(InterfaceDescriptorType &interfaceDescriptor, const Device &device, const HardwareInfo &hwInfo,
|
||||
const uint32_t *threadGroupDimensions, const uint32_t threadGroupCount, const uint32_t grfCount, const uint32_t threadsPerThreadGroup,
|
||||
WalkerType &walkerCmd) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::getScratchPtrOffsetOfImplicitArgs() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSurfaceState<Family>::setPitchForScratch(R_SURFACE_STATE *surfaceState, uint32_t pitch, const ProductHelper &productHelper) {
|
||||
surfaceState->setSurfacePitch(pitch);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t EncodeSurfaceState<Family>::getPitchForScratchInBytes(R_SURFACE_STATE *surfaceState, const ProductHelper &productHelper) {
|
||||
return surfaceState->getSurfacePitch();
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSemaphore<Family>::appendSemaphoreCommand(MI_SEMAPHORE_WAIT &cmd, uint64_t compareData, bool indirect, bool useQwordData, bool switchOnUnsuccessful) {
|
||||
constexpr uint64_t upper32b = static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()) << 32;
|
||||
UNRECOVERABLE_IF(useQwordData || (compareData & upper32b));
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <bool isHeapless>
|
||||
void EncodeDispatchKernel<Family>::setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &submissionCsr) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename InterfaceDescriptorType>
|
||||
void EncodeDispatchKernel<Family>::encodeEuSchedulingPolicy(InterfaceDescriptorType *pInterfaceDescriptor, const KernelDescriptor &kernelDesc, int32_t defaultPipelinedThreadArbitrationPolicy) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::setWalkerRegionSettings(WalkerType &walkerCmd, const NEO::Device &device, uint32_t partitionCount, uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder) {}
|
||||
|
||||
template <typename Family>
|
||||
template <typename WalkerType>
|
||||
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WalkerType &walkerCmd, const EncodeDispatchKernelArgs &args) {}
|
||||
|
||||
template <>
|
||||
size_t EncodeWA<Family>::getAdditionalPipelineSelectSize(Device &device, bool isRcs) {
|
||||
size_t size = 0;
|
||||
|
||||
Reference in New Issue
Block a user