mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-21 09:14:47 +08:00
Partial support for XE_HP_SDV
Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
e0a50d3143
commit
96d14967ac
@@ -13,8 +13,18 @@ set(NEO_CORE_COMMAND_CONTAINER
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_bdw_plus.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/encode_compute_mode_bdw_plus.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/encode_compute_mode_tgllp_plus.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/implicit_scaling.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/implicit_scaling.h
|
||||
)
|
||||
|
||||
if(SUPPORT_XEHP_PLUS)
|
||||
list(APPEND NEO_CORE_COMMAND_CONTAINER
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/command_encoder_xehp_plus.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/implicit_scaling_xehp_plus.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_xehp_plus.h
|
||||
)
|
||||
endif()
|
||||
|
||||
set_property(GLOBAL PROPERTY NEO_CORE_COMMAND_CONTAINER ${NEO_CORE_COMMAND_CONTAINER})
|
||||
|
||||
add_subdirectories()
|
||||
|
||||
649
shared/source/command_container/command_encoder_xehp_plus.inl
Normal file
649
shared/source/command_container/command_encoder_xehp_plus.inl
Normal file
@@ -0,0 +1,649 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/command_container/command_encoder.h"
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||
#include "shared/source/command_stream/linear_stream.h"
|
||||
#include "shared/source/command_stream/preemption.h"
|
||||
#include "shared/source/command_stream/stream_properties.h"
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/execution_environment/execution_environment.h"
|
||||
#include "shared/source/gmm_helper/gmm_helper.h"
|
||||
#include "shared/source/helpers/basic_math.h"
|
||||
#include "shared/source/helpers/constants.h"
|
||||
#include "shared/source/helpers/hw_helper.h"
|
||||
#include "shared/source/helpers/pipeline_select_helper.h"
|
||||
#include "shared/source/helpers/simd_helper.h"
|
||||
#include "shared/source/helpers/state_base_address.h"
|
||||
#include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
|
||||
#include "shared/source/kernel/kernel_descriptor.h"
|
||||
|
||||
#include "gmm_client_context.h"
|
||||
#include "pipe_control_args.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace NEO {
|
||||
constexpr size_t TimestampDestinationAddressAlignment = 16;
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
|
||||
const void *pThreadGroupDimensions, bool isIndirect, bool isPredicate, DispatchKernelEncoderI *dispatchInterface,
|
||||
uint64_t eventAddress, bool isTimestampEvent, bool L3FlushEnable, Device *device, PreemptionMode preemptionMode,
|
||||
bool &requiresUncachedMocs, bool useGlobalAtomics, uint32_t &partitionCount, bool isInternal) {
|
||||
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
||||
using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS;
|
||||
using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END;
|
||||
using INLINE_DATA = typename Family::INLINE_DATA;
|
||||
|
||||
const HardwareInfo &hwInfo = device->getHardwareInfo();
|
||||
|
||||
const auto &kernelDescriptor = dispatchInterface->getKernelDescriptor();
|
||||
auto sizeCrossThreadData = dispatchInterface->getCrossThreadDataSize();
|
||||
auto sizePerThreadDataForWholeGroup = dispatchInterface->getPerThreadDataSizeForWholeThreadGroup();
|
||||
|
||||
LinearStream *listCmdBufferStream = container.getCommandStream();
|
||||
size_t sshOffset = 0;
|
||||
|
||||
auto threadDims = static_cast<const uint32_t *>(pThreadGroupDimensions);
|
||||
const Vec3<size_t> threadStartVec{0, 0, 0};
|
||||
Vec3<size_t> threadDimsVec{0, 0, 0};
|
||||
if (!isIndirect) {
|
||||
threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
|
||||
}
|
||||
size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec, isInternal);
|
||||
if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
|
||||
auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
|
||||
*bbEnd = Family::cmdInitBatchBufferEnd;
|
||||
|
||||
container.allocateNextCommandBuffer();
|
||||
}
|
||||
|
||||
if (kernelDescriptor.extendedInfo) {
|
||||
bool specialModeRequired = kernelDescriptor.extendedInfo->specialPipelineSelectModeRequired();
|
||||
if (container.lastPipelineSelectModeRequired != specialModeRequired) {
|
||||
container.lastPipelineSelectModeRequired = specialModeRequired;
|
||||
EncodeComputeMode<Family>::adjustPipelineSelect(container, kernelDescriptor);
|
||||
}
|
||||
}
|
||||
|
||||
WALKER_TYPE walkerCmd = Family::cmdInitGpgpuWalker;
|
||||
auto &idd = walkerCmd.getInterfaceDescriptor();
|
||||
|
||||
bool localIdsGenerationByRuntime = dispatchInterface->requiresGenerationOfLocalIdsByRuntime();
|
||||
bool inlineDataProgramming = EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(kernelDescriptor);
|
||||
{
|
||||
auto alloc = dispatchInterface->getIsaAllocation();
|
||||
UNRECOVERABLE_IF(nullptr == alloc);
|
||||
auto offset = alloc->getGpuAddressToPatch();
|
||||
if (!localIdsGenerationByRuntime) {
|
||||
offset += kernelDescriptor.entryPoints.skipPerThreadDataLoad;
|
||||
}
|
||||
idd.setKernelStartPointer(offset);
|
||||
idd.setKernelStartPointerHigh(0u);
|
||||
}
|
||||
|
||||
auto threadsPerThreadGroup = dispatchInterface->getNumThreadsPerThreadGroup();
|
||||
idd.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
|
||||
|
||||
EncodeDispatchKernel<Family>::programBarrierEnable(idd,
|
||||
kernelDescriptor.kernelAttributes.barrierCount,
|
||||
hwInfo);
|
||||
|
||||
auto slmSize = static_cast<SHARED_LOCAL_MEMORY_SIZE>(
|
||||
HwHelperHw<Family>::get().computeSlmValues(hwInfo, dispatchInterface->getSlmTotalSize()));
|
||||
|
||||
if (DebugManager.flags.OverrideSlmAllocationSize.get() != -1) {
|
||||
slmSize = static_cast<SHARED_LOCAL_MEMORY_SIZE>(DebugManager.flags.OverrideSlmAllocationSize.get());
|
||||
}
|
||||
idd.setSharedLocalMemorySize(slmSize);
|
||||
|
||||
auto bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
|
||||
uint32_t bindingTablePointer = 0u;
|
||||
if (kernelDescriptor.kernelAttributes.bufferAddressingMode == KernelDescriptor::BindfulAndStateless) {
|
||||
container.prepareBindfulSsh();
|
||||
if (bindingTableStateCount > 0u) {
|
||||
auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
|
||||
sshOffset = ssh->getUsed();
|
||||
bindingTablePointer = static_cast<uint32_t>(EncodeSurfaceState<Family>::pushBindingTableAndSurfaceStates(
|
||||
*ssh, bindingTableStateCount,
|
||||
dispatchInterface->getSurfaceStateHeapData(),
|
||||
dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
|
||||
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
|
||||
}
|
||||
}
|
||||
idd.setBindingTablePointer(bindingTablePointer);
|
||||
|
||||
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, preemptionMode);
|
||||
|
||||
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
|
||||
UNRECOVERABLE_IF(!heap);
|
||||
|
||||
uint32_t samplerStateOffset = 0;
|
||||
uint32_t samplerCount = 0;
|
||||
|
||||
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
|
||||
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
|
||||
samplerStateOffset = EncodeStates<Family>::copySamplerState(
|
||||
heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
|
||||
kernelDescriptor.payloadMappings.samplerTable.numSamplers, kernelDescriptor.payloadMappings.samplerTable.borderColor,
|
||||
dispatchInterface->getDynamicStateHeapData(),
|
||||
device->getBindlessHeapsHelper());
|
||||
if (ApiSpecificConfig::getBindlessConfiguration()) {
|
||||
container.getResidencyContainer().push_back(device->getBindlessHeapsHelper()->getHeap(NEO::BindlessHeapsHelper::BindlesHeapType::GLOBAL_DSH)->getGraphicsAllocation());
|
||||
}
|
||||
}
|
||||
|
||||
idd.setSamplerStatePointer(samplerStateOffset);
|
||||
|
||||
EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(idd, samplerCount, bindingTableStateCount);
|
||||
|
||||
uint64_t offsetThreadData = 0u;
|
||||
const uint32_t inlineDataSize = sizeof(INLINE_DATA);
|
||||
auto crossThreadData = dispatchInterface->getCrossThreadData();
|
||||
|
||||
if (inlineDataProgramming) {
|
||||
auto copySize = std::min(inlineDataSize, sizeCrossThreadData);
|
||||
auto dest = reinterpret_cast<char *>(walkerCmd.getInlineDataPointer());
|
||||
memcpy_s(dest, copySize, crossThreadData, copySize);
|
||||
auto offset = std::min(inlineDataSize, sizeCrossThreadData);
|
||||
sizeCrossThreadData -= copySize;
|
||||
crossThreadData = ptrOffset(crossThreadData, offset);
|
||||
inlineDataProgramming = copySize != 0;
|
||||
}
|
||||
|
||||
uint32_t sizeThreadData = sizePerThreadDataForWholeGroup + sizeCrossThreadData;
|
||||
{
|
||||
auto heap = container.getIndirectHeap(HeapType::INDIRECT_OBJECT);
|
||||
UNRECOVERABLE_IF(!heap);
|
||||
heap->align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
|
||||
auto ptr = container.getHeapSpaceAllowGrow(HeapType::INDIRECT_OBJECT, sizeThreadData);
|
||||
UNRECOVERABLE_IF(!ptr);
|
||||
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData);
|
||||
|
||||
if (sizeCrossThreadData > 0) {
|
||||
memcpy_s(ptr, sizeCrossThreadData,
|
||||
crossThreadData, sizeCrossThreadData);
|
||||
}
|
||||
if (isIndirect) {
|
||||
void *gpuPtr = reinterpret_cast<void *>(heap->getHeapGpuBase() + heap->getUsed() - sizeThreadData);
|
||||
EncodeIndirectParams<Family>::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr);
|
||||
EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize());
|
||||
}
|
||||
|
||||
auto perThreadDataPtr = dispatchInterface->getPerThreadData();
|
||||
if (perThreadDataPtr != nullptr) {
|
||||
ptr = ptrOffset(ptr, sizeCrossThreadData);
|
||||
memcpy_s(ptr, sizePerThreadDataForWholeGroup,
|
||||
perThreadDataPtr, sizePerThreadDataForWholeGroup);
|
||||
}
|
||||
}
|
||||
|
||||
bool requiresGlobalAtomicsUpdate = false;
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(container.getDevice()->getDeviceBitfield(), true)) {
|
||||
requiresGlobalAtomicsUpdate = container.lastSentUseGlobalAtomics != useGlobalAtomics;
|
||||
container.lastSentUseGlobalAtomics = useGlobalAtomics;
|
||||
}
|
||||
|
||||
if (container.isAnyHeapDirty() || requiresUncachedMocs || requiresGlobalAtomicsUpdate) {
|
||||
PipeControlArgs args(true);
|
||||
MemorySynchronizationCommands<Family>::addPipeControl(*container.getCommandStream(), args);
|
||||
STATE_BASE_ADDRESS sbaCmd;
|
||||
auto gmmHelper = container.getDevice()->getGmmHelper();
|
||||
uint32_t statelessMocsIndex =
|
||||
requiresUncachedMocs ? (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED) >> 1) : (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1);
|
||||
EncodeStateBaseAddress<Family>::encode(container, sbaCmd, statelessMocsIndex, useGlobalAtomics);
|
||||
container.setDirtyStateForAllHeaps(false);
|
||||
requiresUncachedMocs = false;
|
||||
}
|
||||
|
||||
walkerCmd.setIndirectDataStartAddress(static_cast<uint32_t>(offsetThreadData));
|
||||
walkerCmd.setIndirectDataLength(sizeThreadData);
|
||||
|
||||
EncodeDispatchKernel<Family>::encodeThreadData(walkerCmd,
|
||||
nullptr,
|
||||
threadDims,
|
||||
dispatchInterface->getGroupSize(),
|
||||
kernelDescriptor.kernelAttributes.simdSize,
|
||||
kernelDescriptor.kernelAttributes.numLocalIdChannels,
|
||||
dispatchInterface->getNumThreadsPerThreadGroup(),
|
||||
dispatchInterface->getThreadExecutionMask(),
|
||||
localIdsGenerationByRuntime,
|
||||
inlineDataProgramming,
|
||||
isIndirect,
|
||||
dispatchInterface->getRequiredWorkgroupOrder());
|
||||
|
||||
using POSTSYNC_DATA = typename Family::POSTSYNC_DATA;
|
||||
auto &postSync = walkerCmd.getPostSync();
|
||||
if (eventAddress != 0) {
|
||||
postSync.setDataportPipelineFlush(true);
|
||||
postSync.setL3Flush(L3FlushEnable);
|
||||
if (isTimestampEvent) {
|
||||
postSync.setOperation(POSTSYNC_DATA::OPERATION_WRITE_TIMESTAMP);
|
||||
} else {
|
||||
uint32_t STATE_SIGNALED = 0u;
|
||||
postSync.setOperation(POSTSYNC_DATA::OPERATION_WRITE_IMMEDIATE_DATA);
|
||||
postSync.setImmediateData(STATE_SIGNALED);
|
||||
}
|
||||
UNRECOVERABLE_IF(!(isAligned<TimestampDestinationAddressAlignment>(eventAddress)));
|
||||
postSync.setDestinationAddress(eventAddress);
|
||||
|
||||
auto gmmHelper = device->getRootDeviceEnvironment().getGmmHelper();
|
||||
postSync.setMocs(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED));
|
||||
|
||||
EncodeDispatchKernel<Family>::adjustTimestampPacket(walkerCmd, hwInfo);
|
||||
}
|
||||
|
||||
walkerCmd.setPredicateEnable(isPredicate);
|
||||
|
||||
EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(idd, hwInfo);
|
||||
|
||||
EncodeDispatchKernel<Family>::appendAdditionalIDDFields(&idd, hwInfo, threadsPerThreadGroup,
|
||||
dispatchInterface->getSlmTotalSize(),
|
||||
dispatchInterface->getSlmPolicy());
|
||||
|
||||
EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(hwInfo, walkerCmd);
|
||||
|
||||
PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *device);
|
||||
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), true) &&
|
||||
!isInternal) {
|
||||
const uint64_t workPartitionAllocationGpuVa = device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
|
||||
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
|
||||
walkerCmd,
|
||||
device->getDeviceBitfield(),
|
||||
partitionCount,
|
||||
true,
|
||||
true,
|
||||
false,
|
||||
workPartitionAllocationGpuVa);
|
||||
} else {
|
||||
partitionCount = 1;
|
||||
auto buffer = listCmdBufferStream->getSpace(sizeof(walkerCmd));
|
||||
*(decltype(walkerCmd) *)buffer = walkerCmd;
|
||||
}
|
||||
|
||||
PreemptionHelper::applyPreemptionWaCmdsEnd<Family>(listCmdBufferStream, *device);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const HardwareInfo &hwInfo, WALKER_TYPE &walkerCmd) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
bool EncodeDispatchKernel<Family>::isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
|
||||
size_t *lws,
|
||||
std::array<uint8_t, 3> walkOrder,
|
||||
bool requireInputWalkOrder,
|
||||
uint32_t &requiredWalkOrder,
|
||||
uint32_t simd) {
|
||||
if (simd == 1) {
|
||||
return true;
|
||||
}
|
||||
bool hwGenerationOfLocalIdsEnabled = true;
|
||||
if (DebugManager.flags.EnableHwGenerationLocalIds.get() != -1) {
|
||||
hwGenerationOfLocalIdsEnabled = !!DebugManager.flags.EnableHwGenerationLocalIds.get();
|
||||
}
|
||||
if (hwGenerationOfLocalIdsEnabled) {
|
||||
if (activeChannels == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t totalLwsSize = 1u;
|
||||
for (auto dimension = 0u; dimension < activeChannels; dimension++) {
|
||||
totalLwsSize *= lws[dimension];
|
||||
}
|
||||
|
||||
if (totalLwsSize > 1024u) {
|
||||
return true;
|
||||
}
|
||||
|
||||
//make sure table below matches Hardware Spec
|
||||
constexpr uint32_t walkOrderPossibilties = 6u;
|
||||
constexpr uint8_t possibleWalkOrders[walkOrderPossibilties][3] = {{0, 1, 2},
|
||||
{0, 2, 1},
|
||||
{1, 0, 2},
|
||||
{2, 0, 1},
|
||||
{1, 2, 0},
|
||||
{2, 1, 0}};
|
||||
|
||||
//check if we need to follow kernel requirements
|
||||
if (requireInputWalkOrder) {
|
||||
for (uint32_t dimension = 0; dimension < activeChannels - 1; dimension++) {
|
||||
if (!Math::isPow2<size_t>(lws[walkOrder[dimension]])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
auto index = 0u;
|
||||
while (index < walkOrderPossibilties) {
|
||||
if (walkOrder[0] == possibleWalkOrders[index][0] &&
|
||||
walkOrder[1] == possibleWalkOrders[index][1]) {
|
||||
break;
|
||||
};
|
||||
index++;
|
||||
}
|
||||
DEBUG_BREAK_IF(index >= walkOrderPossibilties);
|
||||
|
||||
requiredWalkOrder = index;
|
||||
return false;
|
||||
}
|
||||
|
||||
//kernel doesn't specify any walk order requirements, check if we have any compatible
|
||||
for (uint32_t walkOrder = 0; walkOrder < walkOrderPossibilties; walkOrder++) {
|
||||
bool allDimensionsCompatible = true;
|
||||
for (uint32_t dimension = 0; dimension < activeChannels - 1; dimension++) {
|
||||
if (!Math::isPow2<size_t>(lws[possibleWalkOrders[walkOrder][dimension]])) {
|
||||
allDimensionsCompatible = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allDimensionsCompatible) {
|
||||
requiredWalkOrder = walkOrder;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
|
||||
const uint32_t *startWorkGroup,
|
||||
const uint32_t *numWorkGroups,
|
||||
const uint32_t *workGroupSizes,
|
||||
uint32_t simd,
|
||||
uint32_t localIdDimensions,
|
||||
uint32_t threadsPerThreadGroup,
|
||||
uint32_t threadExecutionMask,
|
||||
bool localIdsGenerationByRuntime,
|
||||
bool inlineDataProgrammingRequired,
|
||||
bool isIndirect,
|
||||
uint32_t requiredWorkGroupOrder) {
|
||||
|
||||
if (isIndirect) {
|
||||
walkerCmd.setIndirectParameterEnable(true);
|
||||
} else {
|
||||
walkerCmd.setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
|
||||
walkerCmd.setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
|
||||
walkerCmd.setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
|
||||
}
|
||||
|
||||
if (startWorkGroup) {
|
||||
walkerCmd.setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroup[0]));
|
||||
walkerCmd.setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroup[1]));
|
||||
walkerCmd.setThreadGroupIdStartingZ(static_cast<uint32_t>(startWorkGroup[2]));
|
||||
}
|
||||
|
||||
uint64_t executionMask = threadExecutionMask;
|
||||
if (executionMask == 0) {
|
||||
auto workGroupSize = workGroupSizes[0] * workGroupSizes[1] * workGroupSizes[2];
|
||||
auto remainderSimdLanes = workGroupSize & (simd - 1);
|
||||
executionMask = maxNBitValue(remainderSimdLanes);
|
||||
if (!executionMask) {
|
||||
executionMask = maxNBitValue((simd == 1) ? 32 : simd);
|
||||
}
|
||||
}
|
||||
|
||||
walkerCmd.setExecutionMask(static_cast<uint32_t>(executionMask));
|
||||
walkerCmd.setSimdSize(getSimdConfig<WALKER_TYPE>(simd));
|
||||
|
||||
walkerCmd.setMessageSimd(walkerCmd.getSimdSize());
|
||||
|
||||
//1) cross-thread inline data will be put into R1, but if kernel uses local ids, then cross-thread should be put further back
|
||||
//so whenever local ids are driver or hw generated, reserve space by setting right values for emitLocalIds
|
||||
//2) Auto-generation of local ids should be possible, when in fact local ids are used
|
||||
if (!localIdsGenerationByRuntime && localIdDimensions > 0) {
|
||||
UNRECOVERABLE_IF(localIdDimensions != 3);
|
||||
uint32_t emitLocalIdsForDim = (1 << 0) | (1 << 1) | (1 << 2);
|
||||
walkerCmd.setEmitLocalId(emitLocalIdsForDim);
|
||||
|
||||
walkerCmd.setLocalXMaximum(static_cast<uint32_t>(workGroupSizes[0] - 1));
|
||||
walkerCmd.setLocalYMaximum(static_cast<uint32_t>(workGroupSizes[1] - 1));
|
||||
walkerCmd.setLocalZMaximum(static_cast<uint32_t>(workGroupSizes[2] - 1));
|
||||
|
||||
walkerCmd.setGenerateLocalId(1);
|
||||
walkerCmd.setWalkOrder(requiredWorkGroupOrder);
|
||||
}
|
||||
if (inlineDataProgrammingRequired == true) {
|
||||
walkerCmd.setEmitInlineParameter(1);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart, Vec3<size_t> groupCount,
|
||||
bool isInternal) {
|
||||
size_t totalSize = sizeof(WALKER_TYPE);
|
||||
totalSize += PreemptionHelper::getPreemptionWaCsSize<Family>(*device);
|
||||
totalSize += EncodeStates<Family>::getAdjustStateComputeModeSize();
|
||||
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams();
|
||||
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
|
||||
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), true) &&
|
||||
!isInternal) {
|
||||
const bool staticPartitioning = device->getDefaultEngine().commandStreamReceiver->isStaticWorkPartitioningEnabled();
|
||||
totalSize += ImplicitScalingDispatch<Family>::getSize(true, staticPartitioning, device->getDeviceBitfield(), groupStart, groupCount);
|
||||
}
|
||||
|
||||
return totalSize;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeStateBaseAddress<Family>::encode(CommandContainer &container, STATE_BASE_ADDRESS &sbaCmd) {
|
||||
auto gmmHelper = container.getDevice()->getRootDeviceEnvironment().getGmmHelper();
|
||||
uint32_t statelessMocsIndex = (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1);
|
||||
EncodeStateBaseAddress<Family>::encode(container, sbaCmd, statelessMocsIndex, false);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeStateBaseAddress<Family>::encode(CommandContainer &container, STATE_BASE_ADDRESS &sbaCmd, uint32_t statelessMocsIndex, bool useGlobalAtomics) {
|
||||
auto gmmHelper = container.getDevice()->getRootDeviceEnvironment().getGmmHelper();
|
||||
bool multiOsContextCapable =
|
||||
ImplicitScalingHelper::isImplicitScalingEnabled(container.getDevice()->getDeviceBitfield(), true);
|
||||
|
||||
StateBaseAddressHelper<Family>::programStateBaseAddress(
|
||||
&sbaCmd,
|
||||
container.isHeapDirty(HeapType::DYNAMIC_STATE) ? container.getIndirectHeap(HeapType::DYNAMIC_STATE) : nullptr,
|
||||
container.isHeapDirty(HeapType::INDIRECT_OBJECT) ? container.getIndirectHeap(HeapType::INDIRECT_OBJECT) : nullptr,
|
||||
container.isHeapDirty(HeapType::SURFACE_STATE) ? container.getIndirectHeap(HeapType::SURFACE_STATE) : nullptr,
|
||||
0,
|
||||
true,
|
||||
statelessMocsIndex,
|
||||
container.getIndirectObjectHeapBaseAddress(),
|
||||
container.getInstructionHeapBaseAddress(),
|
||||
0,
|
||||
true,
|
||||
false,
|
||||
gmmHelper,
|
||||
multiOsContextCapable,
|
||||
MemoryCompressionState::NotApplicable,
|
||||
useGlobalAtomics,
|
||||
1u);
|
||||
|
||||
auto pCmd = reinterpret_cast<STATE_BASE_ADDRESS *>(container.getCommandStream()->getSpace(sizeof(STATE_BASE_ADDRESS)));
|
||||
*pCmd = sbaCmd;
|
||||
|
||||
if (container.isHeapDirty(HeapType::SURFACE_STATE)) {
|
||||
auto heap = container.getIndirectHeap(HeapType::SURFACE_STATE);
|
||||
auto cmd = Family::cmdInitStateBindingTablePoolAlloc;
|
||||
cmd.setBindingTablePoolBaseAddress(heap->getHeapGpuBase());
|
||||
cmd.setBindingTablePoolBufferSize(heap->getHeapSizeInPages());
|
||||
cmd.setSurfaceObjectControlStateIndexToMocsTables(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_STATE_HEAP_BUFFER));
|
||||
|
||||
auto buffer = container.getCommandStream()->getSpace(sizeof(cmd));
|
||||
*(typename Family::_3DSTATE_BINDING_TABLE_POOL_ALLOC *)buffer = cmd;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeComputeMode<Family>::adjustComputeMode(LinearStream &csr, void *const stateComputeModePtr, StateComputeModeProperties &properties) {
|
||||
using STATE_COMPUTE_MODE = typename Family::STATE_COMPUTE_MODE;
|
||||
using FORCE_NON_COHERENT = typename STATE_COMPUTE_MODE::FORCE_NON_COHERENT;
|
||||
|
||||
STATE_COMPUTE_MODE stateComputeMode = (stateComputeModePtr != nullptr) ? *(static_cast<STATE_COMPUTE_MODE *>(stateComputeModePtr)) : Family::cmdInitStateComputeMode;
|
||||
auto maskBits = stateComputeMode.getMaskBits();
|
||||
|
||||
if (properties.isCoherencyRequired.isDirty) {
|
||||
FORCE_NON_COHERENT coherencyValue = !properties.isCoherencyRequired.value ? FORCE_NON_COHERENT::FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT
|
||||
: FORCE_NON_COHERENT::FORCE_NON_COHERENT_FORCE_DISABLED;
|
||||
stateComputeMode.setForceNonCoherent(coherencyValue);
|
||||
maskBits |= Family::stateComputeModeForceNonCoherentMask;
|
||||
}
|
||||
|
||||
if (properties.largeGrfMode.isDirty) {
|
||||
stateComputeMode.setLargeGrfMode(properties.largeGrfMode.value);
|
||||
maskBits |= Family::stateComputeModeLargeGrfModeMask;
|
||||
}
|
||||
|
||||
stateComputeMode.setMaskBits(maskBits);
|
||||
|
||||
auto buffer = csr.getSpaceForCmd<STATE_COMPUTE_MODE>();
|
||||
*buffer = stateComputeMode;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeComputeMode<Family>::adjustPipelineSelect(CommandContainer &container, const NEO::KernelDescriptor &kernelDescriptor) {
|
||||
using PIPELINE_SELECT = typename Family::PIPELINE_SELECT;
|
||||
auto pipelineSelectCmd = Family::cmdInitPipelineSelect;
|
||||
|
||||
if (kernelDescriptor.extendedInfo && kernelDescriptor.extendedInfo->specialPipelineSelectModeRequired()) {
|
||||
pipelineSelectCmd.setSystolicModeEnable(true);
|
||||
} else {
|
||||
pipelineSelectCmd.setSystolicModeEnable(false);
|
||||
}
|
||||
|
||||
if (DebugManager.flags.OverrideSystolicPipelineSelect.get() != -1) {
|
||||
pipelineSelectCmd.setSystolicModeEnable(DebugManager.flags.OverrideSystolicPipelineSelect.get());
|
||||
}
|
||||
|
||||
pipelineSelectCmd.setMaskBits(pipelineSelectSystolicModeEnableMaskBits);
|
||||
pipelineSelectCmd.setPipelineSelection(PIPELINE_SELECT::PIPELINE_SELECTION_GPGPU);
|
||||
|
||||
auto buffer = container.getCommandStream()->getSpace(sizeof(pipelineSelectCmd));
|
||||
*(decltype(pipelineSelectCmd) *)buffer = pipelineSelectCmd;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeMediaInterfaceDescriptorLoad<Family>::encode(CommandContainer &container) {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeMiFlushDW<Family>::appendMiFlushDw(MI_FLUSH_DW *miFlushDwCmd) {
|
||||
miFlushDwCmd->setFlushCcs(1);
|
||||
miFlushDwCmd->setFlushLlc(1);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeMiFlushDW<Family>::programMiFlushDwWA(LinearStream &commandStream) {
|
||||
auto miFlushDwCmd = commandStream.getSpaceForCmd<MI_FLUSH_DW>();
|
||||
*miFlushDwCmd = Family::cmdInitMiFlushDw;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeMiFlushDW<Family>::getMiFlushDwWaSize() {
|
||||
return sizeof(typename Family::MI_FLUSH_DW);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
bool EncodeSurfaceState<Family>::doBindingTablePrefetch() {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void EncodeSurfaceState<GfxFamily>::encodeExtraBufferParams(R_SURFACE_STATE *surfaceState, GraphicsAllocation *allocation, GmmHelper *gmmHelper,
|
||||
bool isReadOnly, uint32_t numAvailableDevices, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) {
|
||||
Gmm *gmm = allocation ? allocation->getDefaultGmm() : nullptr;
|
||||
uint32_t compressionFormat = 0;
|
||||
|
||||
bool setConstCachePolicy = false;
|
||||
if (allocation && allocation->getAllocationType() == GraphicsAllocation::AllocationType::CONSTANT_SURFACE) {
|
||||
setConstCachePolicy = true;
|
||||
}
|
||||
|
||||
if (surfaceState->getMemoryObjectControlState() == gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) &&
|
||||
DebugManager.flags.ForceL1Caching.get() != 0) {
|
||||
setConstCachePolicy = true;
|
||||
}
|
||||
|
||||
if (setConstCachePolicy == true) {
|
||||
surfaceState->setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CONST));
|
||||
}
|
||||
|
||||
encodeExtraCacheSettings(surfaceState, *gmmHelper->getHardwareInfo());
|
||||
DeviceBitfield deviceBitfield{static_cast<uint32_t>(maxNBitValue(numAvailableDevices))};
|
||||
bool implicitScaling = ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, true);
|
||||
bool enablePartialWrites = implicitScaling;
|
||||
bool enableMultiGpuAtomics = enablePartialWrites;
|
||||
|
||||
if (DebugManager.flags.EnableMultiGpuAtomicsOptimization.get()) {
|
||||
enableMultiGpuAtomics = useGlobalAtomics && (enablePartialWrites || areMultipleSubDevicesInContext);
|
||||
}
|
||||
|
||||
surfaceState->setDisableSupportForMultiGpuAtomics(!enableMultiGpuAtomics);
|
||||
surfaceState->setDisableSupportForMultiGpuPartialWrites(!enablePartialWrites);
|
||||
|
||||
if (DebugManager.flags.ForceMultiGpuAtomics.get() != -1) {
|
||||
surfaceState->setDisableSupportForMultiGpuAtomics(!!DebugManager.flags.ForceMultiGpuAtomics.get());
|
||||
}
|
||||
|
||||
if (DebugManager.flags.ForceMultiGpuPartialWrites.get() != -1) {
|
||||
surfaceState->setDisableSupportForMultiGpuPartialWrites(!!DebugManager.flags.ForceMultiGpuPartialWrites.get());
|
||||
}
|
||||
|
||||
if (EncodeSurfaceState<GfxFamily>::isAuxModeEnabled(surfaceState, gmm)) {
|
||||
auto resourceFormat = gmm->gmmResourceInfo->getResourceFormat();
|
||||
compressionFormat = gmmHelper->getClientContext()->getSurfaceStateCompressionFormat(resourceFormat);
|
||||
|
||||
if (DebugManager.flags.ForceBufferCompressionFormat.get() != -1) {
|
||||
compressionFormat = DebugManager.flags.ForceBufferCompressionFormat.get();
|
||||
}
|
||||
}
|
||||
|
||||
if (DebugManager.flags.EnableStatelessCompressionWithUnifiedMemory.get()) {
|
||||
if (allocation && !MemoryPool::isSystemMemoryPool(allocation->getMemoryPool())) {
|
||||
setCoherencyType(surfaceState, R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT);
|
||||
setBufferAuxParamsForCCS(surfaceState);
|
||||
compressionFormat = DebugManager.flags.FormatForStatelessCompressionWithUnifiedMemory.get();
|
||||
}
|
||||
}
|
||||
|
||||
surfaceState->setCompressionFormat(compressionFormat);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeSurfaceState<Family>::setCoherencyType(R_SURFACE_STATE *surfaceState, COHERENCY_TYPE coherencyType) {
|
||||
surfaceState->setCoherencyType(R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSempahore<Family>::programMiSemaphoreWait(MI_SEMAPHORE_WAIT *cmd,
|
||||
uint64_t compareAddress,
|
||||
uint32_t compareData,
|
||||
COMPARE_OPERATION compareMode,
|
||||
bool registerPollMode) {
|
||||
MI_SEMAPHORE_WAIT localCmd = Family::cmdInitMiSemaphoreWait;
|
||||
localCmd.setCompareOperation(compareMode);
|
||||
localCmd.setSemaphoreDataDword(compareData);
|
||||
localCmd.setSemaphoreGraphicsAddress(compareAddress);
|
||||
localCmd.setWaitMode(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE);
|
||||
localCmd.setRegisterPollMode(registerPollMode ? MI_SEMAPHORE_WAIT::REGISTER_POLL_MODE::REGISTER_POLL_MODE_REGISTER_POLL : MI_SEMAPHORE_WAIT::REGISTER_POLL_MODE::REGISTER_POLL_MODE_MEMORY_POLL);
|
||||
|
||||
*cmd = localCmd;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeWA<Family>::encodeAdditionalPipelineSelect(Device &device, LinearStream &stream, bool is3DPipeline) {}
|
||||
|
||||
template <typename Family>
|
||||
inline size_t EncodeWA<Family>::getAdditionalPipelineSelectSize(Device &device) {
|
||||
return 0u;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
@@ -8,6 +8,7 @@ set(NEO_CORE_COMMAND_CONTAINER_IMAGE_SURFACE_STATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/compression_params_bdw_plus.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/compression_params_tgllp_plus.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/compression_params_xehp_plus.inl
|
||||
)
|
||||
|
||||
set_property(GLOBAL APPEND PROPERTY NEO_CORE_COMMAND_CONTAINER ${NEO_CORE_COMMAND_CONTAINER_IMAGE_SURFACE_STATE})
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/gmm_helper/resource_info.h"
|
||||
|
||||
#include "gmm_client_context.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSurfaceState<Family>::appendImageCompressionParams(R_SURFACE_STATE *surfaceState, GraphicsAllocation *allocation, GmmHelper *gmmHelper, bool imageFromBuffer) {
|
||||
const auto ccsMode = R_SURFACE_STATE::AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E;
|
||||
if ((ccsMode == surfaceState->getAuxiliarySurfaceMode() || surfaceState->getMemoryCompressionEnable())) {
|
||||
uint8_t compressionFormat;
|
||||
auto gmmResourceInfo = allocation->getDefaultGmm()->gmmResourceInfo.get();
|
||||
if (gmmResourceInfo->getResourceFlags()->Info.MediaCompressed) {
|
||||
compressionFormat = gmmHelper->getClientContext()->getMediaSurfaceStateCompressionFormat(gmmResourceInfo->getResourceFormat());
|
||||
} else {
|
||||
compressionFormat = gmmHelper->getClientContext()->getSurfaceStateCompressionFormat(gmmResourceInfo->getResourceFormat());
|
||||
}
|
||||
|
||||
if (imageFromBuffer) {
|
||||
if (DebugManager.flags.ForceBufferCompressionFormat.get() != -1) {
|
||||
compressionFormat = DebugManager.flags.ForceBufferCompressionFormat.get();
|
||||
}
|
||||
appendParamsForImageFromBuffer(surfaceState);
|
||||
}
|
||||
|
||||
surfaceState->setCompressionFormat(compressionFormat);
|
||||
}
|
||||
}
|
||||
} // namespace NEO
|
||||
37
shared/source/command_container/implicit_scaling.cpp
Normal file
37
shared/source/command_container/implicit_scaling.cpp
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/os_interface/os_interface.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
bool ImplicitScalingHelper::isImplicitScalingEnabled(const DeviceBitfield &devices, bool preCondition) {
|
||||
bool partitionWalker = (devices.count() > 1u) &&
|
||||
preCondition &&
|
||||
ImplicitScaling::apiSupport;
|
||||
|
||||
if (DebugManager.flags.EnableWalkerPartition.get() != -1) {
|
||||
partitionWalker = !!DebugManager.flags.EnableWalkerPartition.get();
|
||||
}
|
||||
//we can't do this without local memory
|
||||
partitionWalker &= OSInterface::osEnableLocalMemory;
|
||||
|
||||
return partitionWalker;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired() {
|
||||
auto synchronizeBeforeExecution = false;
|
||||
if (DebugManager.flags.SynchronizeWalkerInWparidMode.get() != -1) {
|
||||
synchronizeBeforeExecution = static_cast<bool>(DebugManager.flags.SynchronizeWalkerInWparidMode.get());
|
||||
}
|
||||
return synchronizeBeforeExecution;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
44
shared/source/command_container/implicit_scaling.h
Normal file
44
shared/source/command_container/implicit_scaling.h
Normal file
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/helpers/common_types.h"
|
||||
#include "shared/source/helpers/vec.h"
|
||||
|
||||
namespace NEO {
|
||||
class LinearStream;
|
||||
|
||||
namespace ImplicitScaling {
|
||||
extern bool apiSupport;
|
||||
}
|
||||
|
||||
struct ImplicitScalingHelper {
|
||||
static bool isImplicitScalingEnabled(const DeviceBitfield &devices, bool preCondition);
|
||||
static bool isSynchronizeBeforeExecutionRequired();
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
struct ImplicitScalingDispatch {
|
||||
using WALKER_TYPE = typename GfxFamily::WALKER_TYPE;
|
||||
|
||||
static size_t getSize(bool nativeCrossTileAtomicSync,
|
||||
bool preferStaticPartitioning,
|
||||
const DeviceBitfield &devices,
|
||||
Vec3<size_t> groupStart,
|
||||
Vec3<size_t> groupCount);
|
||||
static void dispatchCommands(LinearStream &commandStream,
|
||||
WALKER_TYPE &walkerCmd,
|
||||
const DeviceBitfield &devices,
|
||||
uint32_t &partitionCount,
|
||||
bool useSecondaryBatchBuffer,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
bool usesImages,
|
||||
uint64_t workPartitionAllocationGpuVa);
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
#include "shared/source/command_container/walker_partition_xehp_plus.h"
|
||||
#include "shared/source/command_stream/linear_stream.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSync,
|
||||
bool preferStaticPartitioning,
|
||||
const DeviceBitfield &devices,
|
||||
Vec3<size_t> groupStart,
|
||||
Vec3<size_t> groupCount) {
|
||||
typename GfxFamily::COMPUTE_WALKER::PARTITION_TYPE partitionType{};
|
||||
bool staticPartitioning = false;
|
||||
const uint32_t tileCount = static_cast<uint32_t>(devices.count());
|
||||
const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType<GfxFamily>(tileCount,
|
||||
preferStaticPartitioning,
|
||||
groupStart,
|
||||
groupCount,
|
||||
{},
|
||||
&partitionType,
|
||||
&staticPartitioning);
|
||||
UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount));
|
||||
|
||||
auto synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
|
||||
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(
|
||||
false, 16u, synchronizeBeforeExecution, nativeCrossTileAtomicSync, staticPartitioning));
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandStream,
|
||||
WALKER_TYPE &walkerCmd,
|
||||
const DeviceBitfield &devices,
|
||||
uint32_t &partitionCount,
|
||||
bool useSecondaryBatchBuffer,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
bool usesImages,
|
||||
uint64_t workPartitionAllocationGpuVa) {
|
||||
uint32_t totalProgrammedSize = 0u;
|
||||
const uint32_t tileCount = static_cast<uint32_t>(devices.count());
|
||||
const bool preferStaticPartitioning = workPartitionAllocationGpuVa != 0u;
|
||||
|
||||
bool staticPartitioning = false;
|
||||
partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily>(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning);
|
||||
const bool synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
|
||||
if (staticPartitioning) {
|
||||
UNRECOVERABLE_IF(tileCount != partitionCount);
|
||||
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
|
||||
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
|
||||
&walkerCmd,
|
||||
totalProgrammedSize,
|
||||
partitionCount,
|
||||
tileCount,
|
||||
synchronizeBeforeExecution,
|
||||
useSecondaryBatchBuffer,
|
||||
nativeCrossTileAtomicSync,
|
||||
workPartitionAllocationGpuVa);
|
||||
} else {
|
||||
if (DebugManager.flags.ExperimentalSetWalkerPartitionCount.get()) {
|
||||
partitionCount = DebugManager.flags.ExperimentalSetWalkerPartitionCount.get();
|
||||
if (partitionCount == 1u) {
|
||||
walkerCmd.setPartitionType(GfxFamily::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
|
||||
}
|
||||
}
|
||||
|
||||
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
|
||||
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
|
||||
&walkerCmd, totalProgrammedSize,
|
||||
partitionCount, tileCount,
|
||||
false, synchronizeBeforeExecution, useSecondaryBatchBuffer,
|
||||
nativeCrossTileAtomicSync);
|
||||
}
|
||||
commandStream.getSpace(totalProgrammedSize);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
730
shared/source/command_container/walker_partition_xehp_plus.h
Normal file
730
shared/source/command_container/walker_partition_xehp_plus.h
Normal file
@@ -0,0 +1,730 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "shared/source/command_container/command_encoder.h"
|
||||
#include "shared/source/debug_settings/debug_settings_manager.h"
|
||||
#include "shared/source/helpers/basic_math.h"
|
||||
#include "shared/source/helpers/hw_helper.h"
|
||||
#include "shared/source/helpers/ptr_math.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <optional>
|
||||
|
||||
namespace WalkerPartition {
|
||||
|
||||
template <typename GfxFamily>
|
||||
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
|
||||
template <typename GfxFamily>
|
||||
using POSTSYNC_DATA = typename GfxFamily::POSTSYNC_DATA;
|
||||
template <typename GfxFamily>
|
||||
using BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
|
||||
template <typename GfxFamily>
|
||||
using BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END;
|
||||
template <typename GfxFamily>
|
||||
using LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
|
||||
template <typename GfxFamily>
|
||||
using LOAD_REGISTER_MEM = typename GfxFamily::MI_LOAD_REGISTER_MEM;
|
||||
template <typename GfxFamily>
|
||||
using MI_SET_PREDICATE = typename GfxFamily::MI_SET_PREDICATE;
|
||||
template <typename GfxFamily>
|
||||
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
|
||||
template <typename GfxFamily>
|
||||
using MI_ATOMIC = typename GfxFamily::MI_ATOMIC;
|
||||
template <typename GfxFamily>
|
||||
using DATA_SIZE = typename GfxFamily::MI_ATOMIC::DATA_SIZE;
|
||||
template <typename GfxFamily>
|
||||
using LOAD_REGISTER_REG = typename GfxFamily::MI_LOAD_REGISTER_REG;
|
||||
template <typename GfxFamily>
|
||||
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
|
||||
template <typename GfxFamily>
|
||||
using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM;
|
||||
|
||||
constexpr uint32_t wparidCCSOffset = 0x221C;
|
||||
constexpr uint32_t addressOffsetCCSOffset = 0x23B4;
|
||||
constexpr uint32_t predicationMaskCCSOffset = 0x21FC;
|
||||
|
||||
constexpr uint32_t generalPurposeRegister0 = 0x2600;
|
||||
constexpr uint32_t generalPurposeRegister1 = 0x2608;
|
||||
constexpr uint32_t generalPurposeRegister2 = 0x2610;
|
||||
constexpr uint32_t generalPurposeRegister3 = 0x2618;
|
||||
constexpr uint32_t generalPurposeRegister4 = 0x2620;
|
||||
constexpr uint32_t generalPurposeRegister5 = 0x2628;
|
||||
constexpr uint32_t generalPurposeRegister6 = 0x2630;
|
||||
|
||||
struct BatchBufferControlData {
|
||||
uint32_t partitionCount = 0u;
|
||||
uint32_t tileCount = 0u;
|
||||
uint32_t inTileCount = 0u;
|
||||
uint32_t finalSyncTileCount = 0u;
|
||||
};
|
||||
static constexpr inline size_t dynamicPartitioningFieldsForCleanupCount = sizeof(BatchBufferControlData) / sizeof(uint32_t) - 1;
|
||||
|
||||
template <typename Command>
|
||||
Command *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed) {
|
||||
totalBytesProgrammed += sizeof(Command);
|
||||
auto commandToReturn = reinterpret_cast<Command *>(inputAddress);
|
||||
inputAddress = ptrOffset(inputAddress, sizeof(Command));
|
||||
return commandToReturn;
|
||||
}
|
||||
|
||||
bool inline isSemaphoreProgrammingRequired() {
|
||||
auto semaphoreProgrammingRequired = false;
|
||||
if (NEO::DebugManager.flags.ExperimentalSynchronizeWithSemaphores.get() == 1) {
|
||||
semaphoreProgrammingRequired = true;
|
||||
}
|
||||
return semaphoreProgrammingRequired;
|
||||
}
|
||||
|
||||
bool inline isCrossTileAtomicRequired() {
|
||||
auto crossTileAtomicSynchronization = true;
|
||||
if (NEO::DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.get() == 0) {
|
||||
crossTileAtomicSynchronization = false;
|
||||
}
|
||||
return crossTileAtomicSynchronization;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitionCount,
|
||||
bool preferStaticPartitioning,
|
||||
Vec3<size_t> groupStart,
|
||||
Vec3<size_t> groupCount,
|
||||
std::optional<typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE> requestedPartitionType,
|
||||
typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE *outSelectedPartitionType,
|
||||
bool *outSelectStaticPartitioning) {
|
||||
// For non uniform starting point, there is no support for partition in Hardware. Disable partitioning and select dynamic algorithm
|
||||
if (groupStart.x || groupStart.y || groupStart.z) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_DISABLED;
|
||||
*outSelectStaticPartitioning = false;
|
||||
return 1u;
|
||||
}
|
||||
|
||||
size_t workgroupCount = 0u;
|
||||
bool disablePartitionForPartitionCountOne{};
|
||||
|
||||
if (NEO::DebugManager.flags.ExperimentalSetWalkerPartitionType.get() != -1) {
|
||||
requestedPartitionType = static_cast<typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE>(NEO::DebugManager.flags.ExperimentalSetWalkerPartitionType.get());
|
||||
}
|
||||
|
||||
if (requestedPartitionType.has_value()) {
|
||||
switch (requestedPartitionType.value()) {
|
||||
case COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X:
|
||||
workgroupCount = groupCount.x;
|
||||
break;
|
||||
case COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y:
|
||||
workgroupCount = groupCount.y;
|
||||
break;
|
||||
case COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Z:
|
||||
workgroupCount = groupCount.z;
|
||||
break;
|
||||
default:
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
*outSelectedPartitionType = requestedPartitionType.value();
|
||||
disablePartitionForPartitionCountOne = false;
|
||||
} else {
|
||||
const size_t maxDimension = std::max({groupCount.z, groupCount.y, groupCount.x});
|
||||
|
||||
auto goWithMaxAlgorithm = !preferStaticPartitioning;
|
||||
if (NEO::DebugManager.flags.WalkerPartitionPreferHighestDimension.get() != -1) {
|
||||
goWithMaxAlgorithm = !!!NEO::DebugManager.flags.WalkerPartitionPreferHighestDimension.get();
|
||||
}
|
||||
|
||||
//compute misaligned %, accept imbalance below threshold in favor of Z/Y/X distribution.
|
||||
const float minimalThreshold = 0.05f;
|
||||
float zImbalance = static_cast<float>(groupCount.z - alignDown(groupCount.z, preferredMinimalPartitionCount)) / static_cast<float>(groupCount.z);
|
||||
float yImbalance = static_cast<float>(groupCount.y - alignDown(groupCount.y, preferredMinimalPartitionCount)) / static_cast<float>(groupCount.y);
|
||||
|
||||
//we first try with deepest dimension to see if we can partition there
|
||||
if (groupCount.z > 1 && (zImbalance <= minimalThreshold)) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Z;
|
||||
} else if (groupCount.y > 1 && (yImbalance < minimalThreshold)) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y;
|
||||
} else if (groupCount.x % preferredMinimalPartitionCount == 0) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X;
|
||||
}
|
||||
//if we are here then there is no dimension that results in even distribution, choose max dimension to minimize impact
|
||||
else {
|
||||
goWithMaxAlgorithm = true;
|
||||
}
|
||||
|
||||
if (goWithMaxAlgorithm) {
|
||||
// default mode, select greatest dimension
|
||||
if (maxDimension == groupCount.x) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X;
|
||||
} else if (maxDimension == groupCount.y) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y;
|
||||
} else {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Z;
|
||||
}
|
||||
}
|
||||
|
||||
workgroupCount = maxDimension;
|
||||
disablePartitionForPartitionCountOne = true;
|
||||
}
|
||||
|
||||
// Static partitioning - partition count == tile count
|
||||
*outSelectStaticPartitioning = preferStaticPartitioning;
|
||||
if (preferStaticPartitioning) {
|
||||
return preferredMinimalPartitionCount;
|
||||
}
|
||||
|
||||
// Dynamic partitioning - compute optimal partition count
|
||||
size_t partitionCount = std::min(static_cast<size_t>(16u), workgroupCount);
|
||||
partitionCount = Math::prevPowerOfTwo(partitionCount);
|
||||
if (NEO::DebugManager.flags.SetMinimalPartitionSize.get() != 0) {
|
||||
const auto workgroupPerPartitionThreshold = NEO::DebugManager.flags.SetMinimalPartitionSize.get() == -1
|
||||
? 512u
|
||||
: static_cast<unsigned>(NEO::DebugManager.flags.SetMinimalPartitionSize.get());
|
||||
preferredMinimalPartitionCount = std::max(2u, preferredMinimalPartitionCount);
|
||||
|
||||
while (partitionCount > preferredMinimalPartitionCount) {
|
||||
auto workgroupsPerPartition = workgroupCount / partitionCount;
|
||||
if (workgroupsPerPartition >= workgroupPerPartitionThreshold) {
|
||||
break;
|
||||
}
|
||||
partitionCount = partitionCount / 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (partitionCount == 1u && disablePartitionForPartitionCountOne) {
|
||||
*outSelectedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_DISABLED;
|
||||
}
|
||||
|
||||
return static_cast<uint32_t>(partitionCount);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t computePartitionCountAndSetPartitionType(COMPUTE_WALKER<GfxFamily> *walker,
|
||||
uint32_t preferredMinimalPartitionCount,
|
||||
bool preferStaticPartitioning,
|
||||
bool usesImages,
|
||||
bool *outSelectStaticPartitioning) {
|
||||
const Vec3<size_t> groupStart = {walker->getThreadGroupIdStartingX(), walker->getThreadGroupIdStartingY(), walker->getThreadGroupIdStartingZ()};
|
||||
const Vec3<size_t> groupCount = {walker->getThreadGroupIdXDimension(), walker->getThreadGroupIdYDimension(), walker->getThreadGroupIdZDimension()};
|
||||
std::optional<typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE> requestedPartitionType{};
|
||||
if (usesImages) {
|
||||
requestedPartitionType = COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X;
|
||||
}
|
||||
typename COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE partitionType{};
|
||||
const auto partitionCount = computePartitionCountAndPartitionType<GfxFamily>(preferredMinimalPartitionCount,
|
||||
preferStaticPartitioning,
|
||||
groupStart,
|
||||
groupCount,
|
||||
requestedPartitionType,
|
||||
&partitionType,
|
||||
outSelectStaticPartitioning);
|
||||
walker->setPartitionType(partitionType);
|
||||
return partitionCount;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programRegisterWithValue(void *&inputAddress, uint32_t registerOffset, uint32_t &totalBytesProgrammed, uint32_t registerValue) {
|
||||
auto loadRegisterImmediate = putCommand<LOAD_REGISTER_IMM<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
LOAD_REGISTER_IMM<GfxFamily> cmd = GfxFamily::cmdInitLoadRegisterImm;
|
||||
|
||||
cmd.setRegisterOffset(registerOffset);
|
||||
cmd.setDataDword(registerValue);
|
||||
cmd.setMmioRemapEnable(true);
|
||||
*loadRegisterImmediate = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programWaitForSemaphore(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddress, uint32_t semaphoreCompareValue, typename MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION compareOperation) {
|
||||
auto semaphoreWait = putCommand<MI_SEMAPHORE_WAIT<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
MI_SEMAPHORE_WAIT<GfxFamily> cmd = GfxFamily::cmdInitMiSemaphoreWait;
|
||||
|
||||
cmd.setSemaphoreDataDword(semaphoreCompareValue);
|
||||
cmd.setSemaphoreGraphicsAddress(gpuAddress);
|
||||
cmd.setWaitMode(MI_SEMAPHORE_WAIT<GfxFamily>::WAIT_MODE::WAIT_MODE_POLLING_MODE);
|
||||
cmd.setCompareOperation(compareOperation);
|
||||
*semaphoreWait = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
bool programWparidMask(void *&inputAddress, uint32_t &totalBytesProgrammed, uint32_t partitionCount) {
|
||||
//currently only power of 2 values of partitionCount are being supported
|
||||
if (!Math::isPow2(partitionCount) || partitionCount > 16) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto mask = 0xFFE0;
|
||||
auto fillValue = 0x10;
|
||||
auto count = partitionCount;
|
||||
while (count < 16) {
|
||||
fillValue |= (fillValue >> 1);
|
||||
count *= 2;
|
||||
}
|
||||
mask |= (mask | fillValue);
|
||||
|
||||
programRegisterWithValue<GfxFamily>(inputAddress, predicationMaskCCSOffset, totalBytesProgrammed, mask);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programWparidPredication(void *&inputAddress, uint32_t &totalBytesProgrammed, bool predicationEnabled) {
|
||||
auto miSetPredicate = putCommand<MI_SET_PREDICATE<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
MI_SET_PREDICATE<GfxFamily> cmd = GfxFamily::cmdInitSetPredicate;
|
||||
|
||||
if (predicationEnabled) {
|
||||
cmd.setPredicateEnableWparid(MI_SET_PREDICATE<GfxFamily>::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE);
|
||||
} else {
|
||||
cmd.setPredicateEnable(MI_SET_PREDICATE<GfxFamily>::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE);
|
||||
}
|
||||
*miSetPredicate = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programMiAtomic(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddress, bool requireReturnValue, typename MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES atomicOpcode) {
|
||||
auto miAtomic = putCommand<MI_ATOMIC<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
NEO::EncodeAtomic<GfxFamily>::programMiAtomic(miAtomic, gpuAddress, atomicOpcode, DATA_SIZE<GfxFamily>::DATA_SIZE_DWORD,
|
||||
requireReturnValue, requireReturnValue, 0x0u, 0x0u);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programMiBatchBufferStart(void *&inputAddress, uint32_t &totalBytesProgrammed,
|
||||
uint64_t gpuAddress, bool predicationEnabled, bool secondary) {
|
||||
auto batchBufferStart = putCommand<BATCH_BUFFER_START<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
BATCH_BUFFER_START<GfxFamily> cmd = GfxFamily::cmdInitBatchBufferStart;
|
||||
|
||||
cmd.setSecondLevelBatchBuffer(static_cast<typename BATCH_BUFFER_START<GfxFamily>::SECOND_LEVEL_BATCH_BUFFER>(secondary));
|
||||
cmd.setAddressSpaceIndicator(BATCH_BUFFER_START<GfxFamily>::ADDRESS_SPACE_INDICATOR::ADDRESS_SPACE_INDICATOR_PPGTT);
|
||||
cmd.setPredicationEnable(predicationEnabled);
|
||||
cmd.setBatchBufferStartAddress(gpuAddress);
|
||||
*batchBufferStart = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programMiLoadRegisterReg(void *&inputAddress, uint32_t &totalBytesProgrammed, uint32_t sourceRegisterOffset, uint32_t destinationRegisterOffset) {
|
||||
auto loadRegisterReg = putCommand<LOAD_REGISTER_REG<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
LOAD_REGISTER_REG<GfxFamily> cmd = GfxFamily::cmdInitLoadRegisterReg;
|
||||
|
||||
cmd.setMmioRemapEnableSource(true);
|
||||
cmd.setMmioRemapEnableDestination(true);
|
||||
cmd.setSourceRegisterAddress(sourceRegisterOffset);
|
||||
cmd.setDestinationRegisterAddress(destinationRegisterOffset);
|
||||
*loadRegisterReg = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programMiLoadRegisterMem(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddressToLoad, uint32_t destinationRegisterOffset) {
|
||||
auto loadRegisterReg = putCommand<LOAD_REGISTER_MEM<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
LOAD_REGISTER_MEM<GfxFamily> cmd = GfxFamily::cmdInitLoadRegisterMem;
|
||||
|
||||
cmd.setMmioRemapEnable(true);
|
||||
cmd.setMemoryAddress(gpuAddressToLoad);
|
||||
cmd.setRegisterAddress(destinationRegisterOffset);
|
||||
*loadRegisterReg = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programPipeControlCommand(void *&inputAddress, uint32_t &totalBytesProgrammed, bool dcFlush) {
|
||||
auto pipeControl = putCommand<PIPE_CONTROL<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
PIPE_CONTROL<GfxFamily> cmd = GfxFamily::cmdInitPipeControl;
|
||||
|
||||
if (NEO::MemorySynchronizationCommands<GfxFamily>::isDcFlushAllowed()) {
|
||||
cmd.setDcFlushEnable(dcFlush);
|
||||
}
|
||||
if (NEO::DebugManager.flags.DoNotFlushCaches.get()) {
|
||||
cmd.setDcFlushEnable(false);
|
||||
}
|
||||
*pipeControl = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programStoreMemImmediateDword(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddress, uint32_t data) {
|
||||
auto storeDataImmediate = putCommand<MI_STORE_DATA_IMM<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
MI_STORE_DATA_IMM<GfxFamily> cmd = GfxFamily::cmdInitStoreDataImm;
|
||||
|
||||
cmd.setAddress(gpuAddress);
|
||||
cmd.setStoreQword(false);
|
||||
cmd.setDwordLength(MI_STORE_DATA_IMM<GfxFamily>::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD);
|
||||
cmd.setDataDword0(static_cast<uint32_t>(data));
|
||||
|
||||
*storeDataImmediate = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programNativeCrossTileSyncControl(void *&inputAddress,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint64_t finalSyncTileCountField) {
|
||||
programStoreMemImmediateDword<GfxFamily>(inputAddress,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountField,
|
||||
0u);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programNativeCrossTileSyncCleanup(void *&inputAddress,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint64_t finalSyncTileCountAddress,
|
||||
uint64_t baseAddressForCleanup,
|
||||
size_t fieldsForCleanupCount,
|
||||
uint32_t tileCount) {
|
||||
// Synchronize tiles, so the fields are not cleared while still in use
|
||||
programMiAtomic<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
|
||||
for (auto fieldIndex = 0u; fieldIndex < fieldsForCleanupCount; fieldIndex++) {
|
||||
const uint64_t addressForCleanup = baseAddressForCleanup + fieldIndex * sizeof(uint32_t);
|
||||
programStoreMemImmediateDword<GfxFamily>(inputAddress,
|
||||
totalBytesProgrammed,
|
||||
addressForCleanup,
|
||||
0u);
|
||||
}
|
||||
|
||||
//this synchronization point ensures that all tiles finished zeroing and will fairly access control section atomic variables
|
||||
programMiAtomic<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programTilesSynchronizationWithPostSyncs(void *¤tBatchBufferPointer,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
uint32_t partitionCount) {
|
||||
const auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu;
|
||||
for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) {
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, postSyncAddress + partitionId * 16llu, 1u, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programTilesSynchronizationWithAtomics(void *¤tBatchBufferPointer,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint64_t atomicAddress,
|
||||
uint32_t tileCount) {
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeWalkerSectionSize() {
|
||||
return sizeof(BATCH_BUFFER_START<GfxFamily>) +
|
||||
sizeof(COMPUTE_WALKER<GfxFamily>);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeNativeCrossTileSyncControlSectionSize() {
|
||||
return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount) {
|
||||
return fieldsForCleanupCount * sizeof(MI_STORE_DATA_IMM<GfxFamily>) +
|
||||
2 * sizeof(MI_ATOMIC<GfxFamily>) +
|
||||
2 * sizeof(MI_SEMAPHORE_WAIT<GfxFamily>);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync) {
|
||||
auto synchronizationCount = (synchronizeBeforeExecution) ? 2u : 1u;
|
||||
if (!isCrossTileAtomicRequired() && !nativeCrossTileAtomicSync) {
|
||||
synchronizationCount--;
|
||||
}
|
||||
|
||||
return sizeof(LOAD_REGISTER_IMM<GfxFamily>) +
|
||||
sizeof(MI_ATOMIC<GfxFamily>) * (1u + synchronizationCount) +
|
||||
sizeof(LOAD_REGISTER_REG<GfxFamily>) +
|
||||
sizeof(MI_SET_PREDICATE<GfxFamily>) * 2 +
|
||||
sizeof(BATCH_BUFFER_START<GfxFamily>) * 2 +
|
||||
sizeof(PIPE_CONTROL<GfxFamily>) +
|
||||
sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * synchronizationCount +
|
||||
(isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * partitionCount : 0u) +
|
||||
computeWalkerSectionSize<GfxFamily>() +
|
||||
(nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize<GfxFamily>() : 0u);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeWalkerSectionStart(uint32_t partitionCount,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool nativeCrossTileAtomicSync) {
|
||||
return computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync) -
|
||||
computeWalkerSectionSize<GfxFamily>();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
uint32_t partitionCount) {
|
||||
auto computeWalker = putCommand<COMPUTE_WALKER<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
||||
COMPUTE_WALKER<GfxFamily> cmd = *inputWalker;
|
||||
|
||||
if (partitionCount > 1) {
|
||||
auto partitionType = inputWalker->getPartitionType();
|
||||
|
||||
assert(inputWalker->getThreadGroupIdStartingX() == 0u);
|
||||
assert(inputWalker->getThreadGroupIdStartingY() == 0u);
|
||||
assert(inputWalker->getThreadGroupIdStartingZ() == 0u);
|
||||
assert(partitionType != COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
|
||||
|
||||
cmd.setWorkloadPartitionEnable(true);
|
||||
|
||||
auto workgroupCount = 0u;
|
||||
if (partitionType == COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X) {
|
||||
workgroupCount = inputWalker->getThreadGroupIdXDimension();
|
||||
} else if (partitionType == COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_Y) {
|
||||
workgroupCount = inputWalker->getThreadGroupIdYDimension();
|
||||
} else {
|
||||
workgroupCount = inputWalker->getThreadGroupIdZDimension();
|
||||
}
|
||||
|
||||
cmd.setPartitionSize((workgroupCount + partitionCount - 1u) / partitionCount);
|
||||
}
|
||||
*computeWalker = cmd;
|
||||
}
|
||||
|
||||
/* SAMPLE COMMAND BUFFER STRUCTURE, birds eye view for 16 partitions, 4 tiles
|
||||
//inital setup section
|
||||
1. MI_LOAD_REGISTER(PREDICATION_MASK, active partition mask )
|
||||
//loop 1 - loop as long as there are partitions to be serviced
|
||||
2. MI_ATOMIC_INC( ATOMIC LOCATION #31 within CMD buffer )
|
||||
3. MI_LOAD_REGISTER_REG ( ATOMIC RESULT -> WPARID )
|
||||
4. MI_SET_PREDICATE( WPARID MODE )
|
||||
5. BATCH_BUFFER_START( LOCATION #28 ) // this will not be executed if partition outside of active virtual partitions
|
||||
//loop 1 ends here, if we are here it means there are no more partitions
|
||||
6. MI_SET_PREDICATE ( OFF )
|
||||
//Walker synchronization section starts here, make sure that Walker is done
|
||||
7, PIPE_CONTROL ( DC_FLUSH )
|
||||
//wait for all post syncs to make sure whole work is done, caller needs to set them to 1.
|
||||
//now epilogue starts synchro all engines prior to coming back to RING, this will be once per command buffer to make sure that all engines actually passed via cmd buffer.
|
||||
//epilogue section, make sure every tile completed prior to continuing
|
||||
//This is cross-tile synchronization
|
||||
24. ATOMIC_INC( LOCATION #31)
|
||||
25. WAIT_FOR_SEMAPHORE ( LOCATION #31, LOWER THEN 4 ) // wait till all tiles hit atomic
|
||||
26. PIPE_CONTROL ( TAG UPDATE ) (not implemented)
|
||||
27. BATCH_BUFFER_STAT (LOCATION #32) // go to the very end
|
||||
//Walker section
|
||||
28. COMPUTE_WALKER
|
||||
29. BATCH BUFFER_START ( GO BACK TO #2)
|
||||
//Batch Buffer Control Data section, there are no real commands here but we have memory here
|
||||
//That will be updated via atomic operations.
|
||||
30. uint32_t virtualPartitionID //atomic location
|
||||
31. uint32_t completionTileID //all tiles needs to report completion
|
||||
32. BATCH_BUFFER_END ( optional )
|
||||
*/
|
||||
|
||||
template <typename GfxFamily>
|
||||
void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
uint64_t gpuAddressOfAllocation,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint32_t partitionCount,
|
||||
uint32_t tileCount,
|
||||
bool emitBatchBufferEnd,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool secondaryBatchBuffer,
|
||||
bool nativeCrossTileAtomicSync) {
|
||||
totalBytesProgrammed = 0u;
|
||||
void *currentBatchBufferPointer = cpuPointer;
|
||||
|
||||
auto controlSectionOffset = computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync);
|
||||
if (synchronizeBeforeExecution) {
|
||||
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, inTileCount);
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
|
||||
//if all tiles hit the atomic, it means we may go further
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
}
|
||||
|
||||
programWparidMask<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, partitionCount);
|
||||
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
gpuAddressOfAllocation + controlSectionOffset,
|
||||
true,
|
||||
MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
|
||||
//move atomic result to wparid
|
||||
programMiLoadRegisterReg<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, generalPurposeRegister4, wparidCCSOffset);
|
||||
|
||||
//enable predication basing on wparid value
|
||||
programWparidPredication<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
|
||||
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
gpuAddressOfAllocation +
|
||||
computeWalkerSectionStart<GfxFamily>(partitionCount,
|
||||
synchronizeBeforeExecution,
|
||||
nativeCrossTileAtomicSync),
|
||||
true,
|
||||
secondaryBatchBuffer);
|
||||
|
||||
//disable predication to not noop subsequent commands.
|
||||
programWparidPredication<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, false);
|
||||
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
|
||||
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField);
|
||||
}
|
||||
|
||||
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
|
||||
|
||||
if (isSemaphoreProgrammingRequired()) {
|
||||
auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu;
|
||||
for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) {
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, postSyncAddress + partitionId * 16llu, 1u, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
|
||||
}
|
||||
}
|
||||
|
||||
if (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) {
|
||||
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, tileCount);
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
|
||||
//if all tiles hit the atomic, it means we may go further
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
}
|
||||
|
||||
//this bb start goes to the end of partitioned command buffer
|
||||
programMiBatchBufferStart<GfxFamily>(
|
||||
currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
gpuAddressOfAllocation + controlSectionOffset + sizeof(BatchBufferControlData),
|
||||
false,
|
||||
secondaryBatchBuffer);
|
||||
|
||||
//Walker section
|
||||
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
|
||||
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, secondaryBatchBuffer);
|
||||
|
||||
auto controlSection = reinterpret_cast<BatchBufferControlData *>(ptrOffset(cpuPointer, static_cast<size_t>(controlSectionOffset)));
|
||||
controlSection->partitionCount = 0u;
|
||||
controlSection->tileCount = 0u;
|
||||
controlSection->inTileCount = 0u;
|
||||
controlSection->finalSyncTileCount = 0u;
|
||||
totalBytesProgrammed += sizeof(BatchBufferControlData);
|
||||
currentBatchBufferPointer = ptrOffset(currentBatchBufferPointer, sizeof(BatchBufferControlData));
|
||||
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
|
||||
programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountAddress,
|
||||
gpuAddressOfAllocation + controlSectionOffset,
|
||||
dynamicPartitioningFieldsForCleanupCount,
|
||||
tileCount);
|
||||
}
|
||||
|
||||
if (emitBatchBufferEnd) {
|
||||
auto batchBufferEnd = putCommand<BATCH_BUFFER_END<GfxFamily>>(currentBatchBufferPointer, totalBytesProgrammed);
|
||||
*batchBufferEnd = GfxFamily::cmdInitBatchBufferEnd;
|
||||
}
|
||||
}
|
||||
|
||||
struct StaticPartitioningControlSection {
|
||||
uint32_t synchronizeBeforeWalkerCounter = 0;
|
||||
uint32_t synchronizeAfterWalkerCounter = 0;
|
||||
uint32_t finalSyncTileCounter = 0;
|
||||
};
|
||||
static constexpr inline size_t staticPartitioningFieldsForCleanupCount = sizeof(StaticPartitioningControlSection) / sizeof(uint32_t) - 1;
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeStaticPartitioningControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync) {
|
||||
const auto beforeExecutionSyncAtomicSize = synchronizeBeforeExecution ? (sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) + sizeof(MI_ATOMIC<GfxFamily>)) : 0u;
|
||||
const auto afterExecutionSyncAtomicSize = (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) ? (sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) + sizeof(MI_ATOMIC<GfxFamily>)) : 0u;
|
||||
const auto afterExecutionSyncPostSyncSize = isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * partitionCount : 0u;
|
||||
const auto nativeCrossTileSyncSize = nativeCrossTileAtomicSync ? sizeof(MI_STORE_DATA_IMM<GfxFamily>) : 0u;
|
||||
return beforeExecutionSyncAtomicSize +
|
||||
sizeof(LOAD_REGISTER_MEM<GfxFamily>) +
|
||||
sizeof(PIPE_CONTROL<GfxFamily>) +
|
||||
sizeof(COMPUTE_WALKER<GfxFamily>) +
|
||||
nativeCrossTileSyncSize +
|
||||
afterExecutionSyncAtomicSize +
|
||||
afterExecutionSyncPostSyncSize +
|
||||
sizeof(BATCH_BUFFER_START<GfxFamily>);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
|
||||
uint64_t gpuAddressOfAllocation,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint32_t partitionCount,
|
||||
uint32_t tileCount,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool secondaryBatchBuffer,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
uint64_t workPartitionAllocationGpuVa) {
|
||||
totalBytesProgrammed = 0u;
|
||||
void *currentBatchBufferPointer = cpuPointer;
|
||||
|
||||
// Get address of the control section
|
||||
const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync);
|
||||
const auto afterControlSectionOffset = controlSectionOffset + sizeof(StaticPartitioningControlSection);
|
||||
|
||||
// Synchronize tiles before walker
|
||||
if (synchronizeBeforeExecution) {
|
||||
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount);
|
||||
}
|
||||
|
||||
// Load partition ID to wparid register and execute walker
|
||||
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, workPartitionAllocationGpuVa, wparidCCSOffset);
|
||||
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
|
||||
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache
|
||||
|
||||
// Prepare for cleanup section
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
|
||||
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField);
|
||||
}
|
||||
|
||||
// Synchronize tiles after walker
|
||||
if (isSemaphoreProgrammingRequired()) {
|
||||
programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
|
||||
}
|
||||
if (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) {
|
||||
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount);
|
||||
}
|
||||
|
||||
// Jump over the control section
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, secondaryBatchBuffer);
|
||||
|
||||
// Control section
|
||||
DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
|
||||
StaticPartitioningControlSection *controlSection = putCommand<StaticPartitioningControlSection>(currentBatchBufferPointer, totalBytesProgrammed);
|
||||
controlSection->synchronizeBeforeWalkerCounter = 0u;
|
||||
controlSection->synchronizeAfterWalkerCounter = 0u;
|
||||
controlSection->finalSyncTileCounter = 0u;
|
||||
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
|
||||
|
||||
// Cleanup section
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
|
||||
programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountAddress,
|
||||
gpuAddressOfAllocation + controlSectionOffset,
|
||||
staticPartitioningFieldsForCleanupCount,
|
||||
tileCount);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t estimateSpaceRequiredInCommandBuffer(bool requiresBatchBufferEnd,
|
||||
uint32_t partitionCount,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
bool staticPartitioning) {
|
||||
uint64_t size = {};
|
||||
if (staticPartitioning) {
|
||||
size += computeStaticPartitioningControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync);
|
||||
size += sizeof(StaticPartitioningControlSection);
|
||||
size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount) : 0u;
|
||||
} else {
|
||||
size += computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync);
|
||||
size += sizeof(BatchBufferControlData);
|
||||
size += requiresBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
|
||||
size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount) : 0u;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
} // namespace WalkerPartition
|
||||
Reference in New Issue
Block a user