878 lines
45 KiB
C++
878 lines
45 KiB
C++
/*
|
|
* Copyright (C) 2021-2024 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "shared/source/command_container/command_encoder.h"
|
|
#include "shared/source/command_container/walker_partition_interface.h"
|
|
#include "shared/source/debug_settings/debug_settings_manager.h"
|
|
#include "shared/source/helpers/aligned_memory.h"
|
|
#include "shared/source/helpers/basic_math.h"
|
|
#include "shared/source/helpers/common_types.h"
|
|
#include "shared/source/helpers/gfx_core_helper.h"
|
|
#include "shared/source/helpers/pipe_control_args.h"
|
|
#include "shared/source/helpers/ptr_math.h"
|
|
#include "shared/source/helpers/string.h"
|
|
|
|
#include <cassert>
|
|
#include <optional>
|
|
|
|
namespace NEO {
|
|
struct PipeControlArgs;
|
|
}
|
|
|
|
namespace WalkerPartition {
|
|
|
|
template <typename GfxFamily>
|
|
using POSTSYNC_DATA = typename GfxFamily::POSTSYNC_DATA;
|
|
template <typename GfxFamily>
|
|
using BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
|
|
template <typename GfxFamily>
|
|
using BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END;
|
|
template <typename GfxFamily>
|
|
using LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
|
|
template <typename GfxFamily>
|
|
using LOAD_REGISTER_MEM = typename GfxFamily::MI_LOAD_REGISTER_MEM;
|
|
template <typename GfxFamily>
|
|
using MI_SET_PREDICATE = typename GfxFamily::MI_SET_PREDICATE;
|
|
template <typename GfxFamily>
|
|
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
|
|
template <typename GfxFamily>
|
|
using MI_ATOMIC = typename GfxFamily::MI_ATOMIC;
|
|
template <typename GfxFamily>
|
|
using DATA_SIZE = typename GfxFamily::MI_ATOMIC::DATA_SIZE;
|
|
template <typename GfxFamily>
|
|
using LOAD_REGISTER_REG = typename GfxFamily::MI_LOAD_REGISTER_REG;
|
|
template <typename GfxFamily>
|
|
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
|
|
template <typename GfxFamily>
|
|
using MI_STORE_DATA_IMM = typename GfxFamily::MI_STORE_DATA_IMM;
|
|
template <typename GfxFamily>
|
|
using POST_SYNC_OPERATION = typename PIPE_CONTROL<GfxFamily>::POST_SYNC_OPERATION;
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
void appendWalkerFields(WalkerType &walkerCmd, uint32_t tileCount);
|
|
|
|
template <typename Command>
|
|
Command *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed) {
|
|
totalBytesProgrammed += sizeof(Command);
|
|
auto commandToReturn = reinterpret_cast<Command *>(inputAddress);
|
|
inputAddress = ptrOffset(inputAddress, sizeof(Command));
|
|
return commandToReturn;
|
|
}
|
|
|
|
inline void *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed, size_t commandSize) {
|
|
totalBytesProgrammed += static_cast<uint32_t>(commandSize);
|
|
auto commandToReturn = inputAddress;
|
|
inputAddress = ptrOffset(inputAddress, commandSize);
|
|
return commandToReturn;
|
|
}
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitionCount,
|
|
bool preferStaticPartitioning,
|
|
const Vec3<size_t> &groupStart,
|
|
const Vec3<size_t> &groupCount,
|
|
std::optional<typename WalkerType::PARTITION_TYPE> requestedPartitionType,
|
|
typename WalkerType::PARTITION_TYPE *outSelectedPartitionType,
|
|
bool *outSelectStaticPartitioning) {
|
|
|
|
using PARTITION_TYPE = typename WalkerType::PARTITION_TYPE;
|
|
// For non uniform starting point, there is no support for partition in Hardware. Disable partitioning and select dynamic algorithm
|
|
if (groupStart.x || groupStart.y || groupStart.z) {
|
|
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_DISABLED;
|
|
*outSelectStaticPartitioning = false;
|
|
return 1u;
|
|
}
|
|
|
|
size_t workgroupCount = 0u;
|
|
bool disablePartitionForPartitionCountOne{};
|
|
|
|
if (NEO::debugManager.flags.ExperimentalSetWalkerPartitionType.get() != -1) {
|
|
requestedPartitionType = static_cast<PARTITION_TYPE>(NEO::debugManager.flags.ExperimentalSetWalkerPartitionType.get());
|
|
}
|
|
|
|
if (requestedPartitionType.has_value()) {
|
|
switch (requestedPartitionType.value()) {
|
|
case PARTITION_TYPE::PARTITION_TYPE_X:
|
|
workgroupCount = groupCount.x;
|
|
break;
|
|
case PARTITION_TYPE::PARTITION_TYPE_Y:
|
|
workgroupCount = groupCount.y;
|
|
break;
|
|
case PARTITION_TYPE::PARTITION_TYPE_Z:
|
|
workgroupCount = groupCount.z;
|
|
break;
|
|
default:
|
|
UNRECOVERABLE_IF(true);
|
|
}
|
|
*outSelectedPartitionType = requestedPartitionType.value();
|
|
disablePartitionForPartitionCountOne = false;
|
|
} else {
|
|
const size_t maxDimension = std::max({groupCount.z, groupCount.y, groupCount.x});
|
|
|
|
auto goWithMaxAlgorithm = !preferStaticPartitioning;
|
|
if (NEO::debugManager.flags.WalkerPartitionPreferHighestDimension.get() != -1) {
|
|
goWithMaxAlgorithm = !!!NEO::debugManager.flags.WalkerPartitionPreferHighestDimension.get();
|
|
}
|
|
|
|
// compute misaligned %, accept imbalance below threshold in favor of Z/Y/X distribution.
|
|
const float minimalThreshold = 0.05f;
|
|
float zImbalance = static_cast<float>(groupCount.z - alignDown(groupCount.z, preferredMinimalPartitionCount)) / static_cast<float>(groupCount.z);
|
|
float yImbalance = static_cast<float>(groupCount.y - alignDown(groupCount.y, preferredMinimalPartitionCount)) / static_cast<float>(groupCount.y);
|
|
|
|
// we first try with deepest dimension to see if we can partition there
|
|
if (groupCount.z > 1 && (zImbalance <= minimalThreshold)) {
|
|
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Z;
|
|
} else if (groupCount.y > 1 && (yImbalance < minimalThreshold)) {
|
|
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Y;
|
|
} else if (groupCount.x % preferredMinimalPartitionCount == 0) {
|
|
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X;
|
|
}
|
|
// if we are here then there is no dimension that results in even distribution, choose max dimension to minimize impact
|
|
else {
|
|
goWithMaxAlgorithm = true;
|
|
}
|
|
|
|
if (goWithMaxAlgorithm) {
|
|
// default mode, select greatest dimension
|
|
if (maxDimension == groupCount.x) {
|
|
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X;
|
|
} else if (maxDimension == groupCount.y) {
|
|
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Y;
|
|
} else {
|
|
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Z;
|
|
}
|
|
}
|
|
|
|
workgroupCount = maxDimension;
|
|
disablePartitionForPartitionCountOne = true;
|
|
}
|
|
|
|
// Static partitioning - partition count == tile count
|
|
*outSelectStaticPartitioning = preferStaticPartitioning;
|
|
if (preferStaticPartitioning) {
|
|
return preferredMinimalPartitionCount;
|
|
}
|
|
|
|
// Dynamic partitioning - compute optimal partition count
|
|
size_t partitionCount = std::min(static_cast<size_t>(16u), workgroupCount);
|
|
partitionCount = Math::prevPowerOfTwo(partitionCount);
|
|
if (NEO::debugManager.flags.SetMinimalPartitionSize.get() != 0) {
|
|
const auto workgroupPerPartitionThreshold = NEO::debugManager.flags.SetMinimalPartitionSize.get() == -1
|
|
? 512u
|
|
: static_cast<unsigned>(NEO::debugManager.flags.SetMinimalPartitionSize.get());
|
|
preferredMinimalPartitionCount = std::max(2u, preferredMinimalPartitionCount);
|
|
|
|
while (partitionCount > preferredMinimalPartitionCount) {
|
|
auto workgroupsPerPartition = workgroupCount / partitionCount;
|
|
if (workgroupsPerPartition >= workgroupPerPartitionThreshold) {
|
|
break;
|
|
}
|
|
partitionCount = partitionCount / 2;
|
|
}
|
|
}
|
|
|
|
if (partitionCount == 1u && disablePartitionForPartitionCountOne) {
|
|
*outSelectedPartitionType = PARTITION_TYPE::PARTITION_TYPE_DISABLED;
|
|
}
|
|
|
|
return static_cast<uint32_t>(partitionCount);
|
|
}
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
uint32_t computePartitionCountAndSetPartitionType(WalkerType *walker,
|
|
NEO::RequiredPartitionDim requiredPartitionDim,
|
|
uint32_t preferredMinimalPartitionCount,
|
|
bool preferStaticPartitioning,
|
|
bool *outSelectStaticPartitioning) {
|
|
|
|
using PARTITION_TYPE = typename WalkerType::PARTITION_TYPE;
|
|
|
|
const Vec3<size_t> groupStart = {walker->getThreadGroupIdStartingX(), walker->getThreadGroupIdStartingY(), walker->getThreadGroupIdStartingZ()};
|
|
const Vec3<size_t> groupCount = {walker->getThreadGroupIdXDimension(), walker->getThreadGroupIdYDimension(), walker->getThreadGroupIdZDimension()};
|
|
std::optional<PARTITION_TYPE> requestedPartitionType{};
|
|
|
|
switch (requiredPartitionDim) {
|
|
case NEO::RequiredPartitionDim::x:
|
|
requestedPartitionType = PARTITION_TYPE::PARTITION_TYPE_X;
|
|
break;
|
|
case NEO::RequiredPartitionDim::y:
|
|
requestedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Y;
|
|
break;
|
|
case NEO::RequiredPartitionDim::z:
|
|
requestedPartitionType = PARTITION_TYPE::PARTITION_TYPE_Z;
|
|
break;
|
|
default:
|
|
UNRECOVERABLE_IF(requiredPartitionDim != NEO::RequiredPartitionDim::none);
|
|
break;
|
|
}
|
|
|
|
PARTITION_TYPE partitionType{};
|
|
const auto partitionCount = computePartitionCountAndPartitionType<GfxFamily, WalkerType>(preferredMinimalPartitionCount,
|
|
preferStaticPartitioning,
|
|
groupStart,
|
|
groupCount,
|
|
requestedPartitionType,
|
|
&partitionType,
|
|
outSelectStaticPartitioning);
|
|
walker->setPartitionType(partitionType);
|
|
return partitionCount;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programRegisterWithValue(void *&inputAddress, uint32_t registerOffset, uint32_t &totalBytesProgrammed, uint32_t registerValue) {
|
|
auto loadRegisterImmediate = putCommand<LOAD_REGISTER_IMM<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
|
LOAD_REGISTER_IMM<GfxFamily> cmd = GfxFamily::cmdInitLoadRegisterImm;
|
|
|
|
cmd.setRegisterOffset(registerOffset);
|
|
cmd.setDataDword(registerValue);
|
|
cmd.setMmioRemapEnable(true);
|
|
*loadRegisterImmediate = cmd;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programWaitForSemaphore(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddress, uint32_t semaphoreCompareValue, typename MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION compareOperation) {
|
|
auto semaphoreWait = putCommand<MI_SEMAPHORE_WAIT<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
|
NEO::EncodeSemaphore<GfxFamily>::programMiSemaphoreWait(semaphoreWait, gpuAddress, semaphoreCompareValue, compareOperation, false, true, false, false, false);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
bool programWparidMask(void *&inputAddress, uint32_t &totalBytesProgrammed, uint32_t partitionCount) {
|
|
// currently only power of 2 values of partitionCount are being supported
|
|
if (!Math::isPow2(partitionCount) || partitionCount > 16) {
|
|
return false;
|
|
}
|
|
|
|
auto mask = 0xFFE0;
|
|
auto fillValue = 0x10;
|
|
auto count = partitionCount;
|
|
while (count < 16) {
|
|
fillValue |= (fillValue >> 1);
|
|
count *= 2;
|
|
}
|
|
mask |= (mask | fillValue);
|
|
|
|
programRegisterWithValue<GfxFamily>(inputAddress, predicationMaskCCSOffset, totalBytesProgrammed, mask);
|
|
return true;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programWparidPredication(void *&inputAddress, uint32_t &totalBytesProgrammed, bool predicationEnabled) {
|
|
auto miSetPredicate = putCommand<MI_SET_PREDICATE<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
|
MI_SET_PREDICATE<GfxFamily> cmd = GfxFamily::cmdInitSetPredicate;
|
|
|
|
if (predicationEnabled) {
|
|
cmd.setPredicateEnableWparid(MI_SET_PREDICATE<GfxFamily>::PREDICATE_ENABLE_WPARID::PREDICATE_ENABLE_WPARID_NOOP_ON_NON_ZERO_VALUE);
|
|
} else {
|
|
cmd.setPredicateEnable(MI_SET_PREDICATE<GfxFamily>::PREDICATE_ENABLE::PREDICATE_ENABLE_PREDICATE_DISABLE);
|
|
}
|
|
*miSetPredicate = cmd;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programMiAtomic(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddress, bool requireReturnValue, typename MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES atomicOpcode) {
|
|
auto miAtomic = putCommand<MI_ATOMIC<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
|
NEO::EncodeAtomic<GfxFamily>::programMiAtomic(miAtomic, gpuAddress, atomicOpcode, DATA_SIZE<GfxFamily>::DATA_SIZE_DWORD,
|
|
requireReturnValue, requireReturnValue, 0x0u, 0x0u);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programMiBatchBufferStart(void *&inputAddress, uint32_t &totalBytesProgrammed,
|
|
uint64_t gpuAddress, bool predicationEnabled, bool secondary) {
|
|
auto batchBufferStart = putCommand<BATCH_BUFFER_START<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
|
BATCH_BUFFER_START<GfxFamily> cmd = GfxFamily::cmdInitBatchBufferStart;
|
|
|
|
cmd.setSecondLevelBatchBuffer(static_cast<typename BATCH_BUFFER_START<GfxFamily>::SECOND_LEVEL_BATCH_BUFFER>(secondary));
|
|
cmd.setAddressSpaceIndicator(BATCH_BUFFER_START<GfxFamily>::ADDRESS_SPACE_INDICATOR::ADDRESS_SPACE_INDICATOR_PPGTT);
|
|
cmd.setPredicationEnable(predicationEnabled);
|
|
cmd.setBatchBufferStartAddress(gpuAddress);
|
|
*batchBufferStart = cmd;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programMiLoadRegisterReg(void *&inputAddress, uint32_t &totalBytesProgrammed, uint32_t sourceRegisterOffset, uint32_t destinationRegisterOffset) {
|
|
auto loadRegisterReg = putCommand<LOAD_REGISTER_REG<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
|
LOAD_REGISTER_REG<GfxFamily> cmd = GfxFamily::cmdInitLoadRegisterReg;
|
|
|
|
cmd.setMmioRemapEnableSource(true);
|
|
cmd.setMmioRemapEnableDestination(true);
|
|
cmd.setSourceRegisterAddress(sourceRegisterOffset);
|
|
cmd.setDestinationRegisterAddress(destinationRegisterOffset);
|
|
*loadRegisterReg = cmd;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programMiLoadRegisterMem(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddressToLoad, uint32_t destinationRegisterOffset) {
|
|
auto loadRegisterReg = putCommand<LOAD_REGISTER_MEM<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
|
LOAD_REGISTER_MEM<GfxFamily> cmd = GfxFamily::cmdInitLoadRegisterMem;
|
|
|
|
cmd.setMmioRemapEnable(true);
|
|
cmd.setMemoryAddress(gpuAddressToLoad);
|
|
cmd.setRegisterAddress(destinationRegisterOffset);
|
|
*loadRegisterReg = cmd;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programPipeControlCommand(void *&inputAddress, uint32_t &totalBytesProgrammed, NEO::PipeControlArgs &flushArgs) {
|
|
auto singleBarrierSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(flushArgs.tlbInvalidation);
|
|
|
|
auto pipeControl = putCommand(inputAddress, totalBytesProgrammed, singleBarrierSize);
|
|
|
|
UNRECOVERABLE_IF(sizeof(PIPE_CONTROL<GfxFamily>) < singleBarrierSize);
|
|
uint8_t cmd[sizeof(PIPE_CONTROL<GfxFamily>)] = {};
|
|
|
|
NEO::MemorySynchronizationCommands<GfxFamily>::setSingleBarrier(cmd, flushArgs);
|
|
|
|
memcpy_s(pipeControl, singleBarrierSize, cmd, singleBarrierSize);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programPostSyncPipeControlCommand(void *&inputAddress,
|
|
uint32_t &totalBytesProgrammed,
|
|
WalkerPartitionArgs &args,
|
|
NEO::PipeControlArgs &flushArgs,
|
|
const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {
|
|
|
|
NEO::MemorySynchronizationCommands<GfxFamily>::setBarrierWithPostSyncOperation(inputAddress,
|
|
NEO::PostSyncMode::immediateData,
|
|
args.postSyncGpuAddress,
|
|
args.postSyncImmediateValue,
|
|
rootDeviceEnvironment,
|
|
flushArgs);
|
|
|
|
totalBytesProgrammed += static_cast<uint32_t>(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, flushArgs.tlbInvalidation));
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programStoreMemImmediateDword(void *&inputAddress, uint32_t &totalBytesProgrammed, uint64_t gpuAddress, uint32_t data) {
|
|
auto storeDataImmediate = putCommand<MI_STORE_DATA_IMM<GfxFamily>>(inputAddress, totalBytesProgrammed);
|
|
MI_STORE_DATA_IMM<GfxFamily> cmd = GfxFamily::cmdInitStoreDataImm;
|
|
|
|
cmd.setAddress(gpuAddress);
|
|
cmd.setStoreQword(false);
|
|
cmd.setDwordLength(MI_STORE_DATA_IMM<GfxFamily>::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD);
|
|
cmd.setDataDword0(static_cast<uint32_t>(data));
|
|
|
|
*storeDataImmediate = cmd;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
uint64_t computeSelfCleanupSectionSize(bool useAtomicsForSelfCleanup) {
|
|
if (useAtomicsForSelfCleanup) {
|
|
return sizeof(MI_ATOMIC<GfxFamily>);
|
|
} else {
|
|
return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
|
|
}
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programSelfCleanupSection(void *&inputAddress,
|
|
uint32_t &totalBytesProgrammed,
|
|
uint64_t address,
|
|
bool useAtomicsForSelfCleanup) {
|
|
if (useAtomicsForSelfCleanup) {
|
|
programMiAtomic<GfxFamily>(inputAddress,
|
|
totalBytesProgrammed,
|
|
address,
|
|
false,
|
|
MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_MOVE);
|
|
} else {
|
|
programStoreMemImmediateDword<GfxFamily>(inputAddress,
|
|
totalBytesProgrammed,
|
|
address,
|
|
0u);
|
|
}
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
uint64_t computeTilesSynchronizationWithAtomicsSectionSize() {
|
|
return sizeof(MI_ATOMIC<GfxFamily>) +
|
|
NEO::EncodeSemaphore<GfxFamily>::getSizeMiSemaphoreWait();
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programTilesSynchronizationWithAtomics(void *¤tBatchBufferPointer,
|
|
uint32_t &totalBytesProgrammed,
|
|
uint64_t atomicAddress,
|
|
uint32_t tileCount) {
|
|
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
|
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
uint64_t computeSelfCleanupEndSectionSize(size_t fieldsForCleanupCount, WalkerPartitionArgs &args) {
|
|
size_t extraSize = 0;
|
|
if (args.pipeControlBeforeCleanupCrossTileSync) {
|
|
extraSize += 2 * NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false);
|
|
}
|
|
return fieldsForCleanupCount * computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup) +
|
|
2 * computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>() + extraSize;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void programSelfCleanupEndSection(void *&inputAddress,
|
|
uint32_t &totalBytesProgrammed,
|
|
uint64_t finalSyncTileCountAddress,
|
|
uint64_t baseAddressForCleanup,
|
|
size_t fieldsForCleanupCount,
|
|
WalkerPartitionArgs &args) {
|
|
NEO::PipeControlArgs pipeControlArgs;
|
|
if (args.pipeControlBeforeCleanupCrossTileSync) {
|
|
programPipeControlCommand<GfxFamily>(inputAddress, totalBytesProgrammed, pipeControlArgs);
|
|
}
|
|
|
|
// Synchronize tiles, so the fields are not cleared while still in use
|
|
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, args.tileCount);
|
|
|
|
for (auto fieldIndex = 0u; fieldIndex < fieldsForCleanupCount; fieldIndex++) {
|
|
const uint64_t addressForCleanup = baseAddressForCleanup + fieldIndex * sizeof(uint32_t);
|
|
programSelfCleanupSection<GfxFamily>(inputAddress,
|
|
totalBytesProgrammed,
|
|
addressForCleanup,
|
|
args.useAtomicsForSelfCleanup);
|
|
}
|
|
|
|
if (args.pipeControlBeforeCleanupCrossTileSync) {
|
|
programPipeControlCommand<GfxFamily>(inputAddress, totalBytesProgrammed, pipeControlArgs);
|
|
}
|
|
|
|
// this synchronization point ensures that all tiles finished zeroing and will fairly access control section atomic variables
|
|
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * args.tileCount);
|
|
}
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
void programTilesSynchronizationWithPostSyncs(void *¤tBatchBufferPointer,
|
|
uint32_t &totalBytesProgrammed,
|
|
WalkerType *inputWalker,
|
|
uint32_t partitionCount) {
|
|
const auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu;
|
|
for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) {
|
|
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, postSyncAddress + partitionId * 16llu, 1u, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
|
|
}
|
|
}
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
uint64_t computeWalkerSectionSize() {
|
|
return sizeof(BATCH_BUFFER_START<GfxFamily>) +
|
|
sizeof(WalkerType);
|
|
}
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) {
|
|
uint64_t size = 0u;
|
|
|
|
size += args.synchronizeBeforeExecution ? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>() : 0;
|
|
size += sizeof(LOAD_REGISTER_IMM<GfxFamily>); // predication mask
|
|
size += sizeof(MI_ATOMIC<GfxFamily>); // current id for partition
|
|
size += sizeof(LOAD_REGISTER_REG<GfxFamily>); // id into register
|
|
size += sizeof(MI_SET_PREDICATE<GfxFamily>) * 2 +
|
|
sizeof(BATCH_BUFFER_START<GfxFamily>) * 2;
|
|
size += (args.semaphoreProgrammingRequired ? NEO::EncodeSemaphore<GfxFamily>::getSizeMiSemaphoreWait() * args.partitionCount : 0u);
|
|
size += computeWalkerSectionSize<GfxFamily, WalkerType>();
|
|
size += args.emitPipeControlStall ? NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false) : 0u;
|
|
if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) {
|
|
size += computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
|
|
}
|
|
if (args.emitSelfCleanup) {
|
|
size += computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup);
|
|
}
|
|
size += args.preferredStaticPartitioning ? sizeof(LOAD_REGISTER_MEM<GfxFamily>) : 0u;
|
|
return size;
|
|
}
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
|
|
return computeControlSectionOffset<GfxFamily, WalkerType>(args) -
|
|
computeWalkerSectionSize<GfxFamily, WalkerType>();
|
|
}
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
void *programPartitionedWalker(void *&inputAddress, uint32_t &totalBytesProgrammed,
|
|
WalkerType *inputWalker,
|
|
uint32_t partitionCount,
|
|
uint32_t tileCount,
|
|
bool forceExecutionOnSingleTile) {
|
|
auto computeWalker = putCommand<WalkerType>(inputAddress, totalBytesProgrammed);
|
|
|
|
if (partitionCount > 1) {
|
|
auto partitionType = inputWalker->getPartitionType();
|
|
|
|
assert(inputWalker->getThreadGroupIdStartingX() == 0u);
|
|
assert(inputWalker->getThreadGroupIdStartingY() == 0u);
|
|
assert(inputWalker->getThreadGroupIdStartingZ() == 0u);
|
|
assert(partitionType != WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
|
|
|
|
inputWalker->setWorkloadPartitionEnable(true);
|
|
|
|
auto workgroupCount = 0u;
|
|
if (partitionType == WalkerType::PARTITION_TYPE::PARTITION_TYPE_X) {
|
|
workgroupCount = inputWalker->getThreadGroupIdXDimension();
|
|
} else if (partitionType == WalkerType::PARTITION_TYPE::PARTITION_TYPE_Y) {
|
|
workgroupCount = inputWalker->getThreadGroupIdYDimension();
|
|
} else {
|
|
workgroupCount = inputWalker->getThreadGroupIdZDimension();
|
|
}
|
|
|
|
if (forceExecutionOnSingleTile) {
|
|
inputWalker->setPartitionSize(workgroupCount);
|
|
} else {
|
|
inputWalker->setPartitionSize(Math::divideAndRoundUp(workgroupCount, partitionCount));
|
|
}
|
|
}
|
|
|
|
appendWalkerFields<GfxFamily, WalkerType>(*inputWalker, tileCount);
|
|
|
|
*computeWalker = *inputWalker;
|
|
|
|
return computeWalker;
|
|
}
|
|
|
|
/* SAMPLE COMMAND BUFFER STRUCTURE, birds eye view for 16 partitions, 4 tiles
|
|
//inital setup section
|
|
1. MI_LOAD_REGISTER(PREDICATION_MASK, active partition mask )
|
|
//loop 1 - loop as long as there are partitions to be serviced
|
|
2. MI_ATOMIC_INC( ATOMIC LOCATION #31 within CMD buffer )
|
|
3. MI_LOAD_REGISTER_REG ( ATOMIC RESULT -> WPARID )
|
|
4. MI_SET_PREDICATE( WPARID MODE )
|
|
5. BATCH_BUFFER_START( LOCATION #28 ) // this will not be executed if partition outside of active virtual partitions
|
|
//loop 1 ends here, if we are here it means there are no more partitions
|
|
6. MI_SET_PREDICATE ( OFF )
|
|
//Walker synchronization section starts here, make sure that Walker is done
|
|
7, PIPE_CONTROL ( DC_FLUSH )
|
|
//wait for all post syncs to make sure whole work is done, caller needs to set them to 1.
|
|
//now epilogue starts synchro all engines prior to coming back to RING, this will be once per command buffer to make sure that all engines actually passed via cmd buffer.
|
|
//epilogue section, make sure every tile completed prior to continuing
|
|
//This is cross-tile synchronization
|
|
24. ATOMIC_INC( LOCATION #31)
|
|
25. WAIT_FOR_SEMAPHORE ( LOCATION #31, LOWER THEN 4 ) // wait till all tiles hit atomic
|
|
26. PIPE_CONTROL ( TAG UPDATE ) (not implemented)
|
|
27. BATCH_BUFFER_STAT (LOCATION #32) // go to the very end
|
|
//Walker section
|
|
28. COMPUTE_WALKER
|
|
29. BATCH BUFFER_START ( GO BACK TO #2)
|
|
//Batch Buffer Control Data section, there are no real commands here but we have memory here
|
|
//That will be updated via atomic operations.
|
|
30. uint32_t virtualPartitionID //atomic location
|
|
31. uint32_t completionTileID //all tiles needs to report completion
|
|
32. BATCH_BUFFER_END ( optional )
|
|
*/
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
|
void **outWalkerPtr,
|
|
uint64_t gpuAddressOfAllocation,
|
|
WalkerType *inputWalker,
|
|
uint32_t &totalBytesProgrammed,
|
|
WalkerPartitionArgs &args,
|
|
const NEO::HardwareInfo &hwInfo) {
|
|
totalBytesProgrammed = 0u;
|
|
void *currentBatchBufferPointer = cpuPointer;
|
|
|
|
auto controlSectionOffset = computeControlSectionOffset<GfxFamily, WalkerType>(args);
|
|
if (args.synchronizeBeforeExecution) {
|
|
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, inTileCount);
|
|
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
|
|
}
|
|
|
|
programWparidMask<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.partitionCount);
|
|
|
|
programMiAtomic<GfxFamily>(currentBatchBufferPointer,
|
|
totalBytesProgrammed,
|
|
gpuAddressOfAllocation + controlSectionOffset,
|
|
true,
|
|
MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
|
|
|
// move atomic result to wparid
|
|
programMiLoadRegisterReg<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, generalPurposeRegister4, wparidCCSOffset);
|
|
|
|
// enable predication basing on wparid value
|
|
programWparidPredication<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
|
|
|
|
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer,
|
|
totalBytesProgrammed,
|
|
gpuAddressOfAllocation +
|
|
computeWalkerSectionStart<GfxFamily, WalkerType>(args),
|
|
true,
|
|
args.secondaryBatchBuffer);
|
|
|
|
// disable predication to not noop subsequent commands.
|
|
programWparidPredication<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, false);
|
|
|
|
if (args.emitSelfCleanup) {
|
|
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
|
|
programSelfCleanupSection<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup);
|
|
}
|
|
|
|
if (args.emitPipeControlStall) {
|
|
NEO::PipeControlArgs pipeControlArgs;
|
|
pipeControlArgs.dcFlushEnable = args.dcFlushEnable;
|
|
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, pipeControlArgs);
|
|
}
|
|
|
|
if (args.semaphoreProgrammingRequired) {
|
|
auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu;
|
|
for (uint32_t partitionId = 0u; partitionId < args.partitionCount; partitionId++) {
|
|
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, postSyncAddress + partitionId * 16llu, 1u, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
|
|
}
|
|
}
|
|
|
|
if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) {
|
|
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, tileCount);
|
|
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
|
|
}
|
|
|
|
if (args.preferredStaticPartitioning) {
|
|
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
|
|
}
|
|
|
|
// this bb start goes to the end of partitioned command buffer
|
|
programMiBatchBufferStart<GfxFamily>(
|
|
currentBatchBufferPointer,
|
|
totalBytesProgrammed,
|
|
gpuAddressOfAllocation + controlSectionOffset + sizeof(BatchBufferControlData),
|
|
false,
|
|
args.secondaryBatchBuffer);
|
|
|
|
// Walker section
|
|
auto walkerPtr = programPartitionedWalker<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile);
|
|
if (outWalkerPtr) {
|
|
*outWalkerPtr = walkerPtr;
|
|
}
|
|
|
|
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, args.secondaryBatchBuffer);
|
|
|
|
auto controlSection = reinterpret_cast<BatchBufferControlData *>(ptrOffset(cpuPointer, static_cast<size_t>(controlSectionOffset)));
|
|
controlSection->partitionCount = 0u;
|
|
controlSection->tileCount = 0u;
|
|
controlSection->inTileCount = 0u;
|
|
controlSection->finalSyncTileCount = 0u;
|
|
totalBytesProgrammed += sizeof(BatchBufferControlData);
|
|
currentBatchBufferPointer = ptrOffset(currentBatchBufferPointer, sizeof(BatchBufferControlData));
|
|
|
|
if (args.emitSelfCleanup) {
|
|
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
|
|
programSelfCleanupEndSection<GfxFamily>(currentBatchBufferPointer,
|
|
totalBytesProgrammed,
|
|
finalSyncTileCountAddress,
|
|
gpuAddressOfAllocation + controlSectionOffset,
|
|
dynamicPartitioningFieldsForCleanupCount,
|
|
args);
|
|
}
|
|
|
|
if (args.emitBatchBufferEnd) {
|
|
auto batchBufferEnd = putCommand<BATCH_BUFFER_END<GfxFamily>>(currentBatchBufferPointer, totalBytesProgrammed);
|
|
*batchBufferEnd = GfxFamily::cmdInitBatchBufferEnd;
|
|
}
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
bool isStartAndControlSectionRequired(WalkerPartitionArgs &args) {
|
|
return args.synchronizeBeforeExecution || args.crossTileAtomicSynchronization || args.emitSelfCleanup;
|
|
}
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args) {
|
|
const auto beforeExecutionSyncAtomicSize = args.synchronizeBeforeExecution
|
|
? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
|
|
: 0u;
|
|
const auto afterExecutionSyncAtomicSize = (args.crossTileAtomicSynchronization || args.emitSelfCleanup)
|
|
? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
|
|
: 0u;
|
|
const auto afterExecutionSyncPostSyncSize = args.semaphoreProgrammingRequired
|
|
? NEO::EncodeSemaphore<GfxFamily>::getSizeMiSemaphoreWait() * args.partitionCount
|
|
: 0u;
|
|
const auto selfCleanupSectionSize = args.emitSelfCleanup
|
|
? computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup)
|
|
: 0u;
|
|
const auto wparidRegisterSize = args.initializeWparidRegister
|
|
? sizeof(LOAD_REGISTER_MEM<GfxFamily>)
|
|
: 0u;
|
|
const auto pipeControlSize = args.emitPipeControlStall
|
|
? NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false)
|
|
: 0u;
|
|
const auto bbStartSize = isStartAndControlSectionRequired<GfxFamily>(args)
|
|
? sizeof(BATCH_BUFFER_START<GfxFamily>)
|
|
: 0u;
|
|
return beforeExecutionSyncAtomicSize +
|
|
wparidRegisterSize +
|
|
pipeControlSize +
|
|
sizeof(WalkerType) +
|
|
selfCleanupSectionSize +
|
|
afterExecutionSyncAtomicSize +
|
|
afterExecutionSyncPostSyncSize +
|
|
bbStartSize;
|
|
}
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
|
|
void **outWalkerPtr,
|
|
uint64_t gpuAddressOfAllocation,
|
|
WalkerType *inputWalker,
|
|
uint32_t &totalBytesProgrammed,
|
|
WalkerPartitionArgs &args,
|
|
const NEO::HardwareInfo &hwInfo) {
|
|
totalBytesProgrammed = 0u;
|
|
void *currentBatchBufferPointer = cpuPointer;
|
|
|
|
// Get address of the control section
|
|
const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily, WalkerType>(args);
|
|
const auto afterControlSectionOffset = controlSectionOffset + sizeof(StaticPartitioningControlSection);
|
|
|
|
// Synchronize tiles before walker
|
|
if (args.synchronizeBeforeExecution) {
|
|
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
|
|
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
|
|
}
|
|
|
|
// Load partition ID to wparid register and execute walker
|
|
if (args.initializeWparidRegister) {
|
|
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
|
|
}
|
|
auto walkerPtr = programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount, args.tileCount, args.forceExecutionOnSingleTile);
|
|
if (outWalkerPtr) {
|
|
*outWalkerPtr = walkerPtr;
|
|
}
|
|
|
|
// Prepare for cleanup section
|
|
if (args.emitSelfCleanup) {
|
|
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
|
|
programSelfCleanupSection<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup);
|
|
}
|
|
|
|
if (args.emitPipeControlStall) {
|
|
NEO::PipeControlArgs pipeControlArgs;
|
|
pipeControlArgs.dcFlushEnable = args.dcFlushEnable;
|
|
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, pipeControlArgs);
|
|
}
|
|
|
|
// Synchronize tiles after walker
|
|
if (args.semaphoreProgrammingRequired) {
|
|
programTilesSynchronizationWithPostSyncs<GfxFamily, WalkerType>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
|
|
}
|
|
|
|
if (args.crossTileAtomicSynchronization || args.emitSelfCleanup) {
|
|
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
|
|
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
|
|
}
|
|
|
|
// Jump over the control section only when needed
|
|
if (isStartAndControlSectionRequired<GfxFamily>(args)) {
|
|
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer);
|
|
|
|
// Control section
|
|
DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
|
|
StaticPartitioningControlSection *controlSection = putCommand<StaticPartitioningControlSection>(currentBatchBufferPointer, totalBytesProgrammed);
|
|
controlSection->synchronizeBeforeWalkerCounter = 0u;
|
|
controlSection->synchronizeAfterWalkerCounter = 0u;
|
|
controlSection->finalSyncTileCounter = 0u;
|
|
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
|
|
}
|
|
|
|
// Cleanup section
|
|
if (args.emitSelfCleanup) {
|
|
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
|
|
programSelfCleanupEndSection<GfxFamily>(currentBatchBufferPointer,
|
|
totalBytesProgrammed,
|
|
finalSyncTileCountAddress,
|
|
gpuAddressOfAllocation + controlSectionOffset,
|
|
staticPartitioningFieldsForCleanupCount,
|
|
args);
|
|
}
|
|
}
|
|
|
|
template <typename GfxFamily, typename WalkerType>
|
|
uint64_t estimateSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) {
|
|
uint64_t size = {};
|
|
if (args.staticPartitioning) {
|
|
size += computeStaticPartitioningControlSectionOffset<GfxFamily, WalkerType>(args);
|
|
size += isStartAndControlSectionRequired<GfxFamily>(args) ? sizeof(StaticPartitioningControlSection) : 0u;
|
|
size += args.emitSelfCleanup ? computeSelfCleanupEndSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, args) : 0u;
|
|
} else {
|
|
size += computeControlSectionOffset<GfxFamily, WalkerType>(args);
|
|
size += sizeof(BatchBufferControlData);
|
|
size += args.emitBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
|
|
size += args.emitSelfCleanup ? computeSelfCleanupEndSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, args) : 0u;
|
|
}
|
|
return size;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
uint64_t computeBarrierControlSectionOffset(WalkerPartitionArgs &args,
|
|
const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {
|
|
uint64_t offset = 0u;
|
|
if (args.emitSelfCleanup) {
|
|
offset += computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup);
|
|
}
|
|
|
|
if (args.usePostSync) {
|
|
offset += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false);
|
|
} else {
|
|
offset += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false);
|
|
}
|
|
|
|
offset += (computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>() +
|
|
sizeof(BATCH_BUFFER_START<GfxFamily>));
|
|
return offset;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
uint64_t estimateBarrierSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args,
|
|
const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {
|
|
uint64_t size = computeBarrierControlSectionOffset<GfxFamily>(args, rootDeviceEnvironment) +
|
|
sizeof(BarrierControlSection);
|
|
if (args.emitSelfCleanup) {
|
|
size += computeSelfCleanupEndSectionSize<GfxFamily>(barrierControlSectionFieldsForCleanupCount, args);
|
|
}
|
|
return size;
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
void constructBarrierCommandBuffer(void *cpuPointer,
|
|
uint64_t gpuAddressOfAllocation,
|
|
uint32_t &totalBytesProgrammed,
|
|
WalkerPartitionArgs &args,
|
|
NEO::PipeControlArgs &flushArgs,
|
|
const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {
|
|
void *currentBatchBufferPointer = cpuPointer;
|
|
const auto controlSectionOffset = computeBarrierControlSectionOffset<GfxFamily>(args, rootDeviceEnvironment);
|
|
|
|
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BarrierControlSection, finalSyncTileCount);
|
|
if (args.emitSelfCleanup) {
|
|
programSelfCleanupSection<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup);
|
|
}
|
|
|
|
if (args.usePostSync) {
|
|
programPostSyncPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args, flushArgs, rootDeviceEnvironment);
|
|
} else {
|
|
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, flushArgs);
|
|
}
|
|
|
|
const auto crossTileSyncCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BarrierControlSection, crossTileSyncCount);
|
|
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, crossTileSyncCountField, args.tileCount);
|
|
|
|
const auto afterControlSectionOffset = controlSectionOffset + sizeof(BarrierControlSection);
|
|
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer);
|
|
|
|
DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
|
|
BarrierControlSection *controlSection = putCommand<BarrierControlSection>(currentBatchBufferPointer, totalBytesProgrammed);
|
|
controlSection->crossTileSyncCount = 0u;
|
|
controlSection->finalSyncTileCount = 0u;
|
|
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
|
|
|
|
if (args.emitSelfCleanup) {
|
|
programSelfCleanupEndSection<GfxFamily>(currentBatchBufferPointer,
|
|
totalBytesProgrammed,
|
|
finalSyncTileCountField,
|
|
gpuAddressOfAllocation + controlSectionOffset,
|
|
barrierControlSectionFieldsForCleanupCount,
|
|
args);
|
|
}
|
|
}
|
|
|
|
} // namespace WalkerPartition
|