/* * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_container/walker_partition_xehp_and_later.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/memory_manager/graphics_allocation.h" namespace NEO { template WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(uint64_t workPartitionAllocationGpuVa, uint32_t tileCount, uint32_t partitionCount, bool emitSelfCleanup, bool preferStaticPartitioning, bool staticPartitioning, bool useSecondaryBatchBuffer, bool dcFlush, bool forceExecutionOnSingleTile) { WalkerPartition::WalkerPartitionArgs args = {}; args.workPartitionAllocationGpuVa = workPartitionAllocationGpuVa; args.partitionCount = partitionCount; args.tileCount = tileCount; args.staticPartitioning = staticPartitioning; args.preferredStaticPartitioning = preferStaticPartitioning; args.forceExecutionOnSingleTile = forceExecutionOnSingleTile; args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup(); args.initializeWparidRegister = ImplicitScalingHelper::isWparidRegisterInitializationRequired(); args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired(ImplicitScalingDispatch::getPipeControlStallRequired()); args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired(); args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired(args.emitPipeControlStall); args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired(); args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(args, emitSelfCleanup); args.emitBatchBufferEnd = false; args.secondaryBatchBuffer = useSecondaryBatchBuffer; args.dcFlushEnable = dcFlush; args.pipeControlBeforeCleanupCrossTileSync = ImplicitScalingHelper::pipeControlBeforeCleanupAtomicSyncRequired(); return args; } template size_t ImplicitScalingDispatch::getSize(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3 &groupStart, const Vec3 &groupCount) { typename GfxFamily::COMPUTE_WALKER::PARTITION_TYPE partitionType{}; bool staticPartitioning = false; const uint32_t tileCount = static_cast(devices.count()); const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType(tileCount, preferStaticPartitioning, groupStart, groupCount, {}, &partitionType, &staticPartitioning); UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount)); WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs(0u, tileCount, partitionCount, apiSelfCleanup, preferStaticPartitioning, staticPartitioning, false, false, false); return static_cast(WalkerPartition::estimateSpaceRequiredInCommandBuffer(args)); } template void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandStream, WALKER_TYPE &walkerCmd, const DeviceBitfield &devices, uint32_t &partitionCount, bool useSecondaryBatchBuffer, bool apiSelfCleanup, bool usesImages, bool dcFlush, bool forceExecutionOnSingleTile, uint64_t workPartitionAllocationGpuVa, const HardwareInfo &hwInfo) { uint32_t totalProgrammedSize = 0u; const uint32_t tileCount = static_cast(devices.count()); const bool preferStaticPartitioning = workPartitionAllocationGpuVa != 0u; bool staticPartitioning = false; partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning); WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs(workPartitionAllocationGpuVa, tileCount, partitionCount, apiSelfCleanup, preferStaticPartitioning, staticPartitioning, useSecondaryBatchBuffer, dcFlush, forceExecutionOnSingleTile); auto dispatchCommandsSize = getSize(apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}); void *commandBuffer = commandStream.getSpace(dispatchCommandsSize); uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - dispatchCommandsSize; if (staticPartitioning) { UNRECOVERABLE_IF(tileCount != partitionCount); WalkerPartition::constructStaticallyPartitionedCommandBuffer(commandBuffer, cmdBufferGpuAddress, &walkerCmd, totalProgrammedSize, args, hwInfo); } else { if (DebugManager.flags.ExperimentalSetWalkerPartitionCount.get()) { partitionCount = DebugManager.flags.ExperimentalSetWalkerPartitionCount.get(); if (partitionCount == 1u) { walkerCmd.setPartitionType(GfxFamily::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED); } args.partitionCount = partitionCount; } WalkerPartition::constructDynamicallyPartitionedCommandBuffer(commandBuffer, cmdBufferGpuAddress, &walkerCmd, totalProgrammedSize, args, hwInfo); } UNRECOVERABLE_IF(totalProgrammedSize != dispatchCommandsSize); } template bool &ImplicitScalingDispatch::getPipeControlStallRequired() { return ImplicitScalingDispatch::pipeControlStallRequired; } template WalkerPartition::WalkerPartitionArgs prepareBarrierWalkerPartitionArgs(bool emitSelfCleanup, bool usePostSync) { WalkerPartition::WalkerPartitionArgs args = {}; args.crossTileAtomicSynchronization = true; args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup(); args.usePostSync = usePostSync; args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(args, emitSelfCleanup); args.pipeControlBeforeCleanupCrossTileSync = ImplicitScalingHelper::pipeControlBeforeCleanupAtomicSyncRequired(); return args; } template size_t ImplicitScalingDispatch::getBarrierSize(const RootDeviceEnvironment &rootDeviceEnvironment, bool apiSelfCleanup, bool usePostSync) { WalkerPartition::WalkerPartitionArgs args = prepareBarrierWalkerPartitionArgs(apiSelfCleanup, usePostSync); return static_cast(WalkerPartition::estimateBarrierSpaceRequiredInCommandBuffer(args, rootDeviceEnvironment)); } template void ImplicitScalingDispatch::dispatchBarrierCommands(LinearStream &commandStream, const DeviceBitfield &devices, PipeControlArgs &flushArgs, const RootDeviceEnvironment &rootDeviceEnvironment, uint64_t gpuAddress, uint64_t immediateData, bool apiSelfCleanup, bool useSecondaryBatchBuffer) { uint32_t totalProgrammedSize = 0u; WalkerPartition::WalkerPartitionArgs args = prepareBarrierWalkerPartitionArgs(apiSelfCleanup, gpuAddress > 0); args.tileCount = static_cast(devices.count()); args.secondaryBatchBuffer = useSecondaryBatchBuffer; args.postSyncGpuAddress = gpuAddress; args.postSyncImmediateValue = immediateData; auto barrierCommandsSize = getBarrierSize(rootDeviceEnvironment, args.emitSelfCleanup, args.usePostSync); void *commandBuffer = commandStream.getSpace(barrierCommandsSize); uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - barrierCommandsSize; WalkerPartition::constructBarrierCommandBuffer(commandBuffer, cmdBufferGpuAddress, totalProgrammedSize, args, flushArgs, rootDeviceEnvironment); UNRECOVERABLE_IF(totalProgrammedSize != barrierCommandsSize); } template inline size_t ImplicitScalingDispatch::getRegisterConfigurationSize() { return EncodeSetMMIO::sizeMEM + getOffsetRegisterSize(); } template inline void ImplicitScalingDispatch::dispatchRegisterConfiguration(LinearStream &commandStream, uint64_t workPartitionSurfaceAddress, uint32_t addressOffset) { EncodeSetMMIO::encodeMEM(commandStream, PartitionRegisters::wparidCCSOffset, workPartitionSurfaceAddress); dispatchOffsetRegister(commandStream, addressOffset); } template inline size_t ImplicitScalingDispatch::getOffsetRegisterSize() { return EncodeSetMMIO::sizeIMM; } template inline void ImplicitScalingDispatch::dispatchOffsetRegister(LinearStream &commandStream, uint32_t addressOffset) { EncodeSetMMIO::encodeIMM(commandStream, PartitionRegisters::addressOffsetCCSOffset, addressOffset, true); } template inline uint32_t ImplicitScalingDispatch::getImmediateWritePostSyncOffset() { if (ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled()) { return static_cast(sizeof(uint64_t)); } return static_cast(GfxCoreHelperHw::getSingleTimestampPacketSizeHw()); } template inline uint32_t ImplicitScalingDispatch::getTimeStampPostSyncOffset() { return static_cast(GfxCoreHelperHw::getSingleTimestampPacketSizeHw()); } template inline bool ImplicitScalingDispatch::platformSupportsImplicitScaling(const RootDeviceEnvironment &rootDeviceEnvironment) { return false; } } // namespace NEO