/* * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/implicit_scaling.h" #include "shared/source/command_container/walker_partition_xehp_and_later.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/memory_manager/graphics_allocation.h" namespace NEO { template WalkerPartition::WalkerPartitionArgs prepareWalkerPartitionArgs(ImplicitScalingDispatchCommandArgs &dispatchCommandArgs, uint32_t tileCount, bool preferStaticPartitioning, bool staticPartitioning) { WalkerPartition::WalkerPartitionArgs args = {}; args.workPartitionAllocationGpuVa = dispatchCommandArgs.workPartitionAllocationGpuVa; args.partitionCount = dispatchCommandArgs.partitionCount; args.tileCount = tileCount; args.staticPartitioning = staticPartitioning; args.preferredStaticPartitioning = preferStaticPartitioning; args.forceExecutionOnSingleTile = dispatchCommandArgs.forceExecutionOnSingleTile; args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup(); args.initializeWparidRegister = ImplicitScalingHelper::isWparidRegisterInitializationRequired(); args.emitPipeControlStall = ImplicitScalingHelper::isPipeControlStallRequired(ImplicitScalingDispatch::getPipeControlStallRequired()); args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired(); args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired(args.emitPipeControlStall); args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired(); args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(args, dispatchCommandArgs.apiSelfCleanup); args.emitBatchBufferEnd = false; args.secondaryBatchBuffer = dispatchCommandArgs.useSecondaryBatchBuffer; args.dcFlushEnable = dispatchCommandArgs.dcFlush; args.pipeControlBeforeCleanupCrossTileSync = ImplicitScalingHelper::pipeControlBeforeCleanupAtomicSyncRequired(); args.blockDispatchToCommandBuffer = dispatchCommandArgs.blockDispatchToCommandBuffer; args.workgroupSize = dispatchCommandArgs.workgroupSize; args.maxWgCountPerTile = dispatchCommandArgs.maxWgCountPerTile; args.isRequiredDispatchWorkGroupOrder = dispatchCommandArgs.isRequiredDispatchWorkGroupOrder; return args; } template template size_t ImplicitScalingDispatch::getSize(bool apiSelfCleanup, bool preferStaticPartitioning, const DeviceBitfield &devices, const Vec3 &groupStart, const Vec3 &groupCount) { typename WalkerType::PARTITION_TYPE partitionType{}; bool staticPartitioning = false; const uint32_t tileCount = static_cast(devices.count()); const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType(tileCount, preferStaticPartitioning, groupStart, groupCount, {}, &partitionType, &staticPartitioning); UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount)); ImplicitScalingDispatchCommandArgs dispatchCommandArgs = {}; dispatchCommandArgs.partitionCount = partitionCount; dispatchCommandArgs.apiSelfCleanup = apiSelfCleanup; WalkerPartition::WalkerPartitionArgs args = prepareWalkerPartitionArgs(dispatchCommandArgs, tileCount, preferStaticPartitioning, staticPartitioning); return static_cast(WalkerPartition::estimateSpaceRequiredInCommandBuffer(args)); } template template void ImplicitScalingDispatch::dispatchCommands(LinearStream &commandStream, WalkerType &walkerCmd, const DeviceBitfield &devices, ImplicitScalingDispatchCommandArgs &dispatchCommandArgs) { uint32_t totalProgrammedSize = 0u; const uint32_t tileCount = static_cast(devices.count()); const bool preferStaticPartitioning = dispatchCommandArgs.workPartitionAllocationGpuVa != 0u; bool staticPartitioning = false; dispatchCommandArgs.partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType(&walkerCmd, dispatchCommandArgs.requiredPartitionDim, tileCount, preferStaticPartitioning, &staticPartitioning); WalkerPartition::WalkerPartitionArgs walkerPartitionArgs = prepareWalkerPartitionArgs(dispatchCommandArgs, tileCount, preferStaticPartitioning, staticPartitioning); size_t dispatchCommandsSize = 0; void *commandBuffer = nullptr; uint64_t cmdBufferGpuAddress = 0; if (!dispatchCommandArgs.blockDispatchToCommandBuffer) { dispatchCommandsSize = getSize(dispatchCommandArgs.apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()}); commandBuffer = commandStream.getSpace(dispatchCommandsSize); cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - dispatchCommandsSize; } if (staticPartitioning) { UNRECOVERABLE_IF(tileCount != dispatchCommandArgs.partitionCount); WalkerPartition::constructStaticallyPartitionedCommandBuffer(commandBuffer, dispatchCommandArgs.outWalkerPtr, cmdBufferGpuAddress, &walkerCmd, totalProgrammedSize, walkerPartitionArgs, *dispatchCommandArgs.device); } else { if (debugManager.flags.ExperimentalSetWalkerPartitionCount.get()) { dispatchCommandArgs.partitionCount = debugManager.flags.ExperimentalSetWalkerPartitionCount.get(); if (dispatchCommandArgs.partitionCount == 1u) { walkerCmd.setPartitionType(WalkerType::PARTITION_TYPE::PARTITION_TYPE_DISABLED); } walkerPartitionArgs.partitionCount = dispatchCommandArgs.partitionCount; } WalkerPartition::constructDynamicallyPartitionedCommandBuffer(commandBuffer, dispatchCommandArgs.outWalkerPtr, cmdBufferGpuAddress, &walkerCmd, totalProgrammedSize, walkerPartitionArgs, *dispatchCommandArgs.device); } UNRECOVERABLE_IF(totalProgrammedSize != dispatchCommandsSize); } template bool &ImplicitScalingDispatch::getPipeControlStallRequired() { return ImplicitScalingDispatch::pipeControlStallRequired; } template WalkerPartition::WalkerPartitionArgs prepareBarrierWalkerPartitionArgs(bool emitSelfCleanup, bool usePostSync) { WalkerPartition::WalkerPartitionArgs args = {}; args.crossTileAtomicSynchronization = true; args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup(); args.usePostSync = usePostSync; args.emitSelfCleanup = ImplicitScalingHelper::isSelfCleanupRequired(args, emitSelfCleanup); args.pipeControlBeforeCleanupCrossTileSync = ImplicitScalingHelper::pipeControlBeforeCleanupAtomicSyncRequired(); return args; } template size_t ImplicitScalingDispatch::getBarrierSize(const RootDeviceEnvironment &rootDeviceEnvironment, bool apiSelfCleanup, bool usePostSync) { WalkerPartition::WalkerPartitionArgs args = prepareBarrierWalkerPartitionArgs(apiSelfCleanup, usePostSync); return static_cast(WalkerPartition::estimateBarrierSpaceRequiredInCommandBuffer(args, rootDeviceEnvironment)); } template void ImplicitScalingDispatch::dispatchBarrierCommands(LinearStream &commandStream, const DeviceBitfield &devices, PipeControlArgs &flushArgs, const RootDeviceEnvironment &rootDeviceEnvironment, uint64_t gpuAddress, uint64_t immediateData, bool apiSelfCleanup, bool useSecondaryBatchBuffer) { uint32_t totalProgrammedSize = 0u; WalkerPartition::WalkerPartitionArgs args = prepareBarrierWalkerPartitionArgs(apiSelfCleanup, gpuAddress > 0); args.tileCount = static_cast(devices.count()); args.secondaryBatchBuffer = useSecondaryBatchBuffer; args.postSyncGpuAddress = gpuAddress; args.postSyncImmediateValue = immediateData; auto barrierCommandsSize = getBarrierSize(rootDeviceEnvironment, args.emitSelfCleanup, args.usePostSync); void *commandBuffer = commandStream.getSpace(barrierCommandsSize); uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - barrierCommandsSize; WalkerPartition::constructBarrierCommandBuffer(commandBuffer, cmdBufferGpuAddress, totalProgrammedSize, args, flushArgs, rootDeviceEnvironment); UNRECOVERABLE_IF(totalProgrammedSize != barrierCommandsSize); } template inline size_t ImplicitScalingDispatch::getRegisterConfigurationSize() { return EncodeSetMMIO::sizeMEM + getOffsetRegisterSize(); } template inline void ImplicitScalingDispatch::dispatchRegisterConfiguration(LinearStream &commandStream, uint64_t workPartitionSurfaceAddress, uint32_t addressOffset, bool isBcs) { EncodeSetMMIO::encodeMEM(commandStream, PartitionRegisters::wparidCCSOffset, workPartitionSurfaceAddress, isBcs); dispatchOffsetRegister(commandStream, addressOffset, isBcs); } template inline size_t ImplicitScalingDispatch::getOffsetRegisterSize() { return EncodeSetMMIO::sizeIMM; } template inline void ImplicitScalingDispatch::dispatchOffsetRegister(LinearStream &commandStream, uint32_t addressOffset, bool isBcs) { EncodeSetMMIO::encodeIMM(commandStream, PartitionRegisters::addressOffsetCCSOffset, addressOffset, true, isBcs); } template inline uint32_t ImplicitScalingDispatch::getImmediateWritePostSyncOffset() { return static_cast(sizeof(uint64_t)); } template inline uint32_t ImplicitScalingDispatch::getTimeStampPostSyncOffset() { return static_cast(GfxCoreHelperHw::getSingleTimestampPacketSizeHw()); } template inline bool ImplicitScalingDispatch::platformSupportsImplicitScaling(const RootDeviceEnvironment &rootDeviceEnvironment) { return false; } } // namespace NEO