Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp
Pillow, Scott 9a8bfb5db8 Add header file
Add header file
2025-10-29 04:39:27 +01:00

1733 lines
73 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2024 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "AdaptorOCL/MergeAllocasOCL.h"
#include "Compiler/Legalizer/PeepholeTypeLegalizer.hpp"
#include "Compiler/CISACodeGen/DropTargetBBs.hpp"
#include "Compiler/CISACodeGen/layout.hpp"
#include "Compiler/CISACodeGen/DeSSA.hpp"
#include "Compiler/CISACodeGen/GenCodeGenModule.h"
#include "Compiler/CISACodeGen/AdvCodeMotion.h"
#include "Compiler/CISACodeGen/RematAddressArithmetic.h"
#include "Compiler/CISACodeGen/VectorShuffleAnalysis.hpp"
#include "Compiler/CISACodeGen/IGCLivenessAnalysis.h"
#include "Compiler/CISACodeGen/IGCVectorizer.h"
#include "Compiler/CISACodeGen/AdvMemOpt.h"
#include "Compiler/CISACodeGen/Emu64OpsPass.h"
#include "Compiler/CISACodeGen/PushAnalysis.hpp"
#include "Compiler/CISACodeGen/ScalarizerCodeGen.hpp"
#include "Compiler/CISACodeGen/HoistCongruentPhi.hpp"
#include "Compiler/CISACodeGen/CodeScheduling.hpp"
#include "Compiler/CISACodeGen/CodeSinking.hpp"
#include "Compiler/CISACodeGen/AddressArithmeticSinking.hpp"
#include "Compiler/CISACodeGen/AtomicOptPass.hpp"
#include "Compiler/CISACodeGen/BlockMemOpAddrScalarizationPass.hpp"
#include "Compiler/CISACodeGen/SinkCommonOffsetFromGEP.h"
#include "Compiler/CISACodeGen/ConstantCoalescing.hpp"
#include "Compiler/CISACodeGen/CheckInstrTypes.hpp"
#include "Compiler/CISACodeGen/EstimateFunctionSize.h"
#include "Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp"
#include "Compiler/CISACodeGen/GenerateFrequencyData.hpp"
#include "Compiler/CISACodeGen/FixAddrSpaceCast.h"
#include "Compiler/CISACodeGen/FixupExtractValuePair.h"
#include "Compiler/CISACodeGen/GenIRLowering.h"
#include "Compiler/CISACodeGen/GenSimplification.h"
#include "Compiler/CISACodeGen/LoopDCE.h"
#include "Compiler/CISACodeGen/LdShrink.h"
#include "Compiler/CISACodeGen/MemOpt.h"
#include "Compiler/CISACodeGen/MemOpt2.h"
#include "Compiler/CISACodeGen/SplitLoads.h"
#include "Compiler/CISACodeGen/PreRARematFlag.h"
#include "Compiler/CISACodeGen/PromoteConstantStructs.hpp"
#include "Compiler/Optimizer/OpenCLPasses/Decompose2DBlockFuncs/Decompose2DBlockFuncs.hpp"
#include "Compiler/Optimizer/OpenCLPasses/GenericAddressResolution/GASResolving.h"
#include "Compiler/Optimizer/OpenCLPasses/PrivateMemory/LowerByValAttribute.hpp"
#include "Compiler/CISACodeGen/Simd32Profitability.hpp"
#include "Compiler/CISACodeGen/TimeStatsCounter.h"
#include "Compiler/CISACodeGen/TypeDemote.h"
#include "Compiler/CISACodeGen/UniformAssumptions.hpp"
#include "Compiler/CISACodeGen/ResourceLoopUnroll.hpp"
#include "Compiler/CISACodeGen/VectorProcess.hpp"
#include "Compiler/CISACodeGen/RuntimeValueLegalizationPass.h"
#include "Compiler/CISACodeGen/LowerGEPForPrivMem.hpp"
#include "Compiler/CISACodeGen/MatchCommonKernelPatterns.hpp"
#include "Compiler/CISACodeGen/POSH_RemoveNonPositionOutput.h"
#include "Compiler/CISACodeGen/RegisterEstimator.hpp"
#include "Compiler/CISACodeGen/RegisterPressureEstimate.hpp"
#include "Compiler/CISACodeGen/OpenCLKernelCodeGen.hpp"
#include "Compiler/CISACodeGen/RayTracingShaderLowering.hpp"
#include "Compiler/CISACodeGen/RayTracingStatefulPass.h"
#include "Compiler/CISACodeGen/LSCCacheOptimizationPass.h"
#include "Compiler/CISACodeGen/LSCControlsAnalysisPass.h"
#include "Compiler/ConvertMSAAPayloadTo16Bit.hpp"
#include "Compiler/MSAAInsertDiscard.hpp"
#include "Compiler/CISACodeGen/PromoteInt8Type.hpp"
#include "Compiler/CISACodeGen/PrepareLoadsStoresPass.h"
#include "Compiler/CISACodeGen/CallMergerPass.hpp"
#include "Compiler/CISACodeGen/EvaluateFreeze.hpp"
#include "Compiler/CISACodeGen/DpasScan.hpp"
#include "Compiler/CISACodeGen/FPRoundingModeCoalescing.hpp"
#include "Compiler/CISACodeGen/SLMConstProp.hpp"
#include "Compiler/Optimizer/OpenCLPasses/SplitStructurePhisPass/SplitStructurePhisPass.hpp"
#include "Compiler/Optimizer/OpenCLPasses/MergeScalarPhisPass/MergeScalarPhisPass.hpp"
#include "Compiler/Legalizer/AddRequiredMemoryFences.h"
#include "Compiler/Optimizer/OpenCLPasses/GenericAddressResolution/GenericAddressDynamicResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/GenericAddressResolution/GenericNullPtrPropagation.hpp"
#include "Compiler/Optimizer/OpenCLPasses/PrivateMemory/PrivateMemoryResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/PrivateMemory/PrivateMemoryToSLM.hpp"
#include "Compiler/Optimizer/OpenCLPasses/ProgramScopeConstants/ProgramScopeConstantResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/WIFuncs/WIFuncResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/BreakConstantExpr/BreakConstantExpr.hpp"
#include "Compiler/Optimizer/OpenCLPasses/ReplaceUnsupportedIntrinsics/ReplaceUnsupportedIntrinsics.hpp"
#include "Compiler/Optimizer/PreCompiledFuncImport.hpp"
#include "Compiler/Optimizer/OpenCLPasses/AddressSpaceAliasAnalysis/AddressSpaceAliasAnalysis.h"
#include "Compiler/Optimizer/OpenCLPasses/StatelessToStateful/StatelessToStateful.hpp"
#include "Compiler/Optimizer/OpenCLPasses/DisableLoopUnrollOnRetry/DisableLoopUnrollOnRetry.hpp"
#include "Compiler/Optimizer/OpenCLPasses/TransformUnmaskedFunctionsPass/TransformUnmaskedFunctionsPass.h"
#include "Compiler/Optimizer/OpenCLPasses/UnreachableHandling/UnreachableHandling.hpp"
#include "Compiler/Optimizer/OpenCLPasses/WIFuncs/WIFuncResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/GEPLoopStrengthReduction/GEPLoopStrengthReduction.hpp"
#include "Compiler/Optimizer/OpenCLPasses/StackOverflowDetection/StackOverflowDetection.hpp"
#include "Compiler/Optimizer/OpenCLPasses/SubGroupReductionPattern/SubGroupReductionPattern.hpp"
#include "Compiler/Optimizer/MCSOptimization.hpp"
#include "Compiler/Optimizer/GatingSimilarSamples.hpp"
#include "Compiler/Optimizer/IntDivConstantReduction.hpp"
#include "Compiler/Optimizer/IntDivRemIncrementReduction.hpp"
#include "Compiler/Optimizer/IntDivRemCombine.hpp"
#include "Compiler/Optimizer/SynchronizationObjectCoalescing.hpp"
#include "Compiler/Optimizer/BarrierControlFlowOptimization.hpp"
#include "Compiler/Optimizer/RuntimeValueVectorExtractPass.h"
#include "Compiler/Optimizer/WaveShuffleIndexSinking.hpp"
#include "Compiler/Optimizer/WaveAllJointReduction.hpp"
#include "Compiler/Optimizer/InstructionHoistingOptimization.hpp"
#include "Compiler/Optimizer/WaveBallotCSE.hpp"
#include "Compiler/MetaDataApi/PurgeMetaDataUtils.hpp"
#include "Compiler/HandleLoadStoreInstructions.hpp"
#include "Compiler/CustomSafeOptPass.hpp"
#include "Compiler/CustomUnsafeOptPass.hpp"
#include "Compiler/CustomLoopOpt.hpp"
#include "Compiler/GenUpdateCB.h"
#include "Compiler/PromoteResourceToDirectAS.h"
#include "Compiler/PromoteStatelessToBindless.h"
#include "Compiler/ShrinkArrayAlloca.h"
#if defined(_DEBUG) && !defined(ANDROID)
#include "Compiler/VerificationPass.hpp"
#endif
#include "Compiler/FixInvalidFuncNamePass.hpp"
#include "Compiler/LegalizationPass.hpp"
#include "Compiler/LowPrecisionOptPass.hpp"
#include "Compiler/WorkaroundAnalysisPass.h"
#include "Compiler/MetaDataApi/MetaDataApi.h"
#include "Compiler/MetaDataUtilsWrapper.h"
#include "Compiler/CodeGenContextWrapper.hpp"
#include "Compiler/DynamicTextureFolding.h"
#include "Compiler/SampleMultiversioning.hpp"
#include "Compiler/InitializePasses.h"
#include "Compiler/GenRotate.hpp"
#include "Compiler/Optimizer/Scalarizer.h"
#include "Compiler/RemoveCodeAssumptions.hpp"
#include "common/igc_regkeys.hpp"
#include "common/debug/Dump.hpp"
#include "common/LLVMWarningsPush.hpp"
#include "llvm/Config/llvm-config.h"
#include <llvm/IR/DebugInfo.h>
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Verifier.h>
#include <llvm/Analysis/Passes.h>
#include <llvm/Pass.h>
#include <llvm/IR/LegacyPassManager.h>
#include <llvm/Transforms/IPO.h>
#include <llvm/Transforms/IPO/AlwaysInliner.h>
#include <llvm/Transforms/Scalar.h>
#include <llvm/Transforms/Scalar/GVN.h>
#include <llvm/IR/Function.h>
#include <llvm/Analysis/ScopedNoAliasAA.h>
#include <llvm/Analysis/TargetLibraryInfo.h>
#include <llvm/Support/ErrorHandling.h>
#include <llvm/Transforms/IPO/FunctionAttrs.h>
#include <llvm/Transforms/Utils.h>
#include <llvm/Transforms/Scalar.h>
#include <llvm/Bitcode/BitcodeWriter.h>
#include "common/LLVMWarningsPop.hpp"
#include "Compiler/CISACodeGen/PatternMatchPass.hpp"
#include "Compiler/CISACodeGen/EmitVISAPass.hpp"
#include "Compiler/CISACodeGen/CoalescingEngine.hpp"
#include "Compiler/GenTTI.h"
#include "Compiler/GenRotate.hpp"
#include "Compiler/SampleCmpToDiscard.h"
#include "Compiler/Optimizer/IGCInstCombiner/IGCInstructionCombining.hpp"
#include "Compiler/Optimizer/HoistConvOpToDom.hpp"
#include "Compiler/Optimizer/PromoteToPredicatedMemoryAccess.hpp"
#include "AdaptorCommon/RayTracing/RayTracingPasses.hpp"
#include "AdaptorCommon/RayTracing/RayTracingAddressSpaceAliasAnalysis.h"
#include "AdaptorCommon/RayTracing/API/RayDispatchGlobalData.h"
#include "Compiler/SamplerPerfOptPass.hpp"
#include "Compiler/CISACodeGen/HalfPromotion.h"
#include "Compiler/CISACodeGen/CapLoopIterationsPass.h"
#include "Compiler/CISACodeGen/AnnotateUniformAllocas.h"
#include "Probe/Assertion.h"
#include "Compiler/CISACodeGen/PartialEmuI64OpsPass.h"
#include "Compiler/TranslateToProgrammableOffsetsPass.hpp"
#include "Compiler/CISACodeGen/RemoveLoopDependency.hpp"
#include <filesystem>
/***********************************************************************************
This file contains the generic code generation functions for all the shaders
The class CShader is inherited for each specific type of shaders to add specific
information
************************************************************************************/
using namespace llvm;
using namespace IGC;
using namespace IGC::IGCMD;
using namespace IGC::Debug;
namespace IGC {
const int LOOP_ROTATION_HEADER_INST_THRESHOLD = 32;
const int LOOP_NUM_THRESHOLD = 2000;
const int LOOP_INST_THRESHOLD = 65000;
const int INST_THRESHOLD = 80000;
void AddAnalysisPasses(CodeGenContext &ctx, IGCPassManager &mpm) {
COMPILER_TIME_START(&ctx, TIME_CG_Add_Analysis_Passes);
bool isOptDisabled = ctx.getModuleMetaData()->compOpt.OptDisable;
TODO("remove the following once all IGC passes are registered to PassRegistery in their constructor")
initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());
initializeCastToGASInfoPass(*PassRegistry::getPassRegistry());
mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Analysis, STATS_COUNTER_START));
// transform pull constants and inputs into push constants and inputs
mpm.add(new PushAnalysis());
mpm.add(CreateSampleCmpToDiscardPass());
if (!isOptDisabled) {
mpm.add(llvm::createDeadCodeEliminationPass());
}
// The 1st thing we do when getting into the IGC middle end is to split critical-edges:
// PushAnalysis requires WIAnalysis
// WIAnalysis requires dominator and post-dominator analysis
// WIAnalysis also requires BreakCriticalEdge because it assumes that
// potential phi-moves will be placed at those blocks
mpm.add(llvm::createBreakCriticalEdgesPass());
if (!isOptDisabled && IGC_IS_FLAG_DISABLED(DisableMemOpt2)) {
if (ctx.m_DriverInfo.WAEnableMemOpt2ForOCL())
mpm.add(createMemOpt2Pass(16));
}
if (!isOptDisabled) {
mpm.add(createSplitLoadsPass());
}
if (IGC_IS_FLAG_ENABLED(EnableScalarPhisMerger) && ctx.type == ShaderType::OPENCL_SHADER) {
mpm.add(new MergeScalarPhisPass());
}
// only limited code-sinking to several shader-type
// vs input has the URB-reuse issue to be resolved.
// Also need to understand the performance benefit better.
if (!isOptDisabled) {
mpm.add(new HoistCongruentPHI());
mpm.add(new CodeSinking());
if ((IGC_IS_FLAG_DISABLED(DisableLoopSink) || IGC_IS_FLAG_ENABLED(ForceLoopSink)) &&
ctx.type == ShaderType::OPENCL_SHADER && ctx.m_instrTypes.numOfLoop > 0 &&
ctx.m_instrTypes.numInsts >= IGC_GET_FLAG_VALUE(CodeLoopSinkingMinSize)) {
mpm.add(new CodeLoopSinking());
}
if (IGC_IS_FLAG_DISABLED(DisableCodeScheduling) && (ctx.type == ShaderType::OPENCL_SHADER) &&
(ctx.platform.isCoreChildOf(IGFX_XE_HPC_CORE) || ctx.platform.isCoreChildOf(IGFX_XE2_HPG_CORE))) {
if (IGC_IS_FLAG_DISABLED(CodeSchedulingOnlyRecompilation) || ctx.m_retryManager.AllowCodeScheduling()) {
mpm.add(new CodeScheduling());
}
}
}
// Run flag re-materialization if it's beneficial.
if (ctx.m_DriverInfo.benefitFromPreRARematFlag() && IGC_IS_FLAG_ENABLED(EnablePreRARematFlag)) {
mpm.add(createPreRARematFlagPass());
}
// Peephole framework for generic type legalization
mpm.add(new Legalizer::PeepholeTypeLegalizer());
if (IGC_IS_FLAG_ENABLED(ForcePromoteI8) ||
(IGC_IS_FLAG_ENABLED(EnablePromoteI8) && !ctx.platform.supportByteALUOperation())) {
mpm.add(createPromoteInt8TypePass());
}
// need this before WIAnalysis:
// insert phi to prevent changing of WIAnalysis result by later code-motion
mpm.add(llvm::createLCSSAPass());
// Fixup extract value pairs.
mpm.add(createExtractValuePairFixupPass());
if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc)) {
mpm.add(new InlineUnmaskedFunctionsPass());
// Newly created memcpy intrinsic are lowered
mpm.add(createReplaceUnsupportedIntrinsicsPass());
// Split complex constant expression into 2 simple ones
mpm.add(new BreakConstantExpr());
// Expand newly created allocas
mpm.add(createSROAPass());
// Run legalization pass to expand non-supported instructions
// like shufflevector. The code below is just copied and
// pasted as is.
bool preserveNan = !ctx.getCompilerOption().NoNaNs;
mpm.add(new Legalization(preserveNan));
// Some clean up passes.
mpm.add(llvm::createEarlyCSEPass());
mpm.add(new BreakConstantExpr());
mpm.add(llvm::createCFGSimplificationPass());
mpm.add(createDeadCodeEliminationPass());
// Create functions groups after unmasked functions inlining
mpm.add(createGenXCodeGenModulePass());
// Allocate non-primitive allocas. These peace of code is copied
if (ctx.m_instrTypes.hasNonPrimitiveAlloca) {
mpm.add(createBreakCriticalEdgesPass());
mpm.add(createAnnotateUniformAllocasPass());
if (IGC_IS_FLAG_DISABLED(DisablePromotePrivMem) &&
!isOptDisabledForModule(ctx.getModuleMetaData(), IGCOpts::LowerGEPForPrivMemPass)) {
mpm.add(createPromotePrivateArrayToReg());
mpm.add(createCFGSimplificationPass());
}
}
mpm.add(createPromoteMemoryToRegisterPass());
if (IGC_IS_FLAG_DISABLED(DisableMergeAllocasPrivateMemory) && ctx.type == ShaderType::OPENCL_SHADER) {
mpm.add(createMergeAllocasOCL());
}
if (ctx.type == ShaderType::OPENCL_SHADER && !isOptDisabled && IGC_IS_FLAG_ENABLED(EnableExplicitCopyForByVal)) {
mpm.add(new LowerByValAttribute());
mpm.add(createReplaceUnsupportedIntrinsicsPass());
}
// Resolving private memory allocas
mpm.add(CreatePrivateMemoryResolution());
}
// Reorder FP instructions to minimize number of times rounding mode is switched.
mpm.add(createFPRoundingModeCoalescingPass());
// Expected to be the last ModulePass before EmitPass at this point.
// (Shall be after GenXCodeGenModulePass.)
//
// It uses CastToGASAnalysis and invalidates it by taking its result away.
// This means that after this point, no CastToGASAnalysis will be used,
// and the info should be accessed via CastToGASInfo immutable pass.
if (ctx.type == ShaderType::OPENCL_SHADER) {
mpm.add(new CastToGASInfoWrapper());
}
// Evaluates LLVM 10+ freeze instructions so EmitPass does not need to handle them.
// The pass first occurs during optimization, however new freeze instructions could
// have been inserted since.
mpm.add(createEvaluateFreezePass());
// clean up constexpressions after EarlyCSE
mpm.add(new BreakConstantExpr());
// This is for dumping register pressure info
if (IGC_IS_FLAG_ENABLED(ForceRPE)) {
mpm.add(new RegisterEstimator());
}
mpm.add(createFixInvalidFuncNamePass());
// collect stats after all the optimization. This info can be dumped to the cos file
mpm.add(new CheckInstrTypes(true, false));
if ((IGC_GET_FLAG_VALUE(StaticProfileGuidedSpillCostAnalysis) & FrequencyDataDS::PGSS_IGC_GEN) != 0) {
mpm.add(createGenerateFrequencyDataPass());
}
if (IGC_IS_FLAG_ENABLED(StackOverflowDetection)) {
mpm.add(new StackOverflowDetectionPass(StackOverflowDetectionPass::Mode::RemoveDummyCalls));
}
//
// Generally, passes that change IR should be prior to this place!
//
mpm.add(new DpasScan());
mpm.add(new MatchCommonKernelPatterns());
// let CleanPHINode be right before Layout
mpm.add(createCleanPHINodePass());
if (IGC_IS_FLAG_SET(DumpRegPressureEstimate))
mpm.add(new IGCRegisterPressurePrinter("final"));
// Let Layout be the last pass before Emit Pass
mpm.add(new Layout());
if (IGC_IS_FLAG_ENABLED(EnableDropTargetBBs)) {
mpm.add(new DropTargetBBs());
}
mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Analysis, STATS_COUNTER_END));
COMPILER_TIME_END(&ctx, TIME_CG_Add_Analysis_Passes);
} // AddAnalysisPasses
static void UpdateInstTypeHint(CodeGenContext &ctx) {
// WA: save original values as preRA heuristic is based on those
// we need to fix the preRA pass heuristic or get rid of preRA pass altogether
unsigned int numBB = ctx.m_instrTypes.numBB;
unsigned int numSample = ctx.m_instrTypes.numSample;
unsigned int numInsts = ctx.m_instrTypes.numInsts;
bool hasUnmaskedRegion = ctx.m_instrTypes.hasUnmaskedRegion;
IGCPassManager mpm(&ctx, "UpdateOptPre");
mpm.add(new CodeGenContextWrapper(&ctx));
mpm.add(new BreakConstantExpr());
mpm.add(new CheckInstrTypes(false, false));
mpm.run(*ctx.getModule());
ctx.m_instrTypes.numBB = numBB;
ctx.m_instrTypes.numSample = numSample;
ctx.m_instrTypes.numInsts = numInsts;
ctx.m_instrTypes.hasLoadStore = true;
ctx.m_instrTypes.hasUnmaskedRegion = hasUnmaskedRegion;
}
// forward declaration
llvm::ModulePass *createPruneUnusedArgumentsPass();
void AddLegalizationPasses(CodeGenContext &ctx, IGCPassManager &mpm, PSSignature *pSignature) {
COMPILER_TIME_START(&ctx, TIME_CG_Add_Legalization_Passes);
mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Legalization, STATS_COUNTER_START));
// update type of instructions to know what passes are needed.
UpdateInstTypeHint(ctx);
// check again after full inlining if subroutines are still present
ctx.CheckEnableSubroutine(*ctx.getModule());
MetaDataUtils *pMdUtils = ctx.getMetaDataUtils();
bool isOptDisabled = ctx.getModuleMetaData()->compOpt.OptDisable;
bool fastCompile = ctx.getModuleMetaData()->compOpt.FastCompilation;
bool highAllocaPressure = ctx.m_instrTypes.numAllocaInsts > IGC_GET_FLAG_VALUE(AllocaRAPressureThreshold);
bool isPotentialHPCKernel = (ctx.m_instrTypes.numInsts > IGC_GET_FLAG_VALUE(HPCInstNumThreshold)) ||
(ctx.m_instrTypes.numGlobalInsts > IGC_GET_FLAG_VALUE(HPCGlobalInstNumThreshold)) ||
IGC_GET_FLAG_VALUE(HPCFastCompilation);
highAllocaPressure = IGC_GET_FLAG_VALUE(DisableFastRAWA) ? false : highAllocaPressure;
isPotentialHPCKernel = IGC_GET_FLAG_VALUE(DisableFastRAWA) ? false : isPotentialHPCKernel;
if (highAllocaPressure || isPotentialHPCKernel) {
IGC_SET_FLAG_VALUE(FastCompileRA, 1);
IGC_SET_FLAG_VALUE(HybridRAWithSpill, 1);
}
// In case of presence of Unmasked regions disable loop invariant motion after
// Unmasked functions are inlined at the end of optimization phase
if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_DISABLED(LateInlineUnmaskedFunc) &&
ctx.m_instrTypes.hasUnmaskedRegion) {
IGC_SET_FLAG_VALUE(allowLICM, false);
}
bool disableConvergentInstructionsHoisting =
ctx.m_DriverInfo.DisableConvergentInstructionsHoisting() && ctx.m_instrTypes.numWaveIntrinsics > 0;
if (disableConvergentInstructionsHoisting || IGC_IS_FLAG_ENABLED(ForceAllPrivateMemoryToSLM) ||
IGC_IS_FLAG_ENABLED(ForcePrivateMemoryToSLMOnBuffers)) {
TargetIRAnalysis GenTTgetIIRAnalysis([&](const Function &F) {
GenIntrinsicsTTIImpl GTTI(&ctx);
return TargetTransformInfo(GTTI);
});
mpm.add(new TargetTransformInfoWrapperPass(std::move(GenTTgetIIRAnalysis)));
}
// Disable all target library functions.
// right now we don't support any standard function in the code gen
// maybe we want to support some at some point to take advantage of LLVM optimizations
TargetLibraryInfoImpl TLI;
TLI.disableAllFunctions();
mpm.add(new llvm::TargetLibraryInfoWrapperPass(TLI));
// Add Metadata API immutable pass
mpm.add(new MetaDataUtilsWrapper(pMdUtils, ctx.getModuleMetaData()));
// Add CodeGen Context Wrapper immutable pass
mpm.add(new CodeGenContextWrapper(&ctx));
// Add alias analysis pass
mpm.add(createAddressSpaceAAWrapperPass());
if (ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) {
if (IGC_IS_FLAG_DISABLED(DisableRTAliasAnalysis))
mpm.add(createRayTracingAddressSpaceAAWrapperPass());
}
mpm.add(createIGCExternalAAWrapper());
mpm.add(createScopedNoAliasAAWrapperPass());
TODO("remove the following once all IGC passes are registered to PassRegistery in their constructor")
initializeWIAnalysisPass(*PassRegistry::getPassRegistry());
initializeSimd32ProfitabilityAnalysisPass(*PassRegistry::getPassRegistry());
initializeGenXFunctionGroupAnalysisPass(*PassRegistry::getPassRegistry());
if (ctx.m_threadCombiningOptDone) {
mpm.add(createLoopCanonicalization());
mpm.add(llvm::createLoopDeletionPass());
mpm.add(llvm::createBreakCriticalEdgesPass());
mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));
mpm.add(llvm::createLowerSwitchPass());
int LoopUnrollThreshold = ctx.m_DriverInfo.GetLoopUnrollThreshold();
if (LoopUnrollThreshold > 0 && (ctx.m_tempCount < 64)) {
mpm.add(llvm::createLoopUnrollPass(2, false, false, LoopUnrollThreshold, -1, 1, -1, -1, -1));
}
mpm.add(createBarrierNoopPass());
if (IGC_IS_FLAG_ENABLED(allowLICM) && ctx.m_retryManager.AllowLICM()) {
mpm.add(createSpecialCasesDisableLICM());
mpm.add(llvm::createLICMPass(100, 500, true));
}
mpm.add(llvm::createLoopSimplifyPass());
}
// Lower/Resolve OCL inlined constants.
if (ctx.m_DriverInfo.NeedLoweringInlinedConstants()) {
// Run additional constant breaking which is assumed by the constant
// resolver.
mpm.add(new BreakConstantExpr());
mpm.add(new ProgramScopeConstantResolution());
}
// This is the condition that double emulation is used.
ctx.checkDPEmulationEnabled();
bool hasDPDivSqrtEmu =
!ctx.platform.hasNoFP64Inst() && !ctx.platform.hasCorrectlyRoundedMacros() && ctx.m_DriverInfo.NeedFP64DivSqrt();
uint32_t theEmuKind = (ctx.m_hasDPEmu ? EmuKind::EMU_DP : 0);
theEmuKind |= (hasDPDivSqrtEmu ? EmuKind::EMU_DP_DIV_SQRT : 0);
theEmuKind |= (ctx.m_hasDPConvEmu ? EmuKind::EMU_DP_CONV : 0);
theEmuKind |= (ctx.m_DriverInfo.NeedI64BitDivRem() ? EmuKind::EMU_I64DIVREM : 0);
theEmuKind |= (ctx.m_DriverInfo.NeedFP64toFP16Conv() && IGC_IS_FLAG_DISABLED(ForceDisableDPToHFConvEmu)
? EmuKind::EMU_FP64_FP16_CONV
: 0);
theEmuKind |= ((IGC_IS_FLAG_ENABLED(ForceSPDivEmulation) ||
(ctx.m_DriverInfo.NeedIEEESPDiv() && !ctx.platform.hasCorrectlyRoundedMacros()))
? EmuKind::EMU_SP_DIV
: 0);
if (ctx.platform.preferFP32IntDivRemEmu() && IGC_IS_FLAG_DISABLED(Force32BitIntDivRemEmu)) {
// Prefer using FP32 emulation even though DP support is available
theEmuKind |= EmuKind::EMU_I32DIVREM_SP;
} else if (!ctx.platform.hasNoFP64Inst() &&
(IGC_IS_FLAG_ENABLED(Force32BitIntDivRemEmu) || ctx.getCompilerOption().ForceInt32DivRemEmu ||
(ctx.platform.Enable32BitIntDivRemEmu() && !ctx.getCompilerOption().ForceInt32DivRemEmuSP &&
IGC_IS_FLAG_DISABLED(Force32BitIntDivRemEmuSP)))) {
// Use DP (and float) opeations to emulate int32 div/rem
theEmuKind |= EmuKind::EMU_I32DIVREM;
} else if (ctx.platform.Enable32BitIntDivRemEmu() || ctx.getCompilerOption().ForceInt32DivRemEmuSP ||
IGC_IS_FLAG_ENABLED(Force32BitIntDivRemEmuSP)) {
// Use SP floating operations to emulate int32 div/rem
theEmuKind |= EmuKind::EMU_I32DIVREM_SP;
}
if (IGC_IS_FLAG_ENABLED(RayTracingKeepUDivRemWA)) {
theEmuKind &= ~EmuKind::EMU_I32DIVREM;
theEmuKind &= ~EmuKind::EMU_I32DIVREM_SP;
}
if (theEmuKind > 0 || IGC_IS_FLAG_ENABLED(EnableTestIGCBuiltin)) {
// Need to break constant expr as PreCompiledFuncImport does not handle it.
mpm.add(new BreakConstantExpr());
mpm.add(new PreCompiledFuncImport(&ctx, theEmuKind));
mpm.add(createAlwaysInlinerLegacyPass());
// Using DCE here as AlwaysInliner does not completely remove dead functions.
// Once AlwaysInliner can delete all of them, this DCE is no longer needed.
// mpm.add(createDeadCodeEliminationPass());
//
// DCE doesn't remove dead control flow; ADCE does (currently)
// otherwise you'd have to call createCFGSimplificationPass and DCE
// iteratively e.g..
mpm.add(llvm::createAggressiveDCEPass());
// TODO: we probably should be running other passes on the result
if (!IGC::ForceAlwaysInline(&ctx)) {
mpm.add(new PurgeMetaDataUtils());
}
}
// Find rotate pattern.
// Invoked after DP emulation so that it'd handle emulation functions.
if (ctx.platform.supportRotateInstruction()) {
mpm.add(createGenRotatePass());
}
mpm.add(createReplaceUnsupportedIntrinsicsPass());
if (IGC_IS_FLAG_DISABLED(DisablePromoteToDirectAS) && !ctx.getModuleMetaData()->compOpt.IsLibraryCompilation) {
// Promotes indirect resource access to direct
mpm.add(new BreakConstantExpr());
mpm.add(new PromoteResourceToDirectAS());
}
if (!isOptDisabled) {
mpm.add(createPruneUnusedArgumentsPass());
}
if (ctx.m_instrTypes.hasReadOnlyArray) {
mpm.add(createDeadCodeEliminationPass());
mpm.add(createSROAPass());
}
if (ctx.m_instrTypes.hasGenericAddressSpacePointers) {
if (IGC_IS_FLAG_ENABLED(EnableGASResolver)) {
mpm.add(createSROAPass());
mpm.add(createFixAddrSpaceCastPass());
mpm.add(createResolveGASPass());
}
mpm.add(createGenericAddressDynamicResolutionPass());
mpm.add(createDeadCodeEliminationPass());
mpm.add(createGenericNullPtrPropagationPass());
}
// Resolve the Private memory to register pass
if (!isOptDisabled) {
// In case of late inlining of Unmasked function allocate non
// primitive Allocas after inlining is done. Otherwise there
// is possibility RegAlloc cannot allocate registers for all
// virtual registers. This piece of code is copied at the place
// where inlining is done.
if (ctx.m_instrTypes.hasNonPrimitiveAlloca &&
!(IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc))) {
mpm.add(createBreakCriticalEdgesPass());
mpm.add(createAnnotateUniformAllocasPass());
if (IGC_IS_FLAG_DISABLED(DisablePromotePrivMem) &&
!isOptDisabledForModule(ctx.getModuleMetaData(), IGCOpts::LowerGEPForPrivMemPass)) {
mpm.add(createPromotePrivateArrayToReg());
mpm.add(createCFGSimplificationPass());
}
}
mpm.add(createPromoteMemoryToRegisterPass());
} else {
if (IGC_IS_FLAG_ENABLED(AllowMem2Reg))
mpm.add(createPromoteMemoryToRegisterPass());
}
if (ctx.type == ShaderType::OPENCL_SHADER || ctx.type == ShaderType::COMPUTE_SHADER) {
if (IGC_IS_FLAG_ENABLED(ForceAllPrivateMemoryToSLM)) {
mpm.add(new PrivateMemoryToSLM(IGC_IS_FLAG_ENABLED(EnableOptReportPrivateMemoryToSLM)));
mpm.add(createInferAddressSpacesPass());
} else if (IGC_IS_FLAG_ENABLED(ForcePrivateMemoryToSLMOnBuffers)) {
std::string forcedBuffers(IGC_GET_REGKEYSTRING(ForcePrivateMemoryToSLMOnBuffers));
mpm.add(new PrivateMemoryToSLM(std::move(forcedBuffers), IGC_IS_FLAG_ENABLED(EnableOptReportPrivateMemoryToSLM)));
mpm.add(createInferAddressSpacesPass());
}
}
if (ctx.m_instrTypes.numOfLoop) {
// need to run loop simplify to canonicalize loop and merge latches
mpm.add(createLoopCanonicalization());
mpm.add(createLoopSimplifyPass());
if (!IGC_IS_FLAG_ENABLED(DisableLoopSplitWidePHIs))
mpm.add(createLoopSplitWidePHIs());
}
if (IGC_IS_FLAG_ENABLED(StackOverflowDetection)) {
// Cleanup stack overflow detection calls if necessary.
mpm.add(new StackOverflowDetectionPass(StackOverflowDetectionPass::Mode::AnalyzeAndCleanup));
}
if (ctx.enableFunctionCall() || ctx.type == ShaderType::RAYTRACING_SHADER) {
// Sort functions if subroutine/indirect fcall is enabled.
mpm.add(llvm::createGlobalDCEPass());
mpm.add(new PurgeMetaDataUtils());
mpm.add(createGenXCodeGenModulePass());
}
// Remove all uses of implicit arg instrinsics after inlining by lowering them to kernel args
mpm.add(new LowerImplicitArgIntrinsics());
// Resolving private memory allocas
// In case of late inlining of Unmasked function postpone memory
// resolution till inlining is done as during inlining new Allocas
// are created.
if (!(IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc))) {
if (IGC_IS_FLAG_DISABLED(DisableMergeAllocasPrivateMemory) && ctx.type == ShaderType::OPENCL_SHADER) {
mpm.add(createMergeAllocasOCL());
}
if (ctx.type == ShaderType::OPENCL_SHADER && !isOptDisabled && IGC_IS_FLAG_ENABLED(EnableExplicitCopyForByVal)) {
mpm.add(new LowerByValAttribute());
mpm.add(createReplaceUnsupportedIntrinsicsPass());
}
mpm.add(CreatePrivateMemoryResolution());
}
// Should help MemOpt pass to merge more loads
mpm.add(createSinkCommonOffsetFromGEPPass());
// Run MemOpt
if (!isOptDisabled && ctx.m_instrTypes.hasLoadStore && IGC_IS_FLAG_DISABLED(DisableMemOpt) &&
!ctx.getModuleMetaData()->disableMemOptforNegativeOffsetLoads) {
if ((ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) &&
IGC_IS_FLAG_DISABLED(DisablePrepareLoadsStores)) {
mpm.add(createPrepareLoadsStoresPass());
}
// run AdvMemOpt and MemOPt back-to-back so that we only
// need to run WIAnalysis once
if (IGC_IS_FLAG_ENABLED(EnableAdvMemOpt))
mpm.add(createAdvMemOptPass());
if (doLdStCombine(&ctx)) {
// Once it is stable, no split 64bit store/load anymore.
mpm.add(createLdStCombinePass());
}
bool AllowNegativeSymPtrsForLoad =
ctx.type == ShaderType::OPENCL_SHADER;
bool AllowVector8LoadStore =
IGC_IS_FLAG_ENABLED(EnableVector8LoadStore) ||
((ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) && ctx.platform.supports8DWLSCMessage());
mpm.add(createMemOptPass(AllowNegativeSymPtrsForLoad, AllowVector8LoadStore));
if ((ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) &&
IGC_IS_FLAG_ENABLED(EnableLSCCacheOptimization)) {
// Optimize store instructions for utilizing the LSC-L1 cache.
// This only runs for shaders with raytracing functionality.
mpm.add(createLSCCacheOptimizationPass());
}
mpm.add(createIGCInstructionCombiningPass());
}
if (ctx.hasSyncRTCalls()) {
mpm.add(createRaytracingStatefulPass());
}
if (ctx.type == ShaderType::OPENCL_SHADER &&
static_cast<OpenCLProgramContext &>(ctx).m_InternalOptions.PromoteStatelessToBindless) {
if (static_cast<OpenCLProgramContext &>(ctx).m_InternalOptions.UseBindlessLegacyMode) {
mpm.add(new PromoteStatelessToBindless());
}
else if (!ctx.getModuleMetaData()->compOpt.GreaterThan4GBBufferRequired && !isOptDisabled)
{
// Advanced bindless mode used by the regular OpenCL compilation path
mpm.add(new StatelessToStateful(TargetAddressing::BINDLESS));
}
}
if (!isOptDisabled && ctx.useStatelessToStateful()) {
mpm.add(new StatelessToStateful(TargetAddressing::BINDFUL));
}
// Light cleanup for subroutines after cloning. Note that the constant
// propogation order is reversed, compared to the opt sequence in
// OptimizeIR. There is a substantial gain with CFG simplification after
// interprocedural constant propagation.
if (ctx.m_enableSubroutine && !isOptDisabled) {
mpm.add(createPruneUnusedArgumentsPass());
const bool allowIPConstProp = !ctx.m_hasStackCalls && IGC_IS_FLAG_DISABLED(DisableIPConstantPropagation);
if (allowIPConstProp) {
mpm.add(createIPSCCPPass());
}
mpm.add(createDeadCodeEliminationPass());
mpm.add(createCFGSimplificationPass());
}
// Since we don't support switch statements, switch lowering is needed after the last CFG simplication
mpm.add(llvm::createLowerSwitchPass());
// This pass can create constant expression
if (ctx.m_DriverInfo.HasDoubleLoadStore()) {
mpm.add(new HandleLoadStoreInstructions());
}
// Split big vector & 3-element load/store, etc.
mpm.add(createVectorPreProcessPass());
// Create Gen IR lowering.
// To replace SLM pointer if they are constants, break constants first.
if (ctx.m_instrTypes.hasLocalLoadStore) {
mpm.add(new BreakConstantExpr());
}
bool KeepGEPs;
// In case of late inlining of Unmasked function postpone memory
// resolution till inlining is done as during inlining new Allocas
// are created.
if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc)) {
KeepGEPs = true;
} else {
KeepGEPs = false;
}
mpm.add(createGenIRLowerPass());
if (KeepGEPs) {
mpm.add(createSeparateConstOffsetFromGEPPass());
} else {
// Also break and lower GEP constexpr.
mpm.add(new BreakConstantExpr());
mpm.add(createGEPLoweringPass());
}
mpm.add(new WorkaroundAnalysis());
if (!isOptDisabled) {
// Removing code assumptions can enable some InstructionCombining optimizations.
// Last instruction combining pass needs to be before Legalization pass, as it can produce illegal instructions.
mpm.add(new RemoveCodeAssumptions());
mpm.add(createIGCInstructionCombiningPass());
if (ctx.platform.doIntegerMad() && ctx.m_DriverInfo.EnableIntegerMad()) {
mpm.add(createCanonicalizeMulAddPass());
}
mpm.add(new GenSpecificPattern());
// Cases with DPDivSqrtEmu grow significantly.
// We can disable EarlyCSE when m_hasDPDivSqrtEmu is true,
// what causes the values will have shorter lifetime and we can avoid spills.
if (!fastCompile && !highAllocaPressure && !isPotentialHPCKernel && !ctx.m_hasDPDivSqrtEmu) {
mpm.add(createEarlyCSEPass());
} else if (highAllocaPressure || isPotentialHPCKernel) {
mpm.add(createSinkingPass());
}
if (!fastCompile && !highAllocaPressure && !isPotentialHPCKernel && IGC_IS_FLAG_ENABLED(allowLICM) &&
ctx.m_retryManager.AllowLICM()) {
mpm.add(createSpecialCasesDisableLICM());
mpm.add(llvm::createLICMPass(100, 500, true));
mpm.add(llvm::createEarlyCSEPass());
}
mpm.add(createAggressiveDCEPass());
// As DPC++ FE apply LICM we cannot reduce register pressure just
// by turning off LICM at IGC in some cases so apply sinking address arithmetic
if ((IGC_IS_FLAG_ENABLED(ForceAddressArithSinking) ||
!isOptDisabledForModule(ctx.getModuleMetaData(), IGCOpts::AddressArithmeticSinkingPass)) &&
ctx.type == ShaderType::OPENCL_SHADER) {
mpm.add(new AddressArithmeticSinking());
}
}
// Enabling half promotion AIL for compute shaders only at this point.
// If needed ctx.type check can be removed to apply for all shader types
if (IGC_IS_FLAG_ENABLED(ForceHalfPromotion) ||
(ctx.getModuleMetaData()->compOpt.WaForceHalfPromotionComputeShader && ctx.type == ShaderType::COMPUTE_SHADER) ||
(ctx.getModuleMetaData()->compOpt.WaForceHalfPromotionPixelVertexShader &&
(ctx.type == ShaderType::PIXEL_SHADER || ctx.type == ShaderType::VERTEX_SHADER)) ||
(!ctx.platform.supportFP16() && IGC_IS_FLAG_ENABLED(EnableHalfPromotion))) {
mpm.add(new HalfPromotion());
if (IGC_IS_FLAG_ENABLED(EnableGVN)) {
mpm.add(createGVNPass());
}
mpm.add(createDeadCodeEliminationPass());
}
if (IGC_IS_FLAG_ENABLED(ForceNoInfiniteLoops)) {
mpm.add(createLoopSimplifyPass());
mpm.add(new CapLoopIterations(UINT_MAX));
}
// Run address remat after GVN as it may hoist address calculations and
// create PHI nodes with addresses.
if (IGC_IS_FLAG_ENABLED(RematEnable) ||
(ctx.m_retryManager.AllowCloneAddressArithmetic() && ctx.type == ShaderType::OPENCL_SHADER)) {
if (IGC_GET_FLAG_VALUE(RematInstCombineBefore))
mpm.add(createIGCInstructionCombiningPass());
// TODO: This is a workaround that helps to reduce amount of instructions for clone address arithmetic
// it helps with chain of instructions like this
// %remat12 = add i64 %baseArith, 100780848
// %remat13 = add i64 %remat12, %basePtr
// %remat14 = add i64 %remat13, %offsetI
// %remat15 = add i64 %remat14, %offsetJ
// load ...
// ....
// %remat21 = add i64 %baseArith, 201561696
// %remat22 = add i64 %remat21, %basePtr
// %remat23 = add i64 %remat22, %offsetI
// %remat24 = add i64 %remat23, %offsetJ
// load ...
// we can compress this chain of instruction into one "add" for each "load"
// this is achieved by combining reassoc + cse 3 times (each pair hoists one add)
// it should be substituted for general pass when it's implemented
//
// Now it's accessible through flag, for testing purposes
if (IGC_GET_FLAG_VALUE(RematReassocBefore)) {
mpm.add(llvm::createReassociatePass());
mpm.add(llvm::createEarlyCSEPass());
mpm.add(llvm::createReassociatePass());
mpm.add(llvm::createEarlyCSEPass());
mpm.add(llvm::createReassociatePass());
mpm.add(llvm::createEarlyCSEPass());
}
mpm.add(createCloneAddressArithmeticPass());
// cloneAddressArithmetic leaves old instructions unnecessary
// dce pass helps to clean that up
mpm.add(createDeadCodeEliminationPass());
if (IGC_IS_FLAG_SET(DumpRegPressureEstimate))
mpm.add(new IGCRegisterPressurePrinter("after_remat"));
} else if (ctx.m_retryManager.AllowCloneAddressArithmetic() && IGC_GET_FLAG_VALUE(RematOptionsForRetry) ||
ctx.platform.supportsVRT() && IGC_GET_FLAG_VALUE(RematOptionsForVRT)) {
if (IGC_GET_FLAG_VALUE(RematInstCombineBefore))
mpm.add(createIGCInstructionCombiningPass());
// see comment above
if (IGC_GET_FLAG_VALUE(RematReassocBefore)) {
mpm.add(llvm::createReassociatePass());
mpm.add(llvm::createEarlyCSEPass());
mpm.add(llvm::createReassociatePass());
mpm.add(llvm::createEarlyCSEPass());
mpm.add(llvm::createReassociatePass());
mpm.add(llvm::createEarlyCSEPass());
}
// if both retry and VRT checks go through, retry is more important
auto rematOptions = ctx.m_retryManager.AllowCloneAddressArithmetic() && IGC_GET_FLAG_VALUE(RematOptionsForRetry)
? static_cast<IGC::REMAT_OPTIONS>(IGC_GET_FLAG_VALUE(RematOptionsForRetry))
: static_cast<IGC::REMAT_OPTIONS>(IGC_GET_FLAG_VALUE(RematOptionsForVRT));
mpm.add(createCloneAddressArithmeticPassWithFlags(rematOptions));
// cloneAddressArithmetic leaves old instructions unnecessary
// dce pass helps to clean that up
mpm.add(createDeadCodeEliminationPass());
if (IGC_IS_FLAG_SET(DumpRegPressureEstimate))
mpm.add(new IGCRegisterPressurePrinter("after_remat"));
}
mpm.add(createRematAddressArithmeticPass());
// Run type demotion if it's beneficial.
if (ctx.m_DriverInfo.benefitFromTypeDemotion() && IGC_IS_FLAG_ENABLED(EnableTypeDemotion)) {
mpm.add(createTypeDemotePass());
}
// Do Genx strengthreduction (do things like fdiv -> inv + mul)
if (!isOptDisabled) {
mpm.add(createGenStrengthReductionPass());
mpm.add(createVectorBitCastOptPass());
}
bool forceUniformSurfaceSampler = ctx.getModuleMetaData()->compOpt.ForceUniformSurfaceSampler;
bool forceUniformBuffer = ctx.getModuleMetaData()->compOpt.ForceUniformBuffer;
if (ctx.m_instrTypes.hasUniformAssumptions || IGC_IS_FLAG_ENABLED(ForceUniformSurfaceSampler) ||
forceUniformSurfaceSampler || IGC_IS_FLAG_ENABLED(ForceUniformBuffer) || forceUniformBuffer) {
mpm.add(new UniformAssumptions(IGC_IS_FLAG_ENABLED(ForceUniformSurfaceSampler) || forceUniformSurfaceSampler,
IGC_IS_FLAG_ENABLED(ForceUniformBuffer) || forceUniformBuffer));
}
// NanHandlingPass need to be before Legalization since it might make
// some changes and require Legalization to "legalize"
if (IGC_IS_FLAG_DISABLED(DisableBranchSwaping) && ctx.m_DriverInfo.BranchSwapping()) {
mpm.add(createNanHandlingPass());
}
// TODO: move to use instruction flags
// to figure out if we need to preserve Nan
bool preserveNan = !ctx.getCompilerOption().NoNaNs;
// Legalizer does not handle constant expressions
mpm.add(new BreakConstantExpr());
mpm.add(new Legalization(preserveNan));
// Scalarizer in codegen to handle the vector instructions
mpm.add(new ScalarizerCodeGen());
// coalesce scalar loads into loads of larger quantity.
// This require and preserves uniform analysis we should keep
// other passes using uniformness together to avoid re-running it several times
if (IGC_IS_FLAG_DISABLED(DisableConstantCoalescing) && !ctx.getModuleMetaData()->compOpt.DisableConstantCoalescing &&
!isOptDisabledForModule(ctx.getModuleMetaData(), IGCOpts::ConstantCoalescingPass)) {
mpm.add(createBreakCriticalEdgesPass());
mpm.add(new ConstantCoalescing());
}
if (ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) {
if (IGC_IS_FLAG_DISABLED(DisableLSCControlsForRayTracing))
mpm.add(CreateLSCControlsAnalysisPass());
// We do raytracing lowering a little earlier than the others here
// to take advantage of the instruction simplification below.
mpm.add(CreateRayTracingShaderLowering());
}
// Instruction combining may merge instruction back into unsupported intrinsics.
// Therefore last Replace Unsupported Intrinsics Pass must be after last
// Instruction combining pass.
// Replace Unsupported Intrinsics Pass may generate new 64 bit operations.
// Therefore last 64bit emulation pass must be after the last Replace Unsupported Intrinsics Pass.
mpm.add(createReplaceUnsupportedIntrinsicsPass());
if (!ctx.platform.hasFP32GlobalAtomicAdd()) {
mpm.add(new AtomicOptPass());
}
// When m_hasDPEmu is true, enable Emu64Ops as well for now until
// DPEmu is able to get rid of all 64bit integer ops fully.
if ((ctx.m_hasDPEmu && IGC_IS_FLAG_ENABLED(DPEmuNeedI64Emu)) ||
(ctx.m_DriverInfo.Enable64BitEmu() &&
(IGC_GET_FLAG_VALUE(Enable64BitEmulation) ||
(IGC_GET_FLAG_VALUE(Enable64BitEmulationOnSelectedPlatform) && ctx.platform.need64BitEmulation()))) ||
ctx.platform.hasPartialInt64Support()) {
mpm.add(new BreakConstantExpr());
// Emu64OpsPass requires that we are working on legal types, specifically
// that i128 uses are expanded to i64. This is why we need to run PeepholeTypeLegalizer
// beforehand.
mpm.add(new Legalizer::PeepholeTypeLegalizer());
// Lower all GEPs now as Emu64 doesn't know how to handle them.
if (KeepGEPs) {
mpm.add(createGEPLoweringPass());
mpm.add(llvm::createEarlyCSEPass());
}
// Run dead code elimination pass right before Emu64OpsPass,
// as legalization passes do not always clear unused (operating
// on illegal types) instructions.
mpm.add(llvm::createDeadCodeEliminationPass());
if (ctx.type == ShaderType::OPENCL_SHADER && IGC_IS_FLAG_ENABLED(EnableKernelCostInfo)) {
mpm.add(createLoopCountAnalysisPass());
}
if (ctx.platform.hasPartialEmuI64Enabled()) {
mpm.add(createPartialEmuI64OpsPass());
} else {
mpm.add(createEmu64OpsPass());
}
ctx.m_hasEmu64BitInsts = true;
if (!isOptDisabled) {
mpm.add(new GenSpecificPattern());
}
}
if (ctx.m_instrTypes.hasRuntimeValueVector) {
// Legalize RuntimeValue calls for push analysis
mpm.add(new RuntimeValueLegalizationPass());
}
if ((ctx.m_instrTypes.hasLocalLoadStore || ctx.m_instrTypes.hasLocalAtomics) && ctx.platform.hasLSC() &&
!ctx.platform.NeedsLSCFenceUGMBeforeEOT() && // VISA will add the fence
IGC_IS_FLAG_DISABLED(DisableAddRequiredMemoryFencesPass)) {
mpm.add(createAddRequiredMemoryFencesPass());
}
mpm.add(createInstSimplifyLegacyPass());
// This pass inserts bitcasts for vector loads/stores.
// This pass could be moved further toward EmitPass.
mpm.add(createVectorProcessPass());
// handling constant expressions created by vectorProcess pass
mpm.add(new BreakConstantExpr());
mpm.add(new LowPrecisionOpt());
mpm.add(new WAFMinFMax());
mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Legalization, STATS_COUNTER_END));
COMPILER_TIME_END(&ctx, TIME_CG_Add_Legalization_Passes);
} // AddLegalizationPasses
void AddCodeGenPasses(CodeGenContext &ctx, CShaderProgram::KernelShaderMap &shaders, IGCPassManager &Passes,
SIMDMode simdMode, bool canAbortOnSpill, ShaderDispatchMode shaderMode, PSSignature *pSignature) {
// Generate CISA
COMPILER_TIME_START(&ctx, TIME_CG_Add_CodeGen_Passes);
Passes.add(new EmitPass(shaders, simdMode, canAbortOnSpill, shaderMode, pSignature));
COMPILER_TIME_END(&ctx, TIME_CG_Add_CodeGen_Passes);
}
// check based on performance measures.
bool SimdEarlyCheck(CodeGenContext *ctx) {
if (ctx->m_sampler < 11 || ctx->m_inputCount < 16 || ctx->m_tempCount < 40 || ctx->m_dxbcCount < 280 ||
ctx->m_ConstantBufferCount < 500) {
if (ctx->m_tempCount < 90 && ctx->m_ConstantBufferCount < 210) {
return true;
}
}
return false;
}
void destroyShaderMap(CShaderProgram::KernelShaderMap &shaders) {
for (const auto &i : shaders) {
CShaderProgram *shader = i.second;
COMPILER_SHADER_STATS_PRINT(shader->m_shaderStats, shader->GetContext()->type, shader->GetContext()->hash, "");
COMPILER_SHADER_STATS_SUM(shader->GetContext()->m_sumShaderStats, shader->m_shaderStats,
shader->GetContext()->type);
COMPILER_SHADER_STATS_DEL(shader->m_shaderStats);
delete shader;
}
}
void unify_opt_PreProcess(CodeGenContext *pContext) {
TODO("hasBuiltin should be calculated based on module");
if (IGC_IS_FLAG_ENABLED(DisableLLVMGenericOptimizations)) {
pContext->getModuleMetaData()->compOpt.OptDisable = true;
}
if (IGC_GET_FLAG_VALUE(StripDebugInfo) == FLAG_DEBUG_INFO_STRIP_ALL) {
StripDebugInfo(*pContext->getModule());
} else if (IGC_GET_FLAG_VALUE(StripDebugInfo) == FLAG_DEBUG_INFO_STRIP_NONLINE) {
stripNonLineTableDebugInfo(*pContext->getModule());
}
IGCPassManager mpm(pContext, "OPTPre");
mpm.add(new CodeGenContextWrapper(pContext));
mpm.add(new CheckInstrTypes(false, true));
if (pContext->isPOSH()) {
mpm.add(createRemoveNonPositionOutputPass());
}
mpm.run(*pContext->getModule());
// If the module does not contain called function declaration,
// indirect calls are the only way to detect function pointers usage.
if (pContext->m_instrTypes.hasIndirectCall)
pContext->m_enableFunctionPointer = true;
if (pContext->getMetaDataUtils()->size_FunctionsInfo() == 1 && !pContext->m_instrTypes.hasSubroutines) {
pContext->m_instrTypes.numBB = pContext->getMetaDataUtils()->begin_FunctionsInfo()->first->size();
pContext->m_instrTypes.hasMultipleBB = (pContext->m_instrTypes.numBB != 1);
} else {
pContext->m_instrTypes.hasMultipleBB = true;
}
pContext->m_instrTypes.hasLoadStore = true;
pContext->m_instrTypes.CorrelatedValuePropagationEnable =
(pContext->m_instrTypes.hasMultipleBB &&
(pContext->m_instrTypes.hasSel || pContext->m_instrTypes.hasCmp || pContext->m_instrTypes.hasSwitch ||
pContext->m_instrTypes.hasLoadStore));
}
static bool extensiveShader(CodeGenContext *pContext) {
return (pContext->type == ShaderType::OPENCL_SHADER && pContext->m_instrTypes.numInsts > INST_THRESHOLD &&
pContext->m_instrTypes.numLoopInsts > LOOP_INST_THRESHOLD &&
pContext->m_instrTypes.numOfLoop > LOOP_NUM_THRESHOLD && pContext->m_instrTypes.numBB == 0 &&
pContext->m_instrTypes.numSample == 0 && pContext->m_instrTypes.hasSubroutines);
}
// When we do not run optimizations, we still need to run always inline
// pass, otherwise codegen will fail.
static void alwaysInlineForNoOpt(CodeGenContext *pContext, bool NoOpt) {
if (NoOpt) {
MetaDataUtils *pMdUtils = pContext->getMetaDataUtils();
IGCPassManager mpm(pContext, "OPTPost");
mpm.add(new MetaDataUtilsWrapper(pMdUtils, pContext->getModuleMetaData()));
mpm.add(new CodeGenContextWrapper(pContext));
mpm.add(createAlwaysInlinerLegacyPass());
mpm.add(new PurgeMetaDataUtils());
mpm.run(*pContext->getModule());
}
}
#define GFX_ONLY_PASS if (pContext->type != ShaderType::OPENCL_SHADER)
void OptimizeIR(CodeGenContext *const pContext) {
IGC_ASSERT(nullptr != pContext);
MetaDataUtils *pMdUtils = pContext->getMetaDataUtils();
IGC_ASSERT(nullptr != pContext->getModuleMetaData());
bool NoOpt = pContext->getModuleMetaData()->compOpt.OptDisable;
DumpHashToOptions(pContext->hash, pContext->type);
alwaysInlineForNoOpt(pContext, NoOpt);
// Insert per-func optimization metadata
for (auto &F : *pContext->getModule()) {
if (!F.empty()) {
IGC::InsertOptsMetadata(pContext, &F);
}
}
if (NoOpt) {
return;
}
IGCPassManager mpm(pContext, "OPT");
#if !defined(_DEBUG)
if (IGC_IS_FLAG_ENABLED(EnableDebugging))
#endif
// do verifyModule for debug/release_internal only.
if (false == pContext->m_hasLegacyDebugInfo) {
IGC_ASSERT(nullptr != pContext->getModule());
IGC_ASSERT(false == llvm::verifyModule(*pContext->getModule(), &dbgs()));
}
COMPILER_TIME_START(pContext, TIME_OptimizationPasses);
// scope to force destructors before mem usage sampling
{
unify_opt_PreProcess(pContext);
/// Keeps track of the Dump objects so that we can free them after the pass manager has been run
// right now we don't support any standard function in the code gen
// maybe we want to support some at some point to take advantage of LLVM optimizations
TargetLibraryInfoImpl TLI;
TLI.disableAllFunctions();
mpm.add(new MetaDataUtilsWrapper(pMdUtils, pContext->getModuleMetaData()));
mpm.add(new CodeGenContextWrapper(pContext));
TargetIRAnalysis GenTTgetIIRAnalysis([&](const Function &F) {
GenIntrinsicsTTIImpl GTTI(pContext);
return TargetTransformInfo(GTTI);
});
mpm.add(new TargetTransformInfoWrapperPass(GenTTgetIIRAnalysis));
#if defined(_DEBUG) && !defined(__ANDROID__)
// IGC IR Verification pass checks that we get a correct IR after the Unification.
mpm.add(new VerificationPass());
#endif
mpm.add(new llvm::TargetLibraryInfoWrapperPass(TLI));
initializeWIAnalysisPass(*PassRegistry::getPassRegistry());
// Do inter-procedural constant propagation early.
if (pContext->m_enableSubroutine) {
// Here, we propagate function attributes across calls. Remaining
// function calls that were conservatively marked as 'convergent'
// in ProcessBuiltinMetaData can have that attribute stripped if
// possible which potentially allows late stage code sinking of
// those calls by the instruction combiner.
mpm.add(createPostOrderFunctionAttrsLegacyPass());
// Don't run IPConstantProp if there are stackcalls
const bool allowIPConstProp = !pContext->m_hasStackCalls && IGC_IS_FLAG_DISABLED(DisableIPConstantPropagation);
if (allowIPConstProp) {
mpm.add(createIPSCCPPass());
}
// Note / todo: LLVM < 12 also runs simple constant propagation pass
// regardless of IPSCCP in this case. This pass is not available on
// >= 12 version, but maybe SCCP pass would be suitable here.
}
if (IGC_IS_FLAG_ENABLED(MSAA16BitPayloadEnable) && pContext->platform.support16bitMSAAPayload()) {
mpm.add(new ConvertMSAAPayloadTo16Bit());
}
if (IGC_GET_FLAG_VALUE(MSAAClearedKernel) > 0) {
mpm.add(new MSAAInsertDiscard());
}
mpm.add(createSamplerPerfOptPass());
if ((!IGC_IS_FLAG_ENABLED(DisableDynamicTextureFolding) &&
pContext->getModuleMetaData()->inlineDynTextures.size() > 0) ||
(!IGC_IS_FLAG_ENABLED(DisableDynamicResInfoFolding))) {
mpm.add(new DynamicTextureFolding());
}
if (pContext->m_DriverInfo.CodeSinkingBeforeCFGSimplification()) {
mpm.add(new HoistCongruentPHI());
mpm.add(new CodeSinking());
}
mpm.add(llvm::createCFGSimplificationPass(SimplifyCFGOptions().hoistCommonInsts(true)));
mpm.add(llvm::createBasicAAWrapperPass());
mpm.add(createAddressSpaceAAWrapperPass());
if (pContext->type == ShaderType::RAYTRACING_SHADER || pContext->hasSyncRTCalls()) {
if (IGC_IS_FLAG_DISABLED(DisableRTAliasAnalysis))
mpm.add(createRayTracingAddressSpaceAAWrapperPass());
}
mpm.add(createIGCExternalAAWrapper());
mpm.add(createScopedNoAliasAAWrapperPass());
if (pContext->m_instrTypes.hasLoadStore) {
mpm.add(llvm::createDeadStoreEliminationPass());
mpm.add(createMarkReadOnlyLoadPass());
}
mpm.add(createLogicalAndToBranchPass());
mpm.add(llvm::createEarlyCSEPass());
if (pContext->m_instrTypes.CorrelatedValuePropagationEnable) {
mpm.add(llvm::createCorrelatedValuePropagationPass());
}
mpm.add(new BreakConstantExpr());
mpm.add(new IGCConstProp());
GFX_ONLY_PASS { mpm.add(createTranslateToProgrammableOffsetsPass()); }
mpm.add(new CustomSafeOptPass());
if (!pContext->m_DriverInfo.WADisableCustomPass()) {
mpm.add(new CustomUnsafeOptPass());
}
mpm.add(createSubGroupReductionPatternPass());
if (IGC_IS_FLAG_ENABLED(EmulateFDIV)) {
mpm.add(createGenFDIVEmulation());
}
mpm.add(createIGCInstructionCombiningPass());
if (IGC_IS_FLAG_ENABLED(EnableWaveShuffleIndexSinking)) {
mpm.add(createWaveShuffleIndexSinking());
}
mpm.add(new FCmpPaternMatch());
mpm.add(llvm::createDeadCodeEliminationPass()); // this should be done both before/after constant propagation
if (pContext->m_instrTypes.hasGenericAddressSpacePointers && IGC_IS_FLAG_ENABLED(EnableGASResolver)) {
mpm.add(createSROAPass());
mpm.add(createFixAddrSpaceCastPass());
mpm.add(createResolveGASPass());
}
if (pContext->m_instrTypes.hasNonPrimitiveAlloca && IGC_IS_FLAG_DISABLED(DisableShrinkArrayAllocaPass)) {
mpm.add(new ShrinkArrayAllocaPass());
}
if (IGC_IS_FLAG_ENABLED(SampleMultiversioning) || pContext->m_enableSampleMultiversioning) {
if (pContext->m_instrTypes.numOfLoop == 0)
mpm.add(new SampleMultiversioning(pContext));
}
bool disableGOPT =
((IsStage1FastestCompile(pContext->m_CgFlag, pContext->m_StagingCtx) || IGC_GET_FLAG_VALUE(ForceFastestSIMD)) &&
((FastestS1Options(pContext) & FCEXP_DISABLE_GOPT) || FastestS1Options(pContext) == FCEXP_NO_EXPRIMENT ||
pContext->getModuleMetaData()->compOpt.DisableFastestGopt));
// EnableBarrierControlFlowOptimizationPass: enable BCF optimization
// UseBarrierControlFlowOptimization: UMD AIL option to use BCF optimization
// supportBarrierControlFlowOptimization: API control (D3D12, Vulkan, OCL)
if ((IGC_IS_FLAG_ENABLED(EnableBarrierControlFlowOptimizationPass) ||
pContext->getModuleMetaData()->compOpt.UseBarrierControlFlowOptimization) &&
pContext->m_DriverInfo.supportBarrierControlFlowOptimization() &&
pContext->platform.hasBarrierControlFlowOpt() && !pContext->hasSyncRTCalls() &&
(pContext->type != ShaderType::PIXEL_SHADER)) {
mpm.add(createBarrierControlFlowOptimization());
}
if (pContext->m_instrTypes.hasMultipleBB && !disableGOPT) {
// disable loop unroll for excessive large shaders
if (pContext->m_instrTypes.numOfLoop) {
mpm.add(createLoopDeadCodeEliminationPass());
mpm.add(createLoopCanonicalization());
mpm.add(llvm::createLoopDeletionPass());
mpm.add(llvm::createBreakCriticalEdgesPass());
mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));
mpm.add(llvm::createLCSSAPass());
mpm.add(llvm::createLoopSimplifyPass());
}
}
// This pass needs to be extended for other devices
if (pContext->platform.getPlatformInfo().eProductFamily == IGFX_PVC) {
mpm.add(new GenerateBlockMemOpsPass());
}
mpm.add(new BlockMemOpAddrScalarizationPass());
if (pContext->m_instrTypes.hasMultipleBB && !disableGOPT) {
if (pContext->m_instrTypes.numOfLoop) {
bool allowLICM = IGC_IS_FLAG_ENABLED(allowLICM) && pContext->m_retryManager.AllowLICM();
bool runGEPLSR = IGC_IS_FLAG_ENABLED(EnableGEPLSR) && pContext->type == ShaderType::OPENCL_SHADER &&
pContext->platform.getPlatformInfo().eProductFamily == IGFX_PVC &&
!pContext->useStatelessToStateful() &&
pContext->m_retryManager.IsFirstTry();
if (runGEPLSR && IGC_IS_FLAG_DISABLED(RunGEPLSRAfterLICM)) {
mpm.add(createGEPLoopStrengthReductionPass(allowLICM));
}
if (allowLICM) {
mpm.add(createSpecialCasesDisableLICM());
int licmTh = IGC_GET_FLAG_VALUE(LICMStatThreshold);
mpm.add(new InstrStatistic(pContext, LICM_STAT, InstrStatStage::BEGIN, licmTh));
mpm.add(llvm::createLICMPass(100, 500, true));
mpm.add(new InstrStatistic(pContext, LICM_STAT, InstrStatStage::END, licmTh));
}
if (runGEPLSR && IGC_IS_FLAG_ENABLED(RunGEPLSRAfterLICM)) {
mpm.add(createGEPLoopStrengthReductionPass(allowLICM));
}
if (!pContext->m_retryManager.IsFirstTry() && pContext->type == ShaderType::OPENCL_SHADER) {
mpm.add(new DisableLoopUnrollOnRetry());
}
mpm.add(createIGCInstructionCombiningPass());
if (IGC_IS_FLAG_ENABLED(EnableIndVarSimplification) && pContext->type == ShaderType::OPENCL_SHADER) {
mpm.add(llvm::createIndVarSimplifyPass());
}
if (IGC_IS_FLAG_ENABLED(EnableLoopHoistConstant)) {
mpm.add(createLoopHoistConstant());
}
if (IGC_IS_FLAG_ENABLED(EnableAdvCodeMotion) && pContext->type == ShaderType::OPENCL_SHADER &&
!pContext->m_instrTypes.hasSwitch) {
mpm.add(createAdvCodeMotionPass(IGC_GET_FLAG_VALUE(AdvCodeMotionControl)));
}
mpm.add(createLoopAllocaUpperbound());
int LoopUnrollThreshold = pContext->m_DriverInfo.GetLoopUnrollThreshold();
// override the LoopUnrollThreshold if the registry key is set
if (IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold) != 0) {
LoopUnrollThreshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold);
} else if (pContext->getModuleMetaData()->compOpt.SetLoopUnrollThreshold > 0) {
LoopUnrollThreshold = pContext->getModuleMetaData()->compOpt.SetLoopUnrollThreshold;
}
// if the shader contains indexable_temp, we'll keep unroll
bool unroll =
!pContext->getModuleMetaData()->compOpt.DisableLoopUnroll && IGC_IS_FLAG_DISABLED(DisableLoopUnroll);
bool hasIndexTemp = (pContext->m_indexableTempSize[0] > 0);
bool disableLoopUnrollStage1 =
IsStage1FastestCompile(pContext->m_CgFlag, pContext->m_StagingCtx) &&
(FastestS1Options(pContext) == FCEXP_NO_EXPRIMENT || (FastestS1Options(pContext) & FCEXP_DISABLE_UNROLL));
if ((LoopUnrollThreshold > 0 && unroll && !disableLoopUnrollStage1) || hasIndexTemp) {
mpm.add(llvm::createLoopUnrollPass(2, false, false, -1, -1, -1, -1, -1, -1));
}
// Due to what looks like a bug in LICM, we need to break the LoopPassManager between
// LoopUnroll and LICM.
mpm.add(createBarrierNoopPass());
if (allowLICM) {
mpm.add(createSpecialCasesDisableLICM());
mpm.add(llvm::createLICMPass(100, 500, true));
}
// Second unrolling with the same threshold.
unroll = !pContext->getModuleMetaData()->compOpt.DisableLoopUnroll && IGC_IS_FLAG_DISABLED(DisableLoopUnroll);
if (LoopUnrollThreshold > 0 && unroll) {
mpm.add(llvm::createLoopUnrollPass(2, false, false, -1, -1, -1, -1, -1, -1));
}
// Should be after LICM to accurately reason about which
// instructions are loop-dependent or not. Needs to be before
// another LICM call which will hoist relevant intrinsics
if (IGC_GET_FLAG_VALUE(allowDecompose2DBlockFuncs)) {
mpm.add(createDecompose2DBlockFuncsPass());
}
mpm.add(llvm::createLoopLoadEliminationPass());
if (!extensiveShader(pContext) && pContext->m_instrTypes.hasNonPrimitiveAlloca) {
if (pContext->m_DriverInfo.NeedCountSROA()) {
mpm.add(new InstrStatistic(pContext, SROA_PROMOTED, InstrStatStage::BEGIN, 300));
mpm.add(createSROAPass());
mpm.add(new InstrStatistic(pContext, SROA_PROMOTED, InstrStatStage::END, 300));
} else {
mpm.add(createSROAPass());
}
}
}
mpm.add(new SplitStructurePhisPass());
if (IGC_IS_FLAG_ENABLED(EnableRemoveLoopDependency)) {
mpm.add(new RemoveLoopDependency());
}
// Note:
// call reassociation pass before IGCConstProp(EnableSimplifyGEP)
// to preserve the the expr evaluation order that IGCConstProp
// creates.
// Limit this optimization to GPGPU-only because it tends to have
// more address computation.
// Do not apply reordering on vertex-shader as CustomUnsafeOptPass
// does.
if (IGC_IS_FLAG_ENABLED(OCLEnableReassociate) && pContext->type == ShaderType::OPENCL_SHADER) {
mpm.add(createReassociatePass());
}
mpm.add(createPromoteConstantStructsPass());
if (IGC_IS_FLAG_ENABLED(EnableGVN)) {
mpm.add(llvm::createGVNPass());
}
mpm.add(createGenOptLegalizer());
mpm.add(llvm::createSCCPPass());
mpm.add(llvm::createDeadCodeEliminationPass());
if (!extensiveShader(pContext))
mpm.add(llvm::createAggressiveDCEPass());
mpm.add(new BreakConstantExpr());
mpm.add(new IGCConstProp(IGC_IS_FLAG_ENABLED(EnableSimplifyGEP)));
// Now that constant propagation is largely complete, perform
// initial evaluation of freeze instructions. We need this to make
// life easier for subsequent LLVM passes, as passes like
// InstCombine/SimplifyCFG can sometimes be lazy in checking freeze
// operand's validity over the more complex instruction chains,
// simply assuming that it's safer to refrain from optimizations.
// TODO: Check if LLVM 15+ provides improvements in that regard,
// alleviating the need for early freeze evaluation.
mpm.add(createEvaluateFreezePass());
if (IGC_IS_FLAG_DISABLED(DisableImmConstantOpt)) {
// If we have ICBs, need to emit clamp code so OOB access doesn't occur
if (pContext->getModuleMetaData()->immConstant.zeroIdxs.size()) {
mpm.add(createClampICBOOBAccess());
}
GFX_ONLY_PASS { mpm.add(createIGCIndirectICBPropagaionPass()); }
}
GFX_ONLY_PASS { mpm.add(new GenUpdateCB()); }
// Inserting PromoteToPredicatedMemoryAccess after GVN and several
// other passes, to not block optimizations changing LLVM
// load/stores, but before multiple SimplifyCFGs to allow more
// aggressive CFG simplification.
if (IGC_IS_FLAG_ENABLED(EnablePromoteToPredicatedMemoryAccess)) {
mpm.add(new HoistConvOpToDom());
mpm.add(llvm::createCFGSimplificationPass());
mpm.add(new PromoteToPredicatedMemoryAccess());
}
if (IGC_IS_FLAG_ENABLED(EnableJumpThreading) && !pContext->m_instrTypes.hasAtomics &&
!extensiveShader(pContext)) {
if (pContext->type == ShaderType::OPENCL_SHADER) {
// Add CFGSimplification for clean-up before JumpThreading.
mpm.add(llvm::createCFGSimplificationPass());
}
// jump threading currently causes the atomic_flag test from c11 conformance to fail. Right now,
// only do jump threading if we don't have atomics as using atomics as locks seems to be the most common
// case of violating the no independent forward progress clause from the spec.
// We need to increase default duplication threshold since JumpThreading pass cost estimation does
// not consider that not all instructions need to be duplicated.
int BBDuplicateThreshold = (pContext->type == ShaderType::OPENCL_SHADER) ? 9 : -1;
#if LLVM_VERSION_MAJOR >= 15
// In LLVM-12.x an extra parameter InsertFreezeWhenUnfoldingSelect = false was added
// to JumpThreading pass, but since LLVM-15.x it was removed again.
mpm.add(llvm::createJumpThreadingPass(BBDuplicateThreshold));
#else // LLVM_VERSION_MAJOR
mpm.add(llvm::createJumpThreadingPass(false, BBDuplicateThreshold));
#endif // LLVM_VERSION_MAJOR
}
mpm.add(llvm::createCFGSimplificationPass());
mpm.add(llvm::createEarlyCSEPass());
if (pContext->m_instrTypes.hasNonPrimitiveAlloca) {
// run custom safe opts to potentially get rid of indirect
// addressing of private arrays, see visitLoadInst
mpm.add(new CustomSafeOptPass());
mpm.add(createSROAPass());
}
// Use CFGSimplification to do clean-up. Needs to be invoked before lowerSwitch.
mpm.add(llvm::createCFGSimplificationPass());
if (IGC_IS_FLAG_DISABLED(DisableFlattenSmallSwitch)) {
mpm.add(createFlattenSmallSwitchPass());
}
// some optimization can create switch statement we don't support
mpm.add(llvm::createLowerSwitchPass());
// preferred to be added after all LowerSwitch pass runs, as switch lowering is able
// to benefit from unreachable instruction when it's in default switch case
mpm.add(new UnreachableHandling());
// Conditions apply just as above due to problems with atomics
// (see comment above for details).
if (IGC_IS_FLAG_ENABLED(EnableJumpThreading) && !pContext->m_instrTypes.hasAtomics &&
!extensiveShader(pContext)) {
// After lowering 'switch', run jump threading to remove redundant jumps.
mpm.add(llvm::createJumpThreadingPass());
}
// run instruction combining to clean up the code after CFG optimizations
mpm.add(createIGCInstructionCombiningPass());
mpm.add(llvm::createDeadCodeEliminationPass());
mpm.add(llvm::createEarlyCSEPass());
// need to be before code sinking
GFX_ONLY_PASS { mpm.add(createInsertBranchOptPass()); }
mpm.add(new CustomSafeOptPass());
if (!pContext->m_DriverInfo.WADisableCustomPass()) {
mpm.add(new CustomUnsafeOptPass());
}
} else {
if (pContext->m_instrTypes.hasMultipleBB) {
assert(disableGOPT);
// disable loop unroll for excessive large shaders
if (pContext->m_instrTypes.numOfLoop) {
mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));
int LoopUnrollThreshold = pContext->m_DriverInfo.GetLoopUnrollThreshold();
// override the LoopUnrollThreshold if the registry key is set
if (IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold) != 0) {
LoopUnrollThreshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold);
} else if (pContext->getModuleMetaData()->compOpt.SetLoopUnrollThreshold > 0) {
LoopUnrollThreshold = pContext->getModuleMetaData()->compOpt.SetLoopUnrollThreshold;
}
// if the shader contains indexable_temp, we'll keep unroll
bool unroll =
!pContext->getModuleMetaData()->compOpt.DisableLoopUnroll && IGC_IS_FLAG_DISABLED(DisableLoopUnroll);
bool hasIndexTemp = (pContext->m_indexableTempSize[0] > 0);
// Enable loop unrolling for stage 1 for now due to persisting regressions
bool disableLoopUnrollStage1 = IsStage1FastestCompile(pContext->m_CgFlag, pContext->m_StagingCtx) &&
( // FastestS1Options(pContext) == FCEXP_NO_EXPRIMENT ||
(FastestS1Options(pContext) & FCEXP_DISABLE_UNROLL));
if ((LoopUnrollThreshold > 0 && unroll && !disableLoopUnrollStage1) || hasIndexTemp) {
mpm.add(llvm::createLoopUnrollPass(2, false, false, -1, -1, -1, -1, -1, -1));
}
}
if (IGC_IS_FLAG_ENABLED(EnableGVN)) {
mpm.add(llvm::createGVNPass());
}
}
if (IGC_IS_FLAG_DISABLED(DisableImmConstantOpt)) {
// If we have ICBs, need to emit clamp code so OOB access
// doesn't occur
if (pContext->getModuleMetaData()->immConstant.zeroIdxs.size()) {
mpm.add(createClampICBOOBAccess());
}
GFX_ONLY_PASS { mpm.add(createIGCIndirectICBPropagaionPass()); }
}
// single basic block
if (!pContext->m_DriverInfo.WADisableCustomPass()) {
mpm.add(llvm::createEarlyCSEPass());
mpm.add(new CustomSafeOptPass());
mpm.add(new CustomUnsafeOptPass());
}
mpm.add(createGenOptLegalizer());
GFX_ONLY_PASS { mpm.add(createInsertBranchOptPass()); }
}
// If we have ICBs, need to emit clamp code so OOB access doesn't occur
if (pContext->getModuleMetaData()->immConstant.zeroIdxs.size() && IGC_IS_FLAG_ENABLED(DisableImmConstantOpt)) {
mpm.add(createClampICBOOBAccess());
}
if (pContext->m_instrTypes.hasRuntimeValueVector) {
// Optimize extracts from RuntimeValue vectors. It should be executed
// after constants propagation and loop unrolling
mpm.add(createVectorBitCastOptPass());
mpm.add(new RuntimeValueVectorExtractPass());
}
if (pContext->m_enableSubroutine && getFunctionControl(pContext) == FLAG_FCALL_DEFAULT) {
mpm.add(createEstimateFunctionSizePass(EstimateFunctionSize::AL_Kernel));
if (IGC_IS_FLAG_ENABLED(EnableLargeFunctionCallMerging)) {
mpm.add(new CallMerger());
}
mpm.add(createEstimateFunctionSizePass(EstimateFunctionSize::AL_Kernel));
mpm.add(createSubroutineInlinerPass());
} else {
// Inline all remaining functions with always inline attribute.
mpm.add(createAlwaysInlinerLegacyPass());
}
if ((pContext->m_DriverInfo.NeedExtraPassesAfterAlwaysInlinerPass() || pContext->m_enableSubroutine) &&
pContext->m_instrTypes.hasNonPrimitiveAlloca) {
mpm.add(createSROAPass());
}
if (pContext->type == ShaderType::COMPUTE_SHADER &&
(IGC_IS_FLAG_ENABLED(RemoveUnusedTGMFence) || pContext->getModuleMetaData()->enableRemoveUnusedTGMFence)) {
mpm.add(new TrivialUnnecessaryTGMFenceElimination());
}
mpm.add(createGenSimplificationPass());
if (pContext->m_instrTypes.hasLoadStore) {
mpm.add(llvm::createDeadStoreEliminationPass());
mpm.add(llvm::createMemCpyOptPass());
mpm.add(createLdShrinkPass());
}
mpm.add(llvm::createDeadCodeEliminationPass());
if (IGC_IS_FLAG_ENABLED(EnableWaveAllJointReduction)) {
mpm.add(createWaveAllJointReduction());
}
if (IGC_IS_FLAG_ENABLED(EnableIntDivRemCombine)) {
// simplify rem if the quotient is availble
//
// run GVN first so that stuff like the following can be
// reduced as well:
// = foo / (2*x + 1)
// = foo % (2*x + 1)
// can be reduced as well
if (IGC_IS_FLAG_ENABLED(EnableGVN)) {
mpm.add(llvm::createGVNPass());
}
//
mpm.add(createIntDivRemCombinePass());
}
if (IGC_IS_FLAG_ENABLED(EnableConstIntDivReduction)) {
// reduce division/remainder with a constant divisors/moduli to
// more efficient sequences of multiplies, shifts, and adds
mpm.add(createIntDivConstantReductionPass());
}
if (IGC_IS_FLAG_ENABLED(EnableIntDivRemIncrementReduction)) {
mpm.add(createIntDivRemIncrementReductionPass());
}
GFX_ONLY_PASS { mpm.add(createMergeMemFromBranchOptPass()); }
if (IGC_IS_FLAG_DISABLED(DisableLoadSinking) &&
!isOptDisabledForModule(pContext->getModuleMetaData(), IGCOpts::SinkLoadOptPass)) {
mpm.add(createSinkLoadOptPass());
}
mpm.add(createConstantMergePass());
GFX_ONLY_PASS { mpm.add(CreateMCSOptimization()); }
GFX_ONLY_PASS { mpm.add(CreateGatingSimilarSamples()); }
if (!IGC::ForceAlwaysInline(pContext)) {
mpm.add(new PurgeMetaDataUtils());
}
// mpm.add(llvm::createDeadCodeEliminationPass()); // this should be done both before/after constant propagation
if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_DISABLED(LateInlineUnmaskedFunc)) {
mpm.add(new InlineUnmaskedFunctionsPass());
}
if (pContext->m_instrTypes.numOfLoop) {
mpm.add(createDeadPHINodeEliminationPass());
}
if (IGC_IS_FLAG_ENABLED(EnableMadLoopSlice)) {
mpm.add(createMadLoopSlicePass());
}
if (IGC_IS_FLAG_ENABLED(EnableVectorizer)) {
mpm.add(new IGCVectorizer());
mpm.add(llvm::createAggressiveDCEPass());
if (IGC_IS_FLAG_ENABLED(VectorizerCheckScalarizer))
mpm.add(createScalarizerPass(SelectiveScalarizer::Auto));
}
mpm.run(*pContext->getModule());
} // end scope
COMPILER_TIME_END(pContext, TIME_OptimizationPasses);
// pContext->shaderEntry->viewCFG();
DumpLLVMIR(pContext, "optimized");
MEM_SNAPSHOT(IGC::SMS_AFTER_OPTIMIZER);
} // OptimizeIR
} // namespace IGC