intel-graphics-compiler/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2024 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "AdaptorOCL/MergeAllocasOCL.h"
#include "Compiler/Legalizer/PeepholeTypeLegalizer.hpp"
#include "Compiler/CISACodeGen/DropTargetBBs.hpp"
#include "Compiler/CISACodeGen/layout.hpp"
#include "Compiler/CISACodeGen/DeSSA.hpp"
#include "Compiler/CISACodeGen/GenCodeGenModule.h"
#include "Compiler/CISACodeGen/AdvCodeMotion.h"
#include "Compiler/CISACodeGen/RematAddressArithmetic.h"
#include "Compiler/CISACodeGen/VectorShuffleAnalysis.hpp"
#include "Compiler/CISACodeGen/IGCLivenessAnalysis.h"
#include "Compiler/CISACodeGen/IGCVectorizer.h"
#include "Compiler/CISACodeGen/AdvMemOpt.h"
#include "Compiler/CISACodeGen/Emu64OpsPass.h"
#include "Compiler/CISACodeGen/PushAnalysis.hpp"
#include "Compiler/CISACodeGen/ScalarizerCodeGen.hpp"
#include "Compiler/CISACodeGen/HoistCongruentPhi.hpp"
#include "Compiler/CISACodeGen/CodeScheduling.hpp"
#include "Compiler/CISACodeGen/CodeSinking.hpp"
#include "Compiler/CISACodeGen/AddressArithmeticSinking.hpp"
#include "Compiler/CISACodeGen/AtomicOptPass.hpp"
#include "Compiler/CISACodeGen/BlockMemOpAddrScalarizationPass.hpp"
#include "Compiler/CISACodeGen/SinkCommonOffsetFromGEP.h"
#include "Compiler/CISACodeGen/ConstantCoalescing.hpp"
#include "Compiler/CISACodeGen/CheckInstrTypes.hpp"
#include "Compiler/CISACodeGen/EstimateFunctionSize.h"
#include "Compiler/CISACodeGen/GenerateBlockMemOpsPass.hpp"
#include "Compiler/CISACodeGen/GenerateFrequencyData.hpp"
#include "Compiler/CISACodeGen/FixAddrSpaceCast.h"
#include "Compiler/CISACodeGen/FixupExtractValuePair.h"
#include "Compiler/CISACodeGen/GenIRLowering.h"
#include "Compiler/CISACodeGen/GenSimplification.h"
#include "Compiler/CISACodeGen/LoopDCE.h"
#include "Compiler/CISACodeGen/LdShrink.h"
#include "Compiler/CISACodeGen/MemOpt.h"
#include "Compiler/CISACodeGen/MemOpt2.h"
#include "Compiler/CISACodeGen/SplitLoads.h"
#include "Compiler/CISACodeGen/PreRARematFlag.h"
#include "Compiler/CISACodeGen/PromoteConstantStructs.hpp"
#include "Compiler/Optimizer/OpenCLPasses/Decompose2DBlockFuncs/Decompose2DBlockFuncs.hpp"
#include "Compiler/Optimizer/OpenCLPasses/GenericAddressResolution/GASResolving.h"
#include "Compiler/Optimizer/OpenCLPasses/PrivateMemory/LowerByValAttribute.hpp"
#include "Compiler/CISACodeGen/Simd32Profitability.hpp"
#include "Compiler/CISACodeGen/TimeStatsCounter.h"
#include "Compiler/CISACodeGen/TypeDemote.h"
#include "Compiler/CISACodeGen/UniformAssumptions.hpp"
#include "Compiler/CISACodeGen/ResourceLoopUnroll.hpp"
#include "Compiler/CISACodeGen/VectorProcess.hpp"
#include "Compiler/CISACodeGen/RuntimeValueLegalizationPass.h"
#include "Compiler/CISACodeGen/LowerGEPForPrivMem.hpp"
#include "Compiler/CISACodeGen/MatchCommonKernelPatterns.hpp"
#include "Compiler/CISACodeGen/POSH_RemoveNonPositionOutput.h"
#include "Compiler/CISACodeGen/RegisterEstimator.hpp"
#include "Compiler/CISACodeGen/RegisterPressureEstimate.hpp"
#include "Compiler/CISACodeGen/OpenCLKernelCodeGen.hpp"
#include "Compiler/CISACodeGen/RayTracingShaderLowering.hpp"
#include "Compiler/CISACodeGen/RayTracingStatefulPass.h"
#include "Compiler/CISACodeGen/LSCCacheOptimizationPass.h"
#include "Compiler/CISACodeGen/LSCControlsAnalysisPass.h"
#include "Compiler/ConvertMSAAPayloadTo16Bit.hpp"
#include "Compiler/MSAAInsertDiscard.hpp"
#include "Compiler/CISACodeGen/PromoteInt8Type.hpp"
#include "Compiler/CISACodeGen/PrepareLoadsStoresPass.h"
#include "Compiler/CISACodeGen/CallMergerPass.hpp"
#include "Compiler/CISACodeGen/EvaluateFreeze.hpp"
#include "Compiler/CISACodeGen/DpasScan.hpp"
#include "Compiler/CISACodeGen/FPRoundingModeCoalescing.hpp"

#include "Compiler/CISACodeGen/SLMConstProp.hpp"
#include "Compiler/Optimizer/OpenCLPasses/SplitStructurePhisPass/SplitStructurePhisPass.hpp"
#include "Compiler/Optimizer/OpenCLPasses/MergeScalarPhisPass/MergeScalarPhisPass.hpp"
#include "Compiler/Legalizer/AddRequiredMemoryFences.h"
#include "Compiler/Optimizer/OpenCLPasses/GenericAddressResolution/GenericAddressDynamicResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/GenericAddressResolution/GenericNullPtrPropagation.hpp"
#include "Compiler/Optimizer/OpenCLPasses/PrivateMemory/PrivateMemoryResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/PrivateMemory/PrivateMemoryToSLM.hpp"
#include "Compiler/Optimizer/OpenCLPasses/ProgramScopeConstants/ProgramScopeConstantResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/WIFuncs/WIFuncResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/BreakConstantExpr/BreakConstantExpr.hpp"
#include "Compiler/Optimizer/OpenCLPasses/ReplaceUnsupportedIntrinsics/ReplaceUnsupportedIntrinsics.hpp"
#include "Compiler/Optimizer/PreCompiledFuncImport.hpp"
#include "Compiler/Optimizer/OpenCLPasses/AddressSpaceAliasAnalysis/AddressSpaceAliasAnalysis.h"
#include "Compiler/Optimizer/OpenCLPasses/StatelessToStateful/StatelessToStateful.hpp"
#include "Compiler/Optimizer/OpenCLPasses/DisableLoopUnrollOnRetry/DisableLoopUnrollOnRetry.hpp"
#include "Compiler/Optimizer/OpenCLPasses/TransformUnmaskedFunctionsPass/TransformUnmaskedFunctionsPass.h"
#include "Compiler/Optimizer/OpenCLPasses/UnreachableHandling/UnreachableHandling.hpp"
#include "Compiler/Optimizer/OpenCLPasses/WIFuncs/WIFuncResolution.hpp"
#include "Compiler/Optimizer/OpenCLPasses/GEPLoopStrengthReduction/GEPLoopStrengthReduction.hpp"
#include "Compiler/Optimizer/OpenCLPasses/StackOverflowDetection/StackOverflowDetection.hpp"
#include "Compiler/Optimizer/OpenCLPasses/SubGroupReductionPattern/SubGroupReductionPattern.hpp"
#include "Compiler/Optimizer/MCSOptimization.hpp"
#include "Compiler/Optimizer/GatingSimilarSamples.hpp"
#include "Compiler/Optimizer/IntDivConstantReduction.hpp"
#include "Compiler/Optimizer/IntDivRemIncrementReduction.hpp"
#include "Compiler/Optimizer/IntDivRemCombine.hpp"
#include "Compiler/Optimizer/SynchronizationObjectCoalescing.hpp"
#include "Compiler/Optimizer/BarrierControlFlowOptimization.hpp"
#include "Compiler/Optimizer/RuntimeValueVectorExtractPass.h"
#include "Compiler/Optimizer/WaveShuffleIndexSinking.hpp"
#include "Compiler/Optimizer/WaveAllJointReduction.hpp"
#include "Compiler/Optimizer/InstructionHoistingOptimization.hpp"
#include "Compiler/Optimizer/WaveBallotCSE.hpp"
#include "Compiler/MetaDataApi/PurgeMetaDataUtils.hpp"
#include "Compiler/HandleLoadStoreInstructions.hpp"
#include "Compiler/CustomSafeOptPass.hpp"
#include "Compiler/CustomUnsafeOptPass.hpp"
#include "Compiler/CustomLoopOpt.hpp"
#include "Compiler/GenUpdateCB.h"
#include "Compiler/PromoteResourceToDirectAS.h"
#include "Compiler/PromoteStatelessToBindless.h"
#include "Compiler/ShrinkArrayAlloca.h"
#if defined(_DEBUG) && !defined(ANDROID)
#include "Compiler/VerificationPass.hpp"
#endif
#include "Compiler/FixInvalidFuncNamePass.hpp"
#include "Compiler/LegalizationPass.hpp"
#include "Compiler/LowPrecisionOptPass.hpp"
#include "Compiler/WorkaroundAnalysisPass.h"
#include "Compiler/MetaDataApi/MetaDataApi.h"
#include "Compiler/MetaDataUtilsWrapper.h"
#include "Compiler/CodeGenContextWrapper.hpp"
#include "Compiler/DynamicTextureFolding.h"
#include "Compiler/SampleMultiversioning.hpp"
#include "Compiler/InitializePasses.h"
#include "Compiler/GenRotate.hpp"
#include "Compiler/Optimizer/Scalarizer.h"
#include "Compiler/RemoveCodeAssumptions.hpp"
#include "common/igc_regkeys.hpp"
#include "common/debug/Dump.hpp"
#include "common/LLVMWarningsPush.hpp"
#include "llvm/Config/llvm-config.h"
#include <llvm/IR/DebugInfo.h>
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Verifier.h>
#include <llvm/Analysis/Passes.h>
#include <llvm/Pass.h>
#include <llvm/IR/LegacyPassManager.h>
#include <llvm/Transforms/IPO.h>
#include <llvm/Transforms/IPO/AlwaysInliner.h>
#include <llvm/Transforms/Scalar.h>
#include <llvm/Transforms/Scalar/GVN.h>
#include <llvm/IR/Function.h>
#include <llvm/Analysis/ScopedNoAliasAA.h>
#include <llvm/Analysis/TargetLibraryInfo.h>
#include <llvm/Support/ErrorHandling.h>
#include <llvm/Transforms/IPO/FunctionAttrs.h>
#include <llvm/Transforms/Utils.h>
#include <llvm/Transforms/Scalar.h>
#include <llvm/Bitcode/BitcodeWriter.h>
#include "common/LLVMWarningsPop.hpp"
#include "Compiler/CISACodeGen/PatternMatchPass.hpp"
#include "Compiler/CISACodeGen/EmitVISAPass.hpp"
#include "Compiler/CISACodeGen/CoalescingEngine.hpp"
#include "Compiler/GenTTI.h"
#include "Compiler/GenRotate.hpp"
#include "Compiler/SampleCmpToDiscard.h"
#include "Compiler/Optimizer/IGCInstCombiner/IGCInstructionCombining.hpp"
#include "Compiler/Optimizer/HoistConvOpToDom.hpp"
#include "Compiler/Optimizer/PromoteToPredicatedMemoryAccess.hpp"
#include "AdaptorCommon/RayTracing/RayTracingPasses.hpp"
#include "AdaptorCommon/RayTracing/RayTracingAddressSpaceAliasAnalysis.h"
#include "AdaptorCommon/RayTracing/API/RayDispatchGlobalData.h"
#include "Compiler/SamplerPerfOptPass.hpp"
#include "Compiler/CISACodeGen/HalfPromotion.h"
#include "Compiler/CISACodeGen/CapLoopIterationsPass.h"
#include "Compiler/CISACodeGen/AnnotateUniformAllocas.h"
#include "Probe/Assertion.h"
#include "Compiler/CISACodeGen/PartialEmuI64OpsPass.h"
#include "Compiler/TranslateToProgrammableOffsetsPass.hpp"
#include "Compiler/CISACodeGen/RemoveLoopDependency.hpp"

#include <filesystem>

/***********************************************************************************
This file contains the generic code generation functions for all the shaders
The class CShader is inherited for each specific type of shaders to add specific
information
************************************************************************************/

using namespace llvm;
using namespace IGC;
using namespace IGC::IGCMD;
using namespace IGC::Debug;

namespace IGC {
const int LOOP_ROTATION_HEADER_INST_THRESHOLD = 32;
const int LOOP_NUM_THRESHOLD = 2000;
const int LOOP_INST_THRESHOLD = 65000;
const int INST_THRESHOLD = 80000;


void AddAnalysisPasses(CodeGenContext &ctx, IGCPassManager &mpm) {
  COMPILER_TIME_START(&ctx, TIME_CG_Add_Analysis_Passes);

  bool isOptDisabled = ctx.getModuleMetaData()->compOpt.OptDisable;
  TODO("remove the following once all IGC passes are registered to PassRegistery in their constructor")
  initializeLoopInfoWrapperPassPass(*PassRegistry::getPassRegistry());
  initializeCastToGASInfoPass(*PassRegistry::getPassRegistry());

  mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Analysis, STATS_COUNTER_START));

  // transform pull constants and inputs into push constants and inputs
  mpm.add(new PushAnalysis());
  mpm.add(CreateSampleCmpToDiscardPass());

  if (!isOptDisabled) {
    mpm.add(llvm::createDeadCodeEliminationPass());
  }

  // The 1st thing we do when getting into the IGC middle end is to split critical-edges:
  // PushAnalysis requires WIAnalysis
  // WIAnalysis requires dominator and post-dominator analysis
  // WIAnalysis also requires BreakCriticalEdge because it assumes that
  // potential phi-moves will be placed at those blocks
  mpm.add(llvm::createBreakCriticalEdgesPass());


  if (!isOptDisabled && IGC_IS_FLAG_DISABLED(DisableMemOpt2)) {
    if (ctx.m_DriverInfo.WAEnableMemOpt2ForOCL())
      mpm.add(createMemOpt2Pass(16));
  }

  if (!isOptDisabled) {
    mpm.add(createSplitLoadsPass());
  }

  if (IGC_IS_FLAG_ENABLED(EnableScalarPhisMerger) && ctx.type == ShaderType::OPENCL_SHADER) {
    mpm.add(new MergeScalarPhisPass());
  }

  // only limited code-sinking to several shader-type
  // vs input has the URB-reuse issue to be resolved.
  // Also need to understand the performance benefit better.
  if (!isOptDisabled) {
    mpm.add(new HoistCongruentPHI());
    mpm.add(new CodeSinking());
    if ((IGC_IS_FLAG_DISABLED(DisableLoopSink) || IGC_IS_FLAG_ENABLED(ForceLoopSink)) &&
        ctx.type == ShaderType::OPENCL_SHADER && ctx.m_instrTypes.numOfLoop > 0 &&
        ctx.m_instrTypes.numInsts >= IGC_GET_FLAG_VALUE(CodeLoopSinkingMinSize)) {
      mpm.add(new CodeLoopSinking());
    }
    if (IGC_IS_FLAG_DISABLED(DisableCodeScheduling) && (ctx.type == ShaderType::OPENCL_SHADER) &&
        (ctx.platform.isCoreChildOf(IGFX_XE_HPC_CORE) || ctx.platform.isCoreChildOf(IGFX_XE2_HPG_CORE))) {
      if (IGC_IS_FLAG_DISABLED(CodeSchedulingOnlyRecompilation) || ctx.m_retryManager.AllowCodeScheduling()) {
        mpm.add(new CodeScheduling());
      }
    }
  }


  // Run flag re-materialization if it's beneficial.
  if (ctx.m_DriverInfo.benefitFromPreRARematFlag() && IGC_IS_FLAG_ENABLED(EnablePreRARematFlag)) {
    mpm.add(createPreRARematFlagPass());
  }
  // Peephole framework for generic type legalization
  mpm.add(new Legalizer::PeepholeTypeLegalizer());
  if (IGC_IS_FLAG_ENABLED(ForcePromoteI8) ||
      (IGC_IS_FLAG_ENABLED(EnablePromoteI8) && !ctx.platform.supportByteALUOperation())) {
    mpm.add(createPromoteInt8TypePass());
  }

  // need this before WIAnalysis:
  // insert phi to prevent changing of WIAnalysis result by later code-motion
  mpm.add(llvm::createLCSSAPass());
  // Fixup extract value pairs.
  mpm.add(createExtractValuePairFixupPass());

  if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc)) {
    mpm.add(new InlineUnmaskedFunctionsPass());
    // Newly created memcpy intrinsic are lowered
    mpm.add(createReplaceUnsupportedIntrinsicsPass());
    // Split complex constant expression into 2 simple ones
    mpm.add(new BreakConstantExpr());
    // Expand newly created allocas
    mpm.add(createSROAPass());
    // Run legalization pass to expand non-supported instructions
    // like shufflevector. The code below is just copied and
    // pasted as is.
    bool preserveNan = !ctx.getCompilerOption().NoNaNs;
    mpm.add(new Legalization(preserveNan));
    // Some clean up passes.
    mpm.add(llvm::createEarlyCSEPass());
    mpm.add(new BreakConstantExpr());
    mpm.add(llvm::createCFGSimplificationPass());
    mpm.add(createDeadCodeEliminationPass());
    // Create functions groups after unmasked functions inlining
    mpm.add(createGenXCodeGenModulePass());
    // Allocate non-primitive allocas. These peace of code is copied
    if (ctx.m_instrTypes.hasNonPrimitiveAlloca) {
      mpm.add(createBreakCriticalEdgesPass());
      mpm.add(createAnnotateUniformAllocasPass());

      if (IGC_IS_FLAG_DISABLED(DisablePromotePrivMem) &&
          !isOptDisabledForModule(ctx.getModuleMetaData(), IGCOpts::LowerGEPForPrivMemPass)) {
        mpm.add(createPromotePrivateArrayToReg());
        mpm.add(createCFGSimplificationPass());
      }
    }
    mpm.add(createPromoteMemoryToRegisterPass());
    if (IGC_IS_FLAG_DISABLED(DisableMergeAllocasPrivateMemory) && ctx.type == ShaderType::OPENCL_SHADER) {
      mpm.add(createMergeAllocasOCL());
    }
    if (ctx.type == ShaderType::OPENCL_SHADER && !isOptDisabled && IGC_IS_FLAG_ENABLED(EnableExplicitCopyForByVal)) {
      mpm.add(new LowerByValAttribute());
      mpm.add(createReplaceUnsupportedIntrinsicsPass());
    }
    // Resolving private memory allocas
    mpm.add(CreatePrivateMemoryResolution());
  }

  // Reorder FP instructions to minimize number of times rounding mode is switched.
  mpm.add(createFPRoundingModeCoalescingPass());

  // Expected to be the last ModulePass before EmitPass at this point.
  // (Shall be after GenXCodeGenModulePass.)
  //
  // It uses CastToGASAnalysis and invalidates it by taking its result away.
  // This means that after this point, no CastToGASAnalysis will be used,
  // and the info should be accessed via CastToGASInfo immutable pass.
  if (ctx.type == ShaderType::OPENCL_SHADER) {
    mpm.add(new CastToGASInfoWrapper());
  }

  // Evaluates LLVM 10+ freeze instructions so EmitPass does not need to handle them.
  // The pass first occurs during optimization, however new freeze instructions could
  // have been inserted since.
  mpm.add(createEvaluateFreezePass());

  // clean up constexpressions after EarlyCSE
  mpm.add(new BreakConstantExpr());

  // This is for dumping register pressure info
  if (IGC_IS_FLAG_ENABLED(ForceRPE)) {
    mpm.add(new RegisterEstimator());
  }

  mpm.add(createFixInvalidFuncNamePass());

  // collect stats after all the optimization. This info can be dumped to the cos file
  mpm.add(new CheckInstrTypes(true, false));

  if ((IGC_GET_FLAG_VALUE(StaticProfileGuidedSpillCostAnalysis) & FrequencyDataDS::PGSS_IGC_GEN) != 0) {
    mpm.add(createGenerateFrequencyDataPass());
  }

  if (IGC_IS_FLAG_ENABLED(StackOverflowDetection)) {
    mpm.add(new StackOverflowDetectionPass(StackOverflowDetectionPass::Mode::RemoveDummyCalls));
  }

  //
  // Generally, passes that change IR should be prior to this place!
  //

  mpm.add(new DpasScan());

  mpm.add(new MatchCommonKernelPatterns());

  // let CleanPHINode be right before Layout
  mpm.add(createCleanPHINodePass());
  if (IGC_IS_FLAG_SET(DumpRegPressureEstimate))
    mpm.add(new IGCRegisterPressurePrinter("final"));
  // Let Layout be the last pass before Emit Pass
  mpm.add(new Layout());
  if (IGC_IS_FLAG_ENABLED(EnableDropTargetBBs)) {
    mpm.add(new DropTargetBBs());
  }


  mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Analysis, STATS_COUNTER_END));

  COMPILER_TIME_END(&ctx, TIME_CG_Add_Analysis_Passes);
} // AddAnalysisPasses

static void UpdateInstTypeHint(CodeGenContext &ctx) {
  // WA: save original values as preRA heuristic is based on those
  // we need to fix the preRA pass heuristic or get rid of preRA pass altogether
  unsigned int numBB = ctx.m_instrTypes.numBB;
  unsigned int numSample = ctx.m_instrTypes.numSample;
  unsigned int numInsts = ctx.m_instrTypes.numInsts;
  bool hasUnmaskedRegion = ctx.m_instrTypes.hasUnmaskedRegion;
  IGCPassManager mpm(&ctx, "UpdateOptPre");
  mpm.add(new CodeGenContextWrapper(&ctx));
  mpm.add(new BreakConstantExpr());
  mpm.add(new CheckInstrTypes(false, false));
  mpm.run(*ctx.getModule());
  ctx.m_instrTypes.numBB = numBB;
  ctx.m_instrTypes.numSample = numSample;
  ctx.m_instrTypes.numInsts = numInsts;
  ctx.m_instrTypes.hasLoadStore = true;
  ctx.m_instrTypes.hasUnmaskedRegion = hasUnmaskedRegion;
}

// forward declaration
llvm::ModulePass *createPruneUnusedArgumentsPass();

void AddLegalizationPasses(CodeGenContext &ctx, IGCPassManager &mpm, PSSignature *pSignature) {
  COMPILER_TIME_START(&ctx, TIME_CG_Add_Legalization_Passes);

  mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Legalization, STATS_COUNTER_START));

  // update type of instructions to know what passes are needed.
  UpdateInstTypeHint(ctx);
  // check again after full inlining if subroutines are still present
  ctx.CheckEnableSubroutine(*ctx.getModule());

  MetaDataUtils *pMdUtils = ctx.getMetaDataUtils();
  bool isOptDisabled = ctx.getModuleMetaData()->compOpt.OptDisable;
  bool fastCompile = ctx.getModuleMetaData()->compOpt.FastCompilation;
  bool highAllocaPressure = ctx.m_instrTypes.numAllocaInsts > IGC_GET_FLAG_VALUE(AllocaRAPressureThreshold);
  bool isPotentialHPCKernel = (ctx.m_instrTypes.numInsts > IGC_GET_FLAG_VALUE(HPCInstNumThreshold)) ||
                              (ctx.m_instrTypes.numGlobalInsts > IGC_GET_FLAG_VALUE(HPCGlobalInstNumThreshold)) ||
                              IGC_GET_FLAG_VALUE(HPCFastCompilation);
  highAllocaPressure = IGC_GET_FLAG_VALUE(DisableFastRAWA) ? false : highAllocaPressure;
  isPotentialHPCKernel = IGC_GET_FLAG_VALUE(DisableFastRAWA) ? false : isPotentialHPCKernel;

  if (highAllocaPressure || isPotentialHPCKernel) {
    IGC_SET_FLAG_VALUE(FastCompileRA, 1);
    IGC_SET_FLAG_VALUE(HybridRAWithSpill, 1);
  }
  // In case of presence of Unmasked regions disable loop invariant motion after
  // Unmasked functions are inlined at the end of optimization phase
  if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_DISABLED(LateInlineUnmaskedFunc) &&
      ctx.m_instrTypes.hasUnmaskedRegion) {
    IGC_SET_FLAG_VALUE(allowLICM, false);
  }
  bool disableConvergentInstructionsHoisting =
      ctx.m_DriverInfo.DisableConvergentInstructionsHoisting() && ctx.m_instrTypes.numWaveIntrinsics > 0;
  if (disableConvergentInstructionsHoisting || IGC_IS_FLAG_ENABLED(ForceAllPrivateMemoryToSLM) ||
      IGC_IS_FLAG_ENABLED(ForcePrivateMemoryToSLMOnBuffers)) {
    TargetIRAnalysis GenTTgetIIRAnalysis([&](const Function &F) {
      GenIntrinsicsTTIImpl GTTI(&ctx);
      return TargetTransformInfo(GTTI);
    });
    mpm.add(new TargetTransformInfoWrapperPass(std::move(GenTTgetIIRAnalysis)));
  }

  // Disable all target library functions.
  // right now we don't support any standard function in the code gen
  // maybe we want to support some at some point to take advantage of LLVM optimizations
  TargetLibraryInfoImpl TLI;
  TLI.disableAllFunctions();
  mpm.add(new llvm::TargetLibraryInfoWrapperPass(TLI));

  // Add Metadata API immutable pass
  mpm.add(new MetaDataUtilsWrapper(pMdUtils, ctx.getModuleMetaData()));
  // Add CodeGen Context Wrapper immutable pass
  mpm.add(new CodeGenContextWrapper(&ctx));
  // Add alias analysis pass
  mpm.add(createAddressSpaceAAWrapperPass());

  if (ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) {
    if (IGC_IS_FLAG_DISABLED(DisableRTAliasAnalysis))
      mpm.add(createRayTracingAddressSpaceAAWrapperPass());
  }

  mpm.add(createIGCExternalAAWrapper());
  mpm.add(createScopedNoAliasAAWrapperPass());

  TODO("remove the following once all IGC passes are registered to PassRegistery in their constructor")
  initializeWIAnalysisPass(*PassRegistry::getPassRegistry());
  initializeSimd32ProfitabilityAnalysisPass(*PassRegistry::getPassRegistry());
  initializeGenXFunctionGroupAnalysisPass(*PassRegistry::getPassRegistry());


  if (ctx.m_threadCombiningOptDone) {
    mpm.add(createLoopCanonicalization());
    mpm.add(llvm::createLoopDeletionPass());
    mpm.add(llvm::createBreakCriticalEdgesPass());
    mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));
    mpm.add(llvm::createLowerSwitchPass());

    int LoopUnrollThreshold = ctx.m_DriverInfo.GetLoopUnrollThreshold();

    if (LoopUnrollThreshold > 0 && (ctx.m_tempCount < 64)) {
      mpm.add(llvm::createLoopUnrollPass(2, false, false, LoopUnrollThreshold, -1, 1, -1, -1, -1));
    }

    mpm.add(createBarrierNoopPass());

    if (IGC_IS_FLAG_ENABLED(allowLICM) && ctx.m_retryManager.AllowLICM()) {
      mpm.add(createSpecialCasesDisableLICM());
      mpm.add(llvm::createLICMPass(100, 500, true));
    }
    mpm.add(llvm::createLoopSimplifyPass());
  }

  // Lower/Resolve OCL inlined constants.
  if (ctx.m_DriverInfo.NeedLoweringInlinedConstants()) {
    // Run additional constant breaking which is assumed by the constant
    // resolver.
    mpm.add(new BreakConstantExpr());
    mpm.add(new ProgramScopeConstantResolution());
  }

  // This is the condition that double emulation is used.
  ctx.checkDPEmulationEnabled();

  bool hasDPDivSqrtEmu =
      !ctx.platform.hasNoFP64Inst() && !ctx.platform.hasCorrectlyRoundedMacros() && ctx.m_DriverInfo.NeedFP64DivSqrt();
  uint32_t theEmuKind = (ctx.m_hasDPEmu ? EmuKind::EMU_DP : 0);
  theEmuKind |= (hasDPDivSqrtEmu ? EmuKind::EMU_DP_DIV_SQRT : 0);
  theEmuKind |= (ctx.m_hasDPConvEmu ? EmuKind::EMU_DP_CONV : 0);
  theEmuKind |= (ctx.m_DriverInfo.NeedI64BitDivRem() ? EmuKind::EMU_I64DIVREM : 0);
  theEmuKind |= (ctx.m_DriverInfo.NeedFP64toFP16Conv() && IGC_IS_FLAG_DISABLED(ForceDisableDPToHFConvEmu)
                     ? EmuKind::EMU_FP64_FP16_CONV
                     : 0);
  theEmuKind |= ((IGC_IS_FLAG_ENABLED(ForceSPDivEmulation) ||
                  (ctx.m_DriverInfo.NeedIEEESPDiv() && !ctx.platform.hasCorrectlyRoundedMacros()))
                     ? EmuKind::EMU_SP_DIV
                     : 0);
  if (ctx.platform.preferFP32IntDivRemEmu() && IGC_IS_FLAG_DISABLED(Force32BitIntDivRemEmu)) {
    // Prefer using FP32 emulation even though DP support is available
    theEmuKind |= EmuKind::EMU_I32DIVREM_SP;
  } else if (!ctx.platform.hasNoFP64Inst() &&
             (IGC_IS_FLAG_ENABLED(Force32BitIntDivRemEmu) || ctx.getCompilerOption().ForceInt32DivRemEmu ||
              (ctx.platform.Enable32BitIntDivRemEmu() && !ctx.getCompilerOption().ForceInt32DivRemEmuSP &&
               IGC_IS_FLAG_DISABLED(Force32BitIntDivRemEmuSP)))) {
    // Use DP (and float) opeations to emulate int32 div/rem
    theEmuKind |= EmuKind::EMU_I32DIVREM;
  } else if (ctx.platform.Enable32BitIntDivRemEmu() || ctx.getCompilerOption().ForceInt32DivRemEmuSP ||
             IGC_IS_FLAG_ENABLED(Force32BitIntDivRemEmuSP)) {
    // Use SP floating operations to emulate int32 div/rem
    theEmuKind |= EmuKind::EMU_I32DIVREM_SP;
  }

  if (IGC_IS_FLAG_ENABLED(RayTracingKeepUDivRemWA)) {
    theEmuKind &= ~EmuKind::EMU_I32DIVREM;
    theEmuKind &= ~EmuKind::EMU_I32DIVREM_SP;
  }

  if (theEmuKind > 0 || IGC_IS_FLAG_ENABLED(EnableTestIGCBuiltin)) {
    // Need to break constant expr as PreCompiledFuncImport does not handle it.
    mpm.add(new BreakConstantExpr());
    mpm.add(new PreCompiledFuncImport(&ctx, theEmuKind));
    mpm.add(createAlwaysInlinerLegacyPass());

    // Using DCE here as AlwaysInliner does not completely remove dead functions.
    // Once AlwaysInliner can delete all of them, this DCE is no longer needed.
    // mpm.add(createDeadCodeEliminationPass());
    //
    // DCE doesn't remove dead control flow; ADCE does (currently)
    // otherwise you'd have to call createCFGSimplificationPass and DCE
    // iteratively e.g..
    mpm.add(llvm::createAggressiveDCEPass());
    // TODO: we probably should be running other passes on the result

    if (!IGC::ForceAlwaysInline(&ctx)) {
      mpm.add(new PurgeMetaDataUtils());
    }
  }

  // Find rotate pattern.
  //   Invoked after DP emulation so that it'd handle emulation functions.
  if (ctx.platform.supportRotateInstruction()) {
    mpm.add(createGenRotatePass());
  }

  mpm.add(createReplaceUnsupportedIntrinsicsPass());

  if (IGC_IS_FLAG_DISABLED(DisablePromoteToDirectAS) && !ctx.getModuleMetaData()->compOpt.IsLibraryCompilation) {
    // Promotes indirect resource access to direct
    mpm.add(new BreakConstantExpr());
    mpm.add(new PromoteResourceToDirectAS());
  }

  if (!isOptDisabled) {
    mpm.add(createPruneUnusedArgumentsPass());
  }

  if (ctx.m_instrTypes.hasReadOnlyArray) {
    mpm.add(createDeadCodeEliminationPass());
    mpm.add(createSROAPass());
  }

  if (ctx.m_instrTypes.hasGenericAddressSpacePointers) {
    if (IGC_IS_FLAG_ENABLED(EnableGASResolver)) {
      mpm.add(createSROAPass());
      mpm.add(createFixAddrSpaceCastPass());
      mpm.add(createResolveGASPass());
    }
    mpm.add(createGenericAddressDynamicResolutionPass());
    mpm.add(createDeadCodeEliminationPass());
    mpm.add(createGenericNullPtrPropagationPass());
  }

  // Resolve the Private memory to register pass
  if (!isOptDisabled) {
    // In case of late inlining of Unmasked function allocate non
    // primitive Allocas after inlining is done. Otherwise there
    // is possibility RegAlloc cannot allocate registers for all
    // virtual registers. This piece of code is copied at the place
    // where inlining is done.
    if (ctx.m_instrTypes.hasNonPrimitiveAlloca &&
        !(IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc))) {
      mpm.add(createBreakCriticalEdgesPass());
      mpm.add(createAnnotateUniformAllocasPass());

      if (IGC_IS_FLAG_DISABLED(DisablePromotePrivMem) &&
          !isOptDisabledForModule(ctx.getModuleMetaData(), IGCOpts::LowerGEPForPrivMemPass)) {
        mpm.add(createPromotePrivateArrayToReg());
        mpm.add(createCFGSimplificationPass());
      }
    }
    mpm.add(createPromoteMemoryToRegisterPass());
  } else {
    if (IGC_IS_FLAG_ENABLED(AllowMem2Reg))
      mpm.add(createPromoteMemoryToRegisterPass());
  }

  if (ctx.type == ShaderType::OPENCL_SHADER || ctx.type == ShaderType::COMPUTE_SHADER) {
    if (IGC_IS_FLAG_ENABLED(ForceAllPrivateMemoryToSLM)) {
      mpm.add(new PrivateMemoryToSLM(IGC_IS_FLAG_ENABLED(EnableOptReportPrivateMemoryToSLM)));
      mpm.add(createInferAddressSpacesPass());
    } else if (IGC_IS_FLAG_ENABLED(ForcePrivateMemoryToSLMOnBuffers)) {
      std::string forcedBuffers(IGC_GET_REGKEYSTRING(ForcePrivateMemoryToSLMOnBuffers));

      mpm.add(new PrivateMemoryToSLM(std::move(forcedBuffers), IGC_IS_FLAG_ENABLED(EnableOptReportPrivateMemoryToSLM)));
      mpm.add(createInferAddressSpacesPass());
    }
  }

  if (ctx.m_instrTypes.numOfLoop) {
    // need to run loop simplify to canonicalize loop and merge latches
    mpm.add(createLoopCanonicalization());
    mpm.add(createLoopSimplifyPass());
    if (!IGC_IS_FLAG_ENABLED(DisableLoopSplitWidePHIs))
      mpm.add(createLoopSplitWidePHIs());
  }

  if (IGC_IS_FLAG_ENABLED(StackOverflowDetection)) {
    // Cleanup stack overflow detection calls if necessary.
    mpm.add(new StackOverflowDetectionPass(StackOverflowDetectionPass::Mode::AnalyzeAndCleanup));
  }

  if (ctx.enableFunctionCall() || ctx.type == ShaderType::RAYTRACING_SHADER) {
    // Sort functions if subroutine/indirect fcall is enabled.
    mpm.add(llvm::createGlobalDCEPass());
    mpm.add(new PurgeMetaDataUtils());
    mpm.add(createGenXCodeGenModulePass());
  }

  // Remove all uses of implicit arg instrinsics after inlining by lowering them to kernel args
  mpm.add(new LowerImplicitArgIntrinsics());

  // Resolving private memory allocas
  // In case of late inlining of Unmasked function postpone memory
  // resolution till inlining is done as during inlining new Allocas
  // are created.
  if (!(IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc))) {
    if (IGC_IS_FLAG_DISABLED(DisableMergeAllocasPrivateMemory) && ctx.type == ShaderType::OPENCL_SHADER) {
      mpm.add(createMergeAllocasOCL());
    }
    if (ctx.type == ShaderType::OPENCL_SHADER && !isOptDisabled && IGC_IS_FLAG_ENABLED(EnableExplicitCopyForByVal)) {
      mpm.add(new LowerByValAttribute());
      mpm.add(createReplaceUnsupportedIntrinsicsPass());
    }
    mpm.add(CreatePrivateMemoryResolution());
  }
  // Should help MemOpt pass to merge more loads
  mpm.add(createSinkCommonOffsetFromGEPPass());

  // Run MemOpt
  if (!isOptDisabled && ctx.m_instrTypes.hasLoadStore && IGC_IS_FLAG_DISABLED(DisableMemOpt) &&
      !ctx.getModuleMetaData()->disableMemOptforNegativeOffsetLoads) {

    if ((ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) &&
        IGC_IS_FLAG_DISABLED(DisablePrepareLoadsStores)) {
      mpm.add(createPrepareLoadsStoresPass());
    }

    // run AdvMemOpt and MemOPt back-to-back so that we only
    // need to run WIAnalysis once
    if (IGC_IS_FLAG_ENABLED(EnableAdvMemOpt))
      mpm.add(createAdvMemOptPass());

    if (doLdStCombine(&ctx)) {
      // Once it is stable, no split 64bit store/load anymore.
      mpm.add(createLdStCombinePass());
    }

    bool AllowNegativeSymPtrsForLoad =
        ctx.type == ShaderType::OPENCL_SHADER;

    bool AllowVector8LoadStore =
        IGC_IS_FLAG_ENABLED(EnableVector8LoadStore) ||
        ((ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) && ctx.platform.supports8DWLSCMessage());

    mpm.add(createMemOptPass(AllowNegativeSymPtrsForLoad, AllowVector8LoadStore));

    if ((ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) &&
        IGC_IS_FLAG_ENABLED(EnableLSCCacheOptimization)) {
      // Optimize store instructions for utilizing the LSC-L1 cache.
      // This only runs for shaders with raytracing functionality.
      mpm.add(createLSCCacheOptimizationPass());
    }

    mpm.add(createIGCInstructionCombiningPass());
  }


    if (ctx.hasSyncRTCalls()) {
      mpm.add(createRaytracingStatefulPass());
    }

  if (ctx.type == ShaderType::OPENCL_SHADER &&
      static_cast<OpenCLProgramContext &>(ctx).m_InternalOptions.PromoteStatelessToBindless) {
    if (static_cast<OpenCLProgramContext &>(ctx).m_InternalOptions.UseBindlessLegacyMode) {
      mpm.add(new PromoteStatelessToBindless());
    }
    else if (!ctx.getModuleMetaData()->compOpt.GreaterThan4GBBufferRequired && !isOptDisabled)
    {
      // Advanced bindless mode used by the regular OpenCL compilation path
      mpm.add(new StatelessToStateful(TargetAddressing::BINDLESS));
    }
  }

  if (!isOptDisabled && ctx.useStatelessToStateful()) {
    mpm.add(new StatelessToStateful(TargetAddressing::BINDFUL));
  }

  // Light cleanup for subroutines after cloning. Note that the constant
  // propogation order is reversed, compared to the opt sequence in
  // OptimizeIR. There is a substantial gain with CFG simplification after
  // interprocedural constant propagation.
  if (ctx.m_enableSubroutine && !isOptDisabled) {
    mpm.add(createPruneUnusedArgumentsPass());

    const bool allowIPConstProp = !ctx.m_hasStackCalls && IGC_IS_FLAG_DISABLED(DisableIPConstantPropagation);

    if (allowIPConstProp) {
      mpm.add(createIPSCCPPass());
    }
    mpm.add(createDeadCodeEliminationPass());
    mpm.add(createCFGSimplificationPass());
  }
  // Since we don't support switch statements, switch lowering is needed after the last CFG simplication
  mpm.add(llvm::createLowerSwitchPass());

  // This pass can create constant expression
  if (ctx.m_DriverInfo.HasDoubleLoadStore()) {
    mpm.add(new HandleLoadStoreInstructions());
  }

  // Split big vector & 3-element load/store, etc.
  mpm.add(createVectorPreProcessPass());

  // Create Gen IR lowering.
  //   To replace SLM pointer if they are constants, break constants first.
  if (ctx.m_instrTypes.hasLocalLoadStore) {
    mpm.add(new BreakConstantExpr());
  }

  bool KeepGEPs;
  // In case of late inlining of Unmasked function postpone memory
  // resolution till inlining is done as during inlining new Allocas
  // are created.
  if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_ENABLED(LateInlineUnmaskedFunc)) {
    KeepGEPs = true;
  } else {
    KeepGEPs = false;
  }
  mpm.add(createGenIRLowerPass());

  if (KeepGEPs) {
    mpm.add(createSeparateConstOffsetFromGEPPass());
  } else {
    // Also break and lower GEP constexpr.
    mpm.add(new BreakConstantExpr());
    mpm.add(createGEPLoweringPass());
  }

  mpm.add(new WorkaroundAnalysis());

  if (!isOptDisabled) {
    // Removing code assumptions can enable some InstructionCombining optimizations.
    // Last instruction combining pass needs to be before Legalization pass, as it can produce illegal instructions.
    mpm.add(new RemoveCodeAssumptions());
    mpm.add(createIGCInstructionCombiningPass());
    if (ctx.platform.doIntegerMad() && ctx.m_DriverInfo.EnableIntegerMad()) {
      mpm.add(createCanonicalizeMulAddPass());
    }
    mpm.add(new GenSpecificPattern());
    // Cases with DPDivSqrtEmu grow significantly.
    // We can disable EarlyCSE when m_hasDPDivSqrtEmu is true,
    // what causes the values will have shorter lifetime and we can avoid spills.
    if (!fastCompile && !highAllocaPressure && !isPotentialHPCKernel && !ctx.m_hasDPDivSqrtEmu) {
      mpm.add(createEarlyCSEPass());
    } else if (highAllocaPressure || isPotentialHPCKernel) {
      mpm.add(createSinkingPass());
    }
    if (!fastCompile && !highAllocaPressure && !isPotentialHPCKernel && IGC_IS_FLAG_ENABLED(allowLICM) &&
        ctx.m_retryManager.AllowLICM()) {
      mpm.add(createSpecialCasesDisableLICM());
      mpm.add(llvm::createLICMPass(100, 500, true));
      mpm.add(llvm::createEarlyCSEPass());
    }
    mpm.add(createAggressiveDCEPass());
    // As DPC++ FE apply LICM we cannot reduce register pressure just
    // by turning off LICM at IGC in some cases so apply sinking address arithmetic
    if ((IGC_IS_FLAG_ENABLED(ForceAddressArithSinking) ||
         !isOptDisabledForModule(ctx.getModuleMetaData(), IGCOpts::AddressArithmeticSinkingPass)) &&
        ctx.type == ShaderType::OPENCL_SHADER) {
      mpm.add(new AddressArithmeticSinking());
    }
  }

  // Enabling half promotion AIL for compute shaders only at this point.
  // If needed ctx.type check can be removed to apply for all shader types
  if (IGC_IS_FLAG_ENABLED(ForceHalfPromotion) ||
      (ctx.getModuleMetaData()->compOpt.WaForceHalfPromotionComputeShader && ctx.type == ShaderType::COMPUTE_SHADER) ||
      (ctx.getModuleMetaData()->compOpt.WaForceHalfPromotionPixelVertexShader &&
       (ctx.type == ShaderType::PIXEL_SHADER || ctx.type == ShaderType::VERTEX_SHADER)) ||
      (!ctx.platform.supportFP16() && IGC_IS_FLAG_ENABLED(EnableHalfPromotion))) {
    mpm.add(new HalfPromotion());
    if (IGC_IS_FLAG_ENABLED(EnableGVN)) {
      mpm.add(createGVNPass());
    }
    mpm.add(createDeadCodeEliminationPass());
  }

  if (IGC_IS_FLAG_ENABLED(ForceNoInfiniteLoops)) {
    mpm.add(createLoopSimplifyPass());
    mpm.add(new CapLoopIterations(UINT_MAX));
  }

  // Run address remat after GVN as it may hoist address calculations and
  // create PHI nodes with addresses.
  if (IGC_IS_FLAG_ENABLED(RematEnable) ||
      (ctx.m_retryManager.AllowCloneAddressArithmetic() && ctx.type == ShaderType::OPENCL_SHADER)) {

    if (IGC_GET_FLAG_VALUE(RematInstCombineBefore))
      mpm.add(createIGCInstructionCombiningPass());
    // TODO: This is a workaround that helps to reduce amount of instructions for clone address arithmetic
    // it helps with chain of instructions like this
    // %remat12 = add i64 %baseArith, 100780848
    // %remat13 = add i64 %remat12, %basePtr
    // %remat14 = add i64 %remat13, %offsetI
    // %remat15 = add i64 %remat14, %offsetJ
    // load ...
    // ....
    // %remat21 = add i64 %baseArith, 201561696
    // %remat22 = add i64 %remat21, %basePtr
    // %remat23 = add i64 %remat22, %offsetI
    // %remat24 = add i64 %remat23, %offsetJ
    // load ...
    // we can compress this chain of instruction into one "add" for each "load"
    // this is achieved by combining reassoc + cse 3 times (each pair hoists one add)
    // it should be substituted for general pass when it's implemented
    //
    // Now it's accessible through flag, for testing purposes
    if (IGC_GET_FLAG_VALUE(RematReassocBefore)) {
      mpm.add(llvm::createReassociatePass());
      mpm.add(llvm::createEarlyCSEPass());
      mpm.add(llvm::createReassociatePass());
      mpm.add(llvm::createEarlyCSEPass());
      mpm.add(llvm::createReassociatePass());
      mpm.add(llvm::createEarlyCSEPass());
    }

    mpm.add(createCloneAddressArithmeticPass());
    // cloneAddressArithmetic leaves old instructions unnecessary
    // dce pass helps to clean that up
    mpm.add(createDeadCodeEliminationPass());
    if (IGC_IS_FLAG_SET(DumpRegPressureEstimate))
      mpm.add(new IGCRegisterPressurePrinter("after_remat"));
  } else if (ctx.m_retryManager.AllowCloneAddressArithmetic() && IGC_GET_FLAG_VALUE(RematOptionsForRetry) ||
             ctx.platform.supportsVRT() && IGC_GET_FLAG_VALUE(RematOptionsForVRT)) {

    if (IGC_GET_FLAG_VALUE(RematInstCombineBefore))
      mpm.add(createIGCInstructionCombiningPass());

    // see comment above
    if (IGC_GET_FLAG_VALUE(RematReassocBefore)) {
      mpm.add(llvm::createReassociatePass());
      mpm.add(llvm::createEarlyCSEPass());
      mpm.add(llvm::createReassociatePass());
      mpm.add(llvm::createEarlyCSEPass());
      mpm.add(llvm::createReassociatePass());
      mpm.add(llvm::createEarlyCSEPass());
    }

    // if both retry and VRT checks go through, retry is more important
    auto rematOptions = ctx.m_retryManager.AllowCloneAddressArithmetic() && IGC_GET_FLAG_VALUE(RematOptionsForRetry)
                            ? static_cast<IGC::REMAT_OPTIONS>(IGC_GET_FLAG_VALUE(RematOptionsForRetry))
                            : static_cast<IGC::REMAT_OPTIONS>(IGC_GET_FLAG_VALUE(RematOptionsForVRT));

    mpm.add(createCloneAddressArithmeticPassWithFlags(rematOptions));

    // cloneAddressArithmetic leaves old instructions unnecessary
    // dce pass helps to clean that up
    mpm.add(createDeadCodeEliminationPass());
    if (IGC_IS_FLAG_SET(DumpRegPressureEstimate))
      mpm.add(new IGCRegisterPressurePrinter("after_remat"));
  }

  mpm.add(createRematAddressArithmeticPass());


  // Run type demotion if it's beneficial.
  if (ctx.m_DriverInfo.benefitFromTypeDemotion() && IGC_IS_FLAG_ENABLED(EnableTypeDemotion)) {
    mpm.add(createTypeDemotePass());
  }

  // Do Genx strengthreduction (do things like fdiv -> inv + mul)
  if (!isOptDisabled) {
    mpm.add(createGenStrengthReductionPass());
    mpm.add(createVectorBitCastOptPass());
  }

  bool forceUniformSurfaceSampler = ctx.getModuleMetaData()->compOpt.ForceUniformSurfaceSampler;
  bool forceUniformBuffer = ctx.getModuleMetaData()->compOpt.ForceUniformBuffer;

  if (ctx.m_instrTypes.hasUniformAssumptions || IGC_IS_FLAG_ENABLED(ForceUniformSurfaceSampler) ||
      forceUniformSurfaceSampler || IGC_IS_FLAG_ENABLED(ForceUniformBuffer) || forceUniformBuffer) {
    mpm.add(new UniformAssumptions(IGC_IS_FLAG_ENABLED(ForceUniformSurfaceSampler) || forceUniformSurfaceSampler,
                                   IGC_IS_FLAG_ENABLED(ForceUniformBuffer) || forceUniformBuffer));
  }

  // NanHandlingPass need to be before Legalization since it might make
  // some changes and require Legalization to "legalize"
  if (IGC_IS_FLAG_DISABLED(DisableBranchSwaping) && ctx.m_DriverInfo.BranchSwapping()) {
    mpm.add(createNanHandlingPass());
  }

  // TODO: move to use instruction flags
  // to figure out if we need to preserve Nan
  bool preserveNan = !ctx.getCompilerOption().NoNaNs;

  // Legalizer does not handle constant expressions
  mpm.add(new BreakConstantExpr());
  mpm.add(new Legalization(preserveNan));

  // Scalarizer in codegen to handle the vector instructions
  mpm.add(new ScalarizerCodeGen());

  // coalesce scalar loads into loads of larger quantity.
  // This require and preserves uniform analysis we should keep
  // other passes using uniformness together to avoid re-running it several times
  if (IGC_IS_FLAG_DISABLED(DisableConstantCoalescing) && !ctx.getModuleMetaData()->compOpt.DisableConstantCoalescing &&
      !isOptDisabledForModule(ctx.getModuleMetaData(), IGCOpts::ConstantCoalescingPass)) {
    mpm.add(createBreakCriticalEdgesPass());
    mpm.add(new ConstantCoalescing());
  }

  if (ctx.type == ShaderType::RAYTRACING_SHADER || ctx.hasSyncRTCalls()) {
    if (IGC_IS_FLAG_DISABLED(DisableLSCControlsForRayTracing))
      mpm.add(CreateLSCControlsAnalysisPass());

    // We do raytracing lowering a little earlier than the others here
    // to take advantage of the instruction simplification below.
    mpm.add(CreateRayTracingShaderLowering());
  }


  // Instruction combining may merge instruction back into unsupported intrinsics.
  // Therefore last Replace Unsupported Intrinsics Pass must be after last
  // Instruction combining pass.
  // Replace Unsupported Intrinsics Pass may generate new 64 bit operations.
  // Therefore last 64bit emulation pass must be after the last Replace Unsupported Intrinsics Pass.
  mpm.add(createReplaceUnsupportedIntrinsicsPass());

  if (!ctx.platform.hasFP32GlobalAtomicAdd()) {
    mpm.add(new AtomicOptPass());
  }

  // When m_hasDPEmu is true, enable Emu64Ops as well for now until
  // DPEmu is able to get rid of all 64bit integer ops fully.
  if ((ctx.m_hasDPEmu && IGC_IS_FLAG_ENABLED(DPEmuNeedI64Emu)) ||
      (ctx.m_DriverInfo.Enable64BitEmu() &&
       (IGC_GET_FLAG_VALUE(Enable64BitEmulation) ||
        (IGC_GET_FLAG_VALUE(Enable64BitEmulationOnSelectedPlatform) && ctx.platform.need64BitEmulation()))) ||
      ctx.platform.hasPartialInt64Support()) {
    mpm.add(new BreakConstantExpr());

    // Emu64OpsPass requires that we are working on legal types, specifically
    // that i128 uses are expanded to i64. This is why we need to run PeepholeTypeLegalizer
    // beforehand.
    mpm.add(new Legalizer::PeepholeTypeLegalizer());
    // Lower all GEPs now as Emu64 doesn't know how to handle them.
    if (KeepGEPs) {
      mpm.add(createGEPLoweringPass());
      mpm.add(llvm::createEarlyCSEPass());
    }
    // Run dead code elimination pass right before Emu64OpsPass,
    // as legalization passes do not always clear unused (operating
    // on illegal types) instructions.
    mpm.add(llvm::createDeadCodeEliminationPass());

    if (ctx.type == ShaderType::OPENCL_SHADER && IGC_IS_FLAG_ENABLED(EnableKernelCostInfo)) {
      mpm.add(createLoopCountAnalysisPass());
    }

    if (ctx.platform.hasPartialEmuI64Enabled()) {
      mpm.add(createPartialEmuI64OpsPass());
    } else {
      mpm.add(createEmu64OpsPass());
    }

    ctx.m_hasEmu64BitInsts = true;
    if (!isOptDisabled) {
      mpm.add(new GenSpecificPattern());
    }
  }

  if (ctx.m_instrTypes.hasRuntimeValueVector) {
    // Legalize RuntimeValue calls for push analysis
    mpm.add(new RuntimeValueLegalizationPass());
  }

  if ((ctx.m_instrTypes.hasLocalLoadStore || ctx.m_instrTypes.hasLocalAtomics) && ctx.platform.hasLSC() &&
      !ctx.platform.NeedsLSCFenceUGMBeforeEOT() && // VISA will add the fence
      IGC_IS_FLAG_DISABLED(DisableAddRequiredMemoryFencesPass)) {
    mpm.add(createAddRequiredMemoryFencesPass());
  }

  mpm.add(createInstSimplifyLegacyPass());
  // This pass inserts bitcasts for vector loads/stores.
  // This pass could be moved further toward EmitPass.
  mpm.add(createVectorProcessPass());

  // handling constant expressions created by vectorProcess pass
  mpm.add(new BreakConstantExpr());

  mpm.add(new LowPrecisionOpt());


  mpm.add(new WAFMinFMax());

  mpm.add(createTimeStatsCounterPass(&ctx, TIME_CG_Legalization, STATS_COUNTER_END));

  COMPILER_TIME_END(&ctx, TIME_CG_Add_Legalization_Passes);
} // AddLegalizationPasses

void AddCodeGenPasses(CodeGenContext &ctx, CShaderProgram::KernelShaderMap &shaders, IGCPassManager &Passes,
                      SIMDMode simdMode, bool canAbortOnSpill, ShaderDispatchMode shaderMode, PSSignature *pSignature) {
  // Generate CISA
  COMPILER_TIME_START(&ctx, TIME_CG_Add_CodeGen_Passes);
  Passes.add(new EmitPass(shaders, simdMode, canAbortOnSpill, shaderMode, pSignature));
  COMPILER_TIME_END(&ctx, TIME_CG_Add_CodeGen_Passes);
}


// check based on performance measures.
bool SimdEarlyCheck(CodeGenContext *ctx) {
  if (ctx->m_sampler < 11 || ctx->m_inputCount < 16 || ctx->m_tempCount < 40 || ctx->m_dxbcCount < 280 ||
      ctx->m_ConstantBufferCount < 500) {
    if (ctx->m_tempCount < 90 && ctx->m_ConstantBufferCount < 210) {
      return true;
    }
  }
  return false;
}


void destroyShaderMap(CShaderProgram::KernelShaderMap &shaders) {
  for (const auto &i : shaders) {
    CShaderProgram *shader = i.second;
    COMPILER_SHADER_STATS_PRINT(shader->m_shaderStats, shader->GetContext()->type, shader->GetContext()->hash, "");
    COMPILER_SHADER_STATS_SUM(shader->GetContext()->m_sumShaderStats, shader->m_shaderStats,
                              shader->GetContext()->type);
    COMPILER_SHADER_STATS_DEL(shader->m_shaderStats);
    delete shader;
  }
}

void unify_opt_PreProcess(CodeGenContext *pContext) {
  TODO("hasBuiltin should be calculated based on module");
  if (IGC_IS_FLAG_ENABLED(DisableLLVMGenericOptimizations)) {
    pContext->getModuleMetaData()->compOpt.OptDisable = true;
  }

  if (IGC_GET_FLAG_VALUE(StripDebugInfo) == FLAG_DEBUG_INFO_STRIP_ALL) {
    StripDebugInfo(*pContext->getModule());
  } else if (IGC_GET_FLAG_VALUE(StripDebugInfo) == FLAG_DEBUG_INFO_STRIP_NONLINE) {
    stripNonLineTableDebugInfo(*pContext->getModule());
  }

  IGCPassManager mpm(pContext, "OPTPre");
  mpm.add(new CodeGenContextWrapper(pContext));
  mpm.add(new CheckInstrTypes(false, true));

  if (pContext->isPOSH()) {
    mpm.add(createRemoveNonPositionOutputPass());
  }

  mpm.run(*pContext->getModule());

  // If the module does not contain called function declaration,
  // indirect calls are the only way to detect function pointers usage.
  if (pContext->m_instrTypes.hasIndirectCall)
    pContext->m_enableFunctionPointer = true;

  if (pContext->getMetaDataUtils()->size_FunctionsInfo() == 1 && !pContext->m_instrTypes.hasSubroutines) {
    pContext->m_instrTypes.numBB = pContext->getMetaDataUtils()->begin_FunctionsInfo()->first->size();
    pContext->m_instrTypes.hasMultipleBB = (pContext->m_instrTypes.numBB != 1);
  } else {
    pContext->m_instrTypes.hasMultipleBB = true;
  }

  pContext->m_instrTypes.hasLoadStore = true;

  pContext->m_instrTypes.CorrelatedValuePropagationEnable =
      (pContext->m_instrTypes.hasMultipleBB &&
       (pContext->m_instrTypes.hasSel || pContext->m_instrTypes.hasCmp || pContext->m_instrTypes.hasSwitch ||
        pContext->m_instrTypes.hasLoadStore));
}

static bool extensiveShader(CodeGenContext *pContext) {
  return (pContext->type == ShaderType::OPENCL_SHADER && pContext->m_instrTypes.numInsts > INST_THRESHOLD &&
          pContext->m_instrTypes.numLoopInsts > LOOP_INST_THRESHOLD &&
          pContext->m_instrTypes.numOfLoop > LOOP_NUM_THRESHOLD && pContext->m_instrTypes.numBB == 0 &&
          pContext->m_instrTypes.numSample == 0 && pContext->m_instrTypes.hasSubroutines);
}

// When we do not run optimizations, we still need to run always inline
// pass, otherwise codegen will fail.
static void alwaysInlineForNoOpt(CodeGenContext *pContext, bool NoOpt) {
  if (NoOpt) {
    MetaDataUtils *pMdUtils = pContext->getMetaDataUtils();
    IGCPassManager mpm(pContext, "OPTPost");
    mpm.add(new MetaDataUtilsWrapper(pMdUtils, pContext->getModuleMetaData()));
    mpm.add(new CodeGenContextWrapper(pContext));
    mpm.add(createAlwaysInlinerLegacyPass());
    mpm.add(new PurgeMetaDataUtils());
    mpm.run(*pContext->getModule());
  }
}


#define GFX_ONLY_PASS if (pContext->type != ShaderType::OPENCL_SHADER)

void OptimizeIR(CodeGenContext *const pContext) {
  IGC_ASSERT(nullptr != pContext);
  MetaDataUtils *pMdUtils = pContext->getMetaDataUtils();
  IGC_ASSERT(nullptr != pContext->getModuleMetaData());
  bool NoOpt = pContext->getModuleMetaData()->compOpt.OptDisable;

  DumpHashToOptions(pContext->hash, pContext->type);

  alwaysInlineForNoOpt(pContext, NoOpt);

  // Insert per-func optimization metadata
  for (auto &F : *pContext->getModule()) {
    if (!F.empty()) {
      IGC::InsertOptsMetadata(pContext, &F);
    }
  }

  if (NoOpt) {
    return;
  }

  IGCPassManager mpm(pContext, "OPT");
#if !defined(_DEBUG)
  if (IGC_IS_FLAG_ENABLED(EnableDebugging))
#endif
    // do verifyModule for debug/release_internal only.
    if (false == pContext->m_hasLegacyDebugInfo) {
      IGC_ASSERT(nullptr != pContext->getModule());
      IGC_ASSERT(false == llvm::verifyModule(*pContext->getModule(), &dbgs()));
    }

  COMPILER_TIME_START(pContext, TIME_OptimizationPasses);
  // scope to force destructors before mem usage sampling
  {
    unify_opt_PreProcess(pContext);
    /// Keeps track of the Dump objects so that we can free them after the pass manager has been run

    // right now we don't support any standard function in the code gen
    // maybe we want to support some at some point to take advantage of LLVM optimizations
    TargetLibraryInfoImpl TLI;
    TLI.disableAllFunctions();

    mpm.add(new MetaDataUtilsWrapper(pMdUtils, pContext->getModuleMetaData()));

    mpm.add(new CodeGenContextWrapper(pContext));
    TargetIRAnalysis GenTTgetIIRAnalysis([&](const Function &F) {
      GenIntrinsicsTTIImpl GTTI(pContext);
      return TargetTransformInfo(GTTI);
    });

    mpm.add(new TargetTransformInfoWrapperPass(GenTTgetIIRAnalysis));
#if defined(_DEBUG) && !defined(__ANDROID__)
    // IGC IR Verification pass checks that we get a correct IR after the Unification.
    mpm.add(new VerificationPass());
#endif
    mpm.add(new llvm::TargetLibraryInfoWrapperPass(TLI));
    initializeWIAnalysisPass(*PassRegistry::getPassRegistry());

       // Do inter-procedural constant propagation early.
    if (pContext->m_enableSubroutine) {
      // Here, we propagate function attributes across calls.  Remaining
      // function calls that were conservatively marked as 'convergent'
      // in ProcessBuiltinMetaData can have that attribute stripped if
      // possible which potentially allows late stage code sinking of
      // those calls by the instruction combiner.
      mpm.add(createPostOrderFunctionAttrsLegacyPass());

      // Don't run IPConstantProp if there are stackcalls
      const bool allowIPConstProp = !pContext->m_hasStackCalls && IGC_IS_FLAG_DISABLED(DisableIPConstantPropagation);

      if (allowIPConstProp) {
        mpm.add(createIPSCCPPass());
      }
      // Note / todo: LLVM < 12 also runs simple constant propagation pass
      // regardless of IPSCCP in this case. This pass is not available on
      // >= 12 version, but maybe SCCP pass would be suitable here.
    }
    if (IGC_IS_FLAG_ENABLED(MSAA16BitPayloadEnable) && pContext->platform.support16bitMSAAPayload()) {
      mpm.add(new ConvertMSAAPayloadTo16Bit());
    }

    if (IGC_GET_FLAG_VALUE(MSAAClearedKernel) > 0) {
      mpm.add(new MSAAInsertDiscard());
    }
    mpm.add(createSamplerPerfOptPass());


    if ((!IGC_IS_FLAG_ENABLED(DisableDynamicTextureFolding) &&
         pContext->getModuleMetaData()->inlineDynTextures.size() > 0) ||
        (!IGC_IS_FLAG_ENABLED(DisableDynamicResInfoFolding))) {
      mpm.add(new DynamicTextureFolding());
    }


    if (pContext->m_DriverInfo.CodeSinkingBeforeCFGSimplification()) {
      mpm.add(new HoistCongruentPHI());
      mpm.add(new CodeSinking());
    }
    mpm.add(llvm::createCFGSimplificationPass(SimplifyCFGOptions().hoistCommonInsts(true)));

    mpm.add(llvm::createBasicAAWrapperPass());
    mpm.add(createAddressSpaceAAWrapperPass());

    if (pContext->type == ShaderType::RAYTRACING_SHADER || pContext->hasSyncRTCalls()) {
      if (IGC_IS_FLAG_DISABLED(DisableRTAliasAnalysis))
        mpm.add(createRayTracingAddressSpaceAAWrapperPass());
    }

    mpm.add(createIGCExternalAAWrapper());
    mpm.add(createScopedNoAliasAAWrapperPass());

    if (pContext->m_instrTypes.hasLoadStore) {
      mpm.add(llvm::createDeadStoreEliminationPass());
      mpm.add(createMarkReadOnlyLoadPass());
    }

    mpm.add(createLogicalAndToBranchPass());


    mpm.add(llvm::createEarlyCSEPass());


    if (pContext->m_instrTypes.CorrelatedValuePropagationEnable) {
      mpm.add(llvm::createCorrelatedValuePropagationPass());
    }

    mpm.add(new BreakConstantExpr());
    mpm.add(new IGCConstProp());
    GFX_ONLY_PASS { mpm.add(createTranslateToProgrammableOffsetsPass()); }

    mpm.add(new CustomSafeOptPass());
    if (!pContext->m_DriverInfo.WADisableCustomPass()) {
      mpm.add(new CustomUnsafeOptPass());
    }
    mpm.add(createSubGroupReductionPatternPass());

    if (IGC_IS_FLAG_ENABLED(EmulateFDIV)) {
      mpm.add(createGenFDIVEmulation());
    }

    mpm.add(createIGCInstructionCombiningPass());
    if (IGC_IS_FLAG_ENABLED(EnableWaveShuffleIndexSinking)) {
      mpm.add(createWaveShuffleIndexSinking());
    }
    mpm.add(new FCmpPaternMatch());
    mpm.add(llvm::createDeadCodeEliminationPass()); // this should be done both before/after constant propagation

    if (pContext->m_instrTypes.hasGenericAddressSpacePointers && IGC_IS_FLAG_ENABLED(EnableGASResolver)) {
      mpm.add(createSROAPass());
      mpm.add(createFixAddrSpaceCastPass());
      mpm.add(createResolveGASPass());
    }

    if (pContext->m_instrTypes.hasNonPrimitiveAlloca && IGC_IS_FLAG_DISABLED(DisableShrinkArrayAllocaPass)) {
      mpm.add(new ShrinkArrayAllocaPass());
    }

    if (IGC_IS_FLAG_ENABLED(SampleMultiversioning) || pContext->m_enableSampleMultiversioning) {
      if (pContext->m_instrTypes.numOfLoop == 0)
        mpm.add(new SampleMultiversioning(pContext));
    }

    bool disableGOPT =
        ((IsStage1FastestCompile(pContext->m_CgFlag, pContext->m_StagingCtx) || IGC_GET_FLAG_VALUE(ForceFastestSIMD)) &&
         ((FastestS1Options(pContext) & FCEXP_DISABLE_GOPT) || FastestS1Options(pContext) == FCEXP_NO_EXPRIMENT ||
          pContext->getModuleMetaData()->compOpt.DisableFastestGopt));


    // EnableBarrierControlFlowOptimizationPass: enable BCF optimization
    // UseBarrierControlFlowOptimization: UMD AIL option to use BCF optimization
    // supportBarrierControlFlowOptimization: API control (D3D12, Vulkan, OCL)
    if ((IGC_IS_FLAG_ENABLED(EnableBarrierControlFlowOptimizationPass) ||
         pContext->getModuleMetaData()->compOpt.UseBarrierControlFlowOptimization) &&
        pContext->m_DriverInfo.supportBarrierControlFlowOptimization() &&
        pContext->platform.hasBarrierControlFlowOpt() && !pContext->hasSyncRTCalls() &&
        (pContext->type != ShaderType::PIXEL_SHADER)) {
      mpm.add(createBarrierControlFlowOptimization());
    }

    if (pContext->m_instrTypes.hasMultipleBB && !disableGOPT) {
      // disable loop unroll for excessive large shaders
      if (pContext->m_instrTypes.numOfLoop) {
        mpm.add(createLoopDeadCodeEliminationPass());
        mpm.add(createLoopCanonicalization());
        mpm.add(llvm::createLoopDeletionPass());
        mpm.add(llvm::createBreakCriticalEdgesPass());
        mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));
        mpm.add(llvm::createLCSSAPass());
        mpm.add(llvm::createLoopSimplifyPass());
      }
    }

    // This pass needs to be extended for other devices
    if (pContext->platform.getPlatformInfo().eProductFamily == IGFX_PVC) {
      mpm.add(new GenerateBlockMemOpsPass());
    }
    mpm.add(new BlockMemOpAddrScalarizationPass());

    if (pContext->m_instrTypes.hasMultipleBB && !disableGOPT) {
      if (pContext->m_instrTypes.numOfLoop) {
        bool allowLICM = IGC_IS_FLAG_ENABLED(allowLICM) && pContext->m_retryManager.AllowLICM();
        bool runGEPLSR = IGC_IS_FLAG_ENABLED(EnableGEPLSR) && pContext->type == ShaderType::OPENCL_SHADER &&
                         pContext->platform.getPlatformInfo().eProductFamily == IGFX_PVC &&
                         !pContext->useStatelessToStateful() &&
                         pContext->m_retryManager.IsFirstTry();

        if (runGEPLSR && IGC_IS_FLAG_DISABLED(RunGEPLSRAfterLICM)) {
          mpm.add(createGEPLoopStrengthReductionPass(allowLICM));
        }

        if (allowLICM) {
          mpm.add(createSpecialCasesDisableLICM());
          int licmTh = IGC_GET_FLAG_VALUE(LICMStatThreshold);
          mpm.add(new InstrStatistic(pContext, LICM_STAT, InstrStatStage::BEGIN, licmTh));
          mpm.add(llvm::createLICMPass(100, 500, true));
          mpm.add(new InstrStatistic(pContext, LICM_STAT, InstrStatStage::END, licmTh));
        }

        if (runGEPLSR && IGC_IS_FLAG_ENABLED(RunGEPLSRAfterLICM)) {
          mpm.add(createGEPLoopStrengthReductionPass(allowLICM));
        }


        if (!pContext->m_retryManager.IsFirstTry() && pContext->type == ShaderType::OPENCL_SHADER) {
          mpm.add(new DisableLoopUnrollOnRetry());
        }


        mpm.add(createIGCInstructionCombiningPass());

        if (IGC_IS_FLAG_ENABLED(EnableIndVarSimplification) && pContext->type == ShaderType::OPENCL_SHADER) {
          mpm.add(llvm::createIndVarSimplifyPass());
        }

        if (IGC_IS_FLAG_ENABLED(EnableLoopHoistConstant)) {
          mpm.add(createLoopHoistConstant());
        }
        if (IGC_IS_FLAG_ENABLED(EnableAdvCodeMotion) && pContext->type == ShaderType::OPENCL_SHADER &&
            !pContext->m_instrTypes.hasSwitch) {
          mpm.add(createAdvCodeMotionPass(IGC_GET_FLAG_VALUE(AdvCodeMotionControl)));
        }

        mpm.add(createLoopAllocaUpperbound());

        int LoopUnrollThreshold = pContext->m_DriverInfo.GetLoopUnrollThreshold();

        // override the LoopUnrollThreshold if the registry key is set
        if (IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold) != 0) {
          LoopUnrollThreshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold);
        } else if (pContext->getModuleMetaData()->compOpt.SetLoopUnrollThreshold > 0) {
          LoopUnrollThreshold = pContext->getModuleMetaData()->compOpt.SetLoopUnrollThreshold;
        }

        // if the shader contains indexable_temp, we'll keep unroll
        bool unroll =
            !pContext->getModuleMetaData()->compOpt.DisableLoopUnroll && IGC_IS_FLAG_DISABLED(DisableLoopUnroll);
        bool hasIndexTemp = (pContext->m_indexableTempSize[0] > 0);
        bool disableLoopUnrollStage1 =
            IsStage1FastestCompile(pContext->m_CgFlag, pContext->m_StagingCtx) &&
            (FastestS1Options(pContext) == FCEXP_NO_EXPRIMENT || (FastestS1Options(pContext) & FCEXP_DISABLE_UNROLL));
        if ((LoopUnrollThreshold > 0 && unroll && !disableLoopUnrollStage1) || hasIndexTemp) {
          mpm.add(llvm::createLoopUnrollPass(2, false, false, -1, -1, -1, -1, -1, -1));
        }

        // Due to what looks like a bug in LICM, we need to break the LoopPassManager between
        // LoopUnroll and LICM.
        mpm.add(createBarrierNoopPass());

        if (allowLICM) {
          mpm.add(createSpecialCasesDisableLICM());
          mpm.add(llvm::createLICMPass(100, 500, true));
        }

        // Second unrolling with the same threshold.
        unroll = !pContext->getModuleMetaData()->compOpt.DisableLoopUnroll && IGC_IS_FLAG_DISABLED(DisableLoopUnroll);
        if (LoopUnrollThreshold > 0 && unroll) {
          mpm.add(llvm::createLoopUnrollPass(2, false, false, -1, -1, -1, -1, -1, -1));
        }
        // Should be after LICM to accurately reason about which
        // instructions are loop-dependent or not. Needs to be before
        // another LICM call which will hoist relevant intrinsics
        if (IGC_GET_FLAG_VALUE(allowDecompose2DBlockFuncs)) {
          mpm.add(createDecompose2DBlockFuncsPass());
        }

        mpm.add(llvm::createLoopLoadEliminationPass());

        if (!extensiveShader(pContext) && pContext->m_instrTypes.hasNonPrimitiveAlloca) {
          if (pContext->m_DriverInfo.NeedCountSROA()) {
            mpm.add(new InstrStatistic(pContext, SROA_PROMOTED, InstrStatStage::BEGIN, 300));
            mpm.add(createSROAPass());
            mpm.add(new InstrStatistic(pContext, SROA_PROMOTED, InstrStatStage::END, 300));
          } else {
            mpm.add(createSROAPass());
          }
        }
      }

      mpm.add(new SplitStructurePhisPass());

      if (IGC_IS_FLAG_ENABLED(EnableRemoveLoopDependency)) {
        mpm.add(new RemoveLoopDependency());
      }

      // Note:
      // call reassociation pass before IGCConstProp(EnableSimplifyGEP)
      // to preserve the the expr evaluation order that IGCConstProp
      // creates.
      // Limit this optimization to GPGPU-only because it tends to have
      // more address computation.
      // Do not apply reordering on vertex-shader as CustomUnsafeOptPass
      // does.
      if (IGC_IS_FLAG_ENABLED(OCLEnableReassociate) && pContext->type == ShaderType::OPENCL_SHADER) {
        mpm.add(createReassociatePass());
      }

      mpm.add(createPromoteConstantStructsPass());

      if (IGC_IS_FLAG_ENABLED(EnableGVN)) {
        mpm.add(llvm::createGVNPass());
      }
      mpm.add(createGenOptLegalizer());

      mpm.add(llvm::createSCCPPass());

      mpm.add(llvm::createDeadCodeEliminationPass());
      if (!extensiveShader(pContext))
        mpm.add(llvm::createAggressiveDCEPass());

      mpm.add(new BreakConstantExpr());
      mpm.add(new IGCConstProp(IGC_IS_FLAG_ENABLED(EnableSimplifyGEP)));
      // Now that constant propagation is largely complete, perform
      // initial evaluation of freeze instructions. We need this to make
      // life easier for subsequent LLVM passes, as passes like
      // InstCombine/SimplifyCFG can sometimes be lazy in checking freeze
      // operand's validity over the more complex instruction chains,
      // simply assuming that it's safer to refrain from optimizations.
      // TODO: Check if LLVM 15+ provides improvements in that regard,
      // alleviating the need for early freeze evaluation.
      mpm.add(createEvaluateFreezePass());

      if (IGC_IS_FLAG_DISABLED(DisableImmConstantOpt)) {
        // If we have ICBs, need to emit clamp code so OOB access doesn't occur
        if (pContext->getModuleMetaData()->immConstant.zeroIdxs.size()) {
          mpm.add(createClampICBOOBAccess());
        }
        GFX_ONLY_PASS { mpm.add(createIGCIndirectICBPropagaionPass()); }
      }
      GFX_ONLY_PASS { mpm.add(new GenUpdateCB()); }

      // Inserting PromoteToPredicatedMemoryAccess after GVN and several
      // other passes, to not block optimizations changing LLVM
      // load/stores, but before multiple SimplifyCFGs to allow more
      // aggressive CFG simplification.
      if (IGC_IS_FLAG_ENABLED(EnablePromoteToPredicatedMemoryAccess)) {
        mpm.add(new HoistConvOpToDom());
        mpm.add(llvm::createCFGSimplificationPass());
        mpm.add(new PromoteToPredicatedMemoryAccess());
      }

      if (IGC_IS_FLAG_ENABLED(EnableJumpThreading) && !pContext->m_instrTypes.hasAtomics &&
          !extensiveShader(pContext)) {
        if (pContext->type == ShaderType::OPENCL_SHADER) {
          // Add CFGSimplification for clean-up before JumpThreading.
          mpm.add(llvm::createCFGSimplificationPass());
        }

        // jump threading currently causes the atomic_flag test from c11 conformance to fail.  Right now,
        // only do jump threading if we don't have atomics as using atomics as locks seems to be the most common
        // case of violating the no independent forward progress clause from the spec.

        // We need to increase default duplication threshold since JumpThreading pass cost estimation does
        // not consider that not all instructions need to be duplicated.
        int BBDuplicateThreshold = (pContext->type == ShaderType::OPENCL_SHADER) ? 9 : -1;
#if LLVM_VERSION_MAJOR >= 15
        // In LLVM-12.x an extra parameter InsertFreezeWhenUnfoldingSelect = false was added
        // to JumpThreading pass, but since LLVM-15.x it was removed again.
        mpm.add(llvm::createJumpThreadingPass(BBDuplicateThreshold));
#else  // LLVM_VERSION_MAJOR
        mpm.add(llvm::createJumpThreadingPass(false, BBDuplicateThreshold));
#endif // LLVM_VERSION_MAJOR
      }
      mpm.add(llvm::createCFGSimplificationPass());
      mpm.add(llvm::createEarlyCSEPass());
      if (pContext->m_instrTypes.hasNonPrimitiveAlloca) {
        // run custom safe opts to potentially get rid of indirect
        // addressing of private arrays, see visitLoadInst
        mpm.add(new CustomSafeOptPass());
        mpm.add(createSROAPass());
      }

      // Use CFGSimplification to do clean-up. Needs to be invoked before lowerSwitch.
      mpm.add(llvm::createCFGSimplificationPass());

      if (IGC_IS_FLAG_DISABLED(DisableFlattenSmallSwitch)) {
        mpm.add(createFlattenSmallSwitchPass());
      }
      // some optimization can create switch statement we don't support
      mpm.add(llvm::createLowerSwitchPass());

      // preferred to be added after all LowerSwitch pass runs, as switch lowering is able
      // to benefit from unreachable instruction when it's in default switch case
      mpm.add(new UnreachableHandling());

      // Conditions apply just as above due to problems with atomics
      // (see comment above for details).
      if (IGC_IS_FLAG_ENABLED(EnableJumpThreading) && !pContext->m_instrTypes.hasAtomics &&
          !extensiveShader(pContext)) {
        // After lowering 'switch', run jump threading to remove redundant jumps.
        mpm.add(llvm::createJumpThreadingPass());
      }

      // run instruction combining to clean up the code after CFG optimizations
      mpm.add(createIGCInstructionCombiningPass());

      mpm.add(llvm::createDeadCodeEliminationPass());
      mpm.add(llvm::createEarlyCSEPass());
       // need to be before code sinking
      GFX_ONLY_PASS { mpm.add(createInsertBranchOptPass()); }

      mpm.add(new CustomSafeOptPass());
      if (!pContext->m_DriverInfo.WADisableCustomPass()) {
        mpm.add(new CustomUnsafeOptPass());
      }
    } else {
      if (pContext->m_instrTypes.hasMultipleBB) {
        assert(disableGOPT);
        // disable loop unroll for excessive large shaders
        if (pContext->m_instrTypes.numOfLoop) {
          mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));


          int LoopUnrollThreshold = pContext->m_DriverInfo.GetLoopUnrollThreshold();

          // override the LoopUnrollThreshold if the registry key is set
          if (IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold) != 0) {
            LoopUnrollThreshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold);
          } else if (pContext->getModuleMetaData()->compOpt.SetLoopUnrollThreshold > 0) {
            LoopUnrollThreshold = pContext->getModuleMetaData()->compOpt.SetLoopUnrollThreshold;
          }

          // if the shader contains indexable_temp, we'll keep unroll
          bool unroll =
              !pContext->getModuleMetaData()->compOpt.DisableLoopUnroll && IGC_IS_FLAG_DISABLED(DisableLoopUnroll);
          bool hasIndexTemp = (pContext->m_indexableTempSize[0] > 0);
          // Enable loop unrolling for stage 1 for now due to persisting regressions
          bool disableLoopUnrollStage1 = IsStage1FastestCompile(pContext->m_CgFlag, pContext->m_StagingCtx) &&
                                         ( // FastestS1Options(pContext) == FCEXP_NO_EXPRIMENT ||
                                             (FastestS1Options(pContext) & FCEXP_DISABLE_UNROLL));
          if ((LoopUnrollThreshold > 0 && unroll && !disableLoopUnrollStage1) || hasIndexTemp) {
            mpm.add(llvm::createLoopUnrollPass(2, false, false, -1, -1, -1, -1, -1, -1));
          }
        }

        if (IGC_IS_FLAG_ENABLED(EnableGVN)) {
          mpm.add(llvm::createGVNPass());
        }
      }
      if (IGC_IS_FLAG_DISABLED(DisableImmConstantOpt)) {
        // If we have ICBs, need to emit clamp code so OOB access
        // doesn't occur
        if (pContext->getModuleMetaData()->immConstant.zeroIdxs.size()) {
          mpm.add(createClampICBOOBAccess());
        }

        GFX_ONLY_PASS { mpm.add(createIGCIndirectICBPropagaionPass()); }
      }
       // single basic block
      if (!pContext->m_DriverInfo.WADisableCustomPass()) {
        mpm.add(llvm::createEarlyCSEPass());
        mpm.add(new CustomSafeOptPass());
        mpm.add(new CustomUnsafeOptPass());
      }
      mpm.add(createGenOptLegalizer());
      GFX_ONLY_PASS { mpm.add(createInsertBranchOptPass()); }
    }
    // If we have ICBs, need to emit clamp code so OOB access doesn't occur
    if (pContext->getModuleMetaData()->immConstant.zeroIdxs.size() && IGC_IS_FLAG_ENABLED(DisableImmConstantOpt)) {
      mpm.add(createClampICBOOBAccess());
    }

    if (pContext->m_instrTypes.hasRuntimeValueVector) {
      // Optimize extracts from RuntimeValue vectors. It should be executed
      // after constants propagation and loop unrolling
      mpm.add(createVectorBitCastOptPass());
      mpm.add(new RuntimeValueVectorExtractPass());
    }

    if (pContext->m_enableSubroutine && getFunctionControl(pContext) == FLAG_FCALL_DEFAULT) {
      mpm.add(createEstimateFunctionSizePass(EstimateFunctionSize::AL_Kernel));
      if (IGC_IS_FLAG_ENABLED(EnableLargeFunctionCallMerging)) {
        mpm.add(new CallMerger());
      }
      mpm.add(createEstimateFunctionSizePass(EstimateFunctionSize::AL_Kernel));
      mpm.add(createSubroutineInlinerPass());
    } else {
      // Inline all remaining functions with always inline attribute.
      mpm.add(createAlwaysInlinerLegacyPass());
    }
    if ((pContext->m_DriverInfo.NeedExtraPassesAfterAlwaysInlinerPass() || pContext->m_enableSubroutine) &&
        pContext->m_instrTypes.hasNonPrimitiveAlloca) {
      mpm.add(createSROAPass());
    }

    if (pContext->type == ShaderType::COMPUTE_SHADER &&
        (IGC_IS_FLAG_ENABLED(RemoveUnusedTGMFence) || pContext->getModuleMetaData()->enableRemoveUnusedTGMFence)) {
      mpm.add(new TrivialUnnecessaryTGMFenceElimination());
    }

    mpm.add(createGenSimplificationPass());

    if (pContext->m_instrTypes.hasLoadStore) {
      mpm.add(llvm::createDeadStoreEliminationPass());
      mpm.add(llvm::createMemCpyOptPass());
      mpm.add(createLdShrinkPass());
    }

    mpm.add(llvm::createDeadCodeEliminationPass());

    if (IGC_IS_FLAG_ENABLED(EnableWaveAllJointReduction)) {
      mpm.add(createWaveAllJointReduction());
    }

    if (IGC_IS_FLAG_ENABLED(EnableIntDivRemCombine)) {
      // simplify rem if the quotient is availble
      //
      // run GVN first so that stuff like the following can be
      // reduced as well:
      //  = foo / (2*x + 1)
      //  = foo % (2*x + 1)
      // can be reduced as well
      if (IGC_IS_FLAG_ENABLED(EnableGVN)) {
        mpm.add(llvm::createGVNPass());
      }
      //
      mpm.add(createIntDivRemCombinePass());
    }
    if (IGC_IS_FLAG_ENABLED(EnableConstIntDivReduction)) {
      // reduce division/remainder with a constant divisors/moduli to
      // more efficient sequences of multiplies, shifts, and adds
      mpm.add(createIntDivConstantReductionPass());
    }

    if (IGC_IS_FLAG_ENABLED(EnableIntDivRemIncrementReduction)) {
      mpm.add(createIntDivRemIncrementReductionPass());
    }
    GFX_ONLY_PASS { mpm.add(createMergeMemFromBranchOptPass()); }

    if (IGC_IS_FLAG_DISABLED(DisableLoadSinking) &&
        !isOptDisabledForModule(pContext->getModuleMetaData(), IGCOpts::SinkLoadOptPass)) {
      mpm.add(createSinkLoadOptPass());
    }

    mpm.add(createConstantMergePass());
    GFX_ONLY_PASS { mpm.add(CreateMCSOptimization()); }
    GFX_ONLY_PASS { mpm.add(CreateGatingSimilarSamples()); }

    if (!IGC::ForceAlwaysInline(pContext)) {
      mpm.add(new PurgeMetaDataUtils());
    }
       // mpm.add(llvm::createDeadCodeEliminationPass()); // this should be done both before/after constant propagation

    if (IGC_IS_FLAG_ENABLED(EnableUnmaskedFunctions) && IGC_IS_FLAG_DISABLED(LateInlineUnmaskedFunc)) {
      mpm.add(new InlineUnmaskedFunctionsPass());
    }

    if (pContext->m_instrTypes.numOfLoop) {
      mpm.add(createDeadPHINodeEliminationPass());
    }

    if (IGC_IS_FLAG_ENABLED(EnableMadLoopSlice)) {
      mpm.add(createMadLoopSlicePass());
    }
    if (IGC_IS_FLAG_ENABLED(EnableVectorizer)) {
      mpm.add(new IGCVectorizer());
      mpm.add(llvm::createAggressiveDCEPass());
      if (IGC_IS_FLAG_ENABLED(VectorizerCheckScalarizer))
        mpm.add(createScalarizerPass(SelectiveScalarizer::Auto));
    }


    mpm.run(*pContext->getModule());
  } // end scope
  COMPILER_TIME_END(pContext, TIME_OptimizationPasses);

  // pContext->shaderEntry->viewCFG();
  DumpLLVMIR(pContext, "optimized");
  MEM_SNAPSHOT(IGC::SMS_AFTER_OPTIMIZER);
} // OptimizeIR

} // namespace IGC