intel-graphics-compiler/IGC/Compiler/CISACodeGen/GenIRLowering.cpp

/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2021 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

// vim:ts=2:sw=2:et:
#include "common/LLVMUtils.h"
#include "Compiler/CISACodeGen/GenIRLowering.h"
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "Compiler/CodeGenContextWrapper.hpp"
#include "Compiler/MetaDataUtilsWrapper.h"
#include "Compiler/IGCPassSupport.h"
#include "common/LLVMWarningsPush.hpp"
#include <llvm/Pass.h>
#include <llvm/IR/DataLayout.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/PatternMatch.h>
#include <llvm/Analysis/ScalarEvolution.h>
#include <llvm/Analysis/ScalarEvolutionExpressions.h>
#include <llvm/Analysis/TargetFolder.h>
#include <llvm/Analysis/ValueTracking.h>
#include <llvm/IR/GetElementPtrTypeIterator.h>
#include <llvm/Support/KnownBits.h>
#include <llvm/Transforms/Utils/ScalarEvolutionExpander.h>
#include <llvm/Transforms/Utils/Local.h>
#include "llvmWrapper/IR/Intrinsics.h"
#include "llvmWrapper/IR/DerivedTypes.h"
#include "common/LLVMWarningsPop.hpp"
#include "GenISAIntrinsics/GenIntrinsics.h"
#include "common/IGCIRBuilder.h"
#include "Probe/Assertion.h"

using namespace llvm;
using namespace IGC;
using namespace IGC::IGCMD;
using IGCLLVM::FixedVectorType;

namespace {
class GenIRLowering : public FunctionPass {
  using BuilderTy = IGCIRBuilder<TargetFolder>;
  BuilderTy *Builder = nullptr;

public:
  static char ID;

  GenIRLowering() : FunctionPass(ID) { initializeGenIRLoweringPass(*PassRegistry::getPassRegistry()); }

  StringRef getPassName() const override { return "GenIR Lowering"; }

  bool runOnFunction(Function &F) override;

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesCFG();
    AU.addRequired<CodeGenContextWrapper>();
    AU.addRequired<MetaDataUtilsWrapper>();
    AU.addRequired<LoopInfoWrapperPass>();
  }

private:
  // Helpers
  Value *rearrangeAdd(Value *, Loop *) const;

  bool combineFMaxFMin(CallInst *GII, BasicBlock::iterator &BBI) const;
  bool combineSelectInst(SelectInst *Sel, BasicBlock::iterator &BBI) const;
  bool combinePack4i8Or2i16(Instruction *inst, uint64_t numBits) const;

  bool constantFoldFMaxFMin(CallInst *GII, BasicBlock::iterator &BBI) const;
};

char GenIRLowering::ID = 0;

// Pattern match helpers.

template <typename LHS_t, typename RHS_t, typename Pred_t> struct FMaxMinCast_match {
  unsigned &CastOpcode;
  LHS_t L;
  RHS_t R;

  FMaxMinCast_match(unsigned &Opcode, const LHS_t &LHS, const RHS_t &RHS) : CastOpcode(Opcode), L(LHS), R(RHS) {}

  bool isEqualOrCasted(Value *V, Value *Orig, unsigned Opcode) const {
    if (V == Orig)
      return true;
    // Check V is casted from Orig.
    CastInst *Cast = dyn_cast<CastInst>(V);
    if (Cast && Cast->getOpcode() == Opcode && Cast->getOperand(0) == Orig)
      return true;
    if (Constant *C = dyn_cast<Constant>(Orig)) {
      if (!CastInst::castIsValid(Instruction::CastOps(Opcode), C, V->getType()))
        return false;
      // TODO: Need to check isExact for FPToSI/FPToUI.
      Constant *Casted = ConstantExpr::getCast(Opcode, C, V->getType());
      if (V == Casted)
        return true;
    }
    return false;
  }

  template <typename OpTy> bool match(OpTy *V) {
    SelectInst *SI = dyn_cast<SelectInst>(V);
    if (!SI)
      return false;
    FCmpInst *Cmp = dyn_cast<FCmpInst>(SI->getCondition());
    if (!Cmp)
      return false;
    Value *TVal = SI->getTrueValue();
    Value *FVal = SI->getFalseValue();

    // Check cast op if any. If both operands use cast op, they should match.
    unsigned Opcode = Instruction::UserOp1;
    if (CastInst *Cast = dyn_cast<CastInst>(TVal))
      Opcode = Cast->getOpcode();
    if (CastInst *Cast = dyn_cast<CastInst>(FVal)) {
      unsigned Op = Cast->getOpcode();
      if (Opcode != Instruction::UserOp1 && Opcode != Op)
        return false;
      Opcode = Op;
    }

    Value *LHS = Cmp->getOperand(0);
    Value *RHS = Cmp->getOperand(1);
    if ((!isEqualOrCasted(TVal, LHS, Opcode) || !isEqualOrCasted(FVal, RHS, Opcode)) &&
        (!isEqualOrCasted(TVal, RHS, Opcode) || !isEqualOrCasted(FVal, LHS, Opcode)))
      return false;

    FCmpInst::Predicate Pred = Cmp->getPredicate();
    if (!isEqualOrCasted(TVal, LHS, Opcode)) {
      Pred = Cmp->getSwappedPredicate();
      std::swap(TVal, FVal);
    }
    if (!Pred_t::match(Pred))
      return false;

    if (L.match(LHS) && R.match(RHS)) {
      CastOpcode = Opcode;
      return true;
    }
    return false;
  }
};

template <typename LHS, typename RHS>
inline FMaxMinCast_match<LHS, RHS, llvm::PatternMatch::ofmax_pred_ty> m_OrdFMaxCast(unsigned &Opcode, const LHS &L,
                                                                                    const RHS &R) {
  return FMaxMinCast_match<LHS, RHS, llvm::PatternMatch::ofmax_pred_ty>(Opcode, L, R);
}

template <typename LHS, typename RHS>
inline FMaxMinCast_match<LHS, RHS, llvm::PatternMatch::ofmin_pred_ty> m_OrdFMinCast(unsigned &Opcode, const LHS &L,
                                                                                    const RHS &R) {
  return FMaxMinCast_match<LHS, RHS, llvm::PatternMatch::ofmin_pred_ty>(Opcode, L, R);
}

template <typename Op_t, typename ConstTy> struct ClampWithConstants_match {
  typedef ConstTy *ConstPtrTy;

  Op_t Op;
  ConstPtrTy &CMin, &CMax;

  ClampWithConstants_match(const Op_t &OpMatch, ConstPtrTy &Min, ConstPtrTy &Max) : Op(OpMatch), CMin(Min), CMax(Max) {}

  template <typename OpTy> bool match(OpTy *V) {
    CallInst *GII = dyn_cast<CallInst>(V);
    if (!GII)
      return false;

    EOPCODE GIID = GetOpCode(GII);
    if (GIID != llvm_max && GIID != llvm_min)
      return false;

    Value *X = GII->getOperand(0);
    Value *C = GII->getOperand(1);
    if (isa<ConstTy>(X))
      std::swap(X, C);

    ConstPtrTy C0 = dyn_cast<ConstTy>(C);
    if (!C0)
      return false;

    CallInst *GII2 = dyn_cast<CallInst>(X);
    if (!GII2)
      return false;

    EOPCODE GIID2 = GetOpCode(GII2);
    if (!(GIID == llvm_min && GIID2 == llvm_max) && !(GIID == llvm_max && GIID2 == llvm_min))
      return false;

    X = GII2->getOperand(0);
    C = GII2->getOperand(1);
    if (isa<ConstTy>(X))
      std::swap(X, C);

    ConstPtrTy C1 = dyn_cast<ConstTy>(C);
    if (!C1)
      return false;

    if (!Op.match(X))
      return false;

    CMin = (GIID2 == llvm_min) ? C0 : C1;
    CMax = (GIID2 == llvm_min) ? C1 : C0;
    return true;
  }
};

template <typename OpTy, typename ConstTy>
inline ClampWithConstants_match<OpTy, ConstTy> m_ClampWithConstants(const OpTy &Op, ConstTy *&Min, ConstTy *&Max) {
  return ClampWithConstants_match<OpTy, ConstTy>(Op, Min, Max);
}

// This pass lowers GEP into primitive ones (i.e. addition and/or
// multiplication, converted to shift if applicable) to expose address
// calculation to LLVM optimizations, such as CSE, LICM, and etc.
//
class GEPLowering : public FunctionPass {
  const DataLayout *DL = nullptr;
  CodeGenContext *m_ctx = nullptr;
  using BuilderTy = IGCIRBuilder<TargetFolder>;
  BuilderTy *Builder = nullptr;
  llvm::LoopInfo *m_LI = nullptr;
  ModuleMetaData *modMD = nullptr;
  ScalarEvolution *SE = nullptr;

public:
  static char ID;

  GEPLowering() : FunctionPass(ID) { initializeGEPLoweringPass(*PassRegistry::getPassRegistry()); }

  StringRef getPassName() const override { return "GEP Lowering"; }

  bool runOnFunction(Function &F) override;

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.setPreservesCFG();
    AU.addRequired<CodeGenContextWrapper>();
    AU.addRequired<MetaDataUtilsWrapper>();
    AU.addRequired<LoopInfoWrapperPass>();
    AU.addRequired<ScalarEvolutionWrapperPass>();
  }

protected:
  // Helpers
  Value *getSExtOrTrunc(Value *, Type *) const;
  Value *truncExpr(Value *, Type *) const;

  bool simplifyGEP(BasicBlock &BB) const;
  bool lowerGetElementPtrInst(GetElementPtrInst *GEP) const;
};

char GEPLowering::ID = 0;

} // End anonymous namespace

bool GenIRLowering::runOnFunction(Function &F) {
  // Skip non-kernel function.
  MetaDataUtils *MDU = nullptr;
  MDU = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
  ModuleMetaData *modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
  auto FII = MDU->findFunctionsInfoItem(&F);
  if (FII == MDU->end_FunctionsInfo())
    return false;

  auto &DL = F.getParent()->getDataLayout();

  BuilderTy TheBuilder(F.getContext(), TargetFolder(DL));
  Builder = &TheBuilder;

  bool Changed = false;

  // Replace SLM PtrToInt by the assigned immed offset
  // Later optimization (InstCombine) can fold away some address computation

  FunctionMetaData *funcMD = &modMD->FuncMD[&F];

  for (auto localOffetsItr = funcMD->localOffsets.begin(), localOffsetsEnd = funcMD->localOffsets.end();
       localOffetsItr != localOffsetsEnd; ++localOffetsItr) {
    LocalOffsetMD localOffset = *localOffetsItr;

    // look up the value-to-offset mapping
    Value *V = localOffset.m_Var;
    unsigned Offset = localOffset.m_Offset;

    // Skip non-pointer values.
    if (!V->getType()->isPointerTy())
      continue;
    // Skip non-local pointers.
    unsigned AS = V->getType()->getPointerAddressSpace();
    if (AS != ADDRESS_SPACE_LOCAL)
      continue;

    // It is possible that a global (slm) is used in more than one kernels
    // and each kernel might have a different offset for this global. Thus,
    // we can only replace the uses within this kernel function. We will check
    // instructions only as the constant expressions have been broken up
    // before this pass.
    PointerType *PTy = cast<PointerType>(V->getType());
    Constant *CO = ConstantInt::get(Type::getInt32Ty(F.getContext()), Offset);
    Constant *NewBase = ConstantExpr::getIntToPtr(CO, PTy);
    auto NI = V->user_begin();
    for (auto I = NI, E = V->user_end(); I != E; I = NI) {
      ++NI;
      Instruction *Inst = dyn_cast<Instruction>(*I);
      if (!Inst || Inst->getParent()->getParent() != &F) {
        continue;
      }

      // As constant exprs have been broken up, need to check insts only.
      if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Inst)) {
        // sanity check
        if (GEPI->getOperand(0) == V) {
          // operand 0 is pointer operand
          GEPI->setOperand(0, NewBase);
          Changed = true;
        }
      } else if (PtrToIntInst *PI = dyn_cast<PtrToIntInst>(Inst)) {
        Value *CI = ConstantInt::get(PI->getType(), Offset);
        PI->replaceAllUsesWith(CI);
        PI->eraseFromParent();
        Changed = true;
      } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
        BCI->setOperand(0, NewBase);
        Changed = true;
      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
        LI->setOperand(0, NewBase);
        Changed = true;
      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
        if (SI->getPointerOperand() == V) {
          // pointer operand is operand 1!
          SI->setOperand(1, NewBase);
          Changed = true;
        }
      } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
        CI->replaceUsesOfWith(V, NewBase);
        Changed = true;
      } else if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
        CI->replaceUsesOfWith(V, NewBase);
        Changed = true;
      }
    }
  }

  for (auto &BB : F) {
    for (auto BI = BB.begin(), BE = BB.end(); BI != BE;) {
      Instruction *Inst = &(*BI++);
      Builder->SetInsertPoint(Inst);

      switch (Inst->getOpcode()) {
      default: // By default, DO NOTHING
        break;
      case Instruction::Call:
        if (CallInst *GII = dyn_cast<CallInst>(Inst)) {
          switch (GetOpCode(GII)) {
          case llvm_max:
          case llvm_min:
            Changed |= combineFMaxFMin(GII, BI);
            break;
          default:
            break;
          }
        }
        break;
      case Instruction::Select:
        // Enable the pattern match only when NaNs can be ignored.
        if (modMD->compOpt.NoNaNs || modMD->compOpt.FiniteMathOnly) {
          Changed |= combineSelectInst(cast<SelectInst>(Inst), BI);
        }
        break;
      case Instruction::Or:
        if (Inst->getType()->isIntegerTy(32)) {
          // Detect packing of 4 i8 values and convert to a pattern that is
          // matched CodeGenPatternMatch::MatchPack4i8
          Changed |= combinePack4i8Or2i16(Inst, 8 /*numBits*/);
          // TODO: also detect <2 x i16> packing once PatternMatch is updated
          // to packing of 16-bit values.
        }
        break;
      }
    }
  }

  Builder = nullptr;

  return Changed;
}

// For each basic block, simplify GEPs based on the analysis result from SCEV.
bool GEPLowering::simplifyGEP(BasicBlock &BB) const {
  // Pointers with the form base + zext(idx).
  struct PointerExpr {
    GetElementPtrInst *GEP;
    const SCEV *Idx;
    GetElementPtrInst *Base = nullptr; // A simplified offset if any.
    const SCEV *Offset = nullptr;      // A simplified offset if any.
  };
  // Each visited base pointer have a collection of base expr.
  DenseMap<Value *, SmallVector<PointerExpr, 128>> Pointers;

  auto IsUsedByBindless = [](const GetElementPtrInst *GEP) {
    for (auto *U : GEP->users())
      if (auto *P2I = dyn_cast<PtrToIntInst>(U))
        if (P2I->getType()->isIntegerTy(32))
          return true;
    return false;
  };

  bool Changed = false;
  for (auto BI = BB.begin(), BE = BB.end(); BI != BE; ++BI) {
    auto *GEP = dyn_cast<GetElementPtrInst>(BI);
    // So far, for simplicity, consider GEPs on the generic/global address
    // with a single index only. It should be straight-forward to extend
    // the support to other cases, where multiple indices are present.
    if (!GEP || !GEP->isInBounds() || GEP->getNumIndices() != 1 ||
        (GEP->getAddressSpace() != ADDRESS_SPACE_GLOBAL && GEP->getAddressSpace() != ADDRESS_SPACE_GENERIC))
      continue;
    if (IsUsedByBindless(GEP))
      continue;
    auto *Idx = GEP->getOperand(1);
    if (auto *ZExt = dyn_cast<ZExtInst>(Idx)) {
      Idx = ZExt->getOperand(0);
    } else if (auto *SExt = dyn_cast<SExtInst>(Idx)) {
      Idx = SExt->getOperand(0);
      Operator *Opr = dyn_cast<Operator>(Idx);
      if (Opr && Opr->getOpcode() == BinaryOperator::BinaryOps::SDiv) {
        // Skip if it is SDiv. This special check is needed as
        // OverflowingBinaryOperator does not include SDiv
        continue;
      }
      auto *Op = dyn_cast<OverflowingBinaryOperator>(Idx);
      if (Op && !Op->hasNoSignedWrap())
        continue;
    }

    const SCEV *E = SE->getSCEV(Idx);
    // Skip if the offset to the base is already a constant.
    if (isa<SCEVConstant>(E))
      continue;
    Value *Base = GEP->getPointerOperand();
    auto &Exprs = Pointers[Base];

    auto EI = Exprs.begin();
    auto EE = Exprs.end();
    const SCEV *Offset = nullptr;

    // Let GEP_a be one gep from Pointers[Base];
    // GEP (it is 'GEP' var in this loop iteration) reuses GEP_a's address
    // as its base
    //   1. if GEP_a is the first in Pointer[Base] such that diff of GEP_a
    //      and GEP is constant; otherwise
    //   2. if GEP_a is the first in Pointers[Base] such that diff of GEP_a
    //      and GEP is 1 (a single value), otherwise
    //   3. if GEP_a has the smallest diff or if more than one GEPs with the
    //      same diff, GEP_a is the last one in Pointers[Base].
    // Both 1 and 2 may potentially save a few instructions. 3 is a
    // heuristic and may be further tuned.
    constexpr unsigned DIFF_SIZE_THRESHOLD = 3;
    unsigned MinDiff = DIFF_SIZE_THRESHOLD;
    bool isDiffOne = false;
    GetElementPtrInst *BaseWithMinDiff = nullptr;
    for (/*EMPTY*/; EI != EE; ++EI) {
      // Skip if the result types do not match.
      if (EI->GEP->getType() != GEP->getType() || E->getType() != EI->Idx->getType())
        continue;

      auto *Diff = SE->getMinusSCEV(E, EI->Idx);
      unsigned exprSize = Diff->getExpressionSize();
      if (exprSize <= MinDiff) {
        if (isa<SCEVConstant>(Diff)) {
          BaseWithMinDiff = EI->GEP;
          Offset = Diff;
          MinDiff = exprSize;
          break;
        }
        if (!isDiffOne) {
          BaseWithMinDiff = EI->GEP;
          Offset = Diff;
          MinDiff = exprSize;
          isDiffOne = (MinDiff == 1);
        }
      }
    }
    // Not found, add this GEP as a potential base expr.
    if (!Offset) {
      Exprs.emplace_back(PointerExpr{GEP, E, nullptr, nullptr});
      continue;
    }
    Exprs.emplace_back(PointerExpr{GEP, E, BaseWithMinDiff, Offset});
  }
  std::vector<Instruction *> DeadInsts;
  for (const auto &B : Pointers) {
    for (auto PI = B.second.rbegin(), PE = B.second.rend(); PI != PE; ++PI) {
      auto &P = *PI;
      if (P.Offset) {
        SCEVExpander E(*SE, *DL, "gep-simplification");
        Value *V = E.expandCodeFor(P.Offset, P.Idx->getType(), P.GEP);
        Builder->SetInsertPoint(P.GEP);
        auto *NewGEP = Builder->CreateInBoundsGEP(P.Base->getResultElementType(), P.Base,
                                                  Builder->CreateSExt(V, P.GEP->getOperand(1)->getType()));
        P.GEP->replaceAllUsesWith(NewGEP);
        DeadInsts.push_back(P.GEP);
        Changed = true;
      }
    }
  }
  for (auto *I : DeadInsts)
    RecursivelyDeleteTriviallyDeadInstructions(I);
  return Changed;
}

bool GEPLowering::runOnFunction(Function &F) {
  // Skip non-kernel function.
  modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
  m_LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  MetaDataUtils *MDU = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
  auto FII = MDU->findFunctionsInfoItem(&F);
  if (FII == MDU->end_FunctionsInfo())
    return false;

  CodeGenContextWrapper *pCtxWrapper = &getAnalysis<CodeGenContextWrapper>();
  m_ctx = pCtxWrapper->getCodeGenContext();

  DL = &F.getParent()->getDataLayout();
  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();

  BuilderTy TheBuilder(F.getContext(), TargetFolder(*DL));
  Builder = &TheBuilder;

  bool Changed = false;

  if (IGC_IS_FLAG_ENABLED(EnableGEPSimplification))
  {
    for (auto &BB : F)
      Changed |= simplifyGEP(BB);

    if (IGC_IS_FLAG_ENABLED(TestGEPSimplification))
      return Changed;
  }

  for (auto &BB : F) {
    for (auto BI = BB.begin(), BE = BB.end(); BI != BE;) {
      Instruction *Inst = &(*BI++);
      Builder->SetInsertPoint(Inst);

      switch (Inst->getOpcode()) {
      default: // By default, DO NOTHING
        break;
      // Lower GEPs to inttoptr/ptrtoint with offsets.
      case Instruction::GetElementPtr:
        Changed |= lowerGetElementPtrInst(cast<GetElementPtrInst>(Inst));
        break;
      }
    }
  }

  return Changed;
}

Value *GEPLowering::getSExtOrTrunc(Value *Val, Type *NewTy) const {
  Type *OldTy = Val->getType();
  unsigned OldWidth;
  unsigned NewWidth;

  IGC_ASSERT_MESSAGE(OldTy->isIntOrIntVectorTy(), "Index should be Integer or vector of Integer!");

  if (auto OldVecTy = dyn_cast<IGCLLVM::FixedVectorType>(OldTy)) {
    OldWidth = (unsigned)OldVecTy->getNumElements() * OldVecTy->getElementType()->getIntegerBitWidth();
    NewWidth = (unsigned)OldVecTy->getNumElements() * NewTy->getIntegerBitWidth();
  } else {
    OldWidth = OldTy->getIntegerBitWidth();
    NewWidth = NewTy->getIntegerBitWidth();
  }

  if (OldWidth < NewWidth) { // SExt
    return Builder->CreateSExt(Val, NewTy);
  }

  if (OldWidth > NewWidth) { // Trunc
    return truncExpr(Val, NewTy);
  }

  return Val;
}

Value *GEPLowering::truncExpr(Value *Val, Type *NewTy) const {
  // Truncation on Gen could be as cheap as NOP by creating the proper region.
  // Instead of truncating the value itself, try to truncate how it's
  // calculated.
  if (Constant *C = dyn_cast<Constant>(Val))
    return Builder->CreateIntCast(C, NewTy, false);

  if (!isa<Instruction>(Val))
    return Builder->CreateTrunc(Val, NewTy);

  Instruction *I = cast<Instruction>(Val);
  unsigned Opc = I->getOpcode();
  switch (Opc) {
  case Instruction::Add:
  case Instruction::Sub:
  case Instruction::Mul:
  case Instruction::And:
  case Instruction::Or:
  case Instruction::Xor: {
    BinaryOperator *BO = cast<BinaryOperator>(I);
    Value *LHS = truncExpr(BO->getOperand(0), NewTy);
    Value *RHS = truncExpr(BO->getOperand(1), NewTy);
    return Builder->CreateBinOp(BO->getOpcode(), LHS, RHS);
  }
  case Instruction::Trunc:
  case Instruction::ZExt:
  case Instruction::SExt: {
    Value *Opnd = I->getOperand(0);
    if (Opnd->getType() == NewTy)
      return Opnd;
    return Builder->CreateIntCast(Opnd, NewTy, Opc == Instruction::SExt);
  }
  case Instruction::Select: {
    Value *TVal = truncExpr(I->getOperand(1), NewTy);
    Value *FVal = truncExpr(I->getOperand(2), NewTy);
    return Builder->CreateSelect(I->getOperand(0), TVal, FVal);
  }
#if 0
                              // TODO: Rewrite truncExpr into iterative one instead of recursive one to
                              // easily found the loop due to phi-node.
    case Instruction::PHI: {
        PHINode* PN = cast<PHINode>(I);
        PHINode* Res = PHINode::Create(NewTy, PN->getNumIncomingValues());
        for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
            Value* V = truncExpr(PN->getIncomingValue(i), NewTy);
            Res->addIncoming(V, PN->getIncomingBlock(i));
        }
        return Res;
    }
#endif
  default:
    // Don't know truncate its calculation safely, fall back to the regular
    // way.
    break;
  }

  return Builder->CreateTrunc(Val, NewTy);
}

//
// reassociate chain of address adds so that the loop invariant terms appear on RHS tree
//
Value *GenIRLowering::rearrangeAdd(Value *val, Loop *loop) const {
  BinaryOperator *binOp = dyn_cast<BinaryOperator>(val);
  if (!binOp || binOp->getOpcode() != Instruction::Add) {
    return val;
  }

  Value *LHS = binOp->getOperand(0);
  Value *RHS = binOp->getOperand(1);

  if (loop->isLoopInvariant(LHS)) {
    Value *newRHS = rearrangeAdd(binOp->getOperand(1), loop);
    if (!loop->isLoopInvariant(newRHS)) {
      BinaryOperator *RHSBinOp = dyn_cast<BinaryOperator>(newRHS);
      if (RHSBinOp && RHSBinOp->getOpcode() == Instruction::Add) {
        // LI + (a + b) --> a + (b + LI)
        Value *LHSofNewRHS = RHSBinOp->getOperand(0);
        Value *RHSofNewRHS = RHSBinOp->getOperand(1);
        return Builder->CreateAdd(LHSofNewRHS, Builder->CreateAdd(RHSofNewRHS, LHS));
      }
    }

    // LI + a --> a + LI
    return Builder->CreateAdd(newRHS, LHS);
  } else {
    Value *newLHS = rearrangeAdd(LHS, loop);
    BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(newLHS);
    if (LHSBinOp && LHSBinOp->getOpcode() == Instruction::Add) {
      Value *LHSofLHS = LHSBinOp->getOperand(0);
      Value *RHSofLHS = LHSBinOp->getOperand(1);
      if (loop->isLoopInvariant(RHSofLHS)) {
        // (a + LI) + b --> a + (b + LI)
        return Builder->CreateAdd(LHSofLHS, rearrangeAdd(Builder->CreateAdd(RHS, RHSofLHS), loop));
      }
    }

    return Builder->CreateAdd(newLHS, rearrangeAdd(RHS, loop));
  }
}

bool GEPLowering::lowerGetElementPtrInst(GetElementPtrInst *GEP) const {
  Value *const PtrOp = GEP->getPointerOperand();
  IGC_ASSERT(nullptr != PtrOp);
  PointerType *const PtrTy = dyn_cast<PointerType>(PtrOp->getType());
  IGC_ASSERT_MESSAGE(nullptr != PtrTy, "Only accept scalar pointer!");

  unsigned pointerSizeInBits = m_ctx->getRegisterPointerSizeInBits(PtrTy->getAddressSpace());
  unsigned pointerMathSizeInBits = pointerSizeInBits;
  bool reducePointerArith = false;
  bool canReduceNegativeOffset = false;

  // Detect if we can do intermediate pointer arithmetic in 32bits
  if (pointerMathSizeInBits == 64 && GEP->isInBounds()) {
    if (!modMD->compOpt.GreaterThan4GBBufferRequired) {
      bool gepProducesPositivePointer = true;

      // prove that the offset from the base pointer will be positive.  if we cannot
      // prove that all parameters to GEP increase the address of the final calculation
      // we can't fall back to 32bit math
      for (auto U = GEP->idx_begin(), E = GEP->idx_end(); U != E; ++U) {
        Value *Idx = U->get();

        if (Idx != GEP->getPointerOperand()) {
          gepProducesPositivePointer &= valueIsPositive(Idx, DL);
        }
      }

      if (gepProducesPositivePointer) {
        pointerMathSizeInBits = 32;
        reducePointerArith = true;
      }
    } else if (GEP->getAddressSpace() == ADDRESS_SPACE_CONSTANT || !modMD->compOpt.GreaterThan2GBBufferRequired) {
      canReduceNegativeOffset = true;
      pointerMathSizeInBits = m_ctx->platform.hasLargeMaxConstantBufferSize() ? 64 : 32;
      reducePointerArith = true;
    }
  }

  IntegerType *IntPtrTy = IntegerType::get(Builder->getContext(), pointerSizeInBits);
  IntegerType *PtrMathTy = IntegerType::get(Builder->getContext(), pointerMathSizeInBits);

  Value *BasePointer = nullptr;
  // Check if the pointer itself is created from IntToPtr.  If it is, and if
  // the int is the same size, we can use the int directly.  Otherwise, we
  // need to add PtrToInt.
  if (IntToPtrInst *I2PI = dyn_cast<IntToPtrInst>(PtrOp)) {
    Value *IntOp = I2PI->getOperand(0);
    if (IntOp->getType() == IntPtrTy) {
      BasePointer = IntOp;
    }
  }
  if (!BasePointer) {
    BasePointer = Builder->CreatePtrToInt(PtrOp, IntPtrTy);
  }
  // This is the value of the pointer, which will ultimately replace
  // getelementptr.
  Value *PointerValue = nullptr;
  if (reducePointerArith) {
    // in case the pointer arithmetic is done in lower type postpone adding the base to the end
    PointerValue = ConstantInt::get(PtrMathTy, 0);
  } else {
    PointerValue = BasePointer;
  }

  gep_type_iterator GTI = gep_type_begin(GEP);
  for (auto OI = GEP->op_begin() + 1, E = GEP->op_end(); OI != E; ++OI, ++GTI) {
    Value *Idx = *OI;
    if (StructType *StTy = GTI.getStructTypeOrNull()) {
      unsigned Field = int_cast<unsigned>(cast<ConstantInt>(Idx)->getZExtValue());
      if (Field) {
        uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
        Value *OffsetValue = Builder->getInt(APInt(pointerMathSizeInBits, Offset));
        PointerValue = Builder->CreateAdd(PointerValue, OffsetValue);
      }
    } else {
      Type *Ty = GTI.getIndexedType();

      if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
        if (!CI->isZero()) {
          uint64_t Offset = DL->getTypeAllocSize(Ty) * CI->getSExtValue();
          Value *OffsetValue = Builder->getInt(APInt(pointerMathSizeInBits, Offset));
          PointerValue = Builder->CreateAdd(PointerValue, OffsetValue);
        }
      } else {
        Value *NewIdx = getSExtOrTrunc(Idx, PtrMathTy);
        APInt ElementSize = APInt(pointerMathSizeInBits, DL->getTypeAllocSize(Ty));

        ConstantInt *COffset = nullptr;
        if (IGC_IS_FLAG_ENABLED(EnableSimplifyGEP) && NewIdx->hasOneUse()) {
          // When EnableSimplifyGEP is on, GEP's index can be of V + C
          // where C is constant. If so, we will continue push C up to
          // the top so CSE could do better job.

          //
          // Replace
          //   %nswAdd = add nsw i32 %49, 195
          //   %NewIdx = sext i32 %nswAdd to i64
          //   %PointerValue = %NewIdx * 4 + %Base
          // with
          //   %NewIdx = sext i32 %49
          //   %PointerValue = (%NewIdx * 4 + %Base) + (4 * 195)
          // for later CSE.
          //

          bool performSExt = false;
          if (SExtInst *I = dyn_cast<SExtInst>(NewIdx)) {
            if (OverflowingBinaryOperator *nswAdd = dyn_cast<OverflowingBinaryOperator>(I->getOperand(0))) {
              if ((nswAdd->getOpcode() == Instruction::Add) && nswAdd->hasNoSignedWrap() &&
                  isa<ConstantInt>(nswAdd->getOperand(1))) {
                performSExt = true;
                NewIdx = nswAdd;
              }
            }
          }
          if (Instruction *Inst = dyn_cast<Instruction>(NewIdx)) {
            if (Inst->getOpcode() == Instruction::Add) {
              COffset = dyn_cast<ConstantInt>(Inst->getOperand(1));
              if (COffset) {
                NewIdx = Inst->getOperand(0);
                int64_t cval = COffset->getSExtValue() * ElementSize.getZExtValue();
                COffset = ConstantInt::get(PtrMathTy, cval);
              }
            }
          }
          if (performSExt) {
            NewIdx = Builder->CreateSExt(NewIdx, PtrMathTy);
          }
        }

        if (BinaryOperator *binaryOp = dyn_cast<BinaryOperator>(NewIdx)) {
          // detect the pattern
          // GEP base, a + b
          // where base and a are both loop invariant (but not b), so we could rearrange the lowered code into
          // (base + (a << shftAmt)) + (b << shftAmt)
          // For now we only look at one level
          Loop *loop = m_LI ? m_LI->getLoopFor(binaryOp->getParent()) : nullptr;
          if (loop != nullptr && loop->isLoopInvariant(PtrOp) && binaryOp->getOpcode() == Instruction::Add) {

            Value *LHS = binaryOp->getOperand(0);
            Value *RHS = binaryOp->getOperand(1);
            bool isLHSLI = loop->isLoopInvariant(LHS);
            bool isRHSLI = loop->isLoopInvariant(RHS);

            auto reassociate = [&](Value *invariant, Value *other) {
              Value *invariantVal = nullptr;
              if (ElementSize == 1) {
                invariantVal = invariant;
              } else if (ElementSize.isPowerOf2()) {
                invariantVal = Builder->CreateShl(invariant, APInt(pointerMathSizeInBits, ElementSize.logBase2()));
              } else {
                invariantVal = Builder->CreateMul(invariant, Builder->getInt(ElementSize));
              }
              PointerValue = Builder->CreateAdd(PointerValue, invariantVal);
              NewIdx = other;
            };
            if (isLHSLI && !isRHSLI) {
              reassociate(LHS, RHS);
            } else if (!isLHSLI && isRHSLI) {
              reassociate(RHS, LHS);
            }
          }
        }
        if (ElementSize == 1) {
          // DO NOTHING.
        } else if (ElementSize.isPowerOf2()) {
          APInt ShiftAmount = APInt(pointerMathSizeInBits, ElementSize.logBase2());
          NewIdx = Builder->CreateShl(NewIdx, ShiftAmount);
        } else {
          NewIdx = Builder->CreateMul(NewIdx, Builder->getInt(ElementSize));
        }

        Loop *loop = m_LI ? m_LI->getLoopFor(GEP->getParent()) : nullptr;

        if (loop && loop->isLoopInvariant(PtrOp)) {
          // add COffset to Pointer base first so LICM can kick in later
          // note that PointerValue is guaranteed to be LI since both PtrOp and whatever
          // we've added to it during reassociation must be LI
          if (COffset) {
            PointerValue = Builder->CreateAdd(PointerValue, COffset);
          }
          PointerValue = Builder->CreateAdd(PointerValue, NewIdx);
        } else {
          if (auto NewIdxVT = dyn_cast<IGCLLVM::FixedVectorType>(NewIdx->getType())) {
            Value *result =
                llvm::UndefValue::get(FixedVectorType::get(PtrMathTy, (unsigned)NewIdxVT->getNumElements()));
            for (uint32_t j = 0; j < (uint32_t)NewIdxVT->getNumElements(); j++) {
              result = Builder->CreateInsertElement(result, PointerValue, Builder->getInt32(j));
            }
            PointerValue = result;
          }
          PointerValue = Builder->CreateAdd(PointerValue, NewIdx);
          if (COffset) {
            PointerValue = Builder->CreateAdd(PointerValue, COffset);
          }
        }
      }
    }
  }

  if (reducePointerArith) {
    IGC_ASSERT_MESSAGE(GEP->isInBounds(), "we can only do a zext if the GEP is inbound");
    if (!canReduceNegativeOffset) {
      PointerValue = Builder->CreateZExt(PointerValue, BasePointer->getType());
    } else {
      PointerValue = Builder->CreateSExt(PointerValue, BasePointer->getType());
    }
    PointerValue = Builder->CreateAdd(BasePointer, PointerValue);
  }
  PointerValue = Builder->CreateIntToPtr(PointerValue, GEP->getType());
  GEP->replaceAllUsesWith(PointerValue);
  GEP->eraseFromParent();

  return true;
}

bool GenIRLowering::constantFoldFMaxFMin(CallInst *GII, BasicBlock::iterator &BBI) const {
  // Constant fold fmax/fmin only.
  EOPCODE GIID = GetOpCode(GII);
  if (GIID != llvm_max && GIID != llvm_min)
    return false;

  // Skip fmax/fmin with non-constant operand.
  ConstantFP *CFP0 = dyn_cast<ConstantFP>(GII->getOperand(0));
  ConstantFP *CFP1 = dyn_cast<ConstantFP>(GII->getOperand(1));
  if (!CFP0 || !CFP1)
    return false;

  // Fold fmax/fmin following OpenCL spec.
  const APFloat &A = CFP0->getValueAPF();
  const APFloat &B = CFP1->getValueAPF();
  APFloat Result = (GIID == llvm_min) ? minnum(A, B) : maxnum(A, B);
  Constant *C = ConstantFP::get(GII->getContext(), Result);

  GII->replaceAllUsesWith(C);
  GII->eraseFromParent();

  return true;
}

bool GenIRLowering::combineFMaxFMin(CallInst *GII, BasicBlock::iterator &BBI) const {
  using namespace llvm::PatternMatch; // Scoped namespace using.

  // Fold fmax/fmin with all constant operands.
  if (constantFoldFMaxFMin(GII, BBI))
    return true;

  ConstantFP *CMin, *CMax;
  Value *X = nullptr;

  if (!match(GII, m_ClampWithConstants(m_Value(X), CMin, CMax)))
    return false;

  // Optimize chained clamp, i.e. combine
  // (clamp (clamp x, MIN, MAX), MIN, MAX) into
  // (clamp x, MIN, MAX)
  ConstantFP *CMin2, *CMax2;
  Value *X2 = nullptr;
  if (match(X, m_ClampWithConstants(m_Value(X2), CMin2, CMax2)) && CMin == CMin2 && CMax == CMax2) {
    GII->replaceAllUsesWith(X);
    GII->eraseFromParent();

    return true;
  }

  // TODO: The following case should be combined as well
  //  (clamp (clamp x, MIN, MAX), MIN2, MAX2) into
  //  (clamp x, MIN3, MAX3), where
  // MIN3 = max(MIN, MIN2) and MAX3 = min(MAX, MAX2). The above case is just a
  // special case of this general form.

  if (!CMin->isZero() || !CMax->isExactlyValue(1.f))
    return false;

  // TODO: optimize chained fsat, i.e. combine
  // (fsat (fsat x)) into (fsat x)

  auto M = GII->getParent()->getParent()->getParent();
  GenISAIntrinsic::ID IID = GenISAIntrinsic::GenISA_fsat;
  Function *IFunc = GenISAIntrinsic::getDeclaration(M, IID, GII->getType());

  Instruction *I = Builder->CreateCall(IFunc, X);
  GII->replaceAllUsesWith(I);
  GII->eraseFromParent();

  BBI = llvm::BasicBlock::iterator(I);
  ++BBI;

  return true;
}

bool GenIRLowering::combineSelectInst(SelectInst *Sel, BasicBlock::iterator &BBI) const {
  using namespace llvm::PatternMatch; // Scoped namespace using.

  Value *LHS = nullptr;
  Value *RHS = nullptr;
  bool IsMax = false;
  unsigned Opcode = Instruction::UserOp1;

  if (Sel->getType()->isIntegerTy()) {
    IsMax = match(Sel, m_OrdFMaxCast(Opcode, m_Value(LHS), m_Value(RHS)));
    if (!IsMax && !match(Sel, m_OrdFMinCast(Opcode, m_Value(LHS), m_Value(RHS))))
      return false;

    switch (Opcode) {
    default:
      return false;
    case Instruction::FPToSI:
    case Instruction::FPToUI:
    case Instruction::BitCast:
      break;
    }
  } else {
    IsMax = match(Sel, m_OrdFMax(m_Value(LHS), m_Value(RHS)));
    if (!IsMax && !match(Sel, m_OrdFMin(m_Value(LHS), m_Value(RHS))))
      return false;
  }

  IGCLLVM::Intrinsic IID = IsMax ? Intrinsic::maxnum : Intrinsic::minnum;
  Function *IFunc = Intrinsic::getDeclaration(Sel->getParent()->getParent()->getParent(), IID, LHS->getType());

  Instruction *I = Builder->CreateCall2(IFunc, LHS, RHS);
  BBI = BasicBlock::iterator(I); // Don't move to the next one. We still need combine for saturation.
  if (Opcode != Instruction::UserOp1) {
    I = cast<Instruction>(Builder->CreateCast(static_cast<Instruction::CastOps>(Opcode), I, Sel->getType()));
  }
  Sel->replaceAllUsesWith(I);
  Sel->eraseFromParent();

  return false;
}

////////////////////////////////////////////////////////////////////////////////
// Detect complex patterns that pack 2 16-bit or 4 8-bit integers into a 32-bit
// value. Generate equivalent sequence of instructions that is later matched in
// the CodeGenPatternMatch::MatchPack4i8().
// Pattern example for <4 x i8> packing:
//   %x1 = and i32 %x, 127
//   %x2 = lshr i32 %x, 24
//   %x3 = and i32 %x2, 128
//   %x4 = or i32 %x3, %x1
//   %y1 = and i32 %y, 127
//   %y2 = lshr i32 %y, 24
//   %y3 = and i32 %y2, 128
//   %y4 = or i32 %y3, %y1
//   %y5 = shl nuw nsw i32 %y4, 8
//   %xy = or i32 %x4, %y5
//   %z1 = and i32 %z, 127
//   %z2 = lshr i32 %z, 24
//   %z3 = and i32 %z2, 128
//   %z4 = or i32 %z3, %z1
//   %z5 = shl nuw nsw i32 %z4, 16
//   %xyz = or i32 %xy, %z5
//   %w1 = shl nsw i32 %w, 24
//   %w2 = and i32 %w1, 2130706432
//   %w3 = and i32 %w, -2147483648
//   %w4 = or i32 %w2, %w3
//   %xyzw = or i32 %xyz, %w4
// and generate:
//   %0 = trunc i32 %x to i8
//   %1 = insertelement <4 x i8> poison, i8 %0, i32 0
//   %2 = trunc i32 %y to i8
//   %3 = insertelement <4 x i8> %1, i8 %2, i32 1
//   %4 = trunc i32 %z to i8
//   %5 = insertelement <4 x i8> %3, i8 %4, i32 2
//   %6 = trunc i32 %w to i8
//   %7 = insertelement <4 x i8> %5, i8 %6, i32 3
//   %8 = bitcast <4 x i8> %7 to i32
bool GenIRLowering::combinePack4i8Or2i16(Instruction *inst, uint64_t numBits) const {
  using namespace llvm::PatternMatch;

  const DataLayout &DL = inst->getModule()->getDataLayout();
  // Vector of 4 or 2 values that will be packed into a single 32-bit value.
  // The std::pair contains the 32-bit value that contains the element
  // to pack and the LSB where the packed value starts in the 32-bit value.
  SmallVector<std::pair<Value *, uint64_t>, 4> toPack;
  IGC_ASSERT(numBits == 8 || numBits == 16);
  uint64_t packedVecSize = 32 / numBits;
  toPack.resize(packedVecSize);
  uint64_t cSignMask = QWBIT(numBits - 1);
  uint64_t cMagnMask = BITMASK(numBits - 1);
  // The std::pair contains the 32-bit value that contains the element
  // to pack and the left shift bits that indicate the element position
  // in the packed vector.
  SmallVector<std::pair<Value *, uint64_t>, 4> args;
  args.push_back({isa<BitCastInst>(inst) ? inst->getOperand(0) : inst, 0});
  // In the first step traverse the chain of `or` and `shl` instructions
  // and find all elements of the packed vector.
  while (!args.empty()) {
    auto [v, prevShlBits] = args.pop_back_val();
    Value *lOp = nullptr;
    Value *rOp = nullptr;

    // Detect left shift by multiple of `numBits`. The `shl` operation sets the
    // `index` argument in the corresponding InsertElement instruction in the
    // final packing sequence. This operation can also be viewed as repacking
    // of already packed vector into another packed vector.
    uint64_t shlBits = 0;
    if (match(v, m_Shl(m_Value(lOp), m_ConstantInt(shlBits))) && (shlBits % numBits) == 0) {
      args.push_back({lOp, shlBits + prevShlBits});
      continue;
    }
    // Detect values that fit into `numBits` bits - a single element of
    // the packed vector.
    KnownBits kb = computeKnownBits(v, DL);
    uint32_t nonZeroBits = ~(static_cast<uint32_t>(kb.Zero.getZExtValue()));
    uint32_t lsb = findFirstSet(nonZeroBits);
    uint32_t msb = findLastSet(nonZeroBits);
    if (msb != lsb && (msb / numBits) == (lsb / numBits)) {
      uint32_t idx = (prevShlBits / numBits) + (lsb / numBits);
      if (idx < packedVecSize && toPack[idx].first == nullptr) {
        toPack[idx] = std::make_pair(v, alignDown(lsb, numBits));
        continue;
      }
    }
    // Detect packing of two disjoint values. This `or` operation corresponds
    // to an InsertElement instruction in the final packing sequence.
    if (match(v, m_Or(m_Value(lOp), m_Value(rOp)))) {
      KnownBits kbL = computeKnownBits(lOp, DL);
      KnownBits kbR = computeKnownBits(rOp, DL);
      uint32_t nonZeroBitsL = ~(static_cast<uint32_t>(kbL.Zero.getZExtValue()));
      uint32_t nonZeroBitsR = ~(static_cast<uint32_t>(kbR.Zero.getZExtValue()));
      if ((nonZeroBitsL & nonZeroBitsR) == 0) {
        args.push_back({lOp, prevShlBits});
        args.push_back({rOp, prevShlBits});
      }
      continue;
    }
    if (std::all_of(toPack.begin(), toPack.end(), [](const auto &c) { return c.first != nullptr; })) {
      break;
    }
    // Unsupported pattern.
    return false;
  }
  if (std::any_of(toPack.begin(), toPack.end(), [](const auto &c) { return c.first == nullptr; })) {
    return false;
  }
  // In the second step match the pattern that packs sign and magnitude parts
  // and simple masking with `and` instruction.
  for (uint32_t i = 0; i < packedVecSize; ++i) {
    auto [v, lsb] = toPack[i];
    Value *lOp = nullptr;
    Value *rOp = nullptr;
    uint64_t lMask = 0;
    uint64_t rMask = 0;
    // Match patterns that pack the sign and magnitude parts.
    if (match(v, m_Or(m_And(m_Value(lOp), m_ConstantInt(lMask)), m_And(m_Value(rOp), m_ConstantInt(rMask)))) &&
        (countPopulation(rMask) == 1 || countPopulation(lMask) == 1)) {
      Value *signOp = countPopulation(rMask) == 1 ? rOp : lOp;
      Value *magnOp = countPopulation(rMask) == 1 ? lOp : rOp;
      uint64_t signMask = countPopulation(rMask) == 1 ? rMask : lMask;
      uint64_t magnMask = countPopulation(rMask) == 1 ? lMask : rMask;
      uint64_t shlBits = 0;
      uint64_t shrBits = 0;
      // %b = shl nsw i32 %a, 24
      // %c = and i32 %b, 2130706432
      // %sign = and i32 %a, -2147483648
      // %e = or i32 %sign, %c
      if (match(magnOp, m_Shl(m_Value(v), m_ConstantInt(shlBits))) && v == signOp && (shlBits % numBits) == 0 &&
          shlBits == (i * numBits) && (cSignMask << shlBits) == signMask && (cMagnMask << shlBits) == magnMask &&
          lsb == shlBits) {
        toPack[i] = std::make_pair(v, 0);
        continue;
      }
      // %b = and i32 %a, 127
      // %c = lshr i32 %a, 24
      // %sign = and i32 %c, 128
      // %e = or i32 %sign, %b
      if (match(signOp, m_LShr(m_Value(v), m_ConstantInt(shrBits))) && v == magnOp && shrBits == (32 - numBits) &&
          cSignMask == signMask && cMagnMask == magnMask && lsb == 0) {
        toPack[i] = std::make_pair(v, 0);
        continue;
      }
    }
    uint64_t andMask = 0;
    if (match(v, m_And(m_Value(lOp), m_ConstantInt(andMask))) && andMask == BITMASK(numBits) && lsb == 0) {
      toPack[i] = std::make_pair(lOp, 0);
      continue;
    }
    if (lsb > 0) {
      return false;
    }
  }

  // Create the packing sequence that is matched in the PatternMatch later.
  Type *elemTy = Builder->getIntNTy(numBits);
  Value *packed = PoisonValue::get(IGCLLVM::FixedVectorType::get(elemTy, packedVecSize));
  for (uint32_t i = 0; i < packedVecSize; ++i) {
    auto [elem, lsb] = toPack[i];
    IGC_ASSERT(lsb == 0);
    elem = Builder->CreateTrunc(elem, elemTy);
    packed = Builder->CreateInsertElement(packed, elem, Builder->getInt32(i));
  }
  packed = Builder->CreateBitCast(packed, inst->getType());
  inst->replaceAllUsesWith(packed);
  inst->eraseFromParent();
  return true;
}

FunctionPass *IGC::createGenIRLowerPass() { return new GenIRLowering(); }

// Register pass to igc-opt
#define PASS_FLAG "igc-gen-ir-lowering"
#define PASS_DESCRIPTION "Lowers GEP into primitive ones"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(GenIRLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_END(GenIRLowering, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)

FunctionPass *IGC::createGEPLoweringPass() { return new GEPLowering(); }

// Register pass to igc-opt
#define PASS_FLAG2 "igc-gep-lowering"
#define PASS_DESCRIPTION2 "Lowers GEP into primitive ones"
#define PASS_CFG_ONLY2 false
#define PASS_ANALYSIS2 false
IGC_INITIALIZE_PASS_BEGIN(GEPLowering, PASS_FLAG2, PASS_DESCRIPTION2, PASS_CFG_ONLY2, PASS_ANALYSIS2)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
IGC_INITIALIZE_PASS_END(GEPLowering, PASS_FLAG2, PASS_DESCRIPTION2, PASS_CFG_ONLY2, PASS_ANALYSIS2)