intel-graphics-compiler/IGC/Compiler/CISACodeGen/PatternMatchPass.cpp

/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2024 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#include "Compiler/CISACodeGen/PatternMatchPass.hpp"
#include "Compiler/CISACodeGen/DeSSA.hpp"
#include "Compiler/CISACodeGen/EmitVISAPass.hpp"
#include "common/igc_regkeys.hpp"
#include "common/LLVMWarningsPush.hpp"
#include <llvm/Analysis/ValueTracking.h>
#include <llvm/IR/InlineAsm.h>
#include <llvm/IR/Dominators.h>
#include <llvm/IR/Constants.h>
#include <llvm/IR/Instruction.h>
#include <llvm/IR/IntrinsicInst.h>
#include <llvm/IR/PatternMatch.h>
#include "llvm/Support/KnownBits.h"
#include <llvmWrapper/IR/Instructions.h>
#include "llvmWrapper/Support/Alignment.h"
#include "common/LLVMWarningsPop.hpp"
#include "Compiler/IGCPassSupport.h"
#include "Compiler/InitializePasses.h"
#include "Compiler/DebugInfo/ScalarVISAModule.h"
#include "GenISAIntrinsics/GenIntrinsicInst.h"
#include "Probe/Assertion.h"

using namespace llvm;
using namespace IGC;
using namespace IGC::IGCMD;

char CodeGenPatternMatch::ID = 0;
#define PASS_FLAG "CodeGenPatternMatch"
#define PASS_DESCRIPTION "Does pattern matching"
#define PASS_CFG_ONLY true
#define PASS_ANALYSIS true
IGC_INITIALIZE_PASS_BEGIN(CodeGenPatternMatch, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
IGC_INITIALIZE_PASS_DEPENDENCY(LiveVarsAnalysis)
IGC_INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(PositionDepAnalysis)
IGC_INITIALIZE_PASS_END(CodeGenPatternMatch, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)

namespace IGC {
CodeGenPatternMatch::CodeGenPatternMatch()
    : FunctionPass(ID), m_rootIsSubspanUse(false), m_blocks(nullptr), m_numBlocks(0), m_root(nullptr),
      m_currentPattern(nullptr), m_Platform(), m_AllowContractions(true), m_NeedVMask(false),
      m_samplertoRenderTargetEnable(false), m_ctx(nullptr), DT(nullptr), PDT(nullptr), LI(nullptr), m_DL(0),
      m_WI(nullptr), m_LivenessInfo(nullptr), m_PosDep(nullptr) {
  initializeCodeGenPatternMatchPass(*PassRegistry::getPassRegistry());
}

CodeGenPatternMatch::~CodeGenPatternMatch() { delete[] m_blocks; }

void CodeGenPatternMatch::CodeGenNode(llvm::DomTreeNode *node) {
  struct NodeRange {
    NodeRange(llvm::DomTreeNode *node, llvm::DomTreeNode::iterator current, llvm::DomTreeNode::iterator end)
        : m_Node(node), m_Current(current), m_End(end) {}
    llvm::DomTreeNode *m_Node;
    llvm::DomTreeNode::iterator m_Current;
    llvm::DomTreeNode::iterator m_End;
  };
  std::list<NodeRange> m_NodeContainer;
  m_NodeContainer.push_front(NodeRange(node, node->begin(), node->end()));

  // Process blocks by processing the dominance tree depth first
  while (!m_NodeContainer.empty()) {
    NodeRange &currNodeRange = m_NodeContainer.front();
    if (currNodeRange.m_Current == currNodeRange.m_End) {
      llvm::BasicBlock *bb = currNodeRange.m_Node->getBlock();
      CodeGenBlock(bb);
      m_NodeContainer.pop_front();
      continue;
    }
    llvm::DomTreeNode *child = *currNodeRange.m_Current;
    m_NodeContainer.push_front(NodeRange(child, child->begin(), child->end()));
    currNodeRange.m_Current++;
  }
}

bool CodeGenPatternMatch::runOnFunction(llvm::Function &F) {
  m_blockMap.clear();
  ConstantPlacement.clear();
  PairOutputMap.clear();
  UniformBools.clear();

  delete[] m_blocks;
  m_blocks = nullptr;

  m_ctx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();

  MetaDataUtils *pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
  ModuleMetaData *modMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
  if (pMdUtils->findFunctionsInfoItem(&F) == pMdUtils->end_FunctionsInfo()) {
    return false;
  }

  m_AllowContractions = true;
  if (m_ctx->m_DriverInfo.NeedCheckContractionAllowed()) {
    m_AllowContractions = modMD->compOpt.FastRelaxedMath || modMD->compOpt.MadEnable;
  }
  m_Platform = m_ctx->platform;

  DT = &getAnalysis<llvm::DominatorTreeWrapperPass>().getDomTree();
  PDT = &getAnalysis<llvm::PostDominatorTreeWrapperPass>().getPostDomTree();
  LI = &getAnalysis<llvm::LoopInfoWrapperPass>().getLoopInfo();
  m_DL = &F.getParent()->getDataLayout();
  m_WI = &getAnalysis<WIAnalysis>();
  m_PosDep = &getAnalysis<PositionDepAnalysis>();
  // pattern match will update liveness held by LiveVar, which needs
  // WIAnalysis result for uniform variable
  m_LivenessInfo = &getAnalysis<LiveVarsAnalysis>().getLiveVars();
  CreateBasicBlocks(&F);
  CodeGenNode(DT->getRootNode());
  return false;
}

inline bool HasSideEffect(llvm::Instruction &inst) {
  if (inst.mayWriteToMemory() || inst.isTerminator()) {
    return true;
  }
  return false;
}

inline bool HasPhiUse(llvm::Value &inst) {
  for (auto UI = inst.user_begin(), E = inst.user_end(); UI != E; ++UI) {
    llvm::User *U = *UI;
    if (llvm::isa<llvm::PHINode>(U)) {
      return true;
    }
  }
  return false;
}

// If "v" is a canonicalize instruction, return its source argument. Return
// "v" otherwise.
inline Value *SkipCanonicalize(Value *v) {
  IntrinsicInst *intr = dyn_cast<IntrinsicInst>(v);
  if (intr && intr->getIntrinsicID() == Intrinsic::canonicalize) {
    return intr->getOperand(0);
  }
  return v;
}

bool CodeGenPatternMatch::IsDbgInst(llvm::Instruction &inst) const {
  if (llvm::isa<llvm::DbgInfoIntrinsic>(&inst)) {
    // FIXME: We probably don't need that.
    return true;
  }
  // perThreadOffset is special variable we should be alive for O0 runs.
  // This is intended for debug, but we do it for non-debug too to
  // avoid altering generated code by adding -g.
  if (m_ctx->getModuleMetaData()->compOpt.OptDisable && inst.getMetadata("perThreadOffset")) {
    return true;
  }
  return false;
}

bool CodeGenPatternMatch::IsConstOrSimdConstExpr(Value *C) {
  if (isa<ConstantInt>(C)) {
    return true;
  }
  if (Instruction *inst = dyn_cast<Instruction>(C)) {
    return SIMDConstExpr(inst);
  }
  return false;
}

// Checks if denorms are flushed on instruction's output.
// When denorm mode in CR0 is set to flush to zero, denorms are flushed on
// input and output of any floating-point mathematical operation with the
// following exceptions:
// - raw mov retains denorms
// - conversion instructions retain denorms (includes mix-mode instructions)
// - extended math instructions retain half float denorms
bool CodeGenPatternMatch::FlushesDenormsOnOutput(Instruction &I) {
  bool flushesDenorms = false;
  if ((m_ctx->getModuleMetaData()->compOpt.FloatDenormMode16 == IGC::FLOAT_DENORM_FLUSH_TO_ZERO &&
       I.getType()->isHalfTy()) ||
      (m_ctx->getModuleMetaData()->compOpt.FloatDenormMode32 == IGC::FLOAT_DENORM_FLUSH_TO_ZERO &&
       I.getType()->isFloatTy()) ||
      (m_ctx->getModuleMetaData()->compOpt.FloatDenormMode64 == IGC::FLOAT_DENORM_FLUSH_TO_ZERO &&
       I.getType()->isDoubleTy())) {
    switch (GetOpCode(&I)) {
    case llvm_select:
      flushesDenorms = true;
      for (uint i = 1; i < I.getNumOperands(); ++i) {
        Instruction *inst = dyn_cast<Instruction>(I.getOperand(i));
        if (inst && !FlushesDenormsOnOutput(*inst)) {
          flushesDenorms = false;
          break;
        }
      }
      break;
    case llvm_max:
    case llvm_min:
      flushesDenorms = true;
      for (uint i = 0; i < I.getNumOperands(); ++i) {
        Instruction *inst = dyn_cast<Instruction>(I.getOperand(i));
        if (inst && !FlushesDenormsOnOutput(*inst)) {
          flushesDenorms = false;
          break;
        }
      }
      break;
    case llvm_fsat:
    case llvm_fabs:
      if (isa<Instruction>(I.getOperand(0))) {
        flushesDenorms = FlushesDenormsOnOutput(*(cast<Instruction>(I.getOperand(0))));
      }
      break;
    case llvm_canonicalize:
    case llvm_fadd:
    case llvm_fadd_rtz:
    case llvm_fsub:
    case llvm_fmul:
    case llvm_fmul_rtz:
    case llvm_fma:
    case llvm_roundne:
    case llvm_round_z:
    case llvm_floor:
    case llvm_ceil:
    case llvm_frc:
      flushesDenorms = true;
      break;
    case llvm_fdiv:
    case llvm_frem:
    case llvm_cos:
    case llvm_sin:
    case llvm_log:
    case llvm_exp:
    case llvm_pow:
    case llvm_sqrt:
    case llvm_rsq:
      // extended math retain half denorms
      flushesDenorms = !I.getType()->isHalfTy();
      break;
    default:
      break;
    }
  }
  return flushesDenorms;
}
bool CodeGenPatternMatch::FlushesDenormsOnInput(Instruction &I) {
  bool flushesDenorms = false;
  if ((m_ctx->getModuleMetaData()->compOpt.FloatDenormMode16 == IGC::FLOAT_DENORM_FLUSH_TO_ZERO &&
       I.getType()->isHalfTy()) ||
      (m_ctx->getModuleMetaData()->compOpt.FloatDenormMode32 == IGC::FLOAT_DENORM_FLUSH_TO_ZERO &&
       I.getType()->isFloatTy()) ||
      (m_ctx->getModuleMetaData()->compOpt.FloatDenormMode64 == IGC::FLOAT_DENORM_FLUSH_TO_ZERO &&
       I.getType()->isDoubleTy())) {
    switch (GetOpCode(&I)) {
    case llvm_canonicalize:
    case llvm_fadd:
    case llvm_fadd_rtz:
    case llvm_fsub:
    case llvm_fmul:
    case llvm_fmul_rtz:
    case llvm_fma:
    case llvm_roundne:
    case llvm_round_z:
    case llvm_floor:
    case llvm_ceil:
    case llvm_frc:
      flushesDenorms = true;
      break;
    case llvm_fdiv:
    case llvm_frem:
    case llvm_cos:
    case llvm_sin:
    case llvm_log:
    case llvm_exp:
    case llvm_pow:
    case llvm_sqrt:
    case llvm_rsq:
      // extended math retain half denorms
      flushesDenorms = !I.getType()->isHalfTy();
      break;
    default:
      break;
    }
  }
  return flushesDenorms;
}

bool CodeGenPatternMatch::ContractionAllowed(llvm::Instruction &I) const {
  if (m_AllowContractions || (m_ctx->m_DriverInfo.RespectPerInstructionContractFlag() && I.hasAllowContract())) {
    return true;
  }
  return false;
}

// this function need to be in sync with CShader::EvaluateSIMDConstExpr on what can be supported
bool CodeGenPatternMatch::SIMDConstExpr(Instruction *C) {
  auto it = m_IsSIMDConstExpr.find(C);
  if (it != m_IsSIMDConstExpr.end()) {
    return it->second;
  }
  bool isConstExpr = false;
  if (BinaryOperator *op = dyn_cast<BinaryOperator>(C)) {
    switch (op->getOpcode()) {
    case Instruction::Add:
    case Instruction::Sub:
    case Instruction::Mul:
    case Instruction::Shl:
    case Instruction::Or:
    case Instruction::And:
      isConstExpr = IsConstOrSimdConstExpr(op->getOperand(0)) && IsConstOrSimdConstExpr(op->getOperand(1));
      break;
    default:
      break;
    }
  } else if (llvm::GenIntrinsicInst *genInst = dyn_cast<GenIntrinsicInst>(C)) {
    if (genInst->getIntrinsicID() == GenISAIntrinsic::GenISA_simdSize) {
      isConstExpr = true;
    }
  }
  m_IsSIMDConstExpr.insert(std::make_pair(C, isConstExpr));
  return isConstExpr;
}

bool CodeGenPatternMatch::NeedInstruction(llvm::Instruction &I) {
  if (SIMDConstExpr(&I)) {
    return false;
  }
  if (HasPhiUse(I) || HasSideEffect(I) || IsDbgInst(I) || (m_usedInstructions.find(&I) != m_usedInstructions.end())) {
    return true;
  }
  return false;
}

void CodeGenPatternMatch::AddToConstantPool(llvm::BasicBlock *UseBlock, llvm::Value *Val) {
  Constant *C = dyn_cast_or_null<Constant>(Val);
  if (!C)
    return;

  BasicBlock *LCA = UseBlock;
  // Determine where we put the constant initialization.
  // Choose loop pre-header as LICM.
  // XXX: Further investigation/tuning is needed to see whether
  // we need to hoist constant initialization out of the
  // top-level loop within a nested loop. So far, we only hoist
  // one level up.
  if (Loop *L = LI->getLoopFor(LCA)) {
    if (BasicBlock *Preheader = L->getLoopPreheader())
      LCA = Preheader;
  }
  // Find the common dominator as CSE.
  if (BasicBlock *BB = ConstantPlacement.lookup(C))
    LCA = DT->findNearestCommonDominator(LCA, BB);
  IGC_ASSERT_MESSAGE(LCA, "LCA always exists for reachable BBs within a function!");
  ConstantPlacement[C] = LCA;
}

bool CodeGenPatternMatch::supportsLSCImmediateGlobalBaseOffset() {
  bool res = IGC_GET_FLAG_VALUE(LscImmOffsMatch) > 1 || m_Platform.matchImmOffsetsLSC();
  return res;
}

// Check bool values that can be emitted as a single element predicate.
void CodeGenPatternMatch::gatherUniformBools(Value *Val) {
  if (!isUniform(Val) || Val->getType()->getScalarType()->isIntegerTy(1))
    return;

  // Only starts from select instruction for now.
  // It is more complicate for uses in terminators.
  if (SelectInst *SI = dyn_cast<SelectInst>(Val)) {
    Value *Cond = SI->getCondition();
    if (Cond->getType()->isVectorTy() || !Cond->hasOneUse())
      return;

    // All users of bool values.
    DenseSet<Value *> Vals;
    Vals.insert(SI);

    // Grow the list of bool values to be checked.
    std::vector<Value *> ValList;
    ValList.push_back(Cond);

    bool IsLegal = true;
    while (!ValList.empty()) {
      Value *V = ValList.back();
      ValList.pop_back();
      IGC_ASSERT(nullptr != V);
      IGC_ASSERT(nullptr != V->getType());
      IGC_ASSERT(isUniform(V));
      IGC_ASSERT(V->getType()->isIntegerTy(1));

      // Check uses.
      for (auto UI = V->user_begin(), UE = V->user_end(); UI != UE; ++UI) {
        Value *U = *UI;
        if (!Vals.count(U))
          goto FAIL;
      }

      // Check defs.
      Vals.insert(V);
      if (auto CI = dyn_cast<CmpInst>(V)) {
        IGC_ASSERT(isUniform(CI->getOperand(0)));
        IGC_ASSERT(isUniform(CI->getOperand(1)));
        if (CI->getOperand(0)->getType()->getScalarSizeInBits() == 1)
          goto FAIL;
        continue;
      } else if (auto BI = dyn_cast<BinaryOperator>(V)) {
        IGC_ASSERT(isUniform(BI->getOperand(0)));
        IGC_ASSERT(isUniform(BI->getOperand(1)));
        if (isa<Instruction>(BI->getOperand(0)))
          ValList.push_back(BI->getOperand(0));
        if (isa<Instruction>(BI->getOperand(1)))
          ValList.push_back(BI->getOperand(1));
        continue;
      }

    FAIL:
      IsLegal = false;
      break;
    }

    // Populate all boolean values if legal.
    if (IsLegal) {
      for (auto V : Vals) {
        if (V->getType()->isIntegerTy(1))
          UniformBools.insert(V);
      }
    }
  }
}

void CodeGenPatternMatch::CodeGenBlock(llvm::BasicBlock *bb) {
  llvm::BasicBlock::InstListType::reverse_iterator I, E;
  auto it = m_blockMap.find(bb);
  IGC_ASSERT(it != m_blockMap.end());
  SBasicBlock *block = it->second;

  // loop through instructions bottom up
  for (I = bb->rbegin(), E = bb->rend(); I != E; ++I) {
    llvm::Instruction &inst = (*I);

    if (NeedInstruction(inst)) {
      SetPatternRoot(inst);
      Pattern *pattern = Match(inst);
      if (pattern) {
        block->m_dags.push_back(SDAG(pattern, m_root));
        gatherUniformBools(m_root);
      }
    }
  }
}

void CodeGenPatternMatch::CreateBasicBlocks(llvm::Function *pLLVMFunc) {
  m_numBlocks = pLLVMFunc->size();
  m_blocks = new SBasicBlock[m_numBlocks];
  uint i = 0;
  for (BasicBlock &bb : *pLLVMFunc) {
    m_blocks[i].id = i;
    m_blocks[i].bb = &bb;
    m_blockMap.insert(std::pair<llvm::BasicBlock *, SBasicBlock *>(&bb, &m_blocks[i]));
    i++;
  }
}
Pattern *CodeGenPatternMatch::Match(llvm::Instruction &inst) {
  m_currentPattern = nullptr;
  visit(inst);
  return m_currentPattern;
}

void CodeGenPatternMatch::SetPatternRoot(llvm::Instruction &inst) {
  m_root = &inst;
  m_rootIsSubspanUse = IsSubspanUse(m_root);
}

template <typename Op_t, typename ConstTy> struct ClampWithConstants_match {
  typedef ConstTy *ConstPtrTy;

  Op_t Op;
  ConstPtrTy &CMin, &CMax;

  ClampWithConstants_match(const Op_t &OpMatch, ConstPtrTy &Min, ConstPtrTy &Max) : Op(OpMatch), CMin(Min), CMax(Max) {}

  template <typename OpTy> bool match(OpTy *V) {
    CallInst *GII = dyn_cast<CallInst>(V);
    if (!GII)
      return false;

    EOPCODE op = GetOpCode(GII);

    if (op != llvm_max && op != llvm_min)
      return false;

    Value *X = GII->getOperand(0);
    Value *C = GII->getOperand(1);
    if (isa<ConstTy>(X))
      std::swap(X, C);

    ConstPtrTy C0 = dyn_cast<ConstTy>(C);
    if (!C0)
      return false;

    CallInst *GII2 = dyn_cast<CallInst>(X);
    if (!GII2)
      return false;

    EOPCODE op2 = GetOpCode(GII2);
    if (!(op == llvm_min && op2 == llvm_max) && !(op == llvm_max && op2 == llvm_min))
      return false;

    X = GII2->getOperand(0);
    C = GII2->getOperand(1);
    if (isa<ConstTy>(X))
      std::swap(X, C);

    ConstPtrTy C1 = dyn_cast<ConstTy>(C);
    if (!C1)
      return false;

    if (!Op.match(X))
      return false;

    CMin = (op2 == llvm_min) ? C0 : C1;
    CMax = (op2 == llvm_min) ? C1 : C0;
    return true;
  }
};

template <typename OpTy, typename ConstTy>
inline ClampWithConstants_match<OpTy, ConstTy> m_ClampWithConstants(const OpTy &Op, ConstTy *&Min, ConstTy *&Max) {
  return ClampWithConstants_match<OpTy, ConstTy>(Op, Min, Max);
}

template <typename Op_t> struct IsNaN_match {
  Op_t Op;

  IsNaN_match(const Op_t &OpMatch) : Op(OpMatch) {}

  template <typename OpTy> bool match(OpTy *V) {
    using namespace llvm::PatternMatch;

    FCmpInst *FCI = dyn_cast<FCmpInst>(V);
    if (!FCI)
      return false;

    switch (FCI->getPredicate()) {
    case FCmpInst::FCMP_UNE:
      return FCI->getOperand(0) == FCI->getOperand(1) && Op.match(FCI->getOperand(0));
    case FCmpInst::FCMP_UNO:
      return m_Zero().match(FCI->getOperand(1)) && Op.match(FCI->getOperand(0));
    default:
      break;
    }

    return false;
  }
};

template <typename OpTy> inline IsNaN_match<OpTy> m_IsNaN(const OpTy &Op) { return IsNaN_match<OpTy>(Op); }

std::tuple<Value *, unsigned, VISA_Type> CodeGenPatternMatch::isFPToIntegerSatWithExactConstant(llvm::CastInst *I) {
  using namespace llvm::PatternMatch; // Scoped using declaration.

  unsigned Opcode = I->getOpcode();
  IGC_ASSERT(Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI);

  unsigned BitWidth = I->getDestTy()->getIntegerBitWidth();
  APFloat FMin(I->getSrcTy()->getFltSemantics());
  APFloat FMax(I->getSrcTy()->getFltSemantics());
  if (Opcode == Instruction::FPToSI) {
    if (FMax.convertFromAPInt(APInt::getSignedMaxValue(BitWidth), true, APFloat::rmNearestTiesToEven) != APFloat::opOK)
      return std::make_tuple(nullptr, 0, ISA_TYPE_F);
    if (FMin.convertFromAPInt(APInt::getSignedMinValue(BitWidth), true, APFloat::rmNearestTiesToEven) != APFloat::opOK)
      return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  } else {
    if (FMax.convertFromAPInt(APInt::getMaxValue(BitWidth), false, APFloat::rmNearestTiesToEven) != APFloat::opOK)
      return std::make_tuple(nullptr, 0, ISA_TYPE_F);
    if (FMin.convertFromAPInt(APInt::getMinValue(BitWidth), false, APFloat::rmNearestTiesToEven) != APFloat::opOK)
      return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  }

  llvm::ConstantFP *CMin = nullptr;
  llvm::ConstantFP *CMax = nullptr;
  llvm::Value *X = nullptr;

  if (!match(I->getOperand(0), m_ClampWithConstants(m_Value(X), CMin, CMax)))
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);

  if (!CMin->isExactlyValue(FMin) || !CMax->isExactlyValue(FMax))
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);

  return std::make_tuple(X, Opcode, GetType(I->getType(), m_ctx));
}

// The following pattern matching is targeted to the conversion from FP values
// to INTEGER values with saturation where the MAX and/or MIN INTEGER values
// cannot be represented in FP values exactly. E.g., UINT_MAX (2**32-1) in
// 'unsigned' cannot be represented in 'float', where only 23 significant bits
// are available but UINT_MAX needs 32 significant bits. We cannot simply
// express that conversion with saturation as
//
//  o := fptoui(clamp(x, float(UINT_MIN), float(UINT_MAX));
//
// as, in LLVM, fptoui is undefined when the 'unsigned' source cannot fit in
// 'float', where clamp(x, MIN, MAX) is defined as max(min(x, MAX), MIN),
//
// Hence, OCL use the following sequence (over-simplified by excluding the NaN
// case.)
//
//  o := select(fptoui(x), UINT_MIN, x < float(UINT_MIN));
//  o := select(o,         UINT_MAX, x > float(UINT_MAX));
//
// (We SHOULD use 'o := select(o, UINTMAX, x >= float(UINT_MAX))' as
// 'float(UINT_MAX)' will be rounded to UINT_MAX+1, i.e. 2 ** 32, and the next
// smaller value than float(UINT_MAX) in 'float' is (2 ** 24 - 1) << 8. For
// 'int', that's also true for INT_MIN.)

std::tuple<Value *, unsigned, VISA_Type>
CodeGenPatternMatch::isFPToSignedIntSatWithInexactConstant(llvm::SelectInst *SI) {
  using namespace llvm::PatternMatch; // Scoped using declaration.

  // TODO
  return std::make_tuple(nullptr, 0, ISA_TYPE_F);
}

std::tuple<Value *, unsigned, VISA_Type>
CodeGenPatternMatch::isFPToUnsignedIntSatWithInexactConstant(llvm::SelectInst *SI) {
  using namespace llvm::PatternMatch; // Scoped using declaration.

  Constant *C0 = dyn_cast<Constant>(SI->getTrueValue());
  if (!C0)
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  if (!isa<ConstantFP>(C0) && !isa<ConstantInt>(C0))
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  Value *Cond = SI->getCondition();

  SelectInst *SI2 = dyn_cast<SelectInst>(SI->getFalseValue());
  if (!SI2)
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  Constant *C1 = dyn_cast<Constant>(SI2->getTrueValue());
  if (!C1)
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  if (!isa<ConstantFP>(C1) && !isa<ConstantInt>(C1))
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  Value *Cond2 = SI2->getCondition();

  Value *X = SI2->getFalseValue();
  Type *Ty = X->getType();
  if (Ty->isFloatTy()) {
    BitCastInst *BC = dyn_cast<BitCastInst>(X);
    if (!BC)
      return std::make_tuple(nullptr, 0, ISA_TYPE_F);
    X = BC->getOperand(0);
    Ty = X->getType();
    C1 = ConstantExpr::getBitCast(C1, Ty);
    C0 = ConstantExpr::getBitCast(C0, Ty);
  }
  IntegerType *ITy = dyn_cast<IntegerType>(Ty);
  if (!ITy)
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  unsigned BitWidth = ITy->getBitWidth();
  FPToUIInst *CI = dyn_cast<FPToUIInst>(X);
  if (!CI)
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  Ty = CI->getSrcTy();
  if (!(Ty->isFloatTy() && BitWidth == 32) && !(Ty->isDoubleTy() && BitWidth == 64))
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  X = CI->getOperand(0);

  ConstantInt *CMin = dyn_cast<ConstantInt>(C0);
  ConstantInt *CMax = dyn_cast<ConstantInt>(C1);
  if (!CMax || !CMin || !CMax->isMaxValue(false) || !CMin->isMinValue(false))
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);

  Constant *FMin = ConstantExpr::getUIToFP(CMin, Ty);
  Constant *FMax = ConstantExpr::getUIToFP(CMax, Ty);

  FCmpInst::Predicate Pred = FCmpInst::FCMP_FALSE;
  if (!match(Cond2, m_FCmp(Pred, m_Specific(X), m_Specific(FMax))))
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);
  if (Pred != FCmpInst::FCMP_OGT) // FIXME: We should use OGE instead of OGT.
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);

  FCmpInst::Predicate Pred2 = FCmpInst::FCMP_FALSE;
  if (!match(Cond, m_Or(m_FCmp(Pred, m_Specific(X), m_Specific(FMin)), m_FCmp(Pred2, m_Specific(X), m_Specific(X))))) {
    if (!match(Cond, m_Or(m_FCmp(Pred, m_Specific(X), m_Specific(FMin)), m_Zero()))) {
      return std::make_tuple(nullptr, 0, ISA_TYPE_F);
    }
    // Special case where the staturatured result is bitcasted into float
    // again (due to typedwrite only accepts `float`. So the isNaN(X) is
    // reduced to `false`.
    Pred2 = FCmpInst::FCMP_UNE;
  }
  if (Pred != FCmpInst::FCMP_OLT || Pred2 != FCmpInst::FCMP_UNE)
    return std::make_tuple(nullptr, 0, ISA_TYPE_F);

  VISA_Type type = GetType(CI->getType(), m_ctx);

  // Fold extra clamp.
  Value *X2 = nullptr;
  ConstantFP *CMin2 = nullptr;
  ConstantFP *CMax2 = nullptr;
  if (match(X, m_ClampWithConstants(m_Value(X2), CMin2, CMax2))) {
    if (CMin2 == FMin) {
      if (CMax2->isExactlyValue(255.0)) {
        X = X2;
        type = ISA_TYPE_B;
      } else if (CMax2->isExactlyValue(65535.0)) {
        X = X2;
        type = ISA_TYPE_W;
      }
    }
  }

  return std::make_tuple(X, Instruction::FPToUI, type);
}

bool CodeGenPatternMatch::MatchFPToIntegerWithSaturation(llvm::Instruction &I) {
  Value *X = nullptr;
  unsigned Opcode = 0;
  VISA_Type type = ISA_TYPE_NUM;

  if (CastInst *CI = dyn_cast<CastInst>(&I)) {
    std::tie(X, Opcode, type) = isFPToIntegerSatWithExactConstant(CI);
    if (!X)
      return false;
  } else if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
    std::tie(X, Opcode, type) = isFPToSignedIntSatWithInexactConstant(SI);
    if (!X) {
      std::tie(X, Opcode, type) = isFPToUnsignedIntSatWithInexactConstant(SI);
      if (!X)
        return false;
    }
  } else {
    return false;
  }

  // Match!
  IGC_ASSERT(Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI);

  struct FPToIntegerWithSaturationPattern : public Pattern {
    bool isUnsigned, needBitCast;
    VISA_Type type;
    SSource src;
    virtual void Emit(EmitPass *pass, const DstModifier &dstMod) {
      pass->EmitFPToIntWithSat(isUnsigned, needBitCast, type, src, dstMod);
    }
  };

  bool isUnsigned = (Opcode == Instruction::FPToUI);
  FPToIntegerWithSaturationPattern *pat = new (m_allocator) FPToIntegerWithSaturationPattern();
  pat->isUnsigned = isUnsigned;
  pat->needBitCast = !I.getType()->isIntegerTy();
  pat->type = type;
  pat->src = GetSource(X, !isUnsigned, false, IsSourceOfSample(&I));
  AddPattern(pat);

  return true;
}

std::tuple<Value *, bool, bool> CodeGenPatternMatch::isIntegerSatTrunc(llvm::SelectInst *SI) {
  using namespace llvm::PatternMatch; // Scoped using declaration.

  ICmpInst *Cmp = dyn_cast<ICmpInst>(SI->getOperand(0));
  if (!Cmp)
    return std::make_tuple(nullptr, false, false);

  ICmpInst::Predicate Pred = Cmp->getPredicate();
  if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_UGT)
    return std::make_tuple(nullptr, false, false);

  ConstantInt *CI = dyn_cast<ConstantInt>(Cmp->getOperand(1));
  if (!CI)
    return std::make_tuple(nullptr, false, false);

  // Truncate into unsigned integer by default.
  bool isSignedDst = false;
  unsigned DstBitWidth = SI->getType()->getIntegerBitWidth();
  unsigned SrcBitWidth = Cmp->getOperand(0)->getType()->getIntegerBitWidth();
  APInt UMax = APInt::getMaxValue(DstBitWidth);
  APInt UMin = APInt::getMinValue(DstBitWidth);
  APInt SMax = APInt::getSignedMaxValue(DstBitWidth);
  APInt SMin = APInt::getSignedMinValue(DstBitWidth);
  if (SrcBitWidth > DstBitWidth) {
    UMax = UMax.zext(SrcBitWidth);
    UMin = UMin.zext(SrcBitWidth);
    SMax = SMax.sext(SrcBitWidth);
    SMin = SMin.sext(SrcBitWidth);
  } else {
    // SrcBitwidth should be always wider than DstBitwidth,
    // since src is a source of a trunc instruction, and dst
    // have the same width as its destination.
    return std::make_tuple(nullptr, false, false);
  }

  if (CI->getValue() != UMax && CI->getValue() != SMax)
    return std::make_tuple(nullptr, false, false);
  if (CI->getValue() == SMax) // Truncate into signed integer.
    isSignedDst = true;

  APInt MinValue = isSignedDst ? SMin : UMin;
  CI = dyn_cast<ConstantInt>(SI->getOperand(1));
  if (!CI || !CI->isMaxValue(isSignedDst))
    return std::make_tuple(nullptr, false, false);

  TruncInst *TI = dyn_cast<TruncInst>(SI->getOperand(2));
  if (!TI)
    return std::make_tuple(nullptr, false, false);

  Value *Val = TI->getOperand(0);
  if (Val != Cmp->getOperand(0))
    return std::make_tuple(nullptr, false, false);

  // Truncate from unsigned integer.
  if (Pred == ICmpInst::ICMP_UGT)
    return std::make_tuple(Val, isSignedDst, false);

  // Truncate from signed integer. Need to check further for lower bound.
  Value *LHS = nullptr, *RHS = nullptr;
  if (!match(Val, m_SMax(m_Value(LHS), m_Value(RHS))))
    return std::make_tuple(nullptr, false, false);

  if (isa<ConstantInt>(LHS))
    std::swap(LHS, RHS);

  CI = dyn_cast<ConstantInt>(RHS);
  if (!CI || CI->getValue() != MinValue)
    return std::make_tuple(nullptr, false, false);

  return std::make_tuple(LHS, isSignedDst, true);
}

bool CodeGenPatternMatch::MatchIntegerTruncSatModifier(llvm::SelectInst &I) {
  // Only match BYTE or WORD.
  if (!I.getType()->isIntegerTy(8) && !I.getType()->isIntegerTy(16))
    return false;
  auto [Src, isSignedDst, isSignedSrc] = isIntegerSatTrunc(&I);
  if (!Src)
    return false;

  struct IntegerSatTruncPattern : public Pattern {
    SSource src;
    bool isSignedDst;
    bool isSignedSrc;
    virtual void Emit(EmitPass *pass, const DstModifier &dstMod) {
      pass->EmitIntegerTruncWithSat(isSignedDst, isSignedSrc, src, dstMod);
    }
  };

  IntegerSatTruncPattern *pat = new (m_allocator) IntegerSatTruncPattern();
  pat->src = GetSource(Src, isSignedSrc, false, IsSourceOfSample(&I));
  pat->isSignedDst = isSignedDst;
  pat->isSignedSrc = isSignedSrc;
  AddPattern(pat);

  return true;
}

bool CodeGenPatternMatch::MatchShrSatModifier(llvm::SelectInst &I) {
  struct ShrSatPattern : public Pattern {
    SSource sources[2];
    Instruction *inst;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      DstModifier mod = modifier;
      mod.sat = true;
      pass->EmitSimpleAlu(GetOpCode(inst), sources, mod, true /*isUnsigned*/);
    }
  };

  Value *LHS = nullptr, *RHS = nullptr;
  if (isMax(&I, LHS, RHS)) {
    if (isa<ConstantInt>(LHS) || isa<ConstantInt>(RHS)) {
      ConstantInt *c = isa<ConstantInt>(LHS) ? cast<ConstantInt>(LHS) : cast<ConstantInt>(RHS);
      if (c->getZExtValue() == 0) {
        Value *v = isa<ConstantInt>(LHS) ? RHS : LHS;
        if (isa<LShrOperator>(v) || isa<AShrOperator>(v)) {
          Instruction *inst = cast<Instruction>(v);
          ShrSatPattern *pattern = new (m_allocator) ShrSatPattern();
          pattern->inst = inst;
          pattern->sources[0] = GetSource(inst->getOperand(0), false, false, IsSourceOfSample(&I));
          pattern->sources[1] = GetSource(inst->getOperand(1), false, false, IsSourceOfSample(&I));
          AddPattern(pattern);

          return true;
        }
      }
    }
  }
  return false;
}

void CodeGenPatternMatch::visitFPToSIInst(llvm::FPToSIInst &I) {
  bool match = MatchFPToIntegerWithSaturation(I) || MatchModifier(I);
  IGC_ASSERT_MESSAGE(match, "Pattern match Failed");
}

void CodeGenPatternMatch::visitFPToUIInst(llvm::FPToUIInst &I) {
  bool match = MatchFPToIntegerWithSaturation(I) || MatchModifier(I);
  IGC_ASSERT_MESSAGE(match, "Pattern match Failed");
}

bool CodeGenPatternMatch::MatchSIToFPZExt(llvm::SIToFPInst *S2FI) {
  ZExtInst *ZEI = dyn_cast<ZExtInst>(S2FI->getOperand(0));
  if (!ZEI)
    return false;
  if (!ZEI->getSrcTy()->isIntegerTy(1))
    return false;

  struct SIToFPExtPattern : public Pattern {
    SSource src;
    virtual void Emit(EmitPass *pass, const DstModifier &dstMod) { pass->EmitSIToFPZExt(src, dstMod); }
  };

  SIToFPExtPattern *pat = new (m_allocator) SIToFPExtPattern();
  pat->src = GetSource(ZEI->getOperand(0), false, false, IsSourceOfSample(S2FI));
  AddPattern(pat);

  return true;
}

void CodeGenPatternMatch::visitCastInst(llvm::CastInst &I) {
  bool match = 0;
  if (I.getOpcode() == Instruction::SExt) {
    match = MatchUnpack4i8(I) || MatchCmpSext(I) || MatchModifier(I);
  } else if (I.getOpcode() == Instruction::ZExt) {
    match = MatchUnpack4i8(I) || MatchModifier(I);
  } else if (I.getOpcode() == Instruction::SIToFP) {
    match = MatchSIToFPZExt(cast<SIToFPInst>(&I)) || MatchModifier(I);
  } else if (I.getOpcode() == Instruction::Trunc) {
    match = MatchModifier(I);
  } else {
    match = MatchModifier(I);
  }
}

bool CodeGenPatternMatch::NeedVMask() { return m_NeedVMask; }

bool CodeGenPatternMatch::HasUseOutsideLoop(llvm::Value *v) {
  if (Instruction *inst = dyn_cast<Instruction>(v)) {
    if (Loop *L = LI->getLoopFor(inst->getParent())) {
      for (auto UI = inst->user_begin(), E = inst->user_end(); UI != E; ++UI) {
        if (!L->contains(cast<Instruction>(*UI))) {
          return true;
        }
      }
    }
  }
  return false;
}

void CodeGenPatternMatch::HandleSubspanUse(llvm::Value *v, bool isSampleSource) {
  IGC_ASSERT(m_root != nullptr);
  if (m_ctx->type != ShaderType::PIXEL_SHADER) {
    return;
  }
  if (!isa<Constant>(v) && !m_WI->isUniform(v)) {
    if (isa<PHINode>(v) || HasUseOutsideLoop(v)) {
      // If a phi is used in a subspan we cannot propagate the subspan use and need to use VMask
      m_NeedVMask = true;

      if (isSampleSource && isa<PHINode>(v)) {
        m_sampleSource.insert(v);
      }

      if (Instruction *I = dyn_cast<Instruction>(v)) {
        // this is WA for situation where application has early return (not discard) from shader
        // in this case phi node will not set sample coords properly on all simd lanes
        // WA is to initialize register used for phi node with zero, which eliminates corruptions
        // here we collect suspected phi nodes to be resolved later in EmitVISAPass
        if (m_ctx->getModuleMetaData()->compOpt.initializePhiSampleSourceWA && isSampleSource &&
            !PDT->dominates(I->getParent(), &I->getFunction()->getEntryBlock())) {
          m_sampleUnderCFPHIsource.insert(v);
        }
      }
    } else {
      if (isa<SampleIntrinsic>(v)) {
        // Also, we want to apply vector mask instead of no mask to sample instructions,
        // because it should have better performance on workloads with many disabled subspans
        m_NeedVMask = true;
      }

      if (Instruction *I = dyn_cast<Instruction>(v)) {
        if (isSampleSource) {
          m_sampleSource.insert(v);
          if (!PDT->dominates(I->getParent(), &I->getFunction()->getEntryBlock())) {
            m_sampleUnderCFsource.insert(v);
          }

          if (Loop *L = LI->getLoopFor(I->getParent())) {
            if (L->contains(I)) {
              m_sampleUnderCFsource.insert(v);
            }
          }
        }
      }

      m_subSpanUse.insert(v);
      if (LoadInst *load = dyn_cast<LoadInst>(v)) {
        if (load->getPointerAddressSpace() == ADDRESS_SPACE_PRIVATE) {
          m_NeedVMask = true;
        }
      }
      if (HasPhiUse(*v) && m_WI->insideDivergentCF(m_root)) {
        // \todo, more accurate condition for force-isolation
        ForceIsolate(v);
      }
    }
  }
}

bool CodeGenPatternMatch::MatchMinMax(llvm::SelectInst &SI) {
  // Pattern to emit.
  struct MinMaxPattern : public Pattern {
    SSource srcs[2];
    bool isMin, isUnsigned;
    virtual void Emit(EmitPass *pass, const DstModifier &dstMod) {
      // FIXME: We should tell umax/umin from max/min as integers in LLVM
      // have no sign!
      pass->EmitMinMax(isMin, isUnsigned, srcs, dstMod);
    }
  };

  // Skip min/max pattern matching on FP, which needs to either explicitly
  // use intrinsics or convert them into intrinsic in GenIRLower pass.
  if (SI.getType()->isFloatingPointTy())
    return false;

  bool isMin = false, isUnsigned = false;
  llvm::Value *LHS = nullptr, *RHS = nullptr;

  if (!isMinOrMax(&SI, LHS, RHS, isMin, isUnsigned))
    return false;

  MinMaxPattern *pat = new (m_allocator) MinMaxPattern();
  // FIXME: We leave unsigned operand without source modifier so far. When
  // its behavior is detailed and correcty modeled, consider to add source
  // modifier support.
  pat->srcs[0] = GetSource(LHS, !isUnsigned, false, IsSourceOfSample(&SI));
  pat->srcs[1] = GetSource(RHS, !isUnsigned, false, IsSourceOfSample(&SI));
  pat->isMin = isMin;
  pat->isUnsigned = isUnsigned;
  AddPattern(pat);

  return true;
}

void CodeGenPatternMatch::visitSelectInst(SelectInst &I) {
  bool match = MatchFloatingPointSatModifier(I) || MatchIntegerTruncSatModifier(I) || MatchShrSatModifier(I) ||
               MatchAbsNeg(I) || MatchFPToIntegerWithSaturation(I) || MatchMinMax(I) || MatchCmpSelect(I) ||
               MatchSelectModifier(I);
  IGC_ASSERT_MESSAGE(match, "Pattern Match failed");
}

void CodeGenPatternMatch::visitBinaryOperator(llvm::BinaryOperator &I) {

  bool match = false;
  switch (I.getOpcode()) {
  case Instruction::FSub:
    match = MatchFloor(I) || MatchFrc(I) || MatchLrp(I) || MatchPredAdd(I) || MatchFMad(I) || MatchAbsNeg(I) ||
            MatchModifier(I);
    break;
  case Instruction::Sub:
    match = MatchIMad(I) || MatchAdd3(I) || MatchAbsNeg(I) || MatchMulAdd16(I) || MatchModifier(I);
    break;
  case Instruction::Mul:
    match = MatchFullMul32(I) ||
            MatchMulAdd16(I) || MatchModifier(I);
    break;
  case Instruction::Add:
    match = MatchIMad(I) || MatchAdd3(I) || MatchMulAdd16(I) || MatchModifier(I);
    break;
  case Instruction::UDiv:
  case Instruction::SDiv:
    match = MatchAvg(I) || MatchModifier(I);
    break;
  case Instruction::AShr:
    match = MatchAvg(I) || MatchBinaryUnpack4i8(I) || MatchModifier(I);
    break;
  case Instruction::FMul:
    match = MatchArcpFdiv(I) || MatchModifier(I);
    break;
  case Instruction::URem:
  case Instruction::SRem:
  case Instruction::FRem:
    match = MatchModifier(I);
    break;
  case Instruction::Shl:
    match = MatchBinaryUnpack4i8(I) || MatchModifier(I);
    break;
  case Instruction::LShr:
    match = MatchBinaryUnpack4i8(I) || MatchModifier(I, false);
    break;
  case Instruction::FDiv:
    match = MatchRsqrt(I) || MatchModifier(I);
    break;
  case Instruction::FAdd:
    match = MatchLrp(I) || MatchPredAdd(I) || MatchFMad(I) || MatchSimpleAdd(I) || MatchModifier(I);
    break;
  case Instruction::And:
    match = MatchBfn(I) || MatchBoolOp(I) || MatchLogicAlu(I);
    break;
  case Instruction::Or:
    match = MatchPack4i8(I) || MatchBfn(I) || MatchBoolOp(I) || MatchLogicAlu(I);
    break;
  case Instruction::Xor:
    match = MatchBfn(I) || MatchLogicAlu(I);
    break;
  default:
    IGC_ASSERT_MESSAGE(0, "unknown binary instruction");
    break;
  }
  IGC_ASSERT(match == true);
}

void CodeGenPatternMatch::visitCmpInst(llvm::CmpInst &I) {
  bool match = MatchCondModifier(I) || MatchModifier(I);
  IGC_ASSERT(match);
}

void CodeGenPatternMatch::visitBranchInst(llvm::BranchInst &I) { MatchBranch(I); }

void CodeGenPatternMatch::visitCallInst(CallInst &I) {
  bool match = false;
  using namespace GenISAIntrinsic;
  if (GenIntrinsicInst *GII = llvm::dyn_cast<GenIntrinsicInst>(&I)) {
    switch (GII->getIntrinsicID()) {
    case GenISAIntrinsic::GenISA_ROUNDNE:
    case GenISAIntrinsic::GenISA_imulH:
    case GenISAIntrinsic::GenISA_umulH:
    case GenISAIntrinsic::GenISA_uaddc:
    case GenISAIntrinsic::GenISA_usubb:
    case GenISAIntrinsic::GenISA_bfrev:
    case GenISAIntrinsic::GenISA_IEEE_Sqrt:
    case GenISAIntrinsic::GenISA_IEEE_Divide:
    case GenISAIntrinsic::GenISA_rsq:
    case GenISAIntrinsic::GenISA_inv:
      match = MatchModifier(I);
      break;
    case GenISAIntrinsic::GenISA_intatomicraw:
    case GenISAIntrinsic::GenISA_floatatomicraw:
    case GenISAIntrinsic::GenISA_intatomicrawA64:
    case GenISAIntrinsic::GenISA_floatatomicrawA64:
    case GenISAIntrinsic::GenISA_icmpxchgatomicraw:
    case GenISAIntrinsic::GenISA_fcmpxchgatomicraw:
    case GenISAIntrinsic::GenISA_icmpxchgatomicrawA64:
    case GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64:
    case GenISAIntrinsic::GenISA_dwordatomicstructured:
    case GenISAIntrinsic::GenISA_floatatomicstructured:
    case GenISAIntrinsic::GenISA_cmpxchgatomicstructured:
    case GenISAIntrinsic::GenISA_fcmpxchgatomicstructured:
    case GenISAIntrinsic::GenISA_intatomictyped:
    case GenISAIntrinsic::GenISA_icmpxchgatomictyped:
    case GenISAIntrinsic::GenISA_typedread:
    case GenISAIntrinsic::GenISA_typedwrite:
    case GenISAIntrinsic::GenISA_typedreadMS:
    case GenISAIntrinsic::GenISA_typedwriteMS:
    case GenISAIntrinsic::GenISA_floatatomictyped:
    case GenISAIntrinsic::GenISA_fcmpxchgatomictyped:
    case GenISAIntrinsic::GenISA_ldlptr:
    case GenISAIntrinsic::GenISA_ldstructured:
    case GenISAIntrinsic::GenISA_storestructured1:
    case GenISAIntrinsic::GenISA_storestructured2:
    case GenISAIntrinsic::GenISA_storestructured3:
    case GenISAIntrinsic::GenISA_storestructured4:
    case GenISAIntrinsic::GenISA_atomiccounterinc:
    case GenISAIntrinsic::GenISA_atomiccounterpredec:
    case GenISAIntrinsic::GenISA_ldptr:
      if (supportsLSCImmediateGlobalBaseOffset()) {
        match = MatchImmOffsetLSC(I);
        if (match)
          return;
      }
      match = MatchSingleInstruction(I);
      break;
    case GenISAIntrinsic::GenISA_ldrawvector_indexed:
    case GenISAIntrinsic::GenISA_ldraw_indexed:
    case GenISAIntrinsic::GenISA_storerawvector_indexed:
    case GenISAIntrinsic::GenISA_storeraw_indexed:
      if (supportsLSCImmediateGlobalBaseOffset()) {
        match = MatchImmOffsetLSC(I);
        if (match)
          return;
      }
      match = MatchSingleInstruction(I);
      break;
    case GenISAIntrinsic::GenISA_GradientX:
    case GenISAIntrinsic::GenISA_GradientY:
    case GenISAIntrinsic::GenISA_GradientXfine:
    case GenISAIntrinsic::GenISA_GradientYfine:
      match = MatchGradient(*GII);
      break;
    case GenISAIntrinsic::GenISA_sampleptr:
    case GenISAIntrinsic::GenISA_sampleBptr:
    case GenISAIntrinsic::GenISA_sampleBCptr:
    case GenISAIntrinsic::GenISA_sampleCptr:
    case GenISAIntrinsic::GenISA_gather4Bptr:
    case GenISAIntrinsic::GenISA_gather4BPOptr:
    case GenISAIntrinsic::GenISA_gather4Iptr:
    case GenISAIntrinsic::GenISA_gather4IPOptr:
    case GenISAIntrinsic::GenISA_gather4ICptr:
    case GenISAIntrinsic::GenISA_gather4ICPOptr:
    case GenISAIntrinsic::GenISA_sampleMlodptr:
    case GenISAIntrinsic::GenISA_sampleCMlodptr:
    case GenISAIntrinsic::GenISA_sampleBCMlodptr:
    case GenISAIntrinsic::GenISA_samplePOptr:
    case GenISAIntrinsic::GenISA_samplePOBptr:
    case GenISAIntrinsic::GenISA_samplePOCptr:
    case GenISAIntrinsic::GenISA_gather4POPackedBptr:
    case GenISAIntrinsic::GenISA_gather4POPackedIptr:
    case GenISAIntrinsic::GenISA_gather4POPackedICptr:
    case GenISAIntrinsic::GenISA_lodptr:
    case GenISAIntrinsic::GenISA_sampleKillPix:
      match = MatchSampleDerivative(*GII);
      break;
    case GenISAIntrinsic::GenISA_fsat:
      match = MatchFloatingPointSatModifier(I);
      break;
    case GenISAIntrinsic::GenISA_usat:
    case GenISAIntrinsic::GenISA_isat:
      match = MatchIntegerSatModifier(I);
      break;
    case GenISAIntrinsic::GenISA_WaveShuffleIndex:
    case GenISAIntrinsic::GenISA_WaveBroadcast:
      match = MatchRegisterRegion(*GII) || MatchShuffleBroadCast(*GII) || MatchWaveShuffleIndex(*GII);
      break;
    case GenISAIntrinsic::GenISA_WaveBallot:
    case GenISAIntrinsic::GenISA_WaveInverseBallot:
    case GenISAIntrinsic::GenISA_WaveClusteredBallot:
    case GenISAIntrinsic::GenISA_WaveAll:
    case GenISAIntrinsic::GenISA_WaveClustered:
    case GenISAIntrinsic::GenISA_WaveInterleave:
    case GenISAIntrinsic::GenISA_WaveClusteredInterleave:
    case GenISAIntrinsic::GenISA_WavePrefix:
    case GenISAIntrinsic::GenISA_WaveClusteredPrefix:
      match = MatchWaveInstruction(*GII);
      break;
    case GenISAIntrinsic::GenISA_simdBlockRead:
    case GenISAIntrinsic::GenISA_simdBlockWrite:
      match = MatchBlockReadWritePointer(*GII) || MatchSingleInstruction(*GII);
      break;
    case GenISAIntrinsic::GenISA_UnmaskedRegionBegin:
      match = MatchUnmaskedRegionBoundary(I, true);
      break;
    case GenISAIntrinsic::GenISA_UnmaskedRegionEnd:
      match = MatchUnmaskedRegionBoundary(I, false);
      break;
    case GenISAIntrinsic::GenISA_sub_group_dpas:
    case GenISAIntrinsic::GenISA_dpas:
      match = MatchDpas(*GII);
      break;
    case GenISAIntrinsic::GenISA_dp4a_ss:
    case GenISAIntrinsic::GenISA_dp4a_su:
    case GenISAIntrinsic::GenISA_dp4a_us:
    case GenISAIntrinsic::GenISA_dp4a_uu:
      match = MatchDp4a(*GII);
      break;
    default:
      match = MatchSingleInstruction(I);
      // no pattern for the rest of the intrinsics
      break;
    }
    IGC_ASSERT_MESSAGE(match, "no pattern found for GenISA intrinsic");
  } else {
    Function *Callee = I.getCalledFunction();

    // Match inline asm
    if (I.isInlineAsm()) {
      if (getAnalysis<CodeGenContextWrapper>().getCodeGenContext()->m_DriverInfo.SupportInlineAssembly()) {
        match = MatchSingleInstruction(I);
      }
    }
    // Match indirect call, support declarations for indirect funcs
    else if (!Callee || Callee->hasFnAttribute("referenced-indirectly") ||
             Callee->hasFnAttribute("invoke_simd_target")) {
      match = MatchSingleInstruction(I);
    }
    // Match direct call, skip declarations
    else if (!Callee->isDeclaration()) {
      match = MatchSingleInstruction(I);
    }
  }
  IGC_ASSERT_MESSAGE(match, "no match for this call");
}

void CodeGenPatternMatch::visitUnaryInstruction(llvm::UnaryInstruction &I) {
  bool match = false;
  switch (I.getOpcode()) {
  case Instruction::Alloca:
  case Instruction::Load:
    match = MatchSingleInstruction(I);
    break;
  case Instruction::FNeg:
    match = MatchAbsNeg(I);
    break;
  }
  IGC_ASSERT(match);
}

void CodeGenPatternMatch::visitIntrinsicInst(llvm::IntrinsicInst &I) {
  bool match = false;
  switch (I.getIntrinsicID()) {
  case Intrinsic::sqrt:
  case Intrinsic::log2:
  case Intrinsic::cos:
  case Intrinsic::sin:
  case Intrinsic::pow:
  case Intrinsic::powi:
  case Intrinsic::floor:
  case Intrinsic::ceil:
  case Intrinsic::trunc:
  case Intrinsic::ctpop:
  case Intrinsic::ctlz:
  case Intrinsic::cttz:
    match = MatchModifier(I);
    break;
  case Intrinsic::exp2:
    match = MatchPow(I) || MatchModifier(I);
    break;
  case Intrinsic::abs:
  case Intrinsic::fabs:
    match = MatchAbsNeg(I);
    break;
  case Intrinsic::fma:
    match = MatchFMA(I);
    break;
  case Intrinsic::maxnum:
  case Intrinsic::minnum:
    match = MatchFloatingPointSatModifier(I) || MatchModifier(I);
    break;
  case Intrinsic::fshl:
  case Intrinsic::fshr:
    match = MatchFunnelShiftRotate(I);
    break;
  case Intrinsic::canonicalize:
    match = MatchCanonicalizeInstruction(I);
    break;
  default:
    match = MatchSingleInstruction(I);
    // no pattern for the rest of the intrinsics
    break;
  }
  IGC_ASSERT_MESSAGE(match, "no pattern found");
}

void CodeGenPatternMatch::visitStoreInst(StoreInst &I) {
  bool match = false;
  if (supportsLSCImmediateGlobalBaseOffset()) {
    match = MatchImmOffsetLSC(I);
    if (match)
      return;
  }
  match = MatchSingleInstruction(I);
  IGC_ASSERT(match);
}

void CodeGenPatternMatch::visitLoadInst(LoadInst &I) {
  bool match = false;
  if (supportsLSCImmediateGlobalBaseOffset()) {
    match = MatchImmOffsetLSC(I);
    if (match)
      return;
  }
  match = MatchSingleInstruction(I);
  IGC_ASSERT(match);
}

void CodeGenPatternMatch::visitInstruction(llvm::Instruction &I) {
  // use default pattern
  MatchSingleInstruction(I);
}

void CodeGenPatternMatch::visitExtractElementInst(llvm::ExtractElementInst &I) {
  Value *VecOpnd = I.getVectorOperand();
  if (isa<Constant>(VecOpnd)) {
    const Function *F = I.getParent()->getParent();
    unsigned NUse = 0;
    for (auto User : VecOpnd->users()) {
      if (auto Inst = dyn_cast<Instruction>(User)) {
        NUse += (Inst->getParent()->getParent() == F);
      }
    }

    // Only add it to pool when there are multiple uses within this
    // function; otherwise no benefit but to hurt RP.
    if (NUse > 1)
      AddToConstantPool(I.getParent(), VecOpnd);
  }
  MatchSingleInstruction(I);
}

void CodeGenPatternMatch::visitPHINode(PHINode &I) {
  struct PHIPattern : Pattern {
    // bool isInDivergentLoop = false;
    llvm::PHINode *phi = nullptr;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      IGC_ASSERT(modifier.sat == false);
      IGC_ASSERT(modifier.flag == nullptr);
      pass->EmitInitializePHI(phi);
    }
  };

  if (Loop *L = LI->getLoopFor(I.getParent())) {
    SmallVector<BasicBlock *, 4> exitingBlocks;
    SmallPtrSet<Instruction *, 8> visitedInstructions;
    L->getExitingBlocks(exitingBlocks);
    for (auto exitingBlock : exitingBlocks) {
      if (!isUniform(exitingBlock->getTerminator())) {
        std::vector<Instruction *> worklist;
        worklist.push_back(&I);
        while (!worklist.empty()) {
          Instruction *currentInst = worklist.back();
          worklist.pop_back();
          for (auto operand : currentInst->operand_values()) {
            if (auto nextInst = dyn_cast<Instruction>(operand)) {
              if (nextInst == &I) {
                return;
              }
              if (L->contains(nextInst->getParent()) && !visitedInstructions.contains(nextInst)) {
                worklist.push_back(nextInst);
                visitedInstructions.insert(nextInst);
              }
            }
          }
        }
        break;
      }
    }
  }
  if (IsSourceOfSample(&I)) {
    PHIPattern *pattern = new (m_allocator) PHIPattern();
    pattern->phi = &I;
    // pattern->isInDivergentLoop = isInDivergentLoop;
    AddPattern(pattern);
  }
}

void CodeGenPatternMatch::visitBitCastInst(BitCastInst &I) {
  // detect
  // %66 = insertelement <2 x i32> <i32 0, i32 undef>, i32 %xor19.i, i32 1
  // %67 = bitcast <2 x i32> % 66 to i64
  // and replace it with a shl 32
  struct Shl32Pattern : public Pattern {
    SSource sources[2];
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Binary(EOPCODE_SHL, sources, modifier); }
  };

  if (m_Platform.hasPartialInt64Support() && I.getType()->isIntegerTy(64) && I.getOperand(0)->getType()->isVectorTy() &&
      cast<VectorType>(I.getOperand(0)->getType())->getElementType()->isIntegerTy(32)) {
    if (auto IEI = dyn_cast<InsertElementInst>(I.getOperand(0))) {
      auto vec = dyn_cast<ConstantVector>(IEI->getOperand(0));
      bool isCandidate =
          vec && vec->getNumOperands() == 2 && IsZero(vec->getOperand(0)) && isa<UndefValue>(vec->getOperand(1));
      auto index = dyn_cast<ConstantInt>(IEI->getOperand(2));
      isCandidate &= index && index->getZExtValue() == 1;
      if (isCandidate) {
        Shl32Pattern *Pat = new (m_allocator) Shl32Pattern();
        Pat->sources[0] = GetSource(IEI->getOperand(1), false, false, IsSourceOfSample(&I));
        Pat->sources[1] =
            GetSource(ConstantInt::get(Type::getInt32Ty(I.getContext()), 32), false, false, IsSourceOfSample(&I));
        AddPattern(Pat);
        return;
      }
    }
  }
  bool match = false;
  match = MatchRepack4i8(I) || MatchPack4i8(I) || MatchSingleInstruction(I);
  IGC_ASSERT_MESSAGE(match, "Unsupported BitCast instruction");
}

void CodeGenPatternMatch::visitIntToPtrInst(IntToPtrInst &I) { MatchSingleInstruction(I); }

void CodeGenPatternMatch::visitPtrToIntInst(PtrToIntInst &I) { MatchSingleInstruction(I); }

void CodeGenPatternMatch::visitAddrSpaceCast(AddrSpaceCastInst &I) { MatchSingleInstruction(I); }

void CodeGenPatternMatch::visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { MatchDbgInstruction(I); }

void CodeGenPatternMatch::visitExtractValueInst(ExtractValueInst &I) {
  bool Match = false;

  // Ignore the extract value instruction. Handled in the call inst.
  if (CallInst *call = dyn_cast<CallInst>(I.getOperand(0))) {
    if (call->isInlineAsm() && call->getType()->isStructTy()) {
      MarkAsSource(call, IsSourceOfSample(&I));
      return;
    }
  }

  Match = matchAddPair(&I) || matchSubPair(&I) || matchMulPair(&I) || matchPtrToPair(&I) || MatchSingleInstruction(I);

  IGC_ASSERT_MESSAGE(Match, "Unknown `extractvalue` instruction!");
}

bool CodeGenPatternMatch::matchAddPair(ExtractValueInst *Ex) {
  Value *V = Ex->getOperand(0);
  GenIntrinsicInst *GII = dyn_cast<GenIntrinsicInst>(V);
  if (!GII || GII->getIntrinsicID() != GenISAIntrinsic::GenISA_add_pair)
    return false;

  if (Ex->getNumIndices() != 1)
    return false;
  unsigned Idx = *Ex->idx_begin();
  if (Idx != 0 && Idx != 1)
    return false;

  struct AddPairPattern : public Pattern {
    GenIntrinsicInst *GII;
    SSource Sources[4]; // L0, H0, L1, H1
    virtual void Emit(EmitPass *Pass, const DstModifier &DstMod) { Pass->EmitAddPair(GII, Sources, DstMod); }
  };

  struct AddPairSubPattern : public Pattern {
    virtual void Emit(EmitPass *Pass, const DstModifier &Mod) {
      // DO NOTHING. Dummy pattern.
    }
  };

  auto [MI, New] = PairOutputMap.insert(std::make_pair(GII, PairOutputTy()));
  if (New) {
    AddPairPattern *Pat = new (m_allocator) AddPairPattern();
    Pat->GII = GII;
    Pat->Sources[0] = GetSource(GII->getOperand(0), false, false, IsSourceOfSample(Ex));
    Pat->Sources[1] = GetSource(GII->getOperand(1), false, false, IsSourceOfSample(Ex));
    Pat->Sources[2] = GetSource(GII->getOperand(2), false, false, IsSourceOfSample(Ex));
    Pat->Sources[3] = GetSource(GII->getOperand(3), false, false, IsSourceOfSample(Ex));
    AddPattern(Pat);
  } else {
    AddPairSubPattern *Pat = new (m_allocator) AddPairSubPattern();
    AddPattern(Pat);
  }
  if (Idx == 0)
    MI->second.first = Ex;
  else
    MI->second.second = Ex;

  return true;
}

bool CodeGenPatternMatch::matchSubPair(ExtractValueInst *Ex) {
  Value *V = Ex->getOperand(0);
  GenIntrinsicInst *GII = dyn_cast<GenIntrinsicInst>(V);
  if (!GII || GII->getIntrinsicID() != GenISAIntrinsic::GenISA_sub_pair)
    return false;

  if (Ex->getNumIndices() != 1)
    return false;
  unsigned Idx = *Ex->idx_begin();
  if (Idx != 0 && Idx != 1)
    return false;

  struct SubPairPattern : public Pattern {
    GenIntrinsicInst *GII;
    SSource Sources[4]; // L0, H0, L1, H1
    virtual void Emit(EmitPass *Pass, const DstModifier &DstMod) { Pass->EmitSubPair(GII, Sources, DstMod); }
  };

  struct SubPairSubPattern : public Pattern {
    virtual void Emit(EmitPass *Pass, const DstModifier &Mod) {
      // DO NOTHING. Dummy pattern.
    }
  };

  auto [MI, New] = PairOutputMap.insert(std::make_pair(GII, PairOutputTy()));
  if (New) {
    SubPairPattern *Pat = new (m_allocator) SubPairPattern();
    Pat->GII = GII;
    Pat->Sources[0] = GetSource(GII->getOperand(0), false, false, IsSourceOfSample(Ex));
    Pat->Sources[1] = GetSource(GII->getOperand(1), false, false, IsSourceOfSample(Ex));
    Pat->Sources[2] = GetSource(GII->getOperand(2), false, false, IsSourceOfSample(Ex));
    Pat->Sources[3] = GetSource(GII->getOperand(3), false, false, IsSourceOfSample(Ex));
    AddPattern(Pat);
  } else {
    SubPairSubPattern *Pat = new (m_allocator) SubPairSubPattern();
    AddPattern(Pat);
  }
  if (Idx == 0)
    MI->second.first = Ex;
  else
    MI->second.second = Ex;

  return true;
}

bool CodeGenPatternMatch::matchMulPair(ExtractValueInst *Ex) {
  Value *V = Ex->getOperand(0);
  GenIntrinsicInst *GII = dyn_cast<GenIntrinsicInst>(V);
  if (!GII || GII->getIntrinsicID() != GenISAIntrinsic::GenISA_mul_pair)
    return false;

  if (Ex->getNumIndices() != 1)
    return false;
  unsigned Idx = *Ex->idx_begin();
  if (Idx != 0 && Idx != 1)
    return false;

  struct MulPairPattern : public Pattern {
    GenIntrinsicInst *GII;
    SSource Sources[4]; // L0, H0, L1, H1
    virtual void Emit(EmitPass *Pass, const DstModifier &DstMod) { Pass->EmitMulPair(GII, Sources, DstMod); }
  };

  struct MulPairSubPattern : public Pattern {
    virtual void Emit(EmitPass *Pass, const DstModifier &Mod) {
      // DO NOTHING. Dummy pattern.
    }
  };

  auto [MI, New] = PairOutputMap.insert(std::make_pair(GII, PairOutputTy()));
  if (New) {
    MulPairPattern *Pat = new (m_allocator) MulPairPattern();
    Pat->GII = GII;
    Pat->Sources[0] = GetSource(GII->getOperand(0), false, false, IsSourceOfSample(Ex));
    Pat->Sources[1] = GetSource(GII->getOperand(1), false, false, IsSourceOfSample(Ex));
    Pat->Sources[2] = GetSource(GII->getOperand(2), false, false, IsSourceOfSample(Ex));
    Pat->Sources[3] = GetSource(GII->getOperand(3), false, false, IsSourceOfSample(Ex));
    AddPattern(Pat);
  } else {
    MulPairSubPattern *Pat = new (m_allocator) MulPairSubPattern();
    AddPattern(Pat);
  }
  if (Idx == 0)
    MI->second.first = Ex;
  else
    MI->second.second = Ex;

  return true;
}

bool CodeGenPatternMatch::matchPtrToPair(ExtractValueInst *Ex) {
  Value *V = Ex->getOperand(0);
  GenIntrinsicInst *GII = dyn_cast<GenIntrinsicInst>(V);
  if (!GII || GII->getIntrinsicID() != GenISAIntrinsic::GenISA_ptr_to_pair)
    return false;

  if (Ex->getNumIndices() != 1)
    return false;
  unsigned Idx = *Ex->idx_begin();
  if (Idx != 0 && Idx != 1)
    return false;

  struct PtrToPairPattern : public Pattern {
    GenIntrinsicInst *GII;
    SSource Sources[1]; // Ptr
    virtual void Emit(EmitPass *Pass, const DstModifier &DstMod) { Pass->EmitPtrToPair(GII, Sources, DstMod); }
  };

  struct PtrToPairSubPattern : public Pattern {
    virtual void Emit(EmitPass *Pass, const DstModifier &Mod) {
      // DO NOTHING. Dummy pattern.
    }
  };

  auto [MI, New] = PairOutputMap.insert(std::make_pair(GII, PairOutputTy()));
  if (New) {
    PtrToPairPattern *Pat = new (m_allocator) PtrToPairPattern();
    Pat->GII = GII;
    Pat->Sources[0] = GetSource(GII->getOperand(0), false, false, IsSourceOfSample(Ex));
    AddPattern(Pat);
  } else {
    PtrToPairSubPattern *Pat = new (m_allocator) PtrToPairSubPattern();
    AddPattern(Pat);
  }
  if (Idx == 0)
    MI->second.first = Ex;
  else
    MI->second.second = Ex;

  return true;
}

bool CodeGenPatternMatch::MatchAbsNeg(llvm::Instruction &I) {
  struct MovModifierPattern : public Pattern {
    SSource source;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Mov(source, modifier); }
  };
  bool match = false;
  e_modifier mod{};
  Value *source = nullptr;
  if (GetModifier(I, mod, source)) {
    MovModifierPattern *pattern = new (m_allocator) MovModifierPattern();
    pattern->source = GetSource(source, mod, false, IsSourceOfSample(&I));
    match = true;
    AddPattern(pattern);
  }
  return match;
}

bool CodeGenPatternMatch::MatchFrc(llvm::BinaryOperator &I) {
  if (!ContractionAllowed(I))
    return false;
  struct FrcPattern : public Pattern {
    SSource source;
    void Emit(EmitPass *pass, const DstModifier &modifier) override { pass->Frc(source, modifier); }

    bool supportsSaturate() override { return false; }
  };
  IGC_ASSERT(I.getOpcode() == Instruction::FSub);
  llvm::Value *source0 = I.getOperand(0);
  llvm::IntrinsicInst *source1 = llvm::dyn_cast<llvm::IntrinsicInst>(I.getOperand(1));
  bool found = false;
  if (source1 && source1->getIntrinsicID() == Intrinsic::floor) {
    if (source1->getOperand(0) == source0) {
      found = true;
    }
  }
  if (found) {
    FrcPattern *pattern = new (m_allocator) FrcPattern();
    pattern->source = GetSource(source0, true, false, IsSourceOfSample(&I));
    AddPattern(pattern);
  }
  return found;
}

/*
below pass handles x - frac(x) = floor(x) pattern. Refer below :

frc (8|M0) r20.0<1>:f r19.0<8;8,1>:f {Compacted, @1}
add (8|M0) (ge)f0.1 r19.0<1>:f r19.0<8;8,1>:f -r20.0<8;8,1>:f {@1}
rndd (8|M0) r21.0<1>:f (abs)r19.0<8;8,1>:f {Compacted, @1}
*/

bool CodeGenPatternMatch::MatchFloor(llvm::BinaryOperator &I) {
  if (IGC_IS_FLAG_ENABLED(DisableMatchFloor)) {
    return false;
  }
  struct FloorPattern : public Pattern {
    SSource source;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Floor(source, modifier); }
  };
  IGC_ASSERT(I.getOpcode() == Instruction::FSub);
  llvm::Value *source0 = I.getOperand(0);
  GenIntrinsicInst *source1 = dyn_cast<GenIntrinsicInst>(I.getOperand(1));
  bool found = false;
  if (source1 && source1->getIntrinsicID() == GenISAIntrinsic::GenISA_frc) {
    if (source1->getOperand(0) == source0) {
      found = true;
    }
  }
  if (found) {
    FloorPattern *pattern = new (m_allocator) FloorPattern();
    pattern->source = GetSource(source0, true, false, IsSourceOfSample(&I));
    AddPattern(pattern);
  }
  return found;
}

SSource CodeGenPatternMatch::GetSource(llvm::Value *value, bool modifier, bool regioning, bool isSampleDerivative) {
  llvm::Value *sourceValue = value;
  e_modifier mod = EMOD_NONE;
  if (modifier) {
    GetModifier(*sourceValue, mod, sourceValue);
  }
  return GetSource(sourceValue, mod, regioning, isSampleDerivative);
}

SSource CodeGenPatternMatch::GetSource(llvm::Value *value, e_modifier mod, bool regioning, bool isSampleDerivative) {
  SSource source;
  GetRegionModifier(source, value, regioning);
  source.value = value;
  source.mod = mod;
  MarkAsSource(value, isSampleDerivative);
  return source;
}

void CodeGenPatternMatch::MarkAsSource(llvm::Value *v, bool isSampleDerivative) {
  // update liveness of the sources
  if (IsConstOrSimdConstExpr(v)) {
    // skip constant
    return;
  }
  if (isa<Instruction>(v) || isa<Argument>(v)) {
    m_LivenessInfo->HandleVirtRegUse(v, m_root->getParent(), m_root);
  }
  // mark the source as used so that we know we need to generate this value
  if (llvm::Instruction *inst = llvm::dyn_cast<Instruction>(v)) {
    m_usedInstructions.insert(inst);
  }
  if (m_rootIsSubspanUse) {
    HandleSubspanUse(v, isSampleDerivative);
  }
}

bool CodeGenPatternMatch::IsSubspanUse(llvm::Value *v) { return m_subSpanUse.find(v) != m_subSpanUse.end(); }

bool CodeGenPatternMatch::IsSourceOfSample(llvm::Value *v) { return m_sampleSource.find(v) != m_sampleSource.end(); }

bool CodeGenPatternMatch::IsSourceOfSampleUnderCF(llvm::Value *v) {
  return m_sampleUnderCFsource.find(v) != m_sampleUnderCFsource.end();
}

bool CodeGenPatternMatch::IsPHISourceOfSampleUnderCF(llvm::Value *v) {
  return m_sampleUnderCFPHIsource.find(v) != m_sampleUnderCFPHIsource.end();
}

bool CodeGenPatternMatch::MatchFMA(llvm::IntrinsicInst &I) {
  struct FMAPattern : Pattern {
    SSource sources[3];
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Mad(sources, modifier); }
  };

  FMAPattern *pattern = new (m_allocator) FMAPattern();
  for (int i = 0; i < 3; i++) {
    llvm::Value *V = I.getOperand(i);
    pattern->sources[i] = GetSource(V, true, false, IsSourceOfSample(&I));
    if (isa<Constant>(V) &&
        (!m_Platform.support16BitImmSrcForMad() || V->getType()->getTypeID() != llvm::Type::HalfTyID || i == 1)) {
      // CNL+ mad instruction allows 16 bit immediate for src0 and src2
      AddToConstantPool(I.getParent(), V);
      pattern->sources[i].fromConstantPool = true;
    }
  }
  AddPattern(pattern);

  return true;
}

bool CodeGenPatternMatch::MatchPredAdd(llvm::BinaryOperator &I) {
  struct PredAddPattern : Pattern {
    SSource sources[2];
    SSource pred;
    e_predMode predMode;
    bool invertPred;
    void Emit(EmitPass *pass, const DstModifier &modifier) override {
      DstModifier modf = modifier;
      modf.predMode = predMode;
      pass->PredAdd(pred, invertPred, sources, modf);
    }

    // Cannot use the ".sat" modifier as it is applied only to data lanes enabled by the predication
    // while saturation operation matched MatchFloatingPointSatModifier should be executed
    // irrespective of the predicate value matched by MatchPredAdd.
    bool supportsSaturate() override { return false; }
  };

  if (m_ctx->getModuleMetaData()->isPrecise) {
    return false;
  }

  if (m_ctx->type == ShaderType::VERTEX_SHADER || !m_ctx->m_DriverInfo.SupportMatchPredAdd()) {
    return false;
  }

  bool found = false;

  llvm::Value *sources[2] = {nullptr, nullptr};
  llvm::Value *pred = nullptr;
  e_modifier src_mod[2] = {e_modifier::EMOD_NONE, e_modifier::EMOD_NONE};
  e_modifier pred_mod = e_modifier::EMOD_NONE;
  bool invertPred = false;
  if (!ContractionAllowed(I) || IGC_IS_FLAG_ENABLED(DisableMatchPredAdd)) {
    return false;
  }

  // Skip the pattern match if FPTrunc/FPEXt is used right after fadd
  if (I.hasOneUse()) {
    FPTruncInst *FPTrunc = llvm::dyn_cast<llvm::FPTruncInst>(*I.user_begin());
    FPExtInst *FPExt = llvm::dyn_cast<llvm::FPExtInst>(*I.user_begin());

    if (FPTrunc || FPExt) {
      return false;
    }
  }

  IGC_ASSERT(I.getOpcode() == Instruction::FAdd || I.getOpcode() == Instruction::FSub);
  for (uint iAdd = 0; iAdd < 2 && !found; iAdd++) {
    Value *src = I.getOperand(iAdd);
    llvm::BinaryOperator *mul = llvm::dyn_cast<llvm::BinaryOperator>(src);
    if (mul && mul->getOpcode() == Instruction::FMul) {
      if (!mul->hasOneUse() || !ContractionAllowed(*mul)) {
        continue;
      }

      for (uint iMul = 0; iMul < 2; iMul++) {
        if (llvm::SelectInst *selInst = dyn_cast<SelectInst>(mul->getOperand(iMul))) {
          ConstantFP *C1 = dyn_cast<ConstantFP>(selInst->getOperand(1));
          ConstantFP *C2 = dyn_cast<ConstantFP>(selInst->getOperand(2));
          if (C1 && C2 && selInst->hasOneUse()) {
            // select i1 %res_s48, float 1.000000e+00, float 0.000000e+00
            if ((C2->isZero() && C1->isExactlyValue(1.f))) {
              invertPred = false;
            }
            // select i1 %res_s48, float 0.000000e+00, float 1.000000e+00
            else if (C1->isZero() && C2->isExactlyValue(1.f)) {
              invertPred = true;
            } else {
              continue;
            }
          } // if (C1 && C2 && selInst->hasOneUse())
          else {
            continue;
          }

          //   % 97 = select i1 %res_s48, float 1.000000e+00, float 0.000000e+00
          //   %102 = fmul fast float %97 %98

          // case 1 (add)
          // Before match
          //   %105 = fadd %102 %103
          // After match
          //              %105 = %103
          //   (%res_s48) %105 = fadd %105 %98

          // case 2 (fsub match @ iAdd = 0)
          // Before match
          //   %105 = fsub %102 %103
          // After match
          //              %105 = -%103
          //   (%res_s48) %105 = fadd %105 %98

          // case 3 (fsub match @ iAdd = 1)
          // Before match
          //   %105 = fsub %103 %102
          // After match
          //              %105 = %103
          //   (%res_s48) %98 = fadd %105 -%98

          // sources[0]: store add operand (i.e. %103 above)
          // sources[1]: store mul operand (i.e. %98 above)

          sources[0] = I.getOperand(1 ^ iAdd);
          sources[1] = mul->getOperand(1 ^ iMul);
          pred = selInst->getOperand(0);

          GetModifier(*sources[0], src_mod[0], sources[0]);
          GetModifier(*sources[1], src_mod[1], sources[1]);
          GetModifier(*pred, pred_mod, pred);

          if (I.getOpcode() == Instruction::FSub) {
            src_mod[iAdd] = CombineModifier(EMOD_NEG, src_mod[iAdd]);
          }

          found = true;
          break;
        } //  if (llvm::SelectInst* selInst = dyn_cast<SelectInst>(mul->getOperand(iMul)))
      } // for (uint iMul = 0; iMul < 2; iMul++)
    } // if (mul && mul->getOpcode() == Instruction::FMul)
  } // for (uint iAdd = 0; iAdd < 2; iAdd++)

  if (found) {
    PredAddPattern *pattern = new (m_allocator) PredAddPattern();
    pattern->predMode = EPRED_NORMAL;
    pattern->sources[0] = GetSource(sources[0], src_mod[0], false, IsSourceOfSample(&I));
    pattern->sources[1] = GetSource(sources[1], src_mod[1], false, IsSourceOfSample(&I));
    pattern->pred = GetSource(pred, pred_mod, false, IsSourceOfSample(&I));
    pattern->invertPred = invertPred;

    AddPattern(pattern);
  }
  return found;
}

// we match the following pattern
// %c = fcmp %1 %2
// %g = sext %c to i32
// %h = and i32 %g 1065353216
// %m = bitcast i32 %h to float
// %p = fadd %m %n
bool CodeGenPatternMatch::MatchSimpleAdd(llvm::BinaryOperator &I) {
  struct SimpleAddPattern : public Pattern {
    SSource sources[2];
    SSource pred;

    e_predMode predMode;
    bool invertPred;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      DstModifier modf = modifier;
      modf.predMode = predMode;
      pass->PredAdd(pred, invertPred, sources, modf);
    }
  };

  IGC_ASSERT(I.getOpcode() == Instruction::FAdd);

  if (IGC_IS_FLAG_ENABLED(DisableMatchSimpleAdd)) {
    return false;
  }

  unsigned int repAddOperand = 0;
  BitCastInst *bitCastInst0 = llvm::dyn_cast<llvm::BitCastInst>(I.getOperand(0));
  BitCastInst *bitCastInst1 = llvm::dyn_cast<llvm::BitCastInst>(I.getOperand(1));
  BitCastInst *bitCastInst = NULL;

  for (auto UI = I.user_begin(); UI != I.user_end(); ++UI) {
    if (isa<GenIntrinsicInst>(*UI, GenISAIntrinsic::GenISA_fsat)) {
      return false;
    }
  }

  if (!bitCastInst0 && !bitCastInst1) {
    return false;
  }

  if (bitCastInst1) {
    bitCastInst = bitCastInst1;
    repAddOperand = 1;
  } else {
    bitCastInst = bitCastInst0;
    repAddOperand = 0;
  }

  if (!bitCastInst->hasOneUse()) {
    return false;
  }

  Instruction *andInst = dyn_cast<Instruction>(bitCastInst->getOperand(0));
  if (!andInst || (andInst->getOpcode() != Instruction::And)) {
    return false;
  }

  // check %h = and i32 %g 1065353216
  if (!andInst->getType()->isIntegerTy(32)) {
    return false;
  }

  ConstantInt *CInt = dyn_cast<ConstantInt>(andInst->getOperand(1));
  if (!CInt || (CInt->getZExtValue() != 0x3f800000)) {
    return false;
  }

  SExtInst *SExt = llvm::dyn_cast<llvm::SExtInst>(andInst->getOperand(0));
  if (!SExt) {
    return false;
  }

  CmpInst *cmp = llvm::dyn_cast<CmpInst>(SExt->getOperand(0));
  if (!cmp) {
    return false;
  }

  // match found
  SimpleAddPattern *pattern = new (m_allocator) SimpleAddPattern();
  llvm::Value *sources[2], *pred;
  e_modifier src_mod[2], pred_mod;

  sources[0] = I.getOperand(1 - repAddOperand);
  sources[1] = ConstantFP::get(I.getType(), 1.0);
  pred = cmp;

  GetModifier(*sources[0], src_mod[0], sources[0]);
  GetModifier(*sources[1], src_mod[1], sources[1]);
  GetModifier(*pred, pred_mod, pred);

  pattern->predMode = EPRED_NORMAL;
  pattern->sources[0] = GetSource(sources[0], src_mod[0], false, IsSourceOfSample(&I));
  pattern->sources[1] = GetSource(sources[1], src_mod[1], false, IsSourceOfSample(&I));
  pattern->pred = GetSource(pred, pred_mod, false, IsSourceOfSample(&I));
  pattern->invertPred = false;
  AddPattern(pattern);

  return true;
}

bool CodeGenPatternMatch::CanMatchMad(llvm::BinaryOperator &I) const {
  IGC_ASSERT(I.getOpcode() == Instruction::Add || I.getOpcode() == Instruction::Sub ||
             I.getOpcode() == Instruction::FAdd || I.getOpcode() == Instruction::FSub);

  // General restrictions
  if (IGC_IS_FLAG_ENABLED(DisableMatchMad)) {
    return false;
  }

  if (m_ctx->type == ShaderType::VERTEX_SHADER && m_ctx->m_DriverInfo.DisableMatchMad()) {
    return false;
  }

  if (m_ctx->type == ShaderType::COMPUTE_SHADER && m_ctx->getModuleMetaData()->disableMatchMadOptimizationForCS) {
    return false;
  }

  // Instruction specific restrictions
  bool isFpMad = I.getType()->isFloatingPointTy() ||
                 (I.getType()->isVectorTy() && I.getType()->getScalarType()->isFloatingPointTy() &&
                  IGC_IS_FLAG_ENABLED(VectorizerAllowFMADMatching));

  if (I.getOperand(0) == I.getOperand(1)) {
    // requirement, cannot match mad instruction if both ops of add/sub are the same, but can just do shl or mul by 2
    return false;
  }

  if (isFpMad &&
      (m_ctx->getModuleMetaData()->isPrecise || m_ctx->getModuleMetaData()->compOpt.disableMathRefactoring)) {
    return false;
  }

  if (I.getType()->isIntOrIntVectorTy() && !(m_Platform.doIntegerMad() && m_ctx->m_DriverInfo.EnableIntegerMad())) {
    return false;
  }

  if (m_ctx->type == ShaderType::VERTEX_SHADER && m_ctx->m_DriverInfo.PreventZFighting() &&
      !(m_ctx->getModuleMetaData()->allowMatchMadOptimizationforVS ||
        IGC_IS_FLAG_ENABLED(WaAllowMatchMadOptimizationforVS))) {
    if (m_PosDep->PositionDependsOnInst(&I)) {
      return false;
    }
  }

  if (isFpMad && !ContractionAllowed(I)) {
    return false;
  }

  return true;
}

bool CodeGenPatternMatch::MatchFMad(llvm::BinaryOperator &I) {
  struct FMadPattern : Pattern {
    SSource sources[3];
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      if (IGC_IS_FLAG_ENABLED(EnableVectorEmitter) && sources[0].value->getType()->isVectorTy() &&
          sources[1].value->getType()->isVectorTy() && sources[2].value->getType()->isVectorTy())
        pass->VectorMad(sources, modifier);
      else
        pass->Mad(sources, modifier);
    }
  };

  if (!CanMatchMad(I))
    return false;

  bool found = false;
  llvm::Value *sources[3];
  e_modifier src_mod[3];

  IGC_ASSERT(I.getOpcode() == Instruction::FAdd || I.getOpcode() == Instruction::FSub);
  for (uint i = 0; i < 2; i++) {
    Value *src = SkipCanonicalize(I.getOperand(i));
    if (FPExtInst *fpextInst = llvm::dyn_cast<llvm::FPExtInst>(src)) {
      if (!m_Platform.supportMixMode() && fpextInst->getSrcTy()->getTypeID() == llvm::Type::HalfTyID) {
        // no mix mode instructions
      } else if (fpextInst->getSrcTy()->getTypeID() != llvm::Type::DoubleTyID &&
                 fpextInst->getDestTy()->getTypeID() != llvm::Type::DoubleTyID) {
        src = fpextInst->getOperand(0);
      }
    }
    llvm::BinaryOperator *mul = llvm::dyn_cast<llvm::BinaryOperator>(src);

    if (mul && mul->getOpcode() == Instruction::FMul) {
      // in case we know we won't be able to remove the mul we don't merge it
      if (!m_PosDep->PositionDependsOnInst(mul) && NeedInstruction(*mul))
        continue;

      // Check either contraction allowed globally or fmul has allow contract flag (at this point fadd/fsub already has
      // contract flag if disabled at global level)
      if (!ContractionAllowed(*mul))
        continue;

      sources[2] = SkipCanonicalize(I.getOperand(1 - i));
      sources[1] = SkipCanonicalize(mul->getOperand(0));
      sources[0] = SkipCanonicalize(mul->getOperand(1));

      GetModifier(*sources[0], src_mod[0], sources[0]);
      GetModifier(*sources[1], src_mod[1], sources[1]);
      GetModifier(*sources[2], src_mod[2], sources[2]);

      sources[0] = SkipCanonicalize(sources[0]);
      sources[1] = SkipCanonicalize(sources[1]);
      sources[2] = SkipCanonicalize(sources[2]);
      if (I.getOpcode() == Instruction::FSub) {
        if (i == 0) {
          src_mod[2] = CombineModifier(EMOD_NEG, src_mod[2]);
        } else {
          if (llvm::isa<llvm::Constant>(sources[0])) {
            src_mod[1] = CombineModifier(EMOD_NEG, src_mod[1]);
          } else {
            src_mod[0] = CombineModifier(EMOD_NEG, src_mod[0]);
          }
        }
      }
      found = true;
      break;
    }
  }

  if (found) {
    FMadPattern *pattern = new (m_allocator) FMadPattern();
    for (int i = 0; i < 3; i++) {
      pattern->sources[i] = GetSource(sources[i], src_mod[i], false, IsSourceOfSample(&I));
      if (isa<ConstantFP>(sources[i]) &&
          (!m_Platform.support16BitImmSrcForMad() || !sources[i]->getType()->isHalfTy() || i == 1)) {
        // CNL+ mad instruction allows 16 bit immediate for src0 and src2
        AddToConstantPool(I.getParent(), sources[i]);
        pattern->sources[i].fromConstantPool = true;
      }
    }
    AddPattern(pattern);
  }
  return found;
}

bool CodeGenPatternMatch::IsMulCandidateForIMad(llvm::BinaryOperator *Mul, llvm::BinaryOperator *I) {
  IGC_ASSERT(Mul->getOpcode() == Instruction::Mul);
  IGC_ASSERT(I->getOpcode() == Instruction::Add || I->getOpcode() == Instruction::Sub);

  // in case we know we won't be able to remove the mul we don't merge it
  if (!m_PosDep->PositionDependsOnInst(Mul) && NeedInstruction(*Mul))
    return false;


  auto isValueNumBitsWide = [](Value *V, unsigned bitWidth) {
    // Look through casts because LLVM cannot represent mixed type mul and add that may match a mad instruction
    // i.e.
    // a64 = zext i32 a to i64
    // b64 = zext i32 b to i64
    // temp = mul i64 a64, b64
    // res = add i64 temp, c64
    while (isa<ZExtInst>(V) || isa<SExtInst>(V) || isa<BitCastInst>(V))
      V = cast<Instruction>(V)->getOperand(0);
    Type *T = V->getType();
    return (T->isIntegerTy() && T->getPrimitiveSizeInBits() == bitWidth);
  };

  // Mad instruction doesn't support QW type for mul operands
  if (isValueNumBitsWide(Mul->getOperand(1), 64) || isValueNumBitsWide(Mul->getOperand(0), 64))
    return false;

  if (IsConstOrSimdConstExpr(Mul->getOperand(1)) && IsConstOrSimdConstExpr(Mul->getOperand(0))) {
    // If both Mul operands are constants or simdSize intrinsic, do not match MAD, just match ADD/ADD3 with folded
    // constant.
    return false;
  }

  // In cases where mul has multiple users, do additional check.
  // Always allow single user case, even if cross block.
  if (Mul->getUniqueUndroppableUser() != I) {
    if (IGC_IS_FLAG_DISABLED(AllowMultipleMulUsesMatchMad)) {
      // If Mul has other users, do not match mad, just compute mul and use for the add/sub since it will likely need to
      // be computed to be used in the other instructions.
      return false;
    }

    // Mul has other users and is allowed. However, if mul is not in the same block as I, then matching mad may increase
    // reg pressure. Lifetime of Mul may get extended, matching something else like add3 may reduce reg pressure.
    if (IGC_IS_FLAG_DISABLED(AllowCrossBlockMatchMad) && Mul->getParent() != I->getParent()) {
      return false;
    }
  }

  auto *otherOp = (I->getOperand(0) == Mul) ? I->getOperand(1) : I->getOperand(0);
  // Mad instruction does not support qword type for add/sub operand
  if (isValueNumBitsWide(otherOp, 64)
  ) {
    return false;
  }
  return true;
}

bool CodeGenPatternMatch::MatchIMad(llvm::BinaryOperator &I) {
  struct IMadPattern : Pattern {
    SSource sources[3];
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Mad(sources, modifier); }
  };

  if (!CanMatchMad(I)) {
    return false;
  }

  if (!I.getType()->isIntegerTy()) {
    // TODO: Support vector IMad
    return false;
  }

  for (int i = 0; i < 2; i++) {
    auto *mul = llvm::dyn_cast<llvm::BinaryOperator>(I.getOperand(i));

    if (mul && mul->getOpcode() == Instruction::Mul) {
      if (!IsMulCandidateForIMad(mul, &I)) {
        continue;
      }

      // Helper to hold relevant info about how to emit each source operand
      struct {
        Value *src = nullptr;         // LLVM operand
        Value *originalSrc = nullptr; // original LLVM operand, before any modification for matching
        e_modifier mod = EMOD_NONE;
        bool isSigned = false;
        bool isConstant16BitEq = false;
        bool isUpcastSrc = false;
        bool isCandidate() { return needsRegioning || isConstant16BitEq || isUpcastSrc; }
        void reset() {
          src = originalSrc;
          mod = EMOD_NONE;
          isSigned = false;
          isConstant16BitEq = false;
          isUpcastSrc = false;
          needsRegioning = false;
          useLower = true;
        }

        // below properties relevant when src is 32-bit but can be represented in 16 bits
        bool needsRegioning = false; // needs regioning to reduce bit width
        bool useLower = true;        // for setting offset
      } oprdInfo[3];

      oprdInfo[2].src = I.getOperand(1 - i);
      oprdInfo[2].originalSrc = oprdInfo[2].src;
      oprdInfo[1].src = mul->getOperand(0);
      oprdInfo[1].originalSrc = oprdInfo[1].src;
      oprdInfo[0].src = mul->getOperand(1); // usually a constant LLVM operand will be 2nd mul op at this point
      oprdInfo[0].originalSrc = oprdInfo[0].src;

      // Check whether we can replace some/all of 32-bit MUL+ADD operands with partial 16-bit MAD operands
      // TODO: Can this also extend to replacing 16-bit operands with 8 bits? Need to be careful for byte regioning,
      // seems like there are more restrictions in vISA for matching 8-bit MAD
      if (IGC_IS_FLAG_ENABLED(EnableMixIntOperands) && I.getType()->isIntegerTy(32)) {
        using namespace llvm::PatternMatch;
        for (int opI = 0; opI < 3; ++opI) {
          Value *L = nullptr;
          if (IsConstOrSimdConstExpr(oprdInfo[opI].src)) {
            if (auto *CI = dyn_cast<ConstantInt>(oprdInfo[opI].src)) {
              int64_t val = CI->isNegative() ? CI->getSExtValue() : CI->getZExtValue();
              if (INT16_MIN <= val && val <= INT16_MAX) {
                oprdInfo[opI].isSigned = true;
                oprdInfo[opI].isConstant16BitEq = true;
              } else if (0 <= val && val <= UINT16_MAX) {
                oprdInfo[opI].isConstant16BitEq = true;
              }
            } else {
              // If the source is a derivative of simdSize intrinsic, compute the known bits
              // Not guaranteed to fit in 16 or 8 bits, because of intermediate operations modifying var range
              KnownBits knownBits = computeKnownBits(oprdInfo[opI].src, I.getModule()->getDataLayout());
              // rudimentary check since this doesn't account for negative values as a result of operations performed on
              // SIMDSize, may not catch all cases
              if (knownBits.countMaxActiveBits() <= 16) {
                oprdInfo[opI].isConstant16BitEq = true;
              }
            }
            // When doing matches for m_Value(L), if L is a captured 32-bit variable, then we set needsRegioning to true
            // to know to set regioning later
          } else if (match(oprdInfo[opI].src, m_And(m_Value(L), m_SpecificInt(0xFFFF)))) {
            oprdInfo[opI].src = L;
            oprdInfo[opI].needsRegioning = true;
          } else if (match(oprdInfo[opI].src, m_LShr(m_Value(L), m_SpecificInt(16)))) {
            oprdInfo[opI].src = L;
            oprdInfo[opI].needsRegioning = true;
            oprdInfo[opI].useLower = false;
          } else if (match(oprdInfo[opI].src, m_AShr(m_Shl(m_Value(L), m_SpecificInt(16)), m_SpecificInt(16)))) {
            oprdInfo[opI].src = L;
            oprdInfo[opI].isSigned = true;
            oprdInfo[opI].needsRegioning = true;
          } else if (PatternMatch::match(oprdInfo[opI].src, m_AShr(m_Value(L), m_SpecificInt(16)))) {
            oprdInfo[opI].src = L;
            oprdInfo[opI].isSigned = true;
            oprdInfo[opI].needsRegioning = true;
            oprdInfo[opI].useLower = false;
          } else if (match(oprdInfo[opI].src, m_ZExt(m_Value(L))) && L->getType()->isIntegerTy(16)) {
            oprdInfo[opI].src = L;
            oprdInfo[opI].isUpcastSrc = true;
          } else if (match(oprdInfo[opI].src, m_SExt(m_Value(L))) && L->getType()->isIntegerTy(16)) {
            oprdInfo[opI].src = L;
            oprdInfo[opI].isSigned = true;
            oprdInfo[opI].isUpcastSrc = true;
          }
        }
      }

      // Preemptive swapping of mul operands to try to get better candidates for src0 and src1 of mad
      // Order of if statements in increasing order of priority
      if (isUniform(oprdInfo[1].src) && !isUniform(oprdInfo[0].src) &&
          !(oprdInfo[1].needsRegioning && !oprdInfo[1].useLower)) {
        // put non-uniform value in src1, since src0 (src2 in gen) has regioning restrictions, but only if new value
        // in src0 will still be GRF aligned
        std::swap(oprdInfo[0], oprdInfo[1]);
      }
      if (IsConstOrSimdConstExpr(oprdInfo[1].src)) {
        // put the constant in src0
        std::swap(oprdInfo[0], oprdInfo[1]);
      }
      if (mul->getType()->isIntegerTy(32) && oprdInfo[1].isCandidate() && !oprdInfo[0].isCandidate()) {
        // put the wider operand in src1
        std::swap(oprdInfo[0], oprdInfo[1]);
      }

      // Need GRF-aligned src0 (in vISA, src2 in gen) for mad. Since preemptive swapping happened earlier, if conditions
      // are not satisfied just don't match mad. Alignment and regioning are limited for src0
      // - dword aligned but not using lower bits
      if (oprdInfo[0].needsRegioning && !oprdInfo[0].useLower
      ) {
        continue;
      }

      // - word aligned
      if (oprdInfo[0].isUpcastSrc) {
        oprdInfo[0].reset();
      }

      // Get source modifiers for all operands if not using regioning
      // Source modifier not supported on src1 for DW and lower precision integer multiply
      bool skipModifierSrc1 = false;
      if (m_Platform.supportsSourceModifierForMixedIntMad() ||
          !(mul->getType()->isIntegerTy(32) && (oprdInfo[0].isCandidate() ^ oprdInfo[1].isCandidate()))) {
        // Source modifier supported, but will only get applied properly if not using special regioning to specify
        // reduced values
        if (!oprdInfo[1].needsRegioning) {
          bool mod = GetModifier(*oprdInfo[1].src, oprdInfo[1].mod, oprdInfo[1].src);
          // Special case: NEG, ABS, or NEGABS modifier on unsigned upcasted source
          // Happens when:
          // B = llvm.abs(A) OR B = sub 0, A OR B = sub 0, llvm.abs(A)
          // C = zext i16 B to i32
          // origSrc=C, src=B, isSigned=false (from zext), mod=NONE -> src=A, isSigned=false, mod=ABS/NEG/NEGABS
          // Mod on A, which is guaranteed signed 16-bit since abs or neg or negabs was applied, undef for unsigned
          // values
          if (mod && oprdInfo[1].isUpcastSrc &&
              (oprdInfo[1].mod == EMOD_NEG || oprdInfo[1].mod == EMOD_ABS || oprdInfo[1].mod == EMOD_NEGABS)) {
            oprdInfo[1].isSigned = true;
          }
        }
      } else {
        skipModifierSrc1 = true;
      }
      if (!oprdInfo[0].needsRegioning) {
        bool mod = GetModifier(*oprdInfo[0].src, oprdInfo[0].mod, oprdInfo[0].src);
        if (mod && oprdInfo[0].isUpcastSrc &&
            (oprdInfo[0].mod == EMOD_NEG || oprdInfo[0].mod == EMOD_ABS || oprdInfo[0].mod == EMOD_NEGABS)) {
          oprdInfo[0].isSigned = true;
        }
      }
      if (!oprdInfo[2].needsRegioning) {
        bool mod = GetModifier(*oprdInfo[2].src, oprdInfo[2].mod, oprdInfo[2].src);
        if (mod && oprdInfo[2].isUpcastSrc &&
            (oprdInfo[2].mod == EMOD_NEG || oprdInfo[2].mod == EMOD_ABS || oprdInfo[2].mod == EMOD_NEGABS)) {
          oprdInfo[2].isSigned = true;
        }
      }

      if (I.getOpcode() == Instruction::Sub) {
        if (i == 0) {
          // Matching sub to mad instruction by trying to apply negative modifier to subtrahend
          if (auto *CI = dyn_cast<ConstantInt>(oprdInfo[2].src)) {
            int64_t val = CI->isNegative() ? CI->getSExtValue() : CI->getZExtValue();
            if (CI->isNegative()) {
              // flip without overflow concerns
              oprdInfo[2].src = ConstantInt::get(CI->getType(), -val);
              oprdInfo[2].isSigned = false;
            } else {
              if (val > (int64_t)(INT32_MAX) + 1
              ) {
                // negating will overflow
                return false;
              } else if (val > (int64_t)(INT16_MAX) + 1) {
                // negating will invalidate status
                oprdInfo[2].isConstant16BitEq = false;
              }
              oprdInfo[2].src = ConstantInt::get(CI->getType(), -val, true);
              oprdInfo[2].isSigned = true;
            }
          } else {
            // If src2 is candidate and unsigned, neg modifier will do 2s complement
            // This will result in incorrect values if src2 value falls between INT16_MAX and UINT16_MAX.
            // Unless NEG already present, fallback to full dword operand
            if (oprdInfo[2].isCandidate() && !oprdInfo[2].isSigned && oprdInfo[2].mod != EMOD_NEG) {
              oprdInfo[2].reset();
              GetModifier(*oprdInfo[2].src, oprdInfo[2].mod, oprdInfo[2].src);
              // fallthrough to combine neg modifier from sub instruction
            }
            oprdInfo[2].mod = CombineModifier(EMOD_NEG, oprdInfo[2].mod);
          }
        } else {
          // Matching sub to mad instruction by trying to apply negative modifier to minuend
          // try to apply neg to src0 if possible
          if (auto *CI = dyn_cast<ConstantInt>(oprdInfo[0].src)) {
            int64_t val = CI->isNegative() ? CI->getSExtValue() : CI->getZExtValue();
            // if src0 is a constant, need to make sure to negate it properly
            if (CI->isNegative()) {
              // flip without overflow concerns
              oprdInfo[0].src = ConstantInt::get(CI->getType(), -val);
              oprdInfo[0].isSigned = false;
            } else {
              if (val > (int64_t)(INT32_MAX) + 1
              ) {
                // negating will overflow
                return false;
              } else if (val > (int64_t)(INT16_MAX) + 1) {
                // can flip to negative, need to recheck 16-bit candidacy
                if (oprdInfo[0].isConstant16BitEq && !skipModifierSrc1 &&
                    (!oprdInfo[1].isCandidate() || oprdInfo[1].isSigned)) {
                  // if src0 was never a candidate, no consequence for flipping. Only consider flipping src1 if src0 is
                  // a candidate not modifier restricted for src1 and won't mess with src1 type/regioning by adding neg
                  // modifier, put modifier on src1 without making src0 non-16-bit candidate
                  oprdInfo[1].mod = CombineModifier(EMOD_NEG, oprdInfo[1].mod);
                } else {
                  // cannot put modifier on src1 or will need to reset src1, just negate src0
                  // this will likely result in mad no longer getting matched, but there is no other way
                  oprdInfo[0].src = ConstantInt::get(CI->getType(), -val, true);
                  oprdInfo[0].isSigned = true;
                  oprdInfo[0].isConstant16BitEq = false;
                }
              } else {
                // negative constant fits in 16 bits
                oprdInfo[0].src = ConstantInt::get(CI->getType(), -val, true);
                oprdInfo[0].isSigned = true;
              }
            }
          } else {
            // if src0 is candidate and unsigned, neg modifier will do 2s complement
            // this will result in incorrect values if src0 value falls between INT16_MAX and UINT16_MAX
            // unless NEG already present, fallback to full dword operand
            if (oprdInfo[0].isCandidate() && !oprdInfo[0].isSigned && oprdInfo[0].mod != EMOD_NEG) {
              if (!skipModifierSrc1) {
                oprdInfo[1].mod = CombineModifier(EMOD_NEG, oprdInfo[1].mod);
              } else {
                // this will likely cause mad not to be matched, but no other way
                oprdInfo[0].reset();
                GetModifier(*oprdInfo[0].src, oprdInfo[0].mod, oprdInfo[0].src);
                // combine neg modifier from sub instruction
                oprdInfo[0].mod = CombineModifier(EMOD_NEG, oprdInfo[0].mod);
              }
            } else {
              oprdInfo[0].mod = CombineModifier(EMOD_NEG, oprdInfo[0].mod);
            }
          }
        }
      }

      // Mad instruction does not support dword * dword, no operands were able to get reduced.
      if (mul->getType()->isIntegerTy(32) && !oprdInfo[0].isCandidate() &&
          !oprdInfo[1].isCandidate()
      ) {
        return false;
      }

      auto isValueConstantInt16Bit = [&](Value *V) {
        if (auto *CI = dyn_cast<ConstantInt>(V)) {
          return CI->getValue().isNegative() ? CI->getValue().sge(INT16_MIN) && CI->getValue().sle(INT16_MAX)
                                             : CI->getValue().uge(0) && CI->getValue().ule(UINT16_MAX);
        } else if (isa<Instruction>(V) && SIMDConstExpr(cast<Instruction>(V))) {
          KnownBits knownBits = computeKnownBits(V, I.getModule()->getDataLayout());
          // rudimentary check since this doesn't account for negative values as a result of operations performed on
          // SIMDSize
          return knownBits.countMaxActiveBits() <= 16;
        }
        return false;
      };

      unsigned numConst = 0;
      unsigned num16BitImm = 0;
      // At this point check whether constants can be immediate or they need to be in constant pool
      for (int i = 0; i < 3; ++i) {
        if (IsConstOrSimdConstExpr(oprdInfo[i].src)) {
          numConst++;
          if (m_Platform.support16BitImmSrcForMad() && isValueConstantInt16Bit(oprdInfo[i].src)) {
            num16BitImm++;
          }
        }
      }

      // control whether to allow any constant operand >16-bit to be generated to be used from the constant pool
      // if AllowConstMadOpMovToReg is set, we allow one such constant, otherwise all constants must be 16-bit
      const unsigned constLimit = IGC_IS_FLAG_ENABLED(AllowConstMadOpMovToReg)
                                      ? 1
                                      : 0;
      if ((numConst - num16BitImm) > constLimit) {
        continue;
      }

      IMadPattern *pattern = new (m_allocator) IMadPattern();

      bool addToConstantPool = false;
      for (int i = 0; i < 3; ++i) {
        pattern->sources[i] = GetSource(oprdInfo[i].src, oprdInfo[i].mod, false, IsSourceOfSample(&I));
        if (oprdInfo[i].isCandidate()) {
          SSource &thisSrc = pattern->sources[i];

          // for now, Use W/UW only if region_set is false or the src is scalar
          if (!thisSrc.region_set || (thisSrc.region[0] == 0 && thisSrc.region[1] == 1 && thisSrc.region[2] == 0)) {
            // Note that SSource's type, if set by GetSource(), should be 32bit type. It's safe to override it with
            // either UW or W. But for SSource's offset, need to re-calculate in term of 16bit, not 32bit.
            thisSrc.type = oprdInfo[i].isSigned ? ISA_TYPE_W : ISA_TYPE_UW;
            if (oprdInfo[i].needsRegioning) {
              thisSrc.elementOffset = (2 * thisSrc.elementOffset) + (oprdInfo[i].useLower ? 0 : 1);
            }
          }
          if (oprdInfo[i].needsRegioning && !isUniform(oprdInfo[i].src)) {
            thisSrc.region_set = true;
            thisSrc.region[0] = 16;
            thisSrc.region[1] = 8;
            thisSrc.region[2] = 2;
          }
        }

        if (isa<ConstantInt>(oprdInfo[i].src)) {
          if (!m_Platform.support16BitImmSrcForMad() || !isValueConstantInt16Bit(oprdInfo[i].src) ||
              addToConstantPool) {
            // Add all constants that are unable to fit in 16-bits to constant pool.
            AddToConstantPool(I.getParent(), oprdInfo[i].src);
            pattern->sources[i].fromConstantPool = true;
          } else if (!addToConstantPool
          ) {
            addToConstantPool = true;
          }
        }
      }
      AddPattern(pattern);
      return true;
    }
  }

  return false;
}

// match simdblockRead/Write with preceding inttoptr if possible
// to save a copy move
bool CodeGenPatternMatch::MatchBlockReadWritePointer(llvm::GenIntrinsicInst &I) {
  struct BlockReadWritePointerPattern : public Pattern {
    GenIntrinsicInst *inst;
    Value *offset;
    explicit BlockReadWritePointerPattern(GenIntrinsicInst *I, Value *ofst) : inst(I), offset(ofst) {}
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      switch (inst->getIntrinsicID()) {
      case GenISAIntrinsic::GenISA_simdBlockRead:
        pass->emitSimdBlockRead(inst, offset);
        break;
      case GenISAIntrinsic::GenISA_simdBlockWrite:
        pass->emitSimdBlockWrite(inst, offset);
        break;
      default:
        IGC_ASSERT_MESSAGE(0, "unexpected intrinsic");
        break;
      }
    }
  };

  if (I.getIntrinsicID() != GenISAIntrinsic::GenISA_simdBlockRead &&
      I.getIntrinsicID() != GenISAIntrinsic::GenISA_simdBlockWrite) {
    return false;
  }

  // check the address is inttoptr with same dst and src type width
  auto ptrVal = I.getOperand(0);
  auto I2P = dyn_cast<IntToPtrInst>(ptrVal);
  if (I2P && I2P->getOperand(0)->getType()->getIntegerBitWidth() ==
                 m_ctx->getRegisterPointerSizeInBits(I2P->getAddressSpace())) {
    auto addrBase = I2P->getOperand(0);
    BlockReadWritePointerPattern *pattern = new (m_allocator) BlockReadWritePointerPattern(&I, addrBase);
    MarkAsSource(addrBase, IsSourceOfSample(&I));
    // for write mark data ptr as well
    if (I.getIntrinsicID() == GenISAIntrinsic::GenISA_simdBlockWrite) {
      MarkAsSource(I.getOperand(1), IsSourceOfSample(&I));
    }

    AddPattern(pattern);
    return true;
  }
  return false;
}


// Pattern matching to detect and handle immediate offsets in load/store
// instructions.  It detects offsets of the form "add dst, var, imm"
// The add instruction is removed, and imm included in the LSC message descriptor.
//
// OpenCL Load/Store launder the address through an inttoptr....
//   %2 = add i32 %0, 64
//   %3 = inttoptr i32 %2 to i32 addrspace(131072)*
//   store i32 16, i32 addrspace(131072)* %3, align 8
// Fold to vISA lsc_store... [%0 + 64].
//
// Stuff like 3D GenISA_ldrawvector_indexed directly reference the add.
//   %723 = add i32 %713, 16
//   %724 = call <4 x i32> @llvm.genx.GenISA.ldrawvector.indexed.v4i32.p2621448v4f32(
//            <4 x float> addrspace(2621448)* %runtime_value_1, i32 %723, i32 4, i1 false)
// Fold to lsc_load [%713 + 16].
static uint GetNbSources(llvm::Instruction &v) {
  if (llvm::CallInst *intrin = llvm::dyn_cast<llvm::CallInst>(&v)) {
    return IGCLLVM::getNumArgOperands(intrin);
  }
  return v.getNumOperands();
}

// Returns true if the value is an integer constant that can be encoded in
// 32 bits.
inline bool Is32BitConstInt(llvm::Value *val) {
  llvm::ConstantInt *imm = llvm::dyn_cast<ConstantInt>(val);
  if (imm && imm->getSExtValue() <= std::numeric_limits<int32_t>::max() &&
      imm->getSExtValue() >= std::numeric_limits<int32_t>::min()) {
    return true;
  }
  return false;
}

bool CodeGenPatternMatch::MatchImmOffsetLSC(llvm::Instruction &I) {
  struct LSCImmOffsetPattern : public Pattern {
    explicit LSCImmOffsetPattern(Instruction *I, llvm::Value *varOffset, llvm::ConstantInt *immOffset)
        : m_inst(I), m_varOff(varOffset), m_immOff(immOffset) {}

    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      if (isa<LoadInst>(m_inst)) {
        pass->emitLoad(cast<LoadInst>(m_inst), m_varOff, m_immOff);
      } else if (isa<StoreInst>(m_inst)) {
        pass->emitStore(cast<StoreInst>(m_inst), m_varOff, m_immOff);
      } else if (isa<LdRawIntrinsic>(m_inst)) {
        pass->emitLoadRawIndexed(cast<LdRawIntrinsic>(m_inst), m_varOff, nullptr, m_immOff);
      } else if (isa<StoreRawIntrinsic>(m_inst)) {
        pass->emitStoreRawIndexed(cast<StoreRawIntrinsic>(m_inst), m_varOff, nullptr, m_immOff);
      } else {
        IGC_ASSERT_MESSAGE(false, "unmatched imm off pattern");
      }
    }

  private:
    llvm::Instruction *m_inst;
    llvm::Value *m_varOff;
    llvm::ConstantInt *m_immOff;
  }; // LSCImmOffsetPattern

  // match:
  //   %addr = (add/sub %var, %imm) | (add %imm, %var)
  //   [ %addr = inttoptr %addr, ... ]
  //   load/store/whatever ... %addr
  int addrOpnd = -1;
  if (llvm::isa<LoadInst>(&I) || llvm::isa<StoreInst>(&I)) {
    // buffer load/store instructions in compute languages are usually
    // buried behind a inttoptr op
    addrOpnd = llvm::isa<LoadInst>(&I) ? 0 : 1;
  } else if (llvm::isa<llvm::GenIntrinsicInst>(I)) {
    llvm::GenIntrinsicInst &GI = llvm::cast<llvm::GenIntrinsicInst>(I);
    switch (GI.getIntrinsicID()) {
    // TODO: incrementally enable others
    //
    // case GenISAIntrinsic::GenISA_intatomicraw:
    // case GenISAIntrinsic::GenISA_floatatomicraw:
    // case GenISAIntrinsic::GenISA_intatomicrawA64:
    // case GenISAIntrinsic::GenISA_floatatomicrawA64:
    // case GenISAIntrinsic::GenISA_icmpxchgatomicraw:
    // case GenISAIntrinsic::GenISA_icmpxchgatomicrawA64:
    // case GenISAIntrinsic::GenISA_fcmpxchgatomicrawA64:
    //  all these will target => emitAtomicRaw
    //
    case GenISAIntrinsic::GenISA_ldrawvector_indexed:
    case GenISAIntrinsic::GenISA_ldraw_indexed:
    case GenISAIntrinsic::GenISA_storerawvector_indexed:
    case GenISAIntrinsic::GenISA_storeraw_indexed:
      addrOpnd = 1;
      break;
      // return false;
    // TODO: are these even reachable here???
    // case GenISAIntrinsic::GenISA_ldstructured:
    // case GenISAIntrinsic::GenISA_storestructured1:
    // case GenISAIntrinsic::GenISA_storestructured2:
    // case GenISAIntrinsic::GenISA_storestructured3:
    // case GenISAIntrinsic::GenISA_storestructured4:
    //    addrOpnd = 2;
    //    break;
    default:
      return false;
    }
  } else {
    return false;
  }

  if (addrOpnd < 0) {
    return false;
  }

  llvm::Instruction *addSubInst = llvm::dyn_cast<llvm::Instruction>(I.getOperand((unsigned)addrOpnd));
  if (!addSubInst) {
    // e.g. the address is a constant
    return false;
  }

  Type *addInstType = addSubInst->getType();
  // Note that the A64 addressing mode can be only connected with load and store instructions
  bool isA64AddressingModel = addInstType->isPointerTy() && IGC::isA64Ptr(cast<PointerType>(addInstType), m_ctx);

  llvm::Instruction *intToPtrInst = nullptr;
  if (addSubInst->getOpcode() == llvm::Instruction::IntToPtr) {
    intToPtrInst = addSubInst;
    addSubInst = llvm::dyn_cast<llvm::Instruction>(addSubInst->getOperand(0));
  }
  if (!addSubInst) {
    return false; // e.g. intToPtr of constant
  } else if (addSubInst->getOpcode() != llvm::Instruction::Add && addSubInst->getOpcode() != llvm::Instruction::Sub) {
    return false;
  }

  // VISA interface accepts 32-bit immediate offset values
  const bool isConstant0 = Is32BitConstInt(addSubInst->getOperand(0));
  const bool isConstant1 = Is32BitConstInt(addSubInst->getOperand(1));
  if (isConstant1 || (isConstant0 && addSubInst->getOpcode() == llvm::Instruction::Add)) {
    // YES: var + imm
    // YES: var - imm
    // YES: imm + var
    // NO:  imm - var  (IGC drops the negation otherwise)
    IGC_ASSERT_MESSAGE(!isConstant0 || !isConstant1,
                       "Both operands are immediate - constants should be folded elsewhere.");

    llvm::Value *varOffset = isConstant0 ? addSubInst->getOperand(1) : addSubInst->getOperand(0);


    // HW does an early bounds check on varOffset for A32 messages. Thus, if varOffset
    // is negative, then the bounds check fails early even though the immediate offset
    // would bring the final calculation to a positive number.
    bool disableA32ImmediateGlobalBaseOffset =
        !isA64AddressingModel && !UsedWithoutImmInMemInst(varOffset) && !valueIsPositive(varOffset, m_DL) &&
        IGC_GET_FLAG_VALUE(LscImmOffsMatch) < 3;

    if (disableA32ImmediateGlobalBaseOffset)
      return false;

    MarkAsSource(varOffset, IsSourceOfSample(&I));

    unsigned numSources = GetNbSources(I);
    for (unsigned i = 0; i < numSources; i++) {
      if (I.getOperand(i) != intToPtrInst && I.getOperand(i) != addSubInst) {
        MarkAsSource(I.getOperand(i), IsSourceOfSample(&I));
      }
    }

    llvm::Value *immOffset = isConstant0 ? addSubInst->getOperand(0) : addSubInst->getOperand(1);

    LSCImmOffsetPattern *pattern =
        new (m_allocator) LSCImmOffsetPattern(&I, varOffset, llvm::dyn_cast<llvm::ConstantInt>(immOffset));
    AddPattern(pattern);
    return true;
  }

  return false;
}

bool CodeGenPatternMatch::MatchLrp(llvm::BinaryOperator &I) {
  struct LRPPattern : public Pattern {
    SSource sources[3];
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Lrp(sources, modifier); }
  };

  if (!I.getType()->isFloatTy())
    return false;
  if (!m_Platform.supportLRPInstruction())
    return false;

  if (m_ctx->getModuleMetaData()->isPrecise) {
    return false;
  }

  bool found = false;
  llvm::Value *sources[3];
  e_modifier src_mod[3];

  if (!ContractionAllowed(I)) {
    return false;
  }

  IGC_ASSERT((I.getOpcode() == Instruction::FAdd) || (I.getOpcode() == Instruction::FSub));

  bool startPatternIsAdd = false;
  if (I.getOpcode() == Instruction::FAdd) {
    startPatternIsAdd = true;
  }

  // match the case: dst = src0 (src1 - src2)  + src2;
  for (uint i = 0; i < 2; i++) {
    llvm::BinaryOperator *mul = llvm::dyn_cast<llvm::BinaryOperator>(I.getOperand(i));
    if (mul && mul->getOpcode() == Instruction::FMul && ContractionAllowed(*mul)) {
      for (uint j = 0; j < 2; j++) {
        llvm::BinaryOperator *sub = llvm::dyn_cast<llvm::BinaryOperator>(mul->getOperand(j));
        if (sub && ContractionAllowed(*sub)) {
          llvm::ConstantFP *zero = llvm::dyn_cast<llvm::ConstantFP>(sub->getOperand(0));
          if (zero && zero->isExactlyValue(0.f)) {
            // in this case we can optimize the pattern into fmad and give better result
            continue;
          }

          if ((startPatternIsAdd && sub->getOpcode() == Instruction::FSub) ||
              (!startPatternIsAdd && i == 0 && sub->getOpcode() == Instruction::FAdd)) {
            if (sub->getOperand(1) == I.getOperand(1 - i) && mul->getOperand(0) != mul->getOperand(1)) {
              sources[0] = mul->getOperand(1 - j);
              sources[1] = sub->getOperand(0);
              sources[2] = sub->getOperand(1);
              GetModifier(*sources[0], src_mod[0], sources[0]);
              GetModifier(*sources[1], src_mod[1], sources[1]);
              GetModifier(*sources[2], src_mod[2], sources[2]);

              if (!startPatternIsAdd && i == 0) {
                // handle patterns like this:
                // dst = src0 (src1 + src2) - src2;
                src_mod[2] = CombineModifier(EMOD_NEG, src_mod[2]);
              }

              found = true;
              break;
            }
          }
        }
      }
    }
    if (found) {
      break;
    }
  }

  // match the case: dst = src0 * src1 + src2 * (1.0 - src0);
  if (!found) {
    llvm::BinaryOperator *mul[2];
    mul[0] = llvm::dyn_cast<llvm::BinaryOperator>(I.getOperand(0));
    mul[1] = llvm::dyn_cast<llvm::BinaryOperator>(I.getOperand(1));
    if (mul[0] && mul[0]->getOpcode() == Instruction::FMul && ContractionAllowed(*mul[0]) && mul[1] &&
        mul[1]->getOpcode() == Instruction::FMul && ContractionAllowed(*mul[1]) &&
        !llvm::isa<llvm::ConstantFP>(mul[0]->getOperand(0)) && !llvm::isa<llvm::ConstantFP>(mul[0]->getOperand(1)) &&
        !llvm::isa<llvm::ConstantFP>(mul[1]->getOperand(0)) && !llvm::isa<llvm::ConstantFP>(mul[1]->getOperand(1))) {
      for (uint i = 0; i < 2; i++) {
        for (uint j = 0; j < 2; j++) {
          llvm::BinaryOperator *sub = llvm::dyn_cast<llvm::BinaryOperator>(mul[i]->getOperand(j));
          if (sub && sub->getOpcode() == Instruction::FSub && ContractionAllowed(*sub)) {
            llvm::ConstantFP *one = llvm::dyn_cast<llvm::ConstantFP>(sub->getOperand(0));
            if (one && one->isExactlyValue(1.f)) {
              for (uint k = 0; k < 2; k++) {
                if (sub->getOperand(1) == mul[1 - i]->getOperand(k)) {
                  sources[0] = sub->getOperand(1);
                  sources[1] = mul[1 - i]->getOperand(1 - k);
                  sources[2] = mul[i]->getOperand(1 - j);
                  GetModifier(*sources[0], src_mod[0], sources[0]);
                  GetModifier(*sources[1], src_mod[1], sources[1]);
                  GetModifier(*sources[2], src_mod[2], sources[2]);
                  if (!startPatternIsAdd) {
                    if (i == 1) {
                      // handle patterns like this:
                      // dst = (src1 * src0) - (src2 * (1.0 - src0))
                      src_mod[2] = CombineModifier(EMOD_NEG, src_mod[2]);
                    } else {
                      // handle patterns like this:
                      // dst = (src2 * (1.0 - src0)) - (src1 * src0)
                      src_mod[1] = CombineModifier(EMOD_NEG, src_mod[1]);
                    }
                  }
                  found = true;
                  break;
                }
              }
            }
          }
          if (found) {
            break;
          }
        }
        if (found) {
          break;
        }
      }
    }
  }

  if (!found) {
    // match the case: dst = src2 - (src0 * src2) + (src0 * src1);
    // match the case: dst = (src0 * src1) + src2 - (src0 * src2);
    // match the case: dst = src2 + (src0 * src1) - (src0 * src2);
    if (I.getOpcode() == Instruction::FAdd || I.getOpcode() == Instruction::FSub) {
      // dst = op[0] +/- op[1] +/- op[2]
      llvm::Instruction *op[3];
      llvm::Instruction *addSub1 = llvm::dyn_cast<llvm::Instruction>(I.getOperand(0));
      if (addSub1 && (addSub1->getOpcode() == Instruction::FSub || addSub1->getOpcode() == Instruction::FAdd)) {
        op[0] = llvm::dyn_cast<llvm::Instruction>(addSub1->getOperand(0));
        op[1] = llvm::dyn_cast<llvm::Instruction>(addSub1->getOperand(1));
        op[2] = llvm::dyn_cast<llvm::Instruction>(I.getOperand(1));

        if (op[0] && op[1] && op[2]) {
          for (uint casei = 0; casei < 3; casei++) {
            // i, j, k are the index for op[]
            uint i = (casei == 2 ? 1 : 0);
            uint j = (casei == 0 ? 1 : 2);
            uint k = 2 - casei;

            // op[i] and op[j] should be fMul, and op[k] is src2
            if (op[i]->getOpcode() == Instruction::FMul && op[j]->getOpcode() == Instruction::FMul &&
                ContractionAllowed(*op[i]) && ContractionAllowed(*op[j])) {
              for (uint srci = 0; srci < 2; srci++) {
                for (uint srcj = 0; srcj < 2; srcj++) {
                  // op[i] and op[j] needs to have one common source. this common source will be src0
                  if (op[i]->getOperand(srci) == op[j]->getOperand(srcj)) {
                    // one of the non-common source from op[i] and op[j] needs to be the same as op[k], which is src2
                    if (op[k] == op[i]->getOperand(1 - srci) || op[k] == op[j]->getOperand(1 - srcj)) {
                      // disable if any of the sources is immediate
                      if (llvm::isa<llvm::ConstantFP>(op[i]->getOperand(srci)) ||
                          llvm::isa<llvm::ConstantFP>(op[i]->getOperand(1 - srci)) ||
                          llvm::isa<llvm::ConstantFP>(op[j]->getOperand(srcj)) ||
                          llvm::isa<llvm::ConstantFP>(op[j]->getOperand(1 - srcj)) ||
                          llvm::isa<llvm::ConstantFP>(op[k])) {
                        break;
                      }

                      // check the add/sub cases and add negate to the sources when needed.
                      /*
                      ( src0src1, -src0src2, src2 )   okay
                      ( src0src1, -src0src2, -src2 )  skip
                      ( src0src1, src0src2, src2 )    skip
                      ( src0src1, src0src2, -src2 )   negate src2
                      ( -src0src1, -src0src2, src2 )  negate src1
                      ( -src0src1, -src0src2, -src2 ) skip
                      ( -src0src1, src0src2, src2 )   skip
                      ( -src0src1, src0src2, -src2 )  negate src1 src2
                      */

                      bool SignPositiveOp[3];
                      SignPositiveOp[0] = true;
                      SignPositiveOp[1] = (addSub1->getOpcode() == Instruction::FAdd);
                      SignPositiveOp[2] = (I.getOpcode() == Instruction::FAdd);

                      uint mulSrc0Src1Index = op[k] == op[i]->getOperand(1 - srci) ? j : i;
                      uint mulSrc0Src2Index = op[k] == op[i]->getOperand(1 - srci) ? i : j;

                      if (SignPositiveOp[mulSrc0Src2Index] == SignPositiveOp[k]) {
                        // abort the cases marked as "skip" in the comment above
                        break;
                      }

                      sources[0] = op[i]->getOperand(srci);
                      sources[1] = op[k] == op[i]->getOperand(1 - srci) ? op[j]->getOperand(1 - srcj)
                                                                        : op[i]->getOperand(1 - srci);
                      sources[2] = op[k];
                      GetModifier(*sources[0], src_mod[0], sources[0]);
                      GetModifier(*sources[1], src_mod[1], sources[1]);
                      GetModifier(*sources[2], src_mod[2], sources[2]);

                      if (SignPositiveOp[mulSrc0Src1Index] == false) {
                        src_mod[1] = CombineModifier(EMOD_NEG, src_mod[1]);
                      }
                      if (SignPositiveOp[k] == false) {
                        src_mod[2] = CombineModifier(EMOD_NEG, src_mod[2]);
                      }

                      found = true;
                    }
                  }
                }
                if (found) {
                  break;
                }
              }
            }
            if (found) {
              break;
            }
          }
        }
      }
    }
  }

  if (found) {
    LRPPattern *pattern = new (m_allocator) LRPPattern();
    for (int i = 0; i < 3; i++) {
      pattern->sources[i] = GetSource(sources[i], src_mod[i], false, IsSourceOfSample(&I));
    }
    AddPattern(pattern);
  }
  return found;
}

bool CodeGenPatternMatch::MatchCmpSext(llvm::Instruction &I) {
  /*
      %res_s42 = icmp eq i32 %src1_s41, 0
      %17 = sext i1 %res_s42 to i32
          to
      %res_s42 (i32) = icmp eq i32 %src1_s41, 0


      %res_s73 = fcmp oge float %res_s61, %42
      %46 = sext i1 %res_s73 to i32
          to
      %res_s73 (i32) = fcmp oge float %res_s61, %42
  */

  struct CmpSextPattern : Pattern {
    llvm::CmpInst *inst;
    SSource sources[2];
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      pass->Cmp(inst->getPredicate(), sources, modifier);
    }
  };
  bool match = false;

  if (CmpInst *cmpInst = dyn_cast<CmpInst>(I.getOperand(0))) {
    if (cmpInst->getOperand(0)->getType()->getPrimitiveSizeInBits() == I.getType()->getPrimitiveSizeInBits()) {
      CmpSextPattern *pattern = new (m_allocator) CmpSextPattern();
      bool supportModifier = SupportsModifier(cmpInst, m_Platform);

      pattern->inst = cmpInst;
      pattern->sources[0] = GetSource(cmpInst->getOperand(0), supportModifier, false, IsSourceOfSample(&I));
      pattern->sources[1] = GetSource(cmpInst->getOperand(1), supportModifier, false, IsSourceOfSample(&I));
      AddPattern(pattern);
      match = true;
    }
  }

  return match;
}

// Matches the following sequence:
//  %src0v4i8 = bitcast i32 %x to <4 x i8>
//  %elem1 = extractelement <4 x i8> %src0v4i8, i32 1
//  %dst = {s|z}ext i8 %elem1 to i32
bool CodeGenPatternMatch::MatchUnpack4i8(Instruction &I) {
  struct UnpackPattern : public Pattern {
    SSource source;
    uint32_t index;
    bool isUnsigned;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      pass->EmitUnpack4i8(source, index, isUnsigned, modifier);
    }
  };
  if (I.getType()->isIntegerTy(32) && (isa<ZExtInst>(&I) || isa<SExtInst>(&I))) {
    auto extract = dyn_cast<ExtractElementInst>(I.getOperand(0));
    if (extract && isa<ConstantInt>(extract->getIndexOperand()) && isa<BitCastInst>(extract->getVectorOperand()) &&
        cast<BitCastInst>(extract->getVectorOperand())->getType()->isIntOrIntVectorTy(8) &&
        !cast<BitCastInst>(extract->getVectorOperand())->getOperand(0)->getType()->isVectorTy() &&
        cast<BitCastInst>(extract->getVectorOperand())->getOperand(0)->getType()->getPrimitiveSizeInBits() == 32) {
      UnpackPattern *pattern = new (m_allocator) UnpackPattern();
      auto bitcast = cast<BitCastInst>(extract->getVectorOperand());
      uint32_t index = int_cast<uint32_t>(cast<ConstantInt>(extract->getIndexOperand())->getZExtValue());
      pattern->source = GetSource(bitcast->getOperand(0), false, false, IsSourceOfSample(&I));
      pattern->index = index;
      pattern->isUnsigned = isa<ZExtInst>(&I);
      AddPattern(pattern);
      return true;
    }
  }
  return false;
}

// Matches the following patterns
//
// Pattern 1, pack 4 i8 into i32
// Match the following sequence:
//  %x8 = trunc i32 %x to i8
//  %y8 = trunc i32 %y to i8
//  %z8 = trunc i32 %z to i8
//  %w8 = trunc i32 %w to i8
//  %vec0 = insertelement <4 x i8> undef, i8 %x8, i64 0
//  %vec1 = insertelement <4 x i8> %vec0, i8 %y8, i64 1
//  %vec2 = insertelement <4 x i8> %vec1, i8 %z8, i64 2
//  %vec3 = insertelement <4 x i8> %vec2, i8 %w8, i64 3
//  %dst = bitcast <4 x i8> %vec3 to i32
// And generate:
//  mov (M1_NM, 1) dst_0(0,0)<4> x(0,0)<0;1,0>
//  mov (M1_NM, 1) dst_0(0,1)<4> y(0,0)<0;1,0>
//  mov (M1_NM, 1) dst_0(0,2)<4> z(0,0)<0;1,0>
//  mov (M1_NM, 1) dst_0(0,3)<4> w(0,0)<0;1,0>
//
// Pattern 2, pack 4 i8 into i32 with saturation
// Match the following sequence:
//  %x1 = max i32 %x0, 0
//  %x2 = min i32 %x1, 127
//  %y1 = max i32 %y0, 0
//  %y2 = min i32 %y1, 127
//  %z1 = max i32 %z0, 0
//  %z2 = min i32 %z1, 127
//  %w1 = max i32 %w0, 0
//  %w2 = min i32 %w1, 127
//  %x8 = trunc i32 %x2 to i8
//  %y8 = trunc i32 %y2 to i8
//  %z8 = trunc i32 %z2 to i8
//  %w8 = trunc i32 %w2 to i8
//  %vec0 = insertelement <4 x i8> undef, i8 %x8, i64 0
//  %vec1 = insertelement <4 x i8> %vec0, i8 %y8, i64 1
//  %vec2 = insertelement <4 x i8> %vec1, i8 %z8, i64 2
//  %vec3 = insertelement <4 x i8> %vec2, i8 %w8, i64 3
//  %dst = bitcast <4 x i8> %vec3 to i32
// And generate:
//  max.sat (M1_NM, 1) xyzw_0(0,0)<4> x(0,0)<0;1,0> 0x0:d
//  max.sat (M1_NM, 1) xyzw_0(0,1)<4> y(0,0)<0;1,0> 0x0:d
//  max.sat (M1_NM, 1) xyzw_0(0,2)<4> z(0,0)<0;1,0> 0x0:d
//  max.sat (M1_NM, 1) xyzw_0(0,3)<4> w(0,0)<0;1,0> 0x0:d
//
// Pattern 3, pack 4 i8 into i32 with saturation
// Match the following sequence:
//  %x1 = max i32 %x0, 0
//  %x2 = min i32 %x1, 127
//  %y1 = max i32 %y0, 0
//  %y2 = min i32 %y1, 127
//  %z1 = max i32 %z0, 0
//  %z2 = min i32 %z1, 127
//  %w1 = max i32 %w0, 0
//  %w2 = min i32 %w1, 127
//  %y8 = shl nuw nsw i32 %y2, 8
//  %xy = or i32 %x2, %y8
//  %z8 = shl nuw nsw i32 %z2, 16
//  %xyz = or i32 %z8, %xy
//  %w8 = shl nuw nsw i32 %w2, 24
//  %dst = or i32 %xyz, %w8
// And generate:
//  max.sat (M1_NM, 1) xyzw_0(0,0)<4> x(0,0)<0;1,0> 0x0:d
//  max.sat (M1_NM, 1) xyzw_0(0,1)<4> y(0,0)<0;1,0> 0x0:d
//  max.sat (M1_NM, 1) xyzw_0(0,2)<4> z(0,0)<0;1,0> 0x0:d
//  max.sat (M1_NM, 1) xyzw_0(0,3)<4> w(0,0)<0;1,0> 0x0:d
//
bool CodeGenPatternMatch::MatchPack4i8(Instruction &I) {
  using namespace llvm::PatternMatch; // Scoped using declaration.

  struct PackPattern : public Pattern {
    std::array<EOPCODE, 4> opcodes;
    std::array<SSource, 4> sources0;
    std::array<SSource, 4> sources1;
    std::array<bool, 4> isSat;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      pass->EmitPack4i8(opcodes, sources0, sources1, isSat, modifier);
    }
  };

  // Lambda matches a `min` or `max` operation with a specific immediate
  // constant value (equal to `imm`) and returns the other source value
  // in the `otherSrc`.
  auto MatchMinMaxWithImm = [this](Value *v, uint64_t imm, bool matchMin, Value *&otherSrc) -> bool {
    bool isUnsigned;
    bool isMin;
    Value *src[2];
    if (isMinOrMax(v, src[0], src[1], isMin, isUnsigned) && isMin == matchMin &&
        (isa<ConstantInt>(src[0]) || isa<ConstantInt>(src[1]))) {
      ConstantInt *c = isa<ConstantInt>(src[0]) ? cast<ConstantInt>(src[0]) : cast<ConstantInt>(src[1]);
      if (c->getZExtValue() == imm) {
        otherSrc = isa<ConstantInt>(src[0]) ? src[1] : src[0];
        return true;
      }
    }
    return false;
  };
  // Lambda matches clamp(x, MIN, MAX) pattern.
  // If the pattern is found `x` is returned in the `clampedVal` reference.
  auto MatchClampWithImm = [&MatchMinMaxWithImm](Value *v, Value *&clampedVal, uint32_t minVal, uint32_t maxVal) -> bool {
    bool matchMin = true;
    bool matchMax = false;
    Value *src[2];
    // Match either of:
    // v = min(max(x, MIN), MAX)
    // v = max(min(x, MIN), MAX)
    if ((MatchMinMaxWithImm(v, maxVal, matchMin, src[0]) && MatchMinMaxWithImm(src[0], minVal, matchMax, src[1])) ||
        (MatchMinMaxWithImm(v, minVal, matchMax, src[0]) && MatchMinMaxWithImm(src[0], maxVal, matchMin, src[1]))) {
      clampedVal = src[1];
      return true;
    }
    return false;
  };
  // Lambda matches clamp(x, 0, 127) pattern.
  // If the pattern is found `x` is returned in the `clampedVal` reference.
  auto MatchClamp0_127 = [&MatchClampWithImm](Value *v, Value *&clampedVal) -> bool {
    return MatchClampWithImm(v, clampedVal, 0, 127);
  };

  EOPCODE opcodes[4] = {};
  Value *sources0[4] = {};
  Value *sources1[4] = {};
  bool isSat[4] = {};

  uint32_t elemsFound = 0;

  // Match patterns 1 and 2
  if (isa<BitCastInst>(I) && !I.getType()->isVectorTy() && I.getType()->getPrimitiveSizeInBits() == 32 &&
      I.getOperand(0)->getType()->isIntOrIntVectorTy(8) && isa<InsertElementInst>(I.getOperand(0))) {
    auto ie = cast<InsertElementInst>(I.getOperand(0));
    // Match sequences of InsertElementInsts, e.g.:
    //  %vec0 = insertelement <4 x i8> undef, i8 %x8, i64 0
    //  %vec1 = insertelement <4 x i8> %vec0, i8 %y8, i64 1
    //  %vec2 = insertelement <4 x i8> %vec1, i8 %z8, i64 2
    //  %vec3 = insertelement <4 x i8> %vec2, i8 %w8, i64 3
    while (ie) {
      auto idxVal = dyn_cast<ConstantInt>(ie->getOperand(2));
      auto idx = idxVal ? idxVal->getZExtValue() : 0;
      if (idxVal && sources0[idx] == nullptr) {
        sources0[idx] = ie->getOperand(1);
        ++elemsFound;
      } else {
        // Non-constant index is not supported.
        break;
      }
      ie = dyn_cast<InsertElementInst>(ie->getOperand(0));
    }
    if (elemsFound == 4) {
      // Match sequences integer trunc, e.g.:
      //  %x8 = trunc i32 %x2 to i8
      elemsFound = 0;
      for (uint32_t i = 0; i < 4; ++i) {
        auto trunc = dyn_cast<TruncInst>(sources0[i]);
        if (trunc) {
          opcodes[i] = llvm_bitcast;
          sources0[i] = trunc->getOperand(0);
          ++elemsFound;
        }
      }
    }
    if (elemsFound == 4) {
      // Match pattern 2
      for (uint32_t i = 0; i < 4; ++i) {
        Value *srcToSat;
        // Match clamping of values to 0..127 range, e.g.:
        //  %x1 = max i32 %x0, 0
        //  %x2 = min i32 %x1, 127
        if (MatchClamp0_127(sources0[i], srcToSat)) {
          opcodes[i] = llvm_max;
          sources0[i] = srcToSat;
          sources1[i] = ConstantInt::get(srcToSat->getType(), 0);
          isSat[i] = true;
        // Match clamping of values to -128..127 range, e.g.:
        //  %x1 = max i32 %x0, -128
        //  %x2 = min i32 %x1, 127
        } else if (MatchClampWithImm(sources0[i], srcToSat, -128, 127)) {
          opcodes[i] = llvm_fptosi;
          sources0[i] = srcToSat;
          isSat[i] = true;
        }
      }
    }
  }
  // Match pattern 3
  else if (I.getType()->getPrimitiveSizeInBits() == 32 && isa<BinaryOperator>(&I) &&
           cast<BinaryOperator>(&I)->getOpcode() == Instruction::Or) {
    auto orInst = cast<BinaryOperator>(&I);
    while (orInst) {
      // Match sequences of or + shl instructions, e.g.
      //  %y8 = shl nuw nsw i32 %y2, 8
      //  %xy = or i32 %x2, %y8
      ConstantInt *shlImm = nullptr;
      Value *shlSrc = nullptr;
      Value *orSrc = nullptr;
      if (match(orInst, m_Or(m_Value(orSrc), m_Shl(m_Value(shlSrc), m_ConstantInt(shlImm)))) ||
          match(orInst, m_Or(m_Shl(m_Value(shlSrc), m_ConstantInt(shlImm)), m_Value(orSrc)))) {
        // Lambda sets source values of a `sel` instruction.
        auto SetSource = [&](uint32_t i, Value *src0) {
          IGC_ASSERT(sources0[i] == nullptr);
          IGC_ASSERT(sources1[i] == nullptr);
          opcodes[i] = llvm_max;
          sources0[i] = src0;
          sources1[i] = ConstantInt::get(src0->getType(), 0);
          isSat[i] = true;
        };
        // Match clamping of the `shl` source to 0..127 range, e.g.:
        //  %x1 = max i32 %x0, 0
        //  %x2 = min i32 %x1, 127
        //  %x8 = shl nuw nsw i32 %x2, 8
        Value *srcToSat;
        uint32_t i = int_cast<uint32_t>(shlImm->getZExtValue());
        if (MatchClamp0_127(shlSrc, srcToSat) && ((i % 8) == 0 && (i / 8) < 4)) {
          SetSource(i / 8, srcToSat);
          ++elemsFound;
        }
        // Match clamping of the `or` source to 0..127 range, e.g.:
        //  %x1 = max i32 %x0, 0
        //  %x2 = min i32 %x1, 127
        //  %xy = or i32 %x2, %y8
        if (MatchClamp0_127(orSrc, srcToSat)) {
          SetSource(0, srcToSat);
          ++elemsFound;
        }
      }
      if (elemsFound != 4 && orSrc && isa<BinaryOperator>(orSrc) &&
          cast<BinaryOperator>(orSrc)->getOpcode() == Instruction::Or) {
        // Continue checking other remaining channels
        orInst = cast<BinaryOperator>(orSrc);
      } else {
        // All elements found, or an unsupported pattern.
        orInst = nullptr;
      }
    }
  }
  // TODO: consider adding support for patterns, where not all 4 i8 values
  // are packed, for example:
  //   %x8 = trunc i32 %x to i8
  //   %y8 = trunc i32 %y to i8
  //   %z8 = trunc i32 %z to i8
  //   %vec0 = insertelement <4 x i8> zeroinitializer, i8 %x8, i64 0
  //   %vec1 = insertelement <4 x i8> %vec0, i8 %y8, i64 1
  //   %vec2 = insertelement <4 x i8> %vec1, i8 %z8, i64 2
  //   %dst = bitcast <4 x i8> %vec3 to i32
  if (elemsFound == 4) {
    PackPattern *pattern = new (m_allocator) PackPattern();
    for (uint32_t i = 0; i < 4; ++i) {
      pattern->opcodes[i] = opcodes[i];
      pattern->sources0[i] = GetSource(sources0[i], false, false, IsSourceOfSample(&I));
      if (sources1[i] != nullptr) {
        pattern->sources1[i] = GetSource(sources1[i], false, false, IsSourceOfSample(&I));
      }
      pattern->isSat[i] = isSat[i];
    }
    AddPattern(pattern);
    return true;
  }
  return false;
}

// Matches repacking of packed <4 x i8> values, for example:
//   %src0v4i8 = bitcast i32 %x to <4 x i8>
//   %src1v4i8 = bitcast i32 %y to <4 x i8>
//   %elem1 = extractelement <4 x i8> %src0v4i8, i32 1
//   %dst0 = insertelement <4 x i8> %src0v4i8, i8 %elem1, i32 1
//   %elem2 = extractelement <4 x i8> %src1v4i8, i32 0
//   %dst1 = insertelement <4 x i8> %dst0, i8 %elem2, i32 2
//   %elem3 = extractelement <4 x i8> %src1v4i8, i32 1
//   %dst2 = insertelement <4 x i8> %dst1, i8 %elem3, i32 3
//   %dst3 = bitcast <4 x i8> %dst2 to i32
bool CodeGenPatternMatch::MatchRepack4i8(BitCastInst &I) {
  struct RepackPattern : public Pattern {
    std::array<SSource, 4> sources;
    std::array<uint32_t, 4> mappings;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->EmitRepack4i8(sources, mappings, modifier); }
  };

  if (!I.getType()->isVectorTy() && I.getType()->getPrimitiveSizeInBits() == 32 &&
      I.getOperand(0)->getType()->isIntOrIntVectorTy(8) && isa<InsertElementInst>(I.getOperand(0))) {
    Value *sources[4] = {};
    uint32_t mappings[4] = {};
    auto ie = cast<InsertElementInst>(I.getOperand(0));
    Value *baseVec = nullptr;
    uint32_t elemsFound = 0;
    while (ie) {
      auto idxVal = dyn_cast<ConstantInt>(ie->getOperand(2));
      if (!idxVal) {
        return false;
      }
      auto idx = idxVal->getZExtValue();
      if (idxVal && sources[idx] == nullptr) {
        sources[idx] = ie->getOperand(1);
        ++elemsFound;
      }
      baseVec = ie->getOperand(0);
      ie = dyn_cast<InsertElementInst>(baseVec);
    }

    for (uint32_t i = 0; i < 4; ++i) {
      if (sources[i]) {
        auto ee = dyn_cast<ExtractElementInst>(sources[i]);
        if (!ee || !isa<ConstantInt>(ee->getOperand(1)) || !isa<BitCastInst>(ee->getOperand(0)) ||
            (cast<BitCastInst>(ee->getOperand(0)))->getOperand(0)->getType()->isVectorTy() ||
            ((cast<BitCastInst>(ee->getOperand(0)))->getOperand(0)->getType()->getPrimitiveSizeInBits() != 32)) {
          return false;
        }
        auto bitcast = cast<BitCastInst>(ee->getOperand(0));
        mappings[i] = int_cast<uint32_t>(cast<ConstantInt>(ee->getOperand(1))->getZExtValue());
        sources[i] = bitcast->getOperand(0);
      } else {
        if (!isa<UndefValue>(baseVec) &&
            (!isa<BitCastInst>(baseVec) || (cast<BitCastInst>(baseVec))->getOperand(0)->getType()->isVectorTy() ||
             (cast<BitCastInst>(baseVec))->getOperand(0)->getType()->getPrimitiveSizeInBits() != 32)) {
          return false;
        }
        mappings[i] = i;
        auto bitcast = dyn_cast<BitCastInst>(baseVec);
        sources[i] = bitcast ? bitcast->getOperand(0) : baseVec;
      }
    }
    RepackPattern *pattern = new (m_allocator) RepackPattern();
    for (uint32_t i = 0; i < 4; ++i) {
      pattern->sources[i] = GetSource(sources[i], false, false, IsSourceOfSample(&I));
      pattern->sources[i].type = ISA_TYPE_UB;
      pattern->mappings[i] = mappings[i];
    }
    AddPattern(pattern);
    return true;
  }
  return false;
}

// Matches binary operations on packed <4 x i8> values.
// For example, the following patterns are matched:
//
// Shl+Asr pattern:
//   %x0 = shl i32 %x, 24
//   %dst = ashr i32 %x0, 28
//
// Unpack + shl pattern:
//   %src0v4i8 = bitcast i32 %x to <4 x i8>
//   %elem1 = extractelement <4 x i8> %src0v4i8, i32 1
//   %elem132 = sext i8 %elem1 to i32
//   %dst = shl i32 %elem132, 4
bool CodeGenPatternMatch::MatchBinaryUnpack4i8(Instruction &I) {
  struct BinaryUnpackPattern : public Pattern {
    SSource sources[2];
    Instruction *inst;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->BinaryUnary(inst, sources, modifier); }
  };

  Instruction *instToEmit = &I;
  if (I.getType()->isIntegerTy(32)) {
    std::tuple<bool, bool, Value *, uint32_t> sources[2];
    bool foundUnpack = false;
    // Shl+Asr pattern:
    //   %x0 = shl i32 %x, 24
    //   %dst = ashr i32 %x0, 28
    if ((isa<LShrOperator>(&I) || isa<AShrOperator>(&I)) && isa<ConstantInt>(cast<Instruction>(&I)->getOperand(1))) {
      auto shr = cast<Instruction>(&I);
      auto shl = dyn_cast<ShlOperator>(shr->getOperand(0));
      if (shl && isa<ConstantInt>(shl->getOperand(1)) && cast<ConstantInt>(shl->getOperand(1))->getZExtValue() == 24) {
        sources[0] = std::make_tuple(true, isa<LShrOperator>(&I), shl->getOperand(0), 0);
        auto shift = cast<ConstantInt>(shr->getOperand(1))->getZExtValue();
        bool isShl = shift < 24;
        if (isShl) {
          instToEmit = cast<Instruction>(shl);
          shift = 24 - shift;
        } else {
          shift = shift - 24;
        }

        sources[1] = std::make_tuple(false, false, ConstantInt::get(shr->getOperand(1)->getType(), shift), 0);
        foundUnpack = true;
      }
    } else {
      for (uint32_t i = 0; i < 2; ++i) {
        Value *op = I.getOperand(i);
        sources[i] = std::make_tuple(false, false, op, 0);
        if (isa<ZExtInst>(op) || isa<SExtInst>(op)) {
          auto ext = cast<Instruction>(op);
          if (auto extract = dyn_cast<ExtractElementInst>(ext->getOperand(0))) {
            if (isa<ConstantInt>(extract->getIndexOperand()) && isa<BitCastInst>(extract->getVectorOperand()) &&
                cast<BitCastInst>(extract->getVectorOperand())->getType()->isIntOrIntVectorTy(8) &&
                !cast<BitCastInst>(extract->getVectorOperand())->getOperand(0)->getType()->isVectorTy() &&
                cast<BitCastInst>(extract->getVectorOperand())->getOperand(0)->getType()->getPrimitiveSizeInBits() ==
                    32) {
              sources[i] = std::make_tuple(
                  true, isa<ZExtInst>(op), cast<BitCastInst>(extract->getVectorOperand())->getOperand(0),
                  int_cast<uint32_t>(cast<ConstantInt>(extract->getIndexOperand())->getZExtValue()));
              foundUnpack = true;
            }
          }
        }
      }
    }
    if (foundUnpack) {
      BinaryUnpackPattern *pattern = new (m_allocator) BinaryUnpackPattern();
      pattern->inst = instToEmit;
      for (uint32_t i = 0; i < 2; ++i) {
        auto [isUnpack, isUnsigned, source, subreg] = sources[i];
        pattern->sources[i] = GetSource(source, false, false, IsSourceOfSample(&I));
        if (isUnpack) {
          pattern->sources[i].region_set = true;
          pattern->sources[i].elementOffset = subreg;
          pattern->sources[i].region[0] = 4;
          pattern->sources[i].region[1] = 1;
          pattern->sources[i].region[2] = 0;
          pattern->sources[i].type = isUnsigned ? ISA_TYPE_UB : ISA_TYPE_B;
        }
      }
      AddPattern(pattern);
      return true;
    }
  }
  return false;
}

// Match the pattern of 32 x 32 = 64, a full 32-bit multiplication.
bool CodeGenPatternMatch::MatchFullMul32(llvm::Instruction &I) {
  using namespace llvm::PatternMatch; // Scoped namespace using.

  struct FullMul32Pattern : public Pattern {
    SSource srcs[2];
    bool isUnsigned;
    virtual void Emit(EmitPass *pass, const DstModifier &dstMod) { pass->EmitFullMul32(isUnsigned, srcs, dstMod); }
  };

  IGC_ASSERT_MESSAGE(I.getOpcode() == llvm::Instruction::Mul, "Mul instruction is expected!");

  if (!I.getType()->isIntegerTy(64))
    return false;

  llvm::Value *LHS = I.getOperand(0);
  llvm::Value *RHS = I.getOperand(1);

  // Swap operand to ensure the constant is always RHS.
  if (isa<ConstantInt>(LHS))
    std::swap(LHS, RHS);

  bool IsUnsigned = false;
  llvm::Value *L;
  llvm::Value *R;

  // Check LHS
  if (match(LHS, m_SExt(m_Value(L)))) {
    // Bail out if there's non 32-bit integer.
    if (!L->getType()->isIntegerTy(32))
      return false;
  } else if (match(LHS, m_ZExt(m_Value(L)))) {
    // Bail out if there's non 32-bit integer.
    if (!L->getType()->isIntegerTy(32))
      return false;
    IsUnsigned = true;
  } else {
    // Bailout if it's unknown that LHS have less significant bits than the
    // product.
    // NOTE we don't assertion fail the case where LHS is an constant to prevent
    // the assertion in O0 mode. Otherwise, we expect there's at most 1
    // constant operand.
    return false;
  }

  // Check RHS
  if (match(RHS, m_SExt(m_Value(R)))) {
    // Bail out if there's signedness mismatch or non 32-bit integer.
    if (IsUnsigned || !R->getType()->isIntegerTy(32))
      return false;
  } else if (match(RHS, m_ZExt(m_Value(R)))) {
    // Bail out if there's signedness mismatch or non 32-bit integer.
    if (!IsUnsigned || !R->getType()->isIntegerTy(32))
      return false;
    IsUnsigned = true;
  } else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
    APInt Val = CI->getValue();
    // 31-bit unsigned integer could be used as either signed or
    // unsigned one. Otherwise, we need special check how MSB is used.
    if (!Val.isIntN(31)) {
      if (!(IsUnsigned && Val.isIntN(32)) && !(!IsUnsigned && Val.isSignedIntN(32))) {
        return false;
      }
    }
    R = ConstantExpr::getTrunc(CI, L->getType());
  } else {
    // Bailout if it's unknown that RHS have less significant bits than the
    // product.
    return false;
  }

  FullMul32Pattern *Pat = new (m_allocator) FullMul32Pattern();
  Pat->srcs[0] = GetSource(L, !IsUnsigned, false, IsSourceOfSample(&I));
  Pat->srcs[1] = GetSource(R, !IsUnsigned, false, IsSourceOfSample(&I));
  Pat->isUnsigned = IsUnsigned;
  AddPattern(Pat);

  return true;
}

// For 32 bit integer mul/add/sub, use 16bit operands if possible. Thus,
// This will match 16x16->32, 16x32->32, the same for add/sub.
//
// For example:
//   1.  before:
//        %9 = ashr i32 %8, 16
//        %10 = mul nsw i32 %9, -1024
//        ( asr (16|M0)  r19.0<1>:d  r19.0<8;8,1>:d  16:w
//          mul (16|M0)  r19.0<1>:d  r19.0<8;8,1>:d  -1024:w )
//
//      after:
//      --> %10:d = mul %9.1<16;8:2>:w -1024:w
//          (  mul (16|M0)  r23.0<1>:d   r19.1<2;1,0>:w   -1024:w )
//
//  2. before:
//        %9  = lshr i32 %8, 16
//        %10 = and i32 %8, 65535
//        %11 = mul nuw i32 %9, %10
//        ( shr  (16|M0)   r14.0<1>:d  r12.0<8;8,1>:ud   16:w
//          and(16 | M0)   r12.0<1>:d  r12.0<8;8,1>:d  65535:d
//          mul(16 | M0)   r14.0<1>:d  r14.0<8;8,1>:d  r12.0<8;8,1>:d )
//
//     after:
//     --> %11:d = mul %8.1<16;8,2>:uw %8.0<16;8,2>:uw
//         ( mul (16|M0)  r14.0<1>:d   r12.1<2;1,0>:w   r12.0<2;1,0>:w )
//
bool CodeGenPatternMatch::MatchMulAdd16(Instruction &I) {
  using namespace llvm::PatternMatch;

  struct Oprd16Pattern : public Pattern {
    SSource srcs[2];
    Instruction *rootInst;
    virtual void Emit(EmitPass *pass, const DstModifier &dstMod) { pass->emitMulAdd16(rootInst, srcs, dstMod); }
  };

  // The code is under the control of registry key EnableMixIntOperands.
  if (IGC_IS_FLAG_DISABLED(EnableMixIntOperands)) {
    return false;
  }

  unsigned opc = I.getOpcode();
  IGC_ASSERT_MESSAGE((opc == Instruction::Mul) || (opc == Instruction::Add) || (opc == Instruction::Sub),
                     "Mul instruction is expected!");

  // Handle 32 bit integer mul/add/sub only.
  if (!I.getType()->isIntegerTy(32)) {
    return false;
  }

  // Try to replace any source operands with ones of type short if any. As vISA
  // allows the mix of any integer type, each operand is considered separately.
  struct {
    Value *src;
    bool useLower;
    bool isSigned;
  } oprdInfo[2];
  bool isCandidate = false;

  for (int i = 0; i < 2; ++i) {
    Value *oprd = I.getOperand(i);
    Value *L = nullptr;

    // oprdInfo[i].src == null --> no W operand replacement.
    oprdInfo[i].src = nullptr;
    if (ConstantInt *CI = dyn_cast<ConstantInt>(oprd)) {
      int64_t val = CI->isNegative() ? CI->getSExtValue() : CI->getZExtValue();
      // If src needs to be negated (y = x - a = x + (-a), as gen only
      // has add), need to check if the negated src fits into W/UW.
      bool isNegSrc = (opc == Instruction::Sub && i == 1);
      if (isNegSrc) {
        val = -val;
      }
      if (INT16_MIN <= val && val <= INT16_MAX) {
        oprdInfo[i].src = oprd;
        oprdInfo[i].useLower = true; // does not matter for const
        oprdInfo[i].isSigned = true;
        isCandidate = true;
      } else if (0 <= val && val <= UINT16_MAX) {
        oprdInfo[i].src = oprd;
        oprdInfo[i].useLower = true; // does not matter for const
        oprdInfo[i].isSigned = false;
        isCandidate = true;
      }
    } else if (match(oprd, m_And(m_Value(L), m_SpecificInt(0xFFFF)))) {
      oprdInfo[i].src = L;
      oprdInfo[i].useLower = true;
      oprdInfo[i].isSigned = false;
      isCandidate = true;
    } else if (match(oprd, m_LShr(m_Value(L), m_SpecificInt(16)))) {
      oprdInfo[i].src = L;
      oprdInfo[i].useLower = false;
      oprdInfo[i].isSigned = false;
      isCandidate = true;
    } else if (match(oprd, m_AShr(m_Shl(m_Value(L), m_SpecificInt(16)), m_SpecificInt(16)))) {
      oprdInfo[i].src = L;
      oprdInfo[i].useLower = true;
      oprdInfo[i].isSigned = true;
      isCandidate = true;
    } else if (match(oprd, m_AShr(m_Value(L), m_SpecificInt(16)))) {
      oprdInfo[i].src = L;
      oprdInfo[i].useLower = false;
      oprdInfo[i].isSigned = true;
      isCandidate = true;
    }
  }

  if (!isCandidate) {
    return false;
  }

  Oprd16Pattern *Pat = new (m_allocator) Oprd16Pattern();
  for (int i = 0; i < 2; ++i) {
    if (oprdInfo[i].src) {
      Pat->srcs[i] = GetSource(oprdInfo[i].src, false, false, IsSourceOfSample(&I));
      SSource &thisSrc = Pat->srcs[i];

      // for now, Use W/UW only if region_set is false or the src is scalar
      if (thisSrc.region_set && !(thisSrc.region[0] == 0 && thisSrc.region[1] == 1 && thisSrc.region[2] == 0)) {
        Pat->srcs[i] = GetSource(I.getOperand(i), true, false, IsSourceOfSample(&I));
      } else {
        // Note that SSource's type, if set by GetSource(), should be 32bit type. It's
        // safe to override it with either UW or W. But for SSource's offset, need to
        // re-calculate in term of 16bit, not 32bit.
        thisSrc.type = oprdInfo[i].isSigned ? ISA_TYPE_W : ISA_TYPE_UW;
        thisSrc.elementOffset = (2 * thisSrc.elementOffset) + (oprdInfo[i].useLower ? 0 : 1);
      }
    } else {
      Pat->srcs[i] = GetSource(I.getOperand(i), true, false, IsSourceOfSample(&I));
    }
  }
  Pat->rootInst = &I;
  AddPattern(Pat);

  return true;
}

bool CodeGenPatternMatch::BitcastSearch(SSource &source, llvm::Value *&value, bool broadcast) {
  if (auto elemInst = dyn_cast<ExtractElementInst>(value)) {
    if (auto bTInst = dyn_cast<BitCastInst>(elemInst->getOperand(0))) {
      // Pattern Matching (Instruction) + ExtractElem + (Vector)Bitcast
      //
      // In order to set the regioning for the ALU operand
      // I require three things:
      //      -The first is the source number of elements
      //      -The second is the destination number of elements
      //      -The third is the index from the extract element
      //
      // For example if I have <4 x i32> to <16 x i8> all I need is
      // the 4 (vstride) and the i8 (b) in this case the operand would look
      // like this -> r22.x <4;1,0>:b
      // x is calculated below and later on using the simdsize

      uint32_t index, srcNElts, dstNElts, nEltsRatio;
      llvm::Type *srcTy = bTInst->getOperand(0)->getType();
      llvm::Type *dstTy = bTInst->getType();

      srcNElts = (srcTy->isVectorTy()) ? (uint32_t)cast<IGCLLVM::FixedVectorType>(srcTy)->getNumElements() : 1;
      dstNElts = (dstTy->isVectorTy()) ? (uint32_t)cast<IGCLLVM::FixedVectorType>(dstTy)->getNumElements() : 1;

      if (srcNElts < dstNElts && srcTy->getScalarSizeInBits() < 64) {
        if (isa<ConstantInt>(elemInst->getIndexOperand())) {
          index = int_cast<uint>(cast<ConstantInt>(elemInst->getIndexOperand())->getZExtValue());
          nEltsRatio = dstNElts / srcNElts;
          source.value = bTInst->getOperand(0);
          source.SIMDOffset = iSTD::RoundDownNonPow2(index, nEltsRatio);
          source.elementOffset = source.elementOffset * nEltsRatio + index % nEltsRatio;
          value = source.value;
          if (!broadcast) {
            source.region_set = true;
            if (m_WI->isUniform(value))
              source.region[0] = 0;
            else
              source.region[0] = (unsigned char)nEltsRatio;
            source.region[1] = 1;
            source.region[2] = 0;
          }
          return true;
        }
      }
    }
  }
  return false;
}

bool CodeGenPatternMatch::MatchModifier(llvm::Instruction &I, bool SupportSrc0Mod) {
  struct ModifierPattern : public Pattern {
    SSource sources[2];
    llvm::Instruction *instruction;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      pass->BinaryUnary(instruction, sources, modifier);
    }
  };

  ModifierPattern *pattern = new (m_allocator) ModifierPattern();
  pattern->instruction = &I;
  uint nbSources = GetNbSources(I);

  llvm::SmallVector<llvm::Value *, 4> sources(I.op_begin(), I.op_end());
  if (FlushesDenormsOnInput(I)) {
    // Denorms are flushed at input - skip canonicalize
    std::transform(sources.begin(), sources.end(), sources.begin(),
                   [](llvm::Value *src) -> llvm::Value * { return SkipCanonicalize(src); });
  }
  bool supportModifierSrc0 = SupportsModifier(&I, m_Platform);
  bool supportRegioning = SupportsRegioning(&I);
  llvm::Instruction *src0Inst = llvm::dyn_cast<llvm::Instruction>(I.getOperand(0));
  if (I.getOpcode() == llvm::Instruction::UDiv && src0Inst && src0Inst->getOpcode() == llvm::Instruction::Sub) {
    supportModifierSrc0 = false;
  }
  pattern->sources[0] =
      GetSource(sources[0], supportModifierSrc0 && SupportSrc0Mod, supportRegioning, IsSourceOfSample(&I));
  if (nbSources > 1) {
    bool supportModifierSrc1 = SupportsModifier(&I, m_Platform);
    llvm::Instruction *src1Inst = llvm::dyn_cast<llvm::Instruction>(I.getOperand(1));
    if (I.getOpcode() == llvm::Instruction::UDiv && src1Inst && src1Inst->getOpcode() == llvm::Instruction::Sub) {
      supportModifierSrc1 = false;
    }
    pattern->sources[1] = GetSource(sources[1], supportModifierSrc1, supportRegioning, IsSourceOfSample(&I));

    // add df imm to constant pool for binary/ternary inst
    // we do 64-bit int imm bigger than 32 bits, since smaller may fit in D/W
    for (int i = 0, numSrc = (int)nbSources; i < numSrc; ++i) {
      Value *op = sources[i];
      if (isCandidateForConstantPool(op)) {
        AddToConstantPool(I.getParent(), op);
        pattern->sources[i].fromConstantPool = true;
      }
    }
  }

  AddPattern(pattern);

  return true;
}

bool CodeGenPatternMatch::MatchSingleInstruction(llvm::Instruction &I) {
  struct SingleInstPattern : Pattern {
    llvm::Instruction *inst;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      IGC_ASSERT(modifier.sat == false);
      IGC_ASSERT(modifier.flag == nullptr);
      pass->EmitNoModifier(inst);
    }
  };
  SingleInstPattern *pattern = new (m_allocator) SingleInstPattern();
  pattern->inst = &I;
  bool flushesDenorms = FlushesDenormsOnInput(I);
  uint numSources = GetNbSources(I);
  for (uint i = 0; i < numSources; i++) {
    Value *src = I.getOperand(i);
    if (flushesDenorms) {
      // Denorms are flushed at input - skip canonicalize
      src = SkipCanonicalize(src);
    }
    MarkAsSource(src, IsSourceOfSample(&I));
  }

  if (CallInst *callinst = dyn_cast<CallInst>(&I)) {
    // Mark the function pointer in indirect calls as a source
    if (!callinst->getCalledFunction()) {
      MarkAsSource(IGCLLVM::getCalledValue(callinst), IsSourceOfSample(&I));
    }
  }
  AddPattern(pattern);
  return true;
}

bool CodeGenPatternMatch::MatchCanonicalizeInstruction(llvm::Instruction &I) {
  struct CanonicalizeInstPattern : Pattern {
    CanonicalizeInstPattern(llvm::Instruction *pInst) : m_pInst(pInst) {}

    llvm::Instruction *m_pInst;
    Pattern *m_pPattern = nullptr;

    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      if (m_pPattern) {
        m_pPattern->Emit(pass, modifier);
      } else {
        pass->emitCanonicalize(m_pInst, modifier);
      }
    }
  };

  CanonicalizeInstPattern *pattern = new (m_allocator) CanonicalizeInstPattern(&I);
  IGC_ASSERT(isa<Instruction>(I.getOperand(0)) || isa<Argument>(I.getOperand(0)));
  // Current implementation assumes that mix mode is disabled if
  // half float or 32-bit float denorms must be flushed.
  if (m_ctx->getModuleMetaData()->compOpt.FloatDenormMode16 == IGC::FLOAT_DENORM_FLUSH_TO_ZERO ||
      m_ctx->getModuleMetaData()->compOpt.FloatDenormMode32 == IGC::FLOAT_DENORM_FLUSH_TO_ZERO) {
    IGC_ASSERT(!m_Platform.supportMixMode() || m_ctx->getModuleMetaData()->disableMixMode);
  }
  if (isa<Argument>(I.getOperand(0)) || !FlushesDenormsOnOutput(*(cast<Instruction>(I.getOperand(0))))) {
    MarkAsSource(I.getOperand(0), IsSourceOfSample(&I));
  } else {
    pattern->m_pPattern = Match(*llvm::cast<llvm::Instruction>(I.getOperand(0)));
  }

  AddPattern(pattern);
  return true;
}

bool CodeGenPatternMatch::MatchBranch(llvm::BranchInst &I) {
  using namespace llvm::PatternMatch;
  struct CondBrInstPattern : Pattern {
    SSource cond;
    llvm::BranchInst *inst;
    e_predMode predMode = EPRED_NORMAL;
    bool isDiscardBranch = false;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      if (isDiscardBranch) {
        pass->emitDiscardBranch(inst, cond);
      } else {
        pass->emitBranch(inst, cond, predMode);
      }
    }
  };
  CondBrInstPattern *pattern = new (m_allocator) CondBrInstPattern();
  pattern->inst = &I;

  if (!I.isUnconditional()) {
    Value *orSrc0 = nullptr;
    Value *orSrc1 = nullptr;
    Value *cond = I.getCondition();
    if (dyn_cast<GenIntrinsicInst>(cond, GenISAIntrinsic::GenISA_UpdateDiscardMask)) {
      pattern->isDiscardBranch = true;
    } else if (match(cond, m_Or(m_Value(orSrc0), m_Value(orSrc1)))) {
      // %6 = call i1 @llvm.genx.GenISA.UpdateDiscardMask(i1 %0, i1 %2)
      // %7 = or i1 %6, %2  (or: %7 = or i1 %2, %6)
      // br i1 %7, label %DiscardRet, label %PostDiscard
      if (auto intr = dyn_cast<GenIntrinsicInst>(orSrc0, GenISAIntrinsic::GenISA_UpdateDiscardMask)) {
        if (intr->getOperand(1) == orSrc1) {
          pattern->isDiscardBranch = true;
        }
      } else if (auto intr = dyn_cast<GenIntrinsicInst>(orSrc1, GenISAIntrinsic::GenISA_UpdateDiscardMask)) {
        if (intr->getOperand(1) == orSrc0) {
          pattern->isDiscardBranch = true;
        }
      }
    }
    pattern->cond = GetSource(I.getCondition(), false, false, IsSourceOfSample(&I));
  }
  AddPattern(pattern);
  return true;
}

bool CodeGenPatternMatch::MatchFloatingPointSatModifier(llvm::Instruction &I) {
  struct SatPattern : Pattern {
    Pattern *pattern;
    SSource source;
    bool isUnsigned;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      DstModifier mod = modifier;
      mod.sat = true;
      if (pattern) {
        pattern->Emit(pass, mod);
      } else {
        pass->Mov(source, mod);
      }
    }
  };
  bool match = false;
  llvm::Value *source = nullptr;
  bool isUnsigned = false;
  if (isSat(&I, source, isUnsigned)) {
    SatPattern *satPattern = new (m_allocator) SatPattern();
    if (llvm::Instruction *inst = llvm::dyn_cast<Instruction>(source)) {
      // As an heuristic we only match saturate if the instruction has one use
      // to avoid duplicating expensive instructions and increasing reg pressure
      // without improve code quality this may be refined in the future
      if (inst->hasOneUse() && SupportsSaturate(inst)) {
        bool isSourceOfSample = IsSourceOfSample(&I);
        if (isSourceOfSample) {
          m_sampleSource.insert(inst);
        }
        auto *pattern = Match(*inst);
        IGC_ASSERT_MESSAGE(pattern, "Failed to match pattern");
        // Even though the original `inst` may support saturate,
        // we need to know if the instruction(s) generated from
        // the pattern support it.
        if (pattern->supportsSaturate()) {
          satPattern->pattern = pattern;
          match = true;
        } else if (isSourceOfSample) {
          m_sampleSource.erase(inst);
        }
      }
    }
    if (!match) {
      satPattern->pattern = nullptr;
      satPattern->source = GetSource(source, true, false, IsSourceOfSample(&I));
      match = true;
    }
    if (isUniform(&I) && source->hasOneUse()) {
      gatherUniformBools(source);
    }
    AddPattern(satPattern);
  }
  return match;
}

bool CodeGenPatternMatch::MatchIntegerSatModifier(llvm::Instruction &I) {
  // a default pattern
  struct SatPattern : Pattern {
    Pattern *pattern;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      DstModifier mod = modifier;
      mod.sat = true;
      pattern->Emit(pass, mod);
    }
  };

  // a special pattern is required because of the fact that the instruction works on unsigned values
  // whereas the default type is signed for arithmetic instructions
  struct UAddPattern : Pattern {
    BinaryOperator *inst;

    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      DstModifier mod = modifier;
      mod.sat = true;
      pass->EmitUAdd(inst, mod);
    }
  };

  struct IntegerSatTruncPattern : public Pattern {
    SSource src;
    bool isSigned;
    virtual void Emit(EmitPass *pass, const DstModifier &dstMod) {
      pass->EmitIntegerTruncWithSat(isSigned, isSigned, src, dstMod);
    }
  };

  bool match = false;
  llvm::Value *source = nullptr;
  bool isUnsigned = false;
  if (isSat(&I, source, isUnsigned)) {
    IGC_ASSERT(llvm::isa<Instruction>(source));

    // As an heuristic we only match saturate if the instruction has one use
    // to avoid duplicating expensive instructions and increasing reg pressure
    // without improve code quality this may be refined in the future.
    if (llvm::Instruction *sourceInst = llvm::cast<llvm::Instruction>(source);
        sourceInst && sourceInst->hasOneUse() && SupportsSaturate(sourceInst)) {
      if (llvm::BinaryOperator *binaryOpInst = llvm::dyn_cast<llvm::BinaryOperator>(source);
          binaryOpInst && (binaryOpInst->getOpcode() == llvm::BinaryOperator::BinaryOps::Add) && isUnsigned) {
        match = true;

        uint numSources = GetNbSources(*sourceInst);
        for (uint i = 0; i < numSources; i++) {
          MarkAsSource(sourceInst->getOperand(i), IsSourceOfSample(&I));
        }

        UAddPattern *uAddPattern = new (m_allocator) UAddPattern();
        uAddPattern->inst = binaryOpInst;
        AddPattern(uAddPattern);
      } else if (binaryOpInst && (binaryOpInst->getOpcode() == llvm::BinaryOperator::BinaryOps::Add) && !isUnsigned) {
        match = true;
        SatPattern *satPattern = new (m_allocator) SatPattern();
        satPattern->pattern = Match(*sourceInst);
        AddPattern(satPattern);
      } else if (llvm::TruncInst *truncInst = llvm::dyn_cast<llvm::TruncInst>(source); truncInst) {
        match = true;
        IntegerSatTruncPattern *satPattern = new (m_allocator) IntegerSatTruncPattern();
        satPattern->isSigned = !isUnsigned;
        satPattern->src = GetSource(truncInst->getOperand(0), !isUnsigned, false, IsSourceOfSample(&I));
        AddPattern(satPattern);
      } else {
        IGC_ASSERT_MESSAGE(0, "An undefined pattern match");
      }
    }
  }
  return match;
}

bool CodeGenPatternMatch::MatchSelectModifier(llvm::SelectInst &I) {
  if (I.getType()->isAggregateType())
    return MatchSingleInstruction(I);

  struct SelectPattern : Pattern {
    SSource sources[3];
    e_predMode predMode;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      DstModifier modf = modifier;
      modf.predMode = predMode;
      pass->Select(sources, modf);
    }
  };
  SelectPattern *pattern = new (m_allocator) SelectPattern();
  pattern->predMode = EPRED_NORMAL;

  pattern->sources[0] = GetSource(I.getCondition(), false, false, IsSourceOfSample(&I));
  pattern->sources[1] = GetSource(I.getTrueValue(), true, false, IsSourceOfSample(&I));
  pattern->sources[2] = GetSource(I.getFalseValue(), true, false, IsSourceOfSample(&I));

  // try to add to constant pool whatever possible.
  if (isCandidateForConstantPool(I.getTrueValue())) {
    AddToConstantPool(I.getParent(), I.getTrueValue());
    pattern->sources[1].fromConstantPool = true;
  }

  if (isCandidateForConstantPool(I.getFalseValue())) {
    AddToConstantPool(I.getParent(), I.getFalseValue());
    pattern->sources[2].fromConstantPool = true;
  }

  AddPattern(pattern);
  return true;
}

static bool IsPositiveFloat(Value *v, unsigned int depth = 0) {
  if (depth > 3) {
    // limit the depth of recursion to avoid compile time problem
    return false;
  }
  if (ConstantFP *cst = dyn_cast<ConstantFP>(v)) {
    if (!cst->getValueAPF().isNegative()) {
      return true;
    }
  } else if (Instruction *I = dyn_cast<Instruction>(v)) {
    switch (I->getOpcode()) {
    case Instruction::FMul:
    case Instruction::FAdd:
      return IsPositiveFloat(I->getOperand(0), depth + 1) && IsPositiveFloat(I->getOperand(1), depth + 1);
    case Instruction::Call:
      if (IntrinsicInst *intrinsicInst = dyn_cast<IntrinsicInst>(I)) {
        if (intrinsicInst->getIntrinsicID() == Intrinsic::fabs) {
          return true;
        }
      } else if (isa<GenIntrinsicInst>(I, GenISAIntrinsic::GenISA_fsat)) {
        return true;
      }
      break;
    default:
      break;
    }
  }
  return false;
}

bool CodeGenPatternMatch::MatchPow(llvm::IntrinsicInst &I) {
  if (IGC_IS_FLAG_ENABLED(DisableMatchPow)) {
    return false;
  }

  // For this pattern match exp(log(x) * y) = pow
  // if x < 0 and y is an integer (ex: 1.0)
  // with pattern match : pow(x, 1.0) = x
  // without pattern match : exp(log(x) * 1.0) = NaN because log(x) is NaN.
  //
  // Since pow is 2x slower than exp/log, disabling this optimization might not hurt much.
  // Keep the code and disable MatchPow to track any performance change for now.
  struct PowPattern : public Pattern {
    SSource sources[2];
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Pow(sources, modifier); }
  };
  bool found = false;
  llvm::Value *source0 = NULL;
  llvm::Value *source1 = NULL;
  if (I.getIntrinsicID() == Intrinsic::exp2) {
    llvm::BinaryOperator *mul = dyn_cast<BinaryOperator>(I.getOperand((0)));
    if (mul && mul->getOpcode() == Instruction::FMul) {
      for (uint j = 0; j < 2; j++) {
        llvm::IntrinsicInst *log = dyn_cast<IntrinsicInst>(mul->getOperand(j));
        if (log && log->getIntrinsicID() == Intrinsic::log2) {
          if (IsPositiveFloat(log->getOperand(0))) {
            source0 = log->getOperand(0);
            source1 = mul->getOperand(1 - j);
            found = true;
            break;
          }
        }
      }
    }
  }
  if (found) {
    PowPattern *pattern = new (m_allocator) PowPattern();
    pattern->sources[0] = GetSource(source0, true, false, IsSourceOfSample(&I));
    pattern->sources[1] = GetSource(source1, true, false, IsSourceOfSample(&I));
    AddPattern(pattern);
  }
  return found;
}

bool CodeGenPatternMatch::MatchCondModifier(llvm::CmpInst &I) {
  struct CondModifierPattern : Pattern {
    Pattern *pattern;
    Instruction *alu;
    CmpInst *cmp;
    int aluOprdNum = 0; // 0: alu is src0; otherwise, alu is src1
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      IGC_ASSERT(modifier.flag == nullptr);
      IGC_ASSERT(modifier.sat == false);
      pass->emitAluConditionMod(pattern, alu, cmp, aluOprdNum);
    }
  };
  bool found = false;
  for (uint i = 0; i < 2; i++) {
    if (IsZero(I.getOperand(i))) {
      llvm::Instruction *alu = dyn_cast<Instruction>(I.getOperand(1 - i));
      if (alu && alu->hasOneUse() && SupportsCondModifier(alu)) {
        CondModifierPattern *pattern = new (m_allocator) CondModifierPattern();
        pattern->pattern = Match(*alu);
        IGC_ASSERT_MESSAGE(pattern->pattern, "Failed to match pattern");
        pattern->alu = alu;
        pattern->cmp = &I;
        pattern->aluOprdNum = 1 - i;
        AddPattern(pattern);
        found = true;
        break;
      }
    }
  }
  return found;
}

// we match the following pattern
// %f = cmp %1 %2
// %o = or/and %f %g
bool CodeGenPatternMatch::MatchBoolOp(llvm::BinaryOperator &I) {
  struct BoolOpPattern : public Pattern {
    Pattern *cmpPattern;
    llvm::BinaryOperator *boolOp;
    SSource binarySource;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      pass->CmpBoolOp(cmpPattern, boolOp, binarySource, modifier);
    }
  };

  IGC_ASSERT(I.getOpcode() == Instruction::Or || I.getOpcode() == Instruction::And);
  bool found = false;
  if (I.getType()->isIntegerTy(1)) {
    for (uint i = 0; i < 2; i++) {
      if (CmpInst *cmp = llvm::dyn_cast<CmpInst>(I.getOperand(i))) {
        // only beneficial if the other operand only have one use
        if (I.getOperand(1 - i)->hasOneUse()) {
          BoolOpPattern *pattern = new (m_allocator) BoolOpPattern();
          pattern->cmpPattern = Match(*cmp);
          pattern->boolOp = &I;
          pattern->binarySource = GetSource(I.getOperand(1 - i), false, false, IsSourceOfSample(&I));
          AddPattern(pattern);
          found = true;
          break;
        }
      }
    }
  }
  return found;
}

bool CodeGenPatternMatch::MatchFunnelShiftRotate(llvm::IntrinsicInst &I) {
  // Hanlde only funnel shift that can be turned into rotate.
  struct funnelShiftRotatePattern : public Pattern {
    SSource sources[2];
    llvm::IntrinsicInst *instruction;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      bool isShl = instruction->getIntrinsicID() == Intrinsic::fshl;
      pass->Binary(isShl ? EOPCODE_ROL : EOPCODE_ROR, sources, modifier);
    }
  };

  if (!m_Platform.supportRotateInstruction() || I.getType()->isVectorTy()) {
    return false;
  }

  Value *A = I.getOperand(0);
  Value *B = I.getOperand(1);
  Value *Amt = I.getOperand(2);
  uint32_t typebits = I.getType()->getScalarSizeInBits();
  if (A != B || (typebits != 16 && typebits != 32 && typebits != 64)) {
    return false;
  }
  if (typebits == 64 && !m_Platform.supportQWRotateInstructions()) {
    return false;
  }

  // Found the pattern.
  funnelShiftRotatePattern *pattern = new (m_allocator) funnelShiftRotatePattern();
  pattern->instruction = &I;
  pattern->sources[0] = GetSource(A, true, false, IsSourceOfSample(&I));
  pattern->sources[1] = GetSource(Amt, true, false, IsSourceOfSample(&I));

  AddPattern(pattern);
  return true;
}

bool CodeGenPatternMatch::MatchUnmaskedRegionBoundary(Instruction &I, bool start) {
  struct UnmaskedBoundaryPattern : public Pattern {
    bool start;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      (void)modifier;
      pass->emitUnmaskedRegionBoundary(start);
    }
  };

  UnmaskedBoundaryPattern *pattern = new (m_allocator) UnmaskedBoundaryPattern();
  pattern->start = start;
  AddPattern(pattern);
  return true;
}

bool CodeGenPatternMatch::MatchAdd3(Instruction &I) {
  using namespace llvm::PatternMatch;

  struct Add3Pattern : public Pattern {
    SSource sources[3];
    Instruction *instruction;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Tenary(EOPCODE_ADD3, sources, modifier); }
  };

  if (IGC_IS_FLAG_DISABLED(EnableAdd3) || !m_Platform.supportAdd3Instruction()) {
    return false;
  }

  // Only handle D & W integer types.
  Type *Ty = I.getType();
  if (!(Ty->isIntegerTy(16) || Ty->isIntegerTy(32))) {
    return false;
  }

  Value *s0 = I.getOperand(0);
  Value *s1 = nullptr, *s2 = nullptr;
  e_modifier Mod1 = EMOD_NONE, Mod2 = EMOD_NONE;
  Instruction *I0 = dyn_cast<Instruction>(s0);
  Instruction *Add2 = nullptr;
  if (I0) {
    if (I0->getOpcode() == Instruction::Sub) {
      s0 = I0->getOperand(0);
      s1 = I0->getOperand(1);
      Mod1 = EMOD_NEG;
      Add2 = I0;
    } else if (I0->getOpcode() == Instruction::Add) {
      s0 = I0->getOperand(0);
      s1 = I0->getOperand(1);
      Add2 = I0;
    }
  }

  bool isNeg = (I.getOpcode() == Instruction::Sub);
  if (s1 == nullptr) {
    s1 = I.getOperand(1);
    Instruction *I1 = dyn_cast<Instruction>(s1);
    if (I1) {
      if (I1->getOpcode() == Instruction::Sub) {
        s1 = I1->getOperand(0);
        s2 = I1->getOperand(1);
        Add2 = I1;
        if (isNeg) {
          Mod1 = EMOD_NEG;
        } else {
          Mod2 = EMOD_NEG;
        }
      } else if (I1->getOpcode() == Instruction::Add) {
        s1 = I1->getOperand(0);
        s2 = I1->getOperand(1);
        Add2 = I1;
        if (isNeg) {
          Mod1 = EMOD_NEG;
          Mod2 = EMOD_NEG;
        }
      }
    }
  } else {
    s2 = I.getOperand(1);
    if (isNeg) {
      Mod2 = EMOD_NEG;
    }
  }

  if (s2 == nullptr) {
    return false;
  }

  // If source operand corresponding to first add instruction
  // has many uses the add3 pattern match is unlikely to be profitable,
  // as it increases register pressure and makes register bank
  // conflicts more likely. Unless some of the operands are constants
  // then there is no increase of register pressure from add3 construction
  if (m_ctx->type == ShaderType::OPENCL_SHADER) {
    const int Threshold = 3;

    if (!isa<ConstantInt>(s0) && !isa<ConstantInt>(s1) && !isa<ConstantInt>(s2) &&
        (Add2->getOperand(0)->hasOneUse() || Add2->getOperand(1)->hasOneUse()) && Add2->hasNUsesOrMore(Threshold)) {
      // add3 is unlikely to be profitable.
      return false;
    }
  }

  // Found the pattern.
  // Make sure that the middle one is not constant
  if (isa<ConstantInt>(s1)) {
    std::swap(s1, s2);
    std::swap(Mod1, Mod2);
  }

  Add3Pattern *pattern = new (m_allocator) Add3Pattern();
  pattern->instruction = &I;
  pattern->sources[0] = GetSource(s0, true, false, IsSourceOfSample(&I));
  pattern->sources[1] = GetSource(s1, true, false, IsSourceOfSample(&I));
  pattern->sources[2] = GetSource(s2, true, false, IsSourceOfSample(&I));
  if (Mod1 != EMOD_NONE) {
    pattern->sources[1].mod = CombineModifier(Mod1, pattern->sources[1].mod);
  }
  if (Mod2 != EMOD_NONE) {
    pattern->sources[2].mod = CombineModifier(Mod2, pattern->sources[2].mod);
  }
  AddPattern(pattern);
  return true;
}

bool CodeGenPatternMatch::MatchBfn(Instruction &I) {
  using namespace llvm::PatternMatch;

  struct BfnPattern : public Pattern {
    uint8_t booleanFuncCtrl = 0;
    SSource sources[3];
    Instruction *instruction;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Bfn(booleanFuncCtrl, sources, modifier); }
  };

  if (IGC_IS_FLAG_DISABLED(EnableBfn) || !m_Platform.supportBfnInstruction()) {
    return false;
  }

  // Only handle D & W integer types.
  Type *Ty = I.getType();
  if (!(Ty->isIntegerTy(16) || Ty->isIntegerTy(32))) {
    return false;
  }

  struct CtrlCalculator {
    enum OP { NONE, AND, OR, XOR };
    enum SOURCE {
      S0 = 0, // s0, s1, s2
      S1 = 1,
      S2 = 2,
      NS0 = 3, // ~s0, ~s1, ~s2
      NS1 = 4,
      NS2 = 5
    };
    // Value of the sources for calculating BooleanFuncCtrl. The
    // index maps to enum SOURCE
    const uint8_t s[6] = {0xAA, 0xCC, 0xF0, 0x55, 0x33, 0x0F};

    typedef llvm::SmallVector<OP, 2> OpVecType;
    typedef llvm::SmallVector<SOURCE, 3> SourceVecType;

    // The sequence of matched operators, follow the calculation order
    OpVecType ops;
    // The sequence of matched sources, follow the calculation order
    SourceVecType source_idx;

    void addSource(SOURCE src) { source_idx.push_back(src); }
    void addOPFromLLVMOp(unsigned llvmOp) {
      if (llvmOp == Instruction::And)
        ops.push_back(AND);
      else if (llvmOp == Instruction::Or)
        ops.push_back(OR);
      else if (llvmOp == Instruction::Xor)
        ops.push_back(XOR);
      else {
        IGC_ASSERT_MESSAGE(0, "MatchBfn: inccorect opcode");
        return;
      }
    }

    uint8_t getBooleanFuncCtrl() {
      IGC_ASSERT_MESSAGE((source_idx.size() == (ops.size() + 1)), "MatchBfn: OPs and Sources length missmatched");
      SourceVecType::iterator s_it = source_idx.begin(), s_end = source_idx.end();
      // start from the first source
      uint8_t result = s[*s_it];
      ++s_it;

      // iterate through the matched sequence and compute the BooleanFuncCtrl
      OpVecType::iterator op_it = ops.begin();
      for (; s_it != s_end; ++s_it, ++op_it) {
        switch (*op_it) {
        case AND:
          result = result & s[*s_it];
          break;
        case OR:
          result = result | s[*s_it];
          break;
        case XOR:
          result = result ^ s[*s_it];
          break;
        default:
          IGC_ASSERT_MESSAGE(0, "MatchBfn:CtrlCalculator: inccorect OP");
          break;
        }
      }
      return result;
    }
  };

  auto isBinaryLogic = [](Instruction::BinaryOps op) {
    return op == Instruction::Or || op == Instruction::And || op == Instruction::Xor;
  };
  // Find BFN patterns. Matched patterns: (op0 and op1 are boolean operations)
  // ~s0  ~s1                  ~s1  ~s2   <-- extra check to matchNot on each found src
  //  |    |                    |    |
  // s0   s1  ~s2         ~s0   s1  s2
  //   \ /    |            |     \  /
  //    op    s2     OR    s0   op1
  //     \   /              \   /
  //      op0                op0   <-- match to BFN

  // if source operand has many uses the bfn pattern match is unlikely to be profitable,
  // as it increases register pressure and makes register bank conflicts more likely
  // ToDo: tune the value of N
  const int useThreshold = 4;
  // if both operands of the root is binary logic op, use simple heuristics
  // to fold one of them
  if (isa<BinaryOperator>(I.getOperand(0)) && isa<BinaryOperator>(I.getOperand(1))) {
    auto I0 = cast<BinaryOperator>(I.getOperand(0));
    auto I1 = cast<BinaryOperator>(I.getOperand(1));
    if (I0->hasNUsesOrMore(useThreshold) && I1->hasNUsesOrMore(useThreshold)) {
      // bfn is unlikely to be profitable.
      return false;
    } else if (I0->getNumUses() > I1->getNumUses()) {
      I.setOperand(0, I1);
      I.setOperand(1, I0);
    }
  }

  // Further match s to not operation. Update s if matched.
  auto matchNot = [](Value *&s) {
    BinaryOperator *bo = dyn_cast<BinaryOperator>(s);
    if (bo && match(s, m_Not(m_Specific(bo->getOperand(0))))) {
      s = bo->getOperand(0);
      return true;
    }
    return false;
  };

  CtrlCalculator ctrlcal;
  Value *s0 = I.getOperand(0);
  Value *s1 = nullptr, *s2 = nullptr;
  BinaryOperator *I0 = dyn_cast<BinaryOperator>(s0);
  if (I0) {
    if (isBinaryLogic(I0->getOpcode()) && !I0->hasNUsesOrMore(useThreshold)) {
      s0 = I0->getOperand(0);
      s1 = I0->getOperand(1);
    }
  }

  if (s1 == nullptr) {
    s1 = I.getOperand(1);
    BinaryOperator *I1 = dyn_cast<BinaryOperator>(s1);
    if (I1) {
      if (isBinaryLogic(I1->getOpcode()) && !I1->hasNUsesOrMore(useThreshold)) {
        s1 = I1->getOperand(0);
        s2 = I1->getOperand(1);

        // add ops and sources by execution order
        ctrlcal.addSource(matchNot(s1) ? CtrlCalculator::NS1 : CtrlCalculator::S1);
        ctrlcal.addOPFromLLVMOp(I1->getOpcode());
        ctrlcal.addSource(matchNot(s2) ? CtrlCalculator::NS2 : CtrlCalculator::S2);
        ctrlcal.addOPFromLLVMOp(I.getOpcode());
        ctrlcal.addSource(matchNot(s0) ? CtrlCalculator::NS0 : CtrlCalculator::S0);
      }
    }
  } else {
    s2 = I.getOperand(1);

    // add ops and sources by execution order
    ctrlcal.addSource(matchNot(s0) ? CtrlCalculator::NS0 : CtrlCalculator::S0);
    ctrlcal.addOPFromLLVMOp(I0->getOpcode());
    ctrlcal.addSource(matchNot(s1) ? CtrlCalculator::NS1 : CtrlCalculator::S1);
    ctrlcal.addOPFromLLVMOp(I.getOpcode());
    ctrlcal.addSource(matchNot(s2) ? CtrlCalculator::NS2 : CtrlCalculator::S2);
  }

  if (s2 == nullptr)
    return false;

  BfnPattern *pattern = new (m_allocator) BfnPattern();
  pattern->booleanFuncCtrl = ctrlcal.getBooleanFuncCtrl();
  pattern->instruction = &I;
  pattern->sources[0] = GetSource(s0, false, false, IsSourceOfSample(&I));
  pattern->sources[1] = GetSource(s1, false, false, IsSourceOfSample(&I));
  pattern->sources[2] = GetSource(s2, false, false, IsSourceOfSample(&I));

  // BFN can use imm16 in src0 and src2, check for those;
  // otherwise try to add to constant pool even int32.
  if (dyn_cast<ConstantInt>(s0) && !s0->getType()->isIntegerTy(16)) {
    AddToConstantPool(I.getParent(), s0);
    pattern->sources[0].fromConstantPool = true;
  }
  if (dyn_cast<ConstantInt>(s1)) {
    AddToConstantPool(I.getParent(), s1);
    pattern->sources[1].fromConstantPool = true;
  }
  if (dyn_cast<ConstantInt>(s2) && !s2->getType()->isIntegerTy(16)) {
    AddToConstantPool(I.getParent(), s2);
    pattern->sources[2].fromConstantPool = true;
  }

  AddPattern(pattern);
  return true;
}

// Match this pattern
// %1 = cmp %2 %3
// %6 = select %1 $4 %5
// to
// %1 = cmp %2 %3
// %6  = bfn %1 %4 %5
//
// For the original cmp-sel sequence, a flag-based sequence is generated.
// We instead want to generate a non-flag cmp-bfn sequence which has shorter latency.
bool CodeGenPatternMatch::MatchCmpSelect(llvm::SelectInst &I) {
  struct CmpSelectPattern : public Pattern {
    uint8_t execSize;
    llvm::CmpInst::Predicate predicate;
    SSource cmpSources[2];
    uint8_t booleanFuncCtrl = 0xD8; // represents s0&s1|~s0&s2
    SSource bfnSources[3];
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      pass->CmpBfn(predicate, cmpSources, booleanFuncCtrl, bfnSources, modifier);
    }
  };

  if (IGC_IS_FLAG_DISABLED(EnableBfn) || !m_Platform.supportBfnInstruction()) {
    return false;
  }

  if (llvm::CmpInst *cmp = llvm::dyn_cast<llvm::CmpInst>(I.getOperand(0))) {
    // handle one use for now
    if (!cmp->hasOneUse()) {
      return false;
    }

    // BFN only supports D & W types.
    Type *selTy = I.getType();
    if (!(selTy->isIntegerTy(16) || selTy->isIntegerTy(32))) {
      return false;
    }

    Type *cmpS0Ty = cmp->getOperand(0)->getType();
    if (selTy->getPrimitiveSizeInBits() != cmpS0Ty->getPrimitiveSizeInBits()) {
      return false;
    }

    llvm::Value *selSources[2] = {};
    e_modifier selMod[2] = {};
    selSources[0] = I.getOperand(1);
    selSources[1] = I.getOperand(2);

    // BFN only supports 16bit immediate
    if ((isa<Constant>(selSources[0]) && selSources[0]->getType()->isIntegerTy(32)) ||
        (isa<Constant>(selSources[1]) && selSources[1]->getType()->isIntegerTy(32))) {
      return false;
    }

    // As BFN doesn't support src modifier, it is not worth to generate the cmp-bfn
    // sequence if one of its sources will need an extra move.
    if (GetModifier(*selSources[0], selMod[0], selSources[0]) ||
        GetModifier(*selSources[1], selMod[1], selSources[1])) {
      return false;
    }

    CmpSelectPattern *pattern = new (m_allocator) CmpSelectPattern();
    pattern->predicate = cmp->getPredicate();
    bool supportsModifier = SupportsModifier(cmp, m_Platform);
    pattern->cmpSources[0] = GetSource(cmp->getOperand(0), supportsModifier, false, IsSourceOfSample(&I));
    pattern->cmpSources[1] = GetSource(cmp->getOperand(1), supportsModifier, false, IsSourceOfSample(&I));

    pattern->bfnSources[1] = GetSource(selSources[0], false, false, IsSourceOfSample(&I));
    pattern->bfnSources[2] = GetSource(selSources[1], false, false, IsSourceOfSample(&I));
    AddPattern(pattern);

    return true;
  }
  return false;
}

bool CodeGenPatternMatch::MatchDpas(GenIntrinsicInst &I) {
  struct DpasPattern : public Pattern {
    SSource source[3];
    GenIntrinsicInst *instruction;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      pass->emitDpas(instruction, source, modifier);
      // pass->BinaryUnary(instruction, source, modifier);
    }
  };

  GenISAIntrinsic::ID dpasID = I.getIntrinsicID();
  IGC_ASSERT_MESSAGE((dpasID == GenISAIntrinsic::GenISA_dpas || dpasID == GenISAIntrinsic::GenISA_sub_group_dpas),
                     "Unexpected DPAS intrinsic!");

  Value *src0 = I.getOperand(0); // input
  Value *src1 = I.getOperand(1); // activation. operand(3) is its precision
  Value *src2 = I.getOperand(2); // weight. operand(4) is its precision
  // ConstantInt* pa = dyn_cast<ConstantInt>(I.getOperand(3));
  // ConstantInt* pb = dyn_cast<ConstantInt>(I.getOperand(4));
  ConstantInt *sdepth = cast<ConstantInt>(I.getOperand(5));
  ConstantInt *rcount = cast<ConstantInt>(I.getOperand(6));
  int SD = (int)sdepth->getZExtValue();
  int RC = (int)rcount->getZExtValue();

  if (dpasID == GenISAIntrinsic::GenISA_dpas && RC == 1 && SD == 8) {
    // Special-handling of activation (src1 - uniform). The case handled is:
    //
    // %s0 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %5, i32 0)
    // %s1 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %5, i32 1)
    // %s2 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %5, i32 2)
    // %s3 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %5, i32 3)
    // %s4 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %5, i32 4)
    // %s5 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %5, i32 5)
    // %s6 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %5, i32 6)
    // %s7 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %5, i32 7)
    // %a0 = insertelement <8 x i32> undef, i32 %s0, i32 0
    // %a1 = insertelement <8 x i32> %a0,   i32 %s1, i32 1
    // %a2 = insertelement <8 x i32> %a1,   i32 %s2, i32 2
    // %a3 = insertelement <8 x i32> %a2,   i32 %s3, i32 3
    // %a4 = insertelement <8 x i32> %a3,   i32 %s4, i32 4
    // %a5 = insertelement <8 x i32> %a4,   i32 %s5, i32 5
    // %a6 = insertelement <8 x i32> %a5,   i32 %s6, i32 6
    // %a7 = insertelement <8 x i32> %a6,   i32 %s7, i32 7
    //
    // %c0 = call i32 @llvm.genx.GenISA.dpas.8(i32 %9, <8 x i32> %a7, i32 7, <8 x i32> %14, i32 7)
    //
    // %a7 will be replaced with %5 in vISA.
    //

    InsertElementInst *rootIEI = dyn_cast<InsertElementInst>(src1);
    if (rootIEI) {
      // Currently, the src1 can be at most int8.
      Value *srcVec[8];
      Value *WSVal = nullptr;
      for (int i = 0; i < 8; srcVec[i] = nullptr, ++i)
        ;
      InsertElementInst *IEI = rootIEI;
      while (IEI) {
        Value *elem = IEI->getOperand(1);
        ConstantInt *CI = dyn_cast<ConstantInt>(IEI->getOperand(2));
        if (!CI) {
          // Only handle constant index
          WSVal = nullptr;
          break;
        }
        uint32_t ix = (uint32_t)CI->getZExtValue();
        if (ix > 7 || srcVec[ix]) {
          // Assume each element is inserted once.
          WSVal = nullptr;
          break;
        }

        // Check if the inserted element is created
        // by WaveShuffleIndex intrinsic
        GenIntrinsicInst *WSI = dyn_cast<GenIntrinsicInst>(elem);
        if (!WSI || (WSI->getIntrinsicID() != GenISAIntrinsic::GenISA_WaveShuffleIndex &&
                     WSI->getIntrinsicID() != GenISAIntrinsic::GenISA_WaveBroadcast)) {
          WSVal = nullptr;
          break;
        }
        Value *shuffleVal = WSI->getOperand(0);
        Value *ixVal = WSI->getOperand(1);
        if (WSVal == nullptr) {
          WSVal = shuffleVal;
        } else if (WSVal != shuffleVal) {
          WSVal = nullptr;
          break;
        }
        if (ConstantInt *CIX = dyn_cast<ConstantInt>(ixVal)) {
          if ((uint32_t)CIX->getZExtValue() != ix) {
            WSVal = nullptr;
            break;
          }
        }
        srcVec[ix] = elem;
        IEI = dyn_cast<InsertElementInst>(IEI->getOperand(0));
      }

      if (WSVal) {
        for (int i = 0; i < SD; ++i) {
          if (srcVec[i] == nullptr) {
            WSVal = nullptr;
            break;
          }
        }
      }

      // If WSVal is set at this point, it is one that will
      // replace src1.
      if (WSVal) {
        src1 = WSVal;
      }
    }
  }

  DpasPattern *pattern = new (m_allocator) DpasPattern();
  pattern->instruction = &I;
  pattern->source[0] = GetSource(src0, false, false, IsSourceOfSample(&I));
  pattern->source[1] = GetSource(src1, false, false, IsSourceOfSample(&I));
  pattern->source[2] = GetSource(src2, false, false, IsSourceOfSample(&I));
  AddPattern(pattern);

  return true;
}

bool CodeGenPatternMatch::MatchDp4a(GenIntrinsicInst &I) {
  struct MatchDp4a : public Pattern {
    SSource source[3];
    GenIntrinsicInst *instruction;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->emitDP4A(instruction, source, modifier); }
  };

  // Attempt to find a pattern like this:
  // %scalar52.6.2474 = extractelement <4 x i8> %145, i32 0
  // %scalar53.6.2475 = extractelement <4 x i8> % 145, i32 1
  // %scalar54.6.2476 = extractelement <4 x i8> % 145, i32 2
  // %scalar55.6.2477 = extractelement <4 x i8> % 145, i32 3
  // %simdShuffle.6.2480 = call i8 @llvm.genx.GenISA.WaveShuffleIndex.i8(i8 % scalar52.6.2474, i32 0, i32 0)
  // %simdShuffle33.6.2481 = call i8 @llvm.genx.GenISA.WaveShuffleIndex.i8(i8 % scalar53.6.2475, i32 0, i32 0)
  // %simdShuffle34.6.2482 = call i8 @llvm.genx.GenISA.WaveShuffleIndex.i8(i8 % scalar54.6.2476, i32 0, i32 0)
  // %simdShuffle35.6.2483 = call i8 @llvm.genx.GenISA.WaveShuffleIndex.i8(i8 % scalar55.6.2477, i32 0, i32 0)
  // %assembled.vect.6.2484 = insertelement <4 x i8> undef, i8 % simdShuffle.6.2480, i32 0
  // %assembled.vect56.6.2485 = insertelement <4 x i8> % assembled.vect.6.2484, i8 % simdShuffle33.6.2481, i32 1
  // %assembled.vect57.6.2486 = insertelement <4 x i8> % assembled.vect56.6.2485, i8 % simdShuffle34.6.2482, i32 2
  // %assembled.vect58.6.2487 = insertelement <4 x i8> % assembled.vect57.6.2486, i8 % simdShuffle35.6.2483, i32 3
  // %astype.i45.6.2489 = bitcast <4 x i8> % assembled.vect58.6.2487 to i32
  // %call.i4738.6.2490 = call i32 @llvm.genx.GenISA.dp4a.uu(i32 % call.i4738.6.3.1, i32 % astype.i45.6.2489, i32 % 135)
  // %call.i1239.6.2492 = call i32 @llvm.genx.GenISA.dp4a.uu(i32 % call.i1239.6.3.1, i32 % astype.i45.6.2489, i32
  // 16843009)
  //
  // If the pattern is found, we can avoid creating extra movs by changing the source of the dp4a
  // to use the input from the bitcast instead of using the result of the bitcast.
  // This pattern only happens if the WaveShuffleIndex uses 0,0 for the arguments.
  if (llvm::BitCastInst *bcInst = llvm::dyn_cast<BitCastInst>(I.getOperand(1))) {
    llvm::CallInst *waveInst[4] = {nullptr};
    llvm::InsertElementInst *insertInst[4] = {nullptr};

    // Find the 4 x insertelement
    insertInst[3] = llvm::dyn_cast<InsertElementInst>(bcInst->getOperand(0));
    if (insertInst[3]) {
      for (int i = 2; i >= 0; i--) {
        insertInst[i] = llvm::dyn_cast<InsertElementInst>(insertInst[i + 1]->getOperand(0));
        if (!insertInst[i])
          break;
      }
    }

    // Find the 4 x WaveShuffleIndex
    for (int i = 3; i >= 0; i--) {
      if (!insertInst[i])
        break;
      CallInst *temp = llvm::dyn_cast<CallInst>(insertInst[i]->getOperand(1));
      if (!temp)
        break;

      llvm::GenIntrinsicInst *intrin = llvm::dyn_cast<llvm::GenIntrinsicInst>(temp);
      if (!intrin || (intrin->getIntrinsicID() != GenISAIntrinsic::GenISA_WaveShuffleIndex &&
                      intrin->getIntrinsicID() != GenISAIntrinsic::GenISA_WaveBroadcast))
        break;
      waveInst[i] = temp;
    }

    // Check to see if the WaveShuffleIndex uses 0,0
    llvm::Constant *wavesrc1 = nullptr, *wavesrc2 = nullptr;
    if (waveInst[0]) {
      wavesrc1 = llvm::dyn_cast<llvm::Constant>(waveInst[0]->getOperand(1));
      wavesrc2 = llvm::dyn_cast<llvm::Constant>(waveInst[0]->getOperand(2));
    }

    if (wavesrc1 && wavesrc2 && wavesrc1->isZeroValue() && wavesrc2->isZeroValue()) {
      if (llvm::ExtractElementInst *wavesrc0 = llvm::dyn_cast<llvm::ExtractElementInst>(waveInst[0]->getOperand(0))) {
        llvm::ExtractElementInst *extractInst[4];

        // Find the 4 x extractelement
        extractInst[0] = llvm::dyn_cast<llvm::ExtractElementInst>(waveInst[0]->getOperand(0));
        extractInst[1] = llvm::dyn_cast<llvm::ExtractElementInst>(waveInst[1]->getOperand(0));
        extractInst[2] = llvm::dyn_cast<llvm::ExtractElementInst>(waveInst[2]->getOperand(0));
        extractInst[3] = llvm::dyn_cast<llvm::ExtractElementInst>(waveInst[3]->getOperand(0));

        if (extractInst[0] && extractInst[1] && extractInst[2] && extractInst[3] &&
            extractInst[0]->getOperand(0) == extractInst[1]->getOperand(0) &&
            extractInst[0]->getOperand(0) == extractInst[2]->getOperand(0) &&
            extractInst[0]->getOperand(0) == extractInst[3]->getOperand(0)) {
          MatchDp4a *pattern = new (m_allocator) MatchDp4a();
          pattern->instruction = &I;
          pattern->source[0] = GetSource(I.getOperand(0), false, false, IsSourceOfSample(&I));

          // set regioning to: <0;1,0>, as if we were still going to use the result of the WaveShuffleIndex
          llvm::BitCastInst *bitCast = llvm::dyn_cast<BitCastInst>(extractInst[0]->getOperand(0));
          if (bitCast) {
            pattern->source[1] = GetSource(bitCast->getOperand(0), false, false, IsSourceOfSample(&I));
            pattern->source[1].region[0] = 0;
            pattern->source[1].region[1] = 1;
            pattern->source[1].region[2] = 0;
            pattern->source[1].region_set = true;

            pattern->source[2] = GetSource(I.getOperand(2), false, false, IsSourceOfSample(&I));
            AddPattern(pattern);
            return true;
          }
        }
      }
    }
  }
  return MatchSingleInstruction(I);
}

bool CodeGenPatternMatch::MatchLogicAlu(llvm::BinaryOperator &I) {
  struct LogicInstPattern : public Pattern {
    SSource sources[2];
    llvm::Instruction *instruction;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      pass->BinaryUnary(instruction, sources, modifier);
    }
  };
  LogicInstPattern *pattern = new (m_allocator) LogicInstPattern();
  pattern->instruction = &I;
  for (unsigned int i = 0; i < 2; ++i) {
    e_modifier mod = EMOD_NONE;
    Value *src = I.getOperand(i);
    if (!I.getType()->isIntegerTy(1)) {
      if (BinaryOperator *notInst = dyn_cast<BinaryOperator>(src)) {
        if (notInst->getOpcode() == Instruction::Xor) {
          if (ConstantInt *minusOne = dyn_cast<ConstantInt>(notInst->getOperand(1))) {
            if (minusOne->isMinusOne()) {
              mod = EMOD_NOT;
              src = notInst->getOperand(0);
            }
          }
        }
      }
    }
    pattern->sources[i] = GetSource(src, mod, false, IsSourceOfSample(&I));

    if (isCandidateForConstantPool(src)) {
      AddToConstantPool(I.getParent(), src);
      pattern->sources[i].fromConstantPool = true;
    }
  }
  AddPattern(pattern);
  return true;
}

bool CodeGenPatternMatch::MatchRsqrt(llvm::BinaryOperator &I) {
  struct RsqrtPattern : public Pattern {
    SSource source;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Rsqrt(source, modifier); }
  };

  bool found = false;
  llvm::Value *source = NULL;
  if (I.getOpcode() == Instruction::FDiv) {
    // by vISA document, rsqrt doesn't support double type
    if (isOne(I.getOperand(0)) && I.getType()->getTypeID() != Type::DoubleTyID) {
      if (llvm::IntrinsicInst *sqrt = dyn_cast<IntrinsicInst>(I.getOperand(1))) {
        if (sqrt->getIntrinsicID() == Intrinsic::sqrt) {
          source = sqrt->getOperand(0);
          found = true;
        }
      }
    }
  }
  if (found) {
    RsqrtPattern *pattern = new (m_allocator) RsqrtPattern();
    pattern->source = GetSource(source, true, false, IsSourceOfSample(&I));
    AddPattern(pattern);
  }
  return found;
}

bool CodeGenPatternMatch::MatchArcpFdiv(llvm::BinaryOperator &I) {

  using namespace llvm::PatternMatch;

  struct ArcpFdivPattern : public Pattern {
    SSource sources[2];
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->FDiv(sources, modifier); }
  };

  if (!I.getType()->isDoubleTy() || !I.hasAllowReciprocal())
    return false;

  // Look for fdiv.
  Instruction *fdiv = nullptr;
  Value *dividend = nullptr, *divisor = nullptr;

  auto fdivPattern = m_OneUse(m_FDiv(m_FPOne(), m_Value(divisor)));

  if (match(I.getOperand(0), fdivPattern)) {
    fdiv = dyn_cast<Instruction>(I.getOperand(0));
    dividend = I.getOperand(1);
  } else if (match(I.getOperand(1), fdivPattern)) {
    fdiv = dyn_cast<Instruction>(I.getOperand(1));
    dividend = I.getOperand(0);
  }

  if (!fdiv || !fdiv->hasAllowReciprocal())
    return false;

  // Pattern found.
  ArcpFdivPattern *pattern = new (m_allocator) ArcpFdivPattern();
  Value *sources[2] = {dividend, divisor};
  e_modifier src_mod[2] = {};

  if (FlushesDenormsOnInput(*fdiv)) {
    sources[0] = SkipCanonicalize(sources[0]);
    sources[1] = SkipCanonicalize(sources[1]);
  }

  GetModifier(*sources[0], src_mod[0], sources[0]);
  GetModifier(*sources[1], src_mod[1], sources[1]);

  pattern->sources[0] = GetSource(sources[0], src_mod[0], false, IsSourceOfSample(&I));
  pattern->sources[1] = GetSource(sources[1], src_mod[1], false, IsSourceOfSample(&I));

  // Try to add to constant pool whatever possible.
  if (isCandidateForConstantPool(sources[0])) {
    AddToConstantPool(I.getParent(), sources[0]);
    pattern->sources[0].fromConstantPool = true;
  }
  if (isCandidateForConstantPool(sources[1])) {
    AddToConstantPool(I.getParent(), sources[1]);
    pattern->sources[1].fromConstantPool = true;
  }

  AddPattern(pattern);

  return true;
}

bool CodeGenPatternMatch::MatchGradient(llvm::GenIntrinsicInst &I) {
  struct GradientPattern : public Pattern {
    SSource sources[2];
    llvm::Instruction *instruction;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      pass->BinaryUnary(instruction, sources, modifier);
    }
  };
  GradientPattern *pattern = new (m_allocator) GradientPattern();
  pattern->instruction = &I;
  pattern->sources[0] = GetSource(I.getOperand(0), true, false, IsSourceOfSample(&I));
  pattern->sources[1] = {};
  AddPattern(pattern);
  // mark the source as subspan use
  HandleSubspanUse(pattern->sources[0].value, IsSourceOfSample(&I));
  return true;
}

bool CodeGenPatternMatch::MatchSampleDerivative(llvm::GenIntrinsicInst &I) {
  HandleSampleDerivative(I);
  return MatchSingleInstruction(I);
}

bool CodeGenPatternMatch::MatchDbgInstruction(llvm::DbgInfoIntrinsic &I) {
  struct DbgInstPattern : Pattern {
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      // Nothing to emit.
    }
  };
  DbgInstPattern *pattern = new (m_allocator) DbgInstPattern();
  AddPattern(pattern);
  return true;
}

bool CodeGenPatternMatch::MatchAvg(llvm::Instruction &I) {
  // "Average value" pattern:
  // (x + y + 1) / 2  -->  avg(x, y)
  //
  // We're looking for patterns like this:
  //    % 14 = add nsw i32 % 10, % 13
  //    % 15 = add nsw i32 % 14, 1
  //    % 16 = ashr i32 % 15, 1

  struct AvgPattern : Pattern {
    SSource sources[2];
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Avg(sources, modifier); }
  };

  bool found = false;
  llvm::Value *sources[2] = {};
  e_modifier src_mod[2] = {};

  IGC_ASSERT(I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv ||
             I.getOpcode() == Instruction::AShr);

  // We expect 2 for "div" and 1 for "right shift".
  int expectedVal = (I.getOpcode() == Instruction::SDiv ? 2 : 1);
  Value *opnd1 = I.getOperand(1); // Divisor or shift factor.
  if (!isa<ConstantInt>(opnd1) || (cast<ConstantInt>(opnd1))->getZExtValue() != expectedVal) {
    return false;
  }

  if (Instruction *divSrc = dyn_cast<Instruction>(I.getOperand(0))) {
    if (divSrc->getOpcode() == Instruction::Add && !NeedInstruction(*divSrc)) {
      Instruction *instAdd = cast<Instruction>(divSrc);
      for (int i = 0; i < 2; i++) {
        if (ConstantInt *cnst = dyn_cast<ConstantInt>(instAdd->getOperand(i))) {
          // "otherArg" is the second argument of "instAdd" (which is not constant).
          Value *otherArg = instAdd->getOperand(i == 0 ? 1 : 0);
          if (cnst->getZExtValue() == 1 && isa<AddOperator>(otherArg) &&
              !NeedInstruction(*cast<Instruction>(otherArg))) {
            Instruction *firstAdd = cast<Instruction>(otherArg);
            sources[0] = firstAdd->getOperand(0);
            sources[1] = firstAdd->getOperand(1);
            GetModifier(*sources[0], src_mod[0], sources[0]);
            GetModifier(*sources[1], src_mod[1], sources[1]);
            found = true;
            break;
          }
        }
      }
    }
  }

  if (found) {
    AvgPattern *pattern = new (m_allocator) AvgPattern();
    pattern->sources[0] = GetSource(sources[0], src_mod[0], false, IsSourceOfSample(&I));
    pattern->sources[1] = GetSource(sources[1], src_mod[1], false, IsSourceOfSample(&I));
    AddPattern(pattern);
  }
  return found;
}

bool CodeGenPatternMatch::MatchShuffleBroadCast(llvm::GenIntrinsicInst &I) {
  // Match cases like:
  //    %84 = bitcast <2 x i32> %vCastload to <4 x half>
  //    %scalar269 = extractelement <4 x half> % 84, i32 0
  //    %simdShuffle = call half @llvm.genx.GenISA.simdShuffle.f.f16(half %scalar269, i32 0)
  //
  // to mov with region and offset
  struct BroadCastPattern : public Pattern {
    SSource source;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) { pass->Mov(source, modifier); }
  };
  bool match = false;
  SSource source;
  Value *sourceV = &I;
  if (GetRegionModifier(source, sourceV, true)) {
    BroadCastPattern *pattern = new (m_allocator) BroadCastPattern();
    GetModifier(*sourceV, source.mod, sourceV);
    source.value = sourceV;
    pattern->source = source;
    MarkAsSource(sourceV, IsSourceOfSample(&I));
    match = true;
    AddPattern(pattern);
  }
  return match;
}

bool CodeGenPatternMatch::MatchWaveShuffleIndex(llvm::GenIntrinsicInst &I) {
  auto helperLaneMode = cast<ConstantInt>(I.getOperand(2));
  IGC_ASSERT(helperLaneMode);
  if (int_cast<int>(helperLaneMode->getSExtValue()) == 1) {
    // only if helperLaneMode==1, we enable helper lane under some shuffleindex cases (not for all cases).
    HandleSubspanUse(I.getArgOperand(0), IsSourceOfSample(&I));
    HandleSubspanUse(I.getArgOperand(1), IsSourceOfSample(&I));
    HandleSubspanUse(&I, IsSourceOfSample(&I));
  }
  return MatchSingleInstruction(I);
}

bool CodeGenPatternMatch::MatchWaveInstruction(llvm::GenIntrinsicInst &I) {
  if (subgroupIntrinsicHasHelperLanes(I)) {
    m_NeedVMask = true;
  }
  return MatchSingleInstruction(I);
}

bool CodeGenPatternMatch::MatchRegisterRegion(llvm::GenIntrinsicInst &I) {
  struct MatchRegionPattern : public Pattern {
    SSource source;
    virtual void Emit(EmitPass *pass, const DstModifier &modifier) {
      const bool isSimd32Dispatch = (pass->m_currShader->m_State.m_dispatchSize == SIMDMode::SIMD32);
      if (isSimd32Dispatch && pass->m_currShader->m_numberInstance == 2) {
        pass->emitCrossInstanceMov(source, modifier);
      } else {
        pass->Mov(source, modifier);
      }
    }
  };

  /*
  * Match case 1 - With SubReg Offset: Shuffle( data, (laneID << x) + y )
  *   %25 = call i16 @llvm.genx.GenISA.simdLaneId()
  *   %30 = zext i16 %25 to i32
  *   %31 = shl nuw nsw i32 %30, 1  - Current LaneID shifted by x
  *   %36 = add i32 %31, 1          - Current LaneID shifted by x + y  Shuffle( data, (laneID << x) + 1 )
  *   %37 = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %21, i32 %36)

  * Match case 2(Special case of Match Case 1) - No SubReg Offset: Shuffle( data, (laneID << x) + 0 )
  *    %25 = call i16 @llvm.genx.GenISA.simdLaneId()
  *    %30 = zext i16 %25 to i32
  *    %31 = shl nuw nsw i32 %30, 1 - Current LaneID shifted by x
  *    %32 = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %21, i32 %31)
  */

  Value *data = I.getOperand(0);
  Value *source = I.getOperand(1);
  uint typeByteSize = data->getType()->getScalarSizeInBits() / 8;
  bool isMatch = false;
  int subReg = 0;
  uint verticalStride = 1; // Default value for special case  Shuffle( data, (laneID << x) + y )  when x = 0

  if (auto binaryInst = dyn_cast<BinaryOperator>(source)) {
    // Will be skipped for match case 2
    if (binaryInst->getOpcode() == Instruction::Add) {
      if (llvm::ConstantInt *simDOffSetInst = llvm::dyn_cast<llvm::ConstantInt>(binaryInst->getOperand(1))) {
        subReg = int_cast<int>(cast<ConstantInt>(simDOffSetInst)->getSExtValue());

        // Subregister must be a number between 0 and 15 for a valid region
        //  We could support up to 31 but we need to handle reading from different SIMD16 var chunks
        if (subReg >= 0 && subReg < 16) {
          source = binaryInst->getOperand(0);
        }
      }
    }
  }

  if (auto binaryInst = dyn_cast<BinaryOperator>(source)) {
    if (binaryInst->getOpcode() == Instruction::Shl) {
      source = binaryInst->getOperand(0);

      if (llvm::ConstantInt *simDOffSetInst = llvm::dyn_cast<llvm::ConstantInt>(binaryInst->getOperand(1))) {
        uint shiftFactor = int_cast<uint>(simDOffSetInst->getZExtValue());
        // Check to make sure we dont end up with an invalid Vertical Stride.
        // Only 1, 2, 4, 8, 16 are supported.
        if (shiftFactor <= 4)
          verticalStride = (1U << shiftFactor);
        else
          return false;
      }
    }
  }

  if (auto zExtInst = llvm::dyn_cast<llvm::ZExtInst>(source)) {
    source = zExtInst->getOperand(0);
  }

  llvm::GenIntrinsicInst *intrin = llvm::dyn_cast<llvm::GenIntrinsicInst>(source);

  // Finally check for simLaneID intrisic
  if (intrin && (intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_simdLaneId)) {
    // To avoid compiler crash, pattern match with direct mov will be disable
    // Conservetively, we assum simd16 for 32 bytes GRF platforms and simd32 for 64 bytes GRF platforms
    bool cross2GRFs = typeByteSize * (subReg + verticalStride * (m_Platform.getGRFSize() > 32 ? 32 : 16)) >
                      (2 * m_Platform.getGRFSize());
    if (!cross2GRFs) {
      MatchRegionPattern *pattern = new (m_allocator) MatchRegionPattern();
      pattern->source.elementOffset = subReg;

      // Set Region Parameters <VerString;Width,HorzString>
      pattern->source.region_set = true;
      pattern->source.region[0] = verticalStride;
      pattern->source.region[1] = 1;
      pattern->source.region[2] = 0;

      pattern->source.value = data;
      MarkAsSource(data, IsSourceOfSample(&I));
      HandleSubspanUse(data, IsSourceOfSample(&I));
      AddPattern(pattern);

      isMatch = true;
    }
  }

  return isMatch;
}

bool CodeGenPatternMatch::GetRegionModifier(SSource &sourceMod, llvm::Value *&source, bool regioning) {
  bool found = false;
  Value *OrignalSource = source;
  if (llvm::BitCastInst *bitCast = llvm::dyn_cast<BitCastInst>(source)) {
    if (!bitCast->getType()->isVectorTy() && !bitCast->getOperand(0)->getType()->isVectorTy()) {
      source = bitCast->getOperand(0);
      found = true;
    }
  }

  if (llvm::GenIntrinsicInst *intrin = llvm::dyn_cast<llvm::GenIntrinsicInst>(source)) {
    GenISAIntrinsic::ID id = intrin->getIntrinsicID();
    if (id == GenISAIntrinsic::GenISA_WaveShuffleIndex || id == GenISAIntrinsic::GenISA_WaveBroadcast) {
      if (llvm::ConstantInt *channelVal = llvm::dyn_cast<llvm::ConstantInt>(intrin->getOperand(1))) {
        unsigned int offset = int_cast<unsigned int>(channelVal->getZExtValue());
        if (offset < 16 && !isUniform(intrin->getOperand(0))) {
          sourceMod.elementOffset = offset;
          // SIMD shuffle force region <0,1;0>
          sourceMod.region_set = true;
          sourceMod.region[0] = 0;
          sourceMod.region[1] = 1;
          sourceMod.region[2] = 0;
          sourceMod.instance = EINSTANCE_FIRST_HALF;
          source = intrin->getOperand(0);
          found = true;
          BitcastSearch(sourceMod, source, true);
        }
      }
    }
  }
  if (regioning && !sourceMod.region_set) {
    found |= BitcastSearch(sourceMod, source, false);
  }
  if (found && sourceMod.type == VISA_Type::ISA_TYPE_NUM) {
    // keep the original type
    sourceMod.type = GetType(OrignalSource->getType(), m_ctx);
  }
  return found;
}

void CodeGenPatternMatch::HandleSampleDerivative(llvm::GenIntrinsicInst &I) {
  switch (I.getIntrinsicID()) {
  case GenISAIntrinsic::GenISA_sampleptr:
  case GenISAIntrinsic::GenISA_lodptr:
  case GenISAIntrinsic::GenISA_sampleKillPix:
    HandleSubspanUse(I.getOperand(0), true);
    HandleSubspanUse(I.getOperand(1), true);
    HandleSubspanUse(I.getOperand(2), true);
    break;
  case GenISAIntrinsic::GenISA_sampleBptr:
  case GenISAIntrinsic::GenISA_sampleCptr:
    HandleSubspanUse(I.getOperand(1), true);
    HandleSubspanUse(I.getOperand(2), true);
    HandleSubspanUse(I.getOperand(3), true);
    break;
  case GenISAIntrinsic::GenISA_sampleBCptr:
    HandleSubspanUse(I.getOperand(2), true);
    HandleSubspanUse(I.getOperand(3), true);
    HandleSubspanUse(I.getOperand(4), true);
    break;
  case GenISAIntrinsic::GenISA_gather4Iptr:
  case GenISAIntrinsic::GenISA_gather4IPOptr:
  case GenISAIntrinsic::GenISA_gather4POPackedIptr:
    HandleSubspanUse(I.getOperand(0), true);
    HandleSubspanUse(I.getOperand(1), true);
    HandleSubspanUse(I.getOperand(2), true);
    break;
  case GenISAIntrinsic::GenISA_samplePOptr:
    HandleSubspanUse(I.getOperand(0), true);
    HandleSubspanUse(I.getOperand(1), true);
    HandleSubspanUse(I.getOperand(2), true);
    break;
  case GenISAIntrinsic::GenISA_gather4Bptr:
  case GenISAIntrinsic::GenISA_gather4BPOptr:
  case GenISAIntrinsic::GenISA_gather4ICptr:
  case GenISAIntrinsic::GenISA_gather4ICPOptr:
  case GenISAIntrinsic::GenISA_gather4POPackedBptr:
  case GenISAIntrinsic::GenISA_gather4POPackedICptr:
    HandleSubspanUse(I.getOperand(1), true);
    HandleSubspanUse(I.getOperand(2), true);
    HandleSubspanUse(I.getOperand(3), true);
    break;
  case GenISAIntrinsic::GenISA_sampleMlodptr:
    HandleSubspanUse(I.getOperand(1), true);
    HandleSubspanUse(I.getOperand(2), true);
    HandleSubspanUse(I.getOperand(3), true);
    break;
  case GenISAIntrinsic::GenISA_sampleCMlodptr:
  case GenISAIntrinsic::GenISA_sampleBCMlodptr:
    HandleSubspanUse(I.getOperand(2), true);
    HandleSubspanUse(I.getOperand(3), true);
    HandleSubspanUse(I.getOperand(4), true);
    break;
  case GenISAIntrinsic::GenISA_samplePOBptr:
  case GenISAIntrinsic::GenISA_samplePOCptr:
    HandleSubspanUse(I.getOperand(1), true);
    HandleSubspanUse(I.getOperand(2), true);
    HandleSubspanUse(I.getOperand(3), true);
    break;
  default:
    break;
  }
}

// helper function for pattern match
static inline bool isLowerPredicate(llvm::CmpInst::Predicate pred) {
  switch (pred) {
  case llvm::CmpInst::FCMP_ULT:
  case llvm::CmpInst::FCMP_ULE:
  case llvm::CmpInst::FCMP_OLT:
  case llvm::CmpInst::FCMP_OLE:
  case llvm::CmpInst::ICMP_ULT:
  case llvm::CmpInst::ICMP_ULE:
  case llvm::CmpInst::ICMP_SLT:
  case llvm::CmpInst::ICMP_SLE:
    return true;
  default:
    break;
  }
  return false;
}

// helper function for pattern match
static inline bool isGreaterOrLowerPredicate(llvm::CmpInst::Predicate pred) {
  switch (pred) {
  case llvm::CmpInst::FCMP_UGT:
  case llvm::CmpInst::FCMP_UGE:
  case llvm::CmpInst::FCMP_ULT:
  case llvm::CmpInst::FCMP_ULE:
  case llvm::CmpInst::FCMP_OGT:
  case llvm::CmpInst::FCMP_OGE:
  case llvm::CmpInst::FCMP_OLT:
  case llvm::CmpInst::FCMP_OLE:
  case llvm::CmpInst::ICMP_UGT:
  case llvm::CmpInst::ICMP_UGE:
  case llvm::CmpInst::ICMP_ULT:
  case llvm::CmpInst::ICMP_ULE:
  case llvm::CmpInst::ICMP_SGT:
  case llvm::CmpInst::ICMP_SGE:
  case llvm::CmpInst::ICMP_SLT:
  case llvm::CmpInst::ICMP_SLE:
    return true;
  default:
    break;
  }
  return false;
}

static bool isIntegerAbs(SelectInst *SI, e_modifier &mod, Value *&source) {
  using namespace llvm::PatternMatch; // Scoped using declaration.

  Value *Cond = SI->getOperand(0);
  Value *TVal = SI->getOperand(1);
  Value *FVal = SI->getOperand(2);

  ICmpInst::Predicate IPred = FCmpInst::FCMP_FALSE;
  Value *LHS = nullptr;
  Value *RHS = nullptr;

  if (!match(Cond, m_ICmp(IPred, m_Value(LHS), m_Value(RHS))))
    return false;

  if (!ICmpInst::isSigned(IPred))
    return false;

  if (match(LHS, m_Zero())) {
    IPred = ICmpInst::getSwappedPredicate(IPred);
    std::swap(LHS, RHS);
  }

  if (!match(RHS, m_Zero()))
    return false;

  if (match(TVal, m_Neg(m_Specific(FVal)))) {
    IPred = ICmpInst::getInversePredicate(IPred);
    std::swap(TVal, FVal);
  }

  if (!match(FVal, m_Neg(m_Specific(TVal))))
    return false;

  if (LHS != TVal)
    return false;

  source = TVal;
  mod = (IPred == ICmpInst::ICMP_SGT || IPred == ICmpInst::ICMP_SGE) ? EMOD_ABS : EMOD_NEGABS;

  return true;
}

bool isAbs(llvm::Value *abs, e_modifier &mod, llvm::Value *&source) {
  bool found = false;

  if (IntrinsicInst *intrinsicInst = dyn_cast<IntrinsicInst>(abs)) {
    if (intrinsicInst->getIntrinsicID() == Intrinsic::fabs) {
      source = intrinsicInst->getOperand(0);
      mod = EMOD_ABS;
      return true;
    }
    if (intrinsicInst->getIntrinsicID() == Intrinsic::abs) {
      source = intrinsicInst->getOperand(0);
      mod = EMOD_ABS;
      return true;
    }
  }

  llvm::SelectInst *select = llvm::dyn_cast<llvm::SelectInst>(abs);
  if (!select)
    return false;

  // Try to find floating point abs first
  if (llvm::FCmpInst *cmp = llvm::dyn_cast<llvm::FCmpInst>(select->getOperand(0))) {
    llvm::CmpInst::Predicate pred = cmp->getPredicate();
    if (isGreaterOrLowerPredicate(pred)) {
      for (int zeroIndex = 0; zeroIndex < 2; zeroIndex++) {
        llvm::ConstantFP *zero = llvm::dyn_cast<llvm::ConstantFP>(cmp->getOperand(zeroIndex));
        if (zero && zero->isZero()) {
          llvm::Value *cmpSource = cmp->getOperand(1 - zeroIndex);
          for (int sourceIndex = 0; sourceIndex < 2; sourceIndex++) {
            if (cmpSource == select->getOperand(1 + sourceIndex)) {
              llvm::Instruction *opnd = llvm::dyn_cast<llvm::Instruction>(select->getOperand(1 + (1 - sourceIndex)));
              llvm::Value *negateSource = NULL;
              if (opnd && IsNegate(opnd, negateSource) && negateSource == cmpSource) {
                found = true;
                source = cmpSource;
                // depending on the order source in cmp/select it can abs() or -abs()
                bool isNegateAbs = (zeroIndex == 0) ^ isLowerPredicate(pred) ^ (sourceIndex == 1);
                mod = isNegateAbs ? EMOD_NEGABS : EMOD_ABS;
              }
              break;
            }
          }
          break;
        }
      }
    }
  }

  // If not found, try integer abs
  return found || isIntegerAbs(select, mod, source);
}

// combine two modifiers, this function is *not* communtative
e_modifier CombineModifier(e_modifier mod1, e_modifier mod2) {
  e_modifier mod = EMOD_NONE;
  switch (mod1) {
  case EMOD_ABS:
  case EMOD_NEGABS:
    mod = mod1;
    break;
  case EMOD_NEG:
    if (mod2 == EMOD_NEGABS) {
      mod = EMOD_ABS;
    } else if (mod2 == EMOD_ABS) {
      mod = EMOD_NEGABS;
    } else if (mod2 == EMOD_NEG) {
      mod = EMOD_NONE;
    } else {
      mod = EMOD_NEG;
    }
    break;
  default:
    mod = mod2;
  }
  return mod;
}

bool GetModifier(llvm::Value &modifier, e_modifier &mod, llvm::Value *&source) {
  mod = EMOD_NONE;
  if (llvm::Instruction *bin = llvm::dyn_cast<llvm::Instruction>(&modifier)) {
    return GetModifier(*bin, mod, source);
  }
  return false;
}

bool GetModifier(llvm::Instruction &modifier, e_modifier &mod, llvm::Value *&source) {
  llvm::Value *modifierSource = NULL;
  mod = EMOD_NONE;
  if (IsNegate(&modifier, modifierSource)) {
    e_modifier absModifier = EMOD_NONE;
    llvm::Value *absSource = NULL;
    if (isAbs(modifierSource, absModifier, absSource)) {
      source = absSource;
      mod = IGC::CombineModifier(EMOD_NEG, absModifier);
    } else {
      source = modifierSource;
      mod = EMOD_NEG;
    }
    return true;
  } else if (isAbs(&modifier, mod, modifierSource)) {
    source = modifierSource;
    return true;
  }
  return false;
}

bool IsNegate(llvm::Instruction *inst, llvm::Value *&negateSource) {
  BinaryOperator *binop = dyn_cast<BinaryOperator>(inst);
  if (binop && (inst->getOpcode() == Instruction::FSub || inst->getOpcode() == Instruction::Sub)) {
    if (IsZero(inst->getOperand(0))) {
      negateSource = inst->getOperand(1);
      return true;
    }
  }
  UnaryOperator *unop = dyn_cast<UnaryOperator>(inst);
  if (unop && inst->getOpcode() == Instruction::FNeg) {
    negateSource = inst->getOperand(0);
    return true;
  }
  return false;
}

bool IsZero(llvm::Value *zero) {
  if (llvm::ConstantFP *FCst = llvm::dyn_cast<llvm::ConstantFP>(zero)) {
    if (FCst->isZero()) {
      return true;
    }
  }
  if (llvm::ConstantInt *ICst = llvm::dyn_cast<llvm::ConstantInt>(zero)) {
    if (ICst->isZero()) {
      return true;
    }
  }
  return false;
}

inline bool isMinOrMax(llvm::Value *inst, llvm::Value *&source0, llvm::Value *&source1, bool &isMin, bool &isUnsigned) {
  bool found = false;
  llvm::Instruction *max = llvm::dyn_cast<llvm::Instruction>(inst);
  if (!max)
    return false;

  EOPCODE op = GetOpCode(max);
  if (op == llvm_min || op == llvm_max) {
    source0 = max->getOperand(0);
    source1 = max->getOperand(1);
    isUnsigned = false;
    isMin = (op == llvm_min);
    return true;
  } else if (op == llvm_select) {
    if (llvm::CmpInst *cmp = llvm::dyn_cast<llvm::CmpInst>(max->getOperand(0))) {
      auto SkipAsr = [max](uint32_t idx) {
        ConstantInt *c1 = dyn_cast<ConstantInt>(max->getOperand(1));
        ConstantInt *c2 = dyn_cast<ConstantInt>(max->getOperand(2));
        Instruction *op = dyn_cast<Instruction>(max->getOperand(idx));
        if (((c1 && c1->isZeroValue()) || (c2 && c2->isZeroValue())) && (op && op->getOpcode() == Instruction::AShr)) {
          return op->getOperand(0);
        }
        return max->getOperand(idx);
      };
      if (isGreaterOrLowerPredicate(cmp->getPredicate())) {
        if ((cmp->getOperand(0) == max->getOperand(1) && cmp->getOperand(1) == max->getOperand(2)) ||
            (cmp->getOperand(0) == max->getOperand(2) && cmp->getOperand(1) == max->getOperand(1))) {
          isMin = isLowerPredicate(cmp->getPredicate()) ^ (cmp->getOperand(0) == max->getOperand(2));
          found = true;
        } else if ((cmp->getOperand(0) == SkipAsr(1) && cmp->getOperand(1) == SkipAsr(2)) ||
                   (cmp->getOperand(0) == SkipAsr(2) && cmp->getOperand(1) == SkipAsr(1))) {
          isMin = isLowerPredicate(cmp->getPredicate()) ^ (cmp->getOperand(0) == SkipAsr(2));
          found = true;
        }
        if (found) {
          source0 = max->getOperand(1);
          source1 = max->getOperand(2);
          isUnsigned = IsUnsignedCmp(cmp->getPredicate());
        }
      }
    }
  }
  return found;
}

bool isMax(llvm::Value *max, llvm::Value *&source0, llvm::Value *&source1) {
  bool isMin, isUnsigned;
  llvm::Value *maxSource0;
  llvm::Value *maxSource1;
  if (isMinOrMax(max, maxSource0, maxSource1, isMin, isUnsigned)) {
    if (!isMin) {
      source0 = maxSource0;
      source1 = maxSource1;
      return true;
    }
  }
  return false;
}

bool isMin(llvm::Value *min, llvm::Value *&source0, llvm::Value *&source1) {
  bool isMin, isUnsigned;
  llvm::Value *maxSource0;
  llvm::Value *maxSource1;
  if (isMinOrMax(min, maxSource0, maxSource1, isMin, isUnsigned)) {
    if (isMin) {
      source0 = maxSource0;
      source1 = maxSource1;
      return true;
    }
  }
  return false;
}

bool isOne(llvm::Value *zero) {
  if (llvm::ConstantFP *FCst = llvm::dyn_cast<llvm::ConstantFP>(zero)) {
    if (FCst->isExactlyValue(1.f)) {
      return true;
    }
  }
  if (llvm::ConstantInt *ICst = llvm::dyn_cast<llvm::ConstantInt>(zero)) {
    if (ICst->isOne()) {
      return true;
    }
  }
  return false;
}

bool isSat(llvm::Instruction *sat, llvm::Value *&source, bool &isUnsigned) {
  bool found = false;
  llvm::Value *sources[2] = {0};
  bool floatMatch = sat->getType()->isFloatingPointTy();
  GenIntrinsicInst *intrin = dyn_cast<GenIntrinsicInst>(sat);
  if (intrin && (intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_fsat ||
                 intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_usat ||
                 intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_isat)) {
    source = intrin->getOperand(0);
    found = true;
    isUnsigned = intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_usat;
  } else if (floatMatch && isMax(sat, sources[0], sources[1])) {
    for (int i = 0; i < 2; i++) {
      if (IsZero(sources[i])) {
        llvm::Value *maxSources[2] = {0};
        if (isMin(sources[1 - i], maxSources[0], maxSources[1])) {
          for (int j = 0; j < 2; j++) {
            if (isOne(maxSources[j])) {
              found = true;
              source = maxSources[1 - j];
              isUnsigned = false;
              break;
            }
          }
        }
        break;
      }
    }
  } else if (floatMatch && isMin(sat, sources[0], sources[1])) {
    for (int i = 0; i < 2; i++) {
      if (isOne(sources[i])) {
        llvm::Value *maxSources[2] = {0};
        if (isMax(sources[1 - i], maxSources[0], maxSources[1])) {
          for (int j = 0; j < 2; j++) {
            if (IsZero(maxSources[j])) {
              found = true;
              source = maxSources[1 - j];
              isUnsigned = false;
              break;
            }
          }
        }
        break;
      }
    }
  }
  return found;
}

bool isCandidateForConstantPool(llvm::Value *val) {
  auto ci = dyn_cast<ConstantInt>(val);
  bool isBigQW = ci && !ci->getValue().isNullValue() && !ci->getValue().isSignedIntN(32);
  bool isDF = val->getType()->isDoubleTy();
  return (isBigQW || isDF);
}

uint CodeGenPatternMatch::GetBlockId(llvm::BasicBlock *block) {
  auto it = m_blockMap.find(block);
  IGC_ASSERT(it != m_blockMap.end());

  uint blockID = it->second->id;
  return blockID;
}

} // namespace IGC