intel-graphics-compiler/visa/GraphColor.cpp

/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2023 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#include "GraphColor.h"
#include "BuildIR.h"
#include "DebugInfo.h"
#include "FlagSpillCleanup.h"
#include "FlowGraph.h"
#include "LinearScanRA.h"
#include "LocalRA.h"
#include "Optimizer.h"
#include "PointsToAnalysis.h"
#include "RADebug.h"
#include "RPE.h"
#include "Rematerialization.h"
#include "SCCAnalysis.h"
#include "SpillCleanup.h"
#include "SpillCode.h"
#include "SplitAlignedScalars.h"
#include "Timer.h"

#include <algorithm>
#include <cmath> // sqrt
#include <fstream>
#include <iostream>
#include <list>
#include <sstream>
#include <optional>

#include "common/LLVMWarningsPush.hpp"
#include <llvm/ADT/SmallString.h>
#include <llvm/ADT/StringRef.h>
#include "common/LLVMWarningsPop.hpp"

using namespace vISA;

#define GRAPH_COLOR_MEM_SIZE 16 * 1024
#define SCRATCH_MSG_LIMIT (128 * 1024)
#define SCRATCH_COMPRESS_THRESHOLD (12 * 1024)

const RAVarInfo GlobalRA::defaultValues;
const char GlobalRA::StackCallStr[] = "StackCall";

static const unsigned IN_LOOP_REFERENCE_COUNT_FACTOR = 4;

#define BANK_CONFLICT_HEURISTIC_INST 0.04
#define BANK_CONFLICT_HEURISTIC_REF_COUNT 0.25
#define BANK_CONFLICT_HEURISTIC_LOOP_ITERATION 5
#define BANK_CONFLICT_SEND_INST_CYCLE                                          \
  60 // Some send 200, some 400 we choose the small one
#define BANK_CONFLICT_SIMD8_OVERHEAD_CYCLE 1
#define BANK_CONFLICT_SIMD16_OVERHEAD_CYCLE 2
#define INTERNAL_CONFLICT_RATIO_HEURISTIC 0.25

#define NOMASK_BYTE 0x80

Interference::Interference(const LivenessAnalysis *l, GlobalRA &g)
    : gra(g), kernel(g.kernel), lrs(gra.incRA.getLRs()),
      builder(*g.kernel.fg.builder), maxId(l->getNumSelectedVar()),
      rowSize(maxId / BITS_DWORD + 1),
      splitStartId(l->getNumSplitStartID()), splitNum(l->getNumSplitVar()),
      liveAnalysis(l),  aug(*this, *l, g), incRA(g.incRA),
      sparseIntf(g.intfStorage.sparseIntf), sparseMatrix(g.intfStorage.sparseMatrix) {
  denseMatrixLimit = builder.getuint32Option(vISA_DenseMatrixLimit);
  incRA.registerNextIter((G4_RegFileKind)l->getSelectedRF(), l, this);
}

criticalCmpForEndInterval::criticalCmpForEndInterval(GlobalRA &g) : gra(g) {}
bool criticalCmpForEndInterval::operator()(const QueueEntry &A, const QueueEntry &B) const {
  return A.interval.end->getLexicalId() > B.interval.end->getLexicalId();
}
AugmentPriorityQueue::AugmentPriorityQueue(criticalCmpForEndInterval cmp)
    : std::priority_queue<QueueEntry, std::vector<QueueEntry>,
                          criticalCmpForEndInterval>(cmp) {}

inline bool Interference::varSplitCheckBeforeIntf(unsigned v1,
                                                  unsigned v2) const {
  const LiveRange *l1 = lrs[v1];
  const LiveRange *l2 = lrs[v2];

  if (!l1->getIsPartialDcl() && !l2->getIsPartialDcl()) {
    return false;
  }

  // Don't do interference for two split declares
  if (l1->getIsPartialDcl() && l2->getIsPartialDcl()) {
    return true;
  }

  unsigned p1 = v1;
  unsigned p2 = v2;
  // Don't do inteference for child and parent delcares
  if (l1->getIsPartialDcl()) {
    p1 = l1->getParentLRID();
  }

  if (l2->getIsPartialDcl()) {
    p2 = l2->getParentLRID();
  }

  if (p1 == p2) {
    return true;
  }

  return false;
}

BankConflict BankConflictPass::setupBankAccordingToSiblingOperand(
    BankConflict assignedBank, unsigned offset, bool oneGRFBank) {
  BankConflict tgtBank;

  vISA_ASSERT(assignedBank != BANK_CONFLICT_NONE,
              "sibling bank is not assigned");

  // Set according to sibling
  tgtBank = (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN ||
             assignedBank == BANK_CONFLICT_FIRST_HALF_ODD)
                ? (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN
                       ? BANK_CONFLICT_SECOND_HALF_ODD
                       : BANK_CONFLICT_SECOND_HALF_EVEN)
                : (assignedBank == BANK_CONFLICT_SECOND_HALF_EVEN
                       ? BANK_CONFLICT_FIRST_HALF_ODD
                       : BANK_CONFLICT_FIRST_HALF_EVEN);

  // Adjust according to the offset
  if (oneGRFBank) {
    if (offset % 2) {
      if (tgtBank == BANK_CONFLICT_SECOND_HALF_EVEN ||
          tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN) {
        tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN)
                      ? BANK_CONFLICT_FIRST_HALF_ODD
                      : BANK_CONFLICT_SECOND_HALF_ODD;
      } else {
        tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_ODD)
                      ? BANK_CONFLICT_FIRST_HALF_EVEN
                      : BANK_CONFLICT_SECOND_HALF_EVEN;
      }
    }
  } else {
    if (offset % 4 >= 2) {
      if (tgtBank == BANK_CONFLICT_SECOND_HALF_EVEN ||
          tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN) {
        tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_EVEN)
                      ? BANK_CONFLICT_FIRST_HALF_ODD
                      : BANK_CONFLICT_SECOND_HALF_ODD;
      } else {
        tgtBank = (tgtBank == BANK_CONFLICT_FIRST_HALF_ODD)
                      ? BANK_CONFLICT_FIRST_HALF_EVEN
                      : BANK_CONFLICT_SECOND_HALF_EVEN;
      }
    }
  }

  return tgtBank;
}

void refNumBasedSort(const unsigned *refNum, unsigned *index) {
  if (refNum[2] > refNum[1]) {
    index[0] = 2;
    index[1] = 1;
  } else {
    index[0] = 1;
    index[1] = 2;
  }

  index[2] = 0;

  return;
}

bool BankConflictPass::hasInternalConflict3Srcs(BankConflict *srcBC) {
  if (((srcBC[0] == BANK_CONFLICT_SECOND_HALF_EVEN ||
        srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
       (srcBC[1] == BANK_CONFLICT_SECOND_HALF_EVEN ||
        srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
       (srcBC[2] == BANK_CONFLICT_SECOND_HALF_EVEN ||
        srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)) ||
      ((srcBC[0] == BANK_CONFLICT_SECOND_HALF_ODD ||
        srcBC[0] == BANK_CONFLICT_FIRST_HALF_ODD) &&
       (srcBC[1] == BANK_CONFLICT_SECOND_HALF_ODD ||
        srcBC[1] == BANK_CONFLICT_FIRST_HALF_ODD) &&
       (srcBC[2] == BANK_CONFLICT_SECOND_HALF_ODD ||
        srcBC[2] == BANK_CONFLICT_FIRST_HALF_ODD))) {
    return true;
  }
  if ((srcBC[0] < BANK_CONFLICT_SECOND_HALF_EVEN &&
       srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN &&
       srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN) ||
      (srcBC[0] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
       srcBC[1] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
       srcBC[2] >= BANK_CONFLICT_SECOND_HALF_EVEN)) {
    return true;
  }

  return false;
}

void BankConflictPass::setupEvenOddBankConflictsForDecls(
    G4_Declare *dcl_1, G4_Declare *dcl_2, unsigned offset1, unsigned offset2,
    BankConflict &srcBC1, BankConflict &srcBC2) {
  vISA_ASSERT(srcBC1 == BANK_CONFLICT_NONE, "Wrong Bank initial value");
  vISA_ASSERT(srcBC2 == BANK_CONFLICT_NONE, "Wrong Bank initial value");

  unsigned refNum1 = gra.getNumRefs(dcl_1);
  unsigned refNum2 = gra.getNumRefs(dcl_2);

  BankConflict bank1 = BANK_CONFLICT_NONE;
  BankConflict bank2 = BANK_CONFLICT_NONE;

  bank1 = (refNum1 >= refNum2) ? BANK_CONFLICT_FIRST_HALF_EVEN
                               : BANK_CONFLICT_SECOND_HALF_ODD;
  bank2 = (bank1 == BANK_CONFLICT_FIRST_HALF_EVEN)
              ? BANK_CONFLICT_SECOND_HALF_ODD
              : BANK_CONFLICT_FIRST_HALF_EVEN;

  srcBC1 = bank1;
  srcBC2 = bank2;

  // Adjust only for the single bank allocation
  if ((offset1 + offset2) % 2) {
    if (refNum1 >= refNum2) {
      bank2 = (bank2 == BANK_CONFLICT_FIRST_HALF_EVEN)
                  ? BANK_CONFLICT_SECOND_HALF_ODD
                  : BANK_CONFLICT_FIRST_HALF_EVEN;
    } else {
      bank1 = (bank1 == BANK_CONFLICT_FIRST_HALF_EVEN)
                  ? BANK_CONFLICT_SECOND_HALF_ODD
                  : BANK_CONFLICT_FIRST_HALF_EVEN;
    }
  }

  gra.setBankConflict(dcl_1, bank1);
  gra.setBankConflict(dcl_2, bank2);

  return;
}

//
// inst opcode is G4_mad. This function sets up a simple state machine to
// prevent conflict between src 1 and 2 of mad inst. Following is how GRF file
// is divided in to banks: bank-block A = 0, 2, 4, 6, ..., 62 bank-block B = 1,
// 3, 5, 7, ..., 63 bank-block C = 64, 66, 68, ..., 126 bank-block D = 65, 67,
// 69, ..., 127
//
// For ternary ops, if src1 and src2 are to the same bank then there will be an
// access collision. But unary and binary ops will have no collision, no matter
// what registers they use. The reason is second and third src operands are read
// in the same clock cycle, which is different than when src0 operand is read.
// This is true upto pre-SKL.
//
// Bank Conflict Herustics:
// 1. Try to balance the used registers in two banks for the potential
// conflicted registers.
// 2. reference number is used to decide which to be assigned first
// 3. When conflict detected, bank can be updated according to the reference
// count.
//
void BankConflictPass::setupBankConflictsOneGRFOld(G4_INST *inst,
                                                   int &bank1RegNum,
                                                   int &bank2RegNum,
                                                   float GRFRatio,
                                                   unsigned &internalConflict) {
  BankConflict srcBC[3];
  unsigned regNum[3];
  unsigned refNum[3];
  unsigned offset[3];
  G4_Declare *dcls[3];
  G4_Declare *opndDcls[3];
  int bank_num = 0;

  for (int i = 0; i < 3; i++) {
    dcls[i] = nullptr;
    opndDcls[i] = nullptr;

    G4_Operand *src = inst->getSrc(i);
    if (!src || !src->isSrcRegRegion() || src->isAccReg()) {
      // bank conflict not possible
      return;
    }

    dcls[i] = GetTopDclFromRegRegion(src);
    opndDcls[i] = src->getBase()->asRegVar()->getDeclare();

    regNum[i] = dcls[i]->getNumRows();
    refNum[i] = gra.getNumRefs(dcls[i]);
    offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
                gra.kernel.numEltPerGRF<Type_UB>();
    srcBC[i] = gra.getBankConflict(dcls[i]);

    if (src->getBase()->asRegVar()->isPhyRegAssigned()) {
      unsigned reg =
          src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
      if ((reg + offset[i]) < SECOND_HALF_BANK_START_GRF) {
        srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_FIRST_HALF_ODD
                                           : BANK_CONFLICT_FIRST_HALF_EVEN;
      } else {
        srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD
                                           : BANK_CONFLICT_SECOND_HALF_EVEN;
      }
      if (reg < SECOND_HALF_BANK_START_GRF) {
        bank1RegNum += regNum[i];
      } else {
        bank2RegNum += regNum[i];
      }
      gra.setBankConflict(dcls[i], srcBC[i]);
    } else if (srcBC[i] != BANK_CONFLICT_NONE) {
      if (offset[i] % 2) {
        // Get operand's bank from declare's bank
        if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN ||
            srcBC[i] == BANK_CONFLICT_FIRST_HALF_ODD) {
          srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
                         ? BANK_CONFLICT_FIRST_HALF_ODD
                         : BANK_CONFLICT_FIRST_HALF_EVEN;
        } else {
          srcBC[i] = (srcBC[i] == BANK_CONFLICT_SECOND_HALF_EVEN)
                         ? BANK_CONFLICT_SECOND_HALF_ODD
                         : BANK_CONFLICT_SECOND_HALF_EVEN;
        }
      }
    }

    if (i > 0) {
      bank_num += srcBC[i];
    }
  }

  // In case src1 and src2 share same declare, i.e. use same register
  if (bank_num == 0 && dcls[1] == dcls[2]) {
    BankConflict bank1 = ((bank1RegNum * GRFRatio) > bank2RegNum)
                             ? BANK_CONFLICT_SECOND_HALF_EVEN
                             : BANK_CONFLICT_FIRST_HALF_EVEN;

    gra.setBankConflict(dcls[1], bank1);
    srcBC[1] = bank1;
    srcBC[2] = bank1;
    bank_num += bank1 * 2;
    if (bank1 < BANK_CONFLICT_SECOND_HALF_EVEN) {
      bank1RegNum += regNum[1];
    } else {
      bank2RegNum += regNum[1];
    }
  }

  // No bank assigned to src 1, 2.
  // assign the two delcares into different bundles/banks.
  if (bank_num == 0) {
    BankConflict bank1 = BANK_CONFLICT_NONE;
    BankConflict bank2 = BANK_CONFLICT_NONE;
    bool bank1First = false;
    if (GRFRatio == 1.0) {
      // For global RA: Try to reduce the size of bank 2
      if ((float)refNum[1] / regNum[1] >= (float)refNum[2] / regNum[2]) {
        bank1 = BANK_CONFLICT_SECOND_HALF_EVEN;
        bank2 = BANK_CONFLICT_FIRST_HALF_ODD;
        bank1First = true;
      } else {
        bank2 = BANK_CONFLICT_SECOND_HALF_EVEN;
        bank1 = BANK_CONFLICT_FIRST_HALF_ODD;
      }
    } else {
      // For local RA: Try to balance two banks
      if (refNum[1] >= refNum[2]) {
        bank1 = ((bank1RegNum * GRFRatio) > bank2RegNum)
                    ? BANK_CONFLICT_SECOND_HALF_EVEN
                    : BANK_CONFLICT_FIRST_HALF_EVEN;
        bank2 = (bank1 == BANK_CONFLICT_SECOND_HALF_EVEN)
                    ? BANK_CONFLICT_FIRST_HALF_ODD
                    : BANK_CONFLICT_SECOND_HALF_ODD;
        bank1First = true;
      } else {
        bank2 = (bank1RegNum * GRFRatio) > bank2RegNum
                    ? BANK_CONFLICT_SECOND_HALF_EVEN
                    : BANK_CONFLICT_FIRST_HALF_EVEN;
        bank1 = (bank2 == BANK_CONFLICT_SECOND_HALF_EVEN)
                    ? BANK_CONFLICT_FIRST_HALF_ODD
                    : BANK_CONFLICT_SECOND_HALF_ODD;
      }
    }

    // Adjust only for the single bank allocation
    if ((offset[1] + offset[2]) % 2) {
      if (bank1First) {
        bank2 = (bank2 == BANK_CONFLICT_FIRST_HALF_ODD)
                    ? BANK_CONFLICT_FIRST_HALF_EVEN
                    : BANK_CONFLICT_SECOND_HALF_EVEN;
      } else {
        bank1 = (bank1 == BANK_CONFLICT_SECOND_HALF_ODD)
                    ? BANK_CONFLICT_SECOND_HALF_EVEN
                    : BANK_CONFLICT_FIRST_HALF_EVEN;
      }
    }

    if (bank1 >= BANK_CONFLICT_SECOND_HALF_EVEN) {
      bank2RegNum += regNum[1];
      bank1RegNum += regNum[2];
    } else {
      bank1RegNum += regNum[1];
      bank2RegNum += regNum[2];
    }

    gra.setBankConflict(dcls[1], bank1);
    gra.setBankConflict(dcls[2], bank2);
  } else {
    if (srcBC[1] == BANK_CONFLICT_NONE || srcBC[2] == BANK_CONFLICT_NONE) {
      // One source operand is assigned bank already
      if (srcBC[2] == BANK_CONFLICT_NONE) {
        srcBC[2] =
            setupBankAccordingToSiblingOperand(srcBC[1], offset[2], true);
        gra.setBankConflict(dcls[2], srcBC[2]);

        if (srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN)
          bank1RegNum += regNum[2];
        else
          bank2RegNum += regNum[2];
      } else {
        srcBC[1] =
            setupBankAccordingToSiblingOperand(srcBC[2], offset[1], true);
        gra.setBankConflict(dcls[1], srcBC[1]);
        if (srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN)
          bank1RegNum += regNum[1];
        else
          bank2RegNum += regNum[1];
      }
    } else if (dcls[1] != dcls[2]) {
      if (((srcBC[1] == BANK_CONFLICT_SECOND_HALF_EVEN ||
            srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) &&
           (srcBC[2] == BANK_CONFLICT_SECOND_HALF_EVEN ||
            srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)) ||
          ((srcBC[1] == BANK_CONFLICT_SECOND_HALF_ODD ||
            srcBC[1] == BANK_CONFLICT_FIRST_HALF_ODD) &&
           (srcBC[2] == BANK_CONFLICT_SECOND_HALF_ODD ||
            srcBC[2] == BANK_CONFLICT_FIRST_HALF_ODD))) {
        internalConflict++;
      }
      if ((srcBC[1] < BANK_CONFLICT_SECOND_HALF_EVEN &&
           srcBC[2] < BANK_CONFLICT_SECOND_HALF_EVEN) ||
          (srcBC[1] >= BANK_CONFLICT_SECOND_HALF_EVEN &&
           srcBC[2] >= BANK_CONFLICT_SECOND_HALF_EVEN)) {
        internalConflict++;
      }
    }
  }
}

void BankConflictPass::getBanks(G4_INST *inst, BankConflict *srcBC,
                                G4_Declare **dcls, G4_Declare **opndDcls,
                                unsigned *offset) {
  for (int i = 0; i < 3; i++) {
    dcls[i] = nullptr;
    opndDcls[i] = nullptr;
    srcBC[i] = BANK_CONFLICT_NONE;

    G4_Operand *src = inst->getSrc(i);
    if (!src || !src->isSrcRegRegion() || src->isAccReg()) {
      return;
    }

    dcls[i] = GetTopDclFromRegRegion(src);
    if (!dcls[i]) {
      continue;
    }
    opndDcls[i] = src->getBase()->asRegVar()->getDeclare();

    offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
                gra.kernel.numEltPerGRF<Type_UB>();
    srcBC[i] = gra.getBankConflict(dcls[i]);

    if (src->getBase()->asRegVar()->isPhyRegAssigned()) {
      unsigned reg =
          src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
      srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD
                                         : BANK_CONFLICT_FIRST_HALF_EVEN;
    } else if (srcBC[i] != BANK_CONFLICT_NONE) {
      if (offset[i] % 2) {
        if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
          srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
        } else {
          srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
        }
      }
    }
  }

  return;
}

void BankConflictPass::getPrevBanks(G4_INST *inst, BankConflict *srcBC,
                                    G4_Declare **dcls, G4_Declare **opndDcls,
                                    unsigned *offset) {
  // We only care about ALU instructions which have a max number of 3 sources.
  int execSize[3];

  for (int i = 1; i < 3; i++) {
    dcls[i] = nullptr;
    opndDcls[i] = nullptr;
    srcBC[i] = BANK_CONFLICT_NONE;

    G4_Operand *src = inst->getSrc(i);
    if (!src || !src->isSrcRegRegion()) {
      return;
    }
    dcls[i] = GetTopDclFromRegRegion(src);
    if (dcls[i]->getRegFile() != G4_GRF) {
      return;
    }
    execSize[i] = src->getLinearizedEnd() - src->getLinearizedStart() + 1;

    opndDcls[i] = src->getBase()->asRegVar()->getDeclare();

    offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
                gra.kernel.numEltPerGRF<Type_UB>();
    srcBC[i] = gra.getBankConflict(dcls[i]);

    if (src->getBase()->asRegVar()->isPhyRegAssigned()) {
      unsigned reg =
          src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
      srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD
                                         : BANK_CONFLICT_FIRST_HALF_EVEN;
    } else if (srcBC[i] != BANK_CONFLICT_NONE) {
      if (offset[i] % 2) {
        if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
          srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
        } else {
          srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
        }
      }
    }
    if (execSize[i] > 32) {
      srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
                     ? BANK_CONFLICT_SECOND_HALF_ODD
                     : BANK_CONFLICT_FIRST_HALF_EVEN;
    }
  }

  return;
}

void BankConflictPass::setupBankForSrc0(G4_INST *inst, G4_INST *prevInst) {
  BankConflict srcBC[3];
  G4_Declare *dcls[3];
  G4_Declare *opndDcls[3];
  unsigned offset[3];

  BankConflict prevSrcBC[3];
  G4_Declare *prevDcls[3];
  G4_Declare *prevOpndDcls[3];
  unsigned prevOffset[3];

  if (prevInst->isSend() || prevInst->isMath()) {
    return;
  }

  getBanks(inst, srcBC, dcls, opndDcls, offset);
  getPrevBanks(prevInst, prevSrcBC, prevDcls, prevOpndDcls, prevOffset);

  if (dcls[0] != nullptr && srcBC[0] == BANK_CONFLICT_NONE &&
      prevSrcBC[1] != BANK_CONFLICT_NONE &&
      prevSrcBC[2] != BANK_CONFLICT_NONE) {
    if (prevSrcBC[1] == prevSrcBC[2]) {
      if (prevSrcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN) {
        srcBC[0] = offset[0] % 2 ? BANK_CONFLICT_FIRST_HALF_EVEN
                                 : BANK_CONFLICT_SECOND_HALF_ODD;
      } else {
        srcBC[0] = offset[0] % 2 ? BANK_CONFLICT_SECOND_HALF_ODD
                                 : BANK_CONFLICT_FIRST_HALF_EVEN;
      }

      gra.setBankConflict(dcls[0], srcBC[0]);
    }
  }

  return;
}

void BankConflictPass::setupBankConflictsforTwoGRFs(G4_INST *inst) {
  BankConflict srcBC[3];
  unsigned refNum[3];
  unsigned offset[3];
  G4_Declare *dcls[3];
  G4_Declare *opndDcls[3];
  int bank_num = 0;
  int execSize[3];

  for (int i = 0; i < 3; i++) {
    dcls[i] = nullptr;
    opndDcls[i] = nullptr;
    execSize[i] = 0;

    G4_Operand *src = inst->getSrc(i);
    if (!src || !src->isSrcRegRegion() || src->isAccReg()) {
      // bank conflict not possible
      return;
    }
    execSize[i] = src->getLinearizedEnd() - src->getLinearizedStart() + 1;

    dcls[i] = GetTopDclFromRegRegion(src);
    opndDcls[i] = src->getBase()->asRegVar()->getDeclare();

    refNum[i] = gra.getNumRefs(dcls[i]);
    offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
                gra.kernel.numEltPerGRF<Type_UB>();
    srcBC[i] = gra.getBankConflict(dcls[i]);

    if (src->getBase()->asRegVar()->isPhyRegAssigned()) {
      unsigned reg =
          src->getBase()->asRegVar()->getPhyReg()->asGreg()->getRegNum();
      srcBC[i] = ((reg + offset[i]) % 2) ? BANK_CONFLICT_SECOND_HALF_ODD
                                         : BANK_CONFLICT_FIRST_HALF_EVEN;
      gra.setBankConflict(dcls[i], srcBC[i]);
    } else if (srcBC[i] != BANK_CONFLICT_NONE) {
      if (offset[i] % 2) {
        if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
          srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
        } else {
          srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
        }
      }
    }
    if (i != 0) {
      bank_num += srcBC[i];
    }
  }

  int simd8SrcNum = 0;
  for (int i = 0; i < 3; i++) {
    if (execSize[i] <= 32) {
      simd8SrcNum++;
    }
  }

  // In case (src0) src1 and src2 use same declare, i.e. use same register
  if ((dcls[0] == dcls[1]) && (dcls[1] == dcls[2])) {
    return;
  }

  // No bank assigned to src operands,
  // assign the two delcares into different bundles/banks.
  if (simd8SrcNum <= 1) // All simd16, do even align
  {
    for (int i = 0; i < 3; i++) {
      if (execSize[i] > 32) {
        srcBC[i] = offset[i] % 2 ? BANK_CONFLICT_SECOND_HALF_ODD
                                 : BANK_CONFLICT_FIRST_HALF_EVEN;
        gra.setBankConflict(dcls[i], srcBC[i]);
      }
    }
  } else if (bank_num == 0) {
    unsigned index[3];

    refNumBasedSort(refNum, index);

    if (dcls[index[0]] != dcls[index[1]]) {
      setupEvenOddBankConflictsForDecls(dcls[index[0]], dcls[index[1]],
                                        offset[index[0]], offset[index[1]],
                                        srcBC[index[0]], srcBC[index[1]]);
    }
  } else {
    if (srcBC[1] != BANK_CONFLICT_NONE) {
      srcBC[2] = (srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN)
                     ? BANK_CONFLICT_SECOND_HALF_ODD
                     : BANK_CONFLICT_FIRST_HALF_EVEN;
      if (offset[2] % 2) {
        srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)
                       ? BANK_CONFLICT_SECOND_HALF_ODD
                       : BANK_CONFLICT_FIRST_HALF_EVEN;
      }
      gra.setBankConflict(dcls[2], srcBC[2]);
    } else {
      srcBC[1] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)
                     ? BANK_CONFLICT_SECOND_HALF_ODD
                     : BANK_CONFLICT_FIRST_HALF_EVEN;
      if (offset[1] % 2) {
        srcBC[1] = (srcBC[1] == BANK_CONFLICT_FIRST_HALF_EVEN)
                       ? BANK_CONFLICT_SECOND_HALF_ODD
                       : BANK_CONFLICT_FIRST_HALF_EVEN;
      }
      gra.setBankConflict(dcls[1], srcBC[1]);
    }
  }
}

bool BankConflictPass::isOddOffset(unsigned offset) const {
  if (gra.kernel.fg.builder->oneGRFBankDivision()) {
    return (offset % 2);
  } else {
    return ((offset % 4) / 2);
  }
}

void BankConflictPass::setupBankConflictsforDPAS(G4_INST *inst) {
  BankConflict srcBC[3];
  unsigned refNum[3];
  unsigned offset[3];
  G4_Declare *dcls[3];
  G4_Declare *opndDcls[3];
  int bank_num = 0;

  if (!inst->isDpas()) {
    return;
  }

  for (int i = 0; i < 3; i += 1) {
    opndDcls[i] = nullptr;

    G4_Operand *src = inst->getSrc(i);

    dcls[i] = GetTopDclFromRegRegion(src);
    if (dcls[i]) {
      opndDcls[i] = src->getBase()->asRegVar()->getDeclare();

      refNum[i] = gra.getNumRefs(dcls[i]);
      offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
                  gra.kernel.numEltPerGRF<Type_UB>();
      srcBC[i] = gra.getBankConflict(dcls[i]);

      if (srcBC[i] != BANK_CONFLICT_NONE) {
        if (isOddOffset(offset[i])) {
          if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
            srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
          } else {
            srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
          }
        }
        if (i != 1) {
          bank_num++;
        }
      }
    }
  }
  if (dcls[0] && dcls[1]) {
    gra.addBundleConflictDcl(dcls[0], dcls[1], offset[0] - offset[1]);
    gra.addBundleConflictDcl(dcls[1], dcls[0], offset[1] - offset[0]);
  }
  if (dcls[1] && dcls[2]) {
    gra.addBundleConflictDcl(dcls[2], dcls[1], offset[2] - offset[1]);
    gra.addBundleConflictDcl(dcls[1], dcls[2], offset[1] - offset[2]);
  }
  // In case src0 and src2 are null or use same declare, i.e. use same register
  if (dcls[0] == dcls[2] || !dcls[0] || !dcls[2]) {
    return;
  }

  if (bank_num == 0) {
    srcBC[0] = refNum[0] > refNum[2] ? BANK_CONFLICT_FIRST_HALF_EVEN
                                     : BANK_CONFLICT_SECOND_HALF_ODD;
    srcBC[2] = refNum[0] > refNum[2] ? BANK_CONFLICT_SECOND_HALF_ODD
                                     : BANK_CONFLICT_FIRST_HALF_EVEN;
    if (isOddOffset(offset[0])) {
      srcBC[0] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN)
                     ? BANK_CONFLICT_SECOND_HALF_ODD
                     : BANK_CONFLICT_FIRST_HALF_EVEN;
    }
    if (isOddOffset(offset[2])) {
      srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)
                     ? BANK_CONFLICT_SECOND_HALF_ODD
                     : BANK_CONFLICT_FIRST_HALF_EVEN;
    }
    gra.setBankConflict(dcls[0], srcBC[0]);
    gra.setBankConflict(dcls[2], srcBC[2]);

  } else if (bank_num == 1) {
    if (srcBC[0] != BANK_CONFLICT_NONE) {
      srcBC[2] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN)
                     ? BANK_CONFLICT_SECOND_HALF_ODD
                     : BANK_CONFLICT_FIRST_HALF_EVEN;
      if (isOddOffset(offset[2])) {
        srcBC[2] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)
                       ? BANK_CONFLICT_SECOND_HALF_ODD
                       : BANK_CONFLICT_FIRST_HALF_EVEN;
      }
      gra.setBankConflict(dcls[2], srcBC[2]);
    } else {
      srcBC[0] = (srcBC[2] == BANK_CONFLICT_FIRST_HALF_EVEN)
                     ? BANK_CONFLICT_SECOND_HALF_ODD
                     : BANK_CONFLICT_FIRST_HALF_EVEN;
      if (offset[0] % 2) {
        srcBC[0] = (srcBC[0] == BANK_CONFLICT_FIRST_HALF_EVEN)
                       ? BANK_CONFLICT_SECOND_HALF_ODD
                       : BANK_CONFLICT_FIRST_HALF_EVEN;
      }
      gra.setBankConflict(dcls[0], srcBC[0]);
    }
  }

#ifdef DEBUG_VERBOSE_ON
  for (int i = 0; i < 3; i += 2) {
    if (opndDcls[i]) {
      printf("%s, ", opndDcls[i]->getName());

      if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_FIRST_HALF_EVEN) {
        printf("%s\n", "EVEN");
      } else if (gra.getBankConflict(dcls[i]) ==
                 BANK_CONFLICT_SECOND_HALF_ODD) {
        printf("%s\n", "ODD");
      } else {
        printf("%s\n", "NONE");
      }
    }
  }
#endif

  return;
}


void BankConflictPass::setupBundleConflictsforTwoSrcsInst(G4_INST *inst) {
  vISA_ASSERT(inst->getNumSrc() == 2, "Only support two source operands instructions");

  G4_Declare *dcls[2];
  G4_Declare *opndDcls[2];
  unsigned offset[2];

  for (int i = 0; i < 2; i += 1) {
    dcls[i] = nullptr;
    opndDcls[i] = nullptr;

    G4_Operand *src = inst->getSrc(i);
    if (!src || !src->isSrcRegRegion() || src->isAreg()) {
      // bank conflict not possible
      continue;
    }

    dcls[i] = GetTopDclFromRegRegion(src);
    opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
    offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
                gra.kernel.numEltPerGRF<Type_UB>();
  }

  // Add potential bundle conflicts
  if (dcls[0] && dcls[1]) {
    gra.addBundleConflictDcl(dcls[0], dcls[1], offset[0] - offset[1]);
    gra.addBundleConflictDcl(dcls[1], dcls[0], offset[1] - offset[0]);
  }

  return;
}

void BankConflictPass::setupBankConflictsforMad(G4_INST *inst) {
  BankConflict srcBC[3];
  unsigned offset[3];
  G4_Declare *dcls[3];
  G4_Declare *opndDcls[3];
  BankConflict assignedBank = BANK_CONFLICT_NONE; // Flip for next

  for (int i = 0; i < 3; i += 1) {
    dcls[i] = nullptr;
    opndDcls[i] = nullptr;

    G4_Operand *src = inst->getSrc(i);
    if (!src || !src->isSrcRegRegion() || src->isAreg()) {
      // bank conflict not possible
      continue;
    }

    dcls[i] = GetTopDclFromRegRegion(src);
    opndDcls[i] = src->getBase()->asRegVar()->getDeclare();
    offset[i] = (opndDcls[i]->getOffsetFromBase() + src->getLeftBound()) /
                gra.kernel.numEltPerGRF<Type_UB>();
    srcBC[i] = gra.getBankConflict(dcls[i]);

    if (srcBC[i] != BANK_CONFLICT_NONE) {
      if (isOddOffset(offset[i])) {
        if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
          srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
        } else {
          srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
        }
      }
      if (assignedBank != BANK_CONFLICT_SECOND_HALF_EVEN) {
        if (assignedBank == BANK_CONFLICT_NONE) {
          assignedBank = srcBC[i];
        } else if (assignedBank != srcBC[i]) {
          assignedBank =
              BANK_CONFLICT_SECOND_HALF_EVEN; // BANK_CONFLICT_SECOND_HALF_EVEN
                                              // is used to represent all banks
                                              // are assigned
        }
      }
    }
  }
  // Add potential bundle conflicts, so that RA can handle it when option
  // -enableBundleCR with value 2 or 3
  if (gra.kernel.getuInt32Option(vISA_enableBundleCR) & 2) {
    if (dcls[0] && dcls[1]) {
      gra.addBundleConflictDcl(dcls[0], dcls[1], offset[0] - offset[1]);
      gra.addBundleConflictDcl(dcls[1], dcls[0], offset[1] - offset[0]);
    }
    if (dcls[1] && dcls[2]) {
      gra.addBundleConflictDcl(dcls[2], dcls[1], offset[2] - offset[1]);
      gra.addBundleConflictDcl(dcls[1], dcls[2], offset[1] - offset[2]);
    }
  }

  for (int k = 0; k < 2; k++) {
    for (int i = 2; i != -1; i--) {
      if (!dcls[i]) {
        continue;
      }

      LocalLiveRange *lr = gra.getLocalLR(dcls[i]);
      if (!lr || (k == 0 && !lr->isLiveRangeLocal())) {
        continue;
      }

      if (k == 1 && lr->isLiveRangeLocal()) {
        continue;
      }

      if (assignedBank == BANK_CONFLICT_SECOND_HALF_EVEN) {
        continue;
      }

      srcBC[i] = gra.getBankConflict(dcls[i]);
      if (srcBC[i] != BANK_CONFLICT_NONE) {
        if (isOddOffset(offset[i])) {
          if (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN) {
            srcBC[i] = BANK_CONFLICT_SECOND_HALF_ODD;
          } else {
            srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
          }
        }

        if (assignedBank == BANK_CONFLICT_NONE) {
          assignedBank = srcBC[i];
        } else if (srcBC[i] != assignedBank) {
          assignedBank = BANK_CONFLICT_SECOND_HALF_EVEN;
        }

        continue;
      }

      if (assignedBank == BANK_CONFLICT_NONE) {
        srcBC[i] = BANK_CONFLICT_FIRST_HALF_EVEN;
        assignedBank = srcBC[i];
        if (isOddOffset(offset[i])) {
          srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
                         ? BANK_CONFLICT_SECOND_HALF_ODD
                         : BANK_CONFLICT_FIRST_HALF_EVEN;
        }
        gra.setBankConflict(dcls[i], srcBC[i]);
      } else {
        srcBC[i] = (assignedBank == BANK_CONFLICT_FIRST_HALF_EVEN)
                       ? BANK_CONFLICT_SECOND_HALF_ODD
                       : BANK_CONFLICT_FIRST_HALF_EVEN;
        if (isOddOffset(offset[i])) {
          srcBC[i] = (srcBC[i] == BANK_CONFLICT_FIRST_HALF_EVEN)
                         ? BANK_CONFLICT_SECOND_HALF_ODD
                         : BANK_CONFLICT_FIRST_HALF_EVEN;
        }
        gra.setBankConflict(dcls[i], srcBC[i]);
        assignedBank = BANK_CONFLICT_SECOND_HALF_EVEN;
      }
    }
  }

#ifdef DEBUG_VERBOSE_ON
  printf("$%d:\n", inst->getVISAId());
  for (int i = 0; i < 3; i++) {
    if (dcls[i]) {
      printf("%s, ", dcls[i]->getName());

      if (gra.getBankConflict(dcls[i]) == BANK_CONFLICT_FIRST_HALF_EVEN) {
        printf("%s\n", "EVEN");
      } else if (gra.getBankConflict(dcls[i]) ==
                 BANK_CONFLICT_SECOND_HALF_ODD) {
        printf("%s\n", "ODD");
      } else {
        printf("%s\n", "NONE");
      }
    }
  }
  printf("\n");
#endif

  return;
}

void BankConflictPass::setupBankConflictsForBB(G4_BB *bb,
                                               unsigned &threeSourceInstNum,
                                               unsigned &sendInstNum,
                                               unsigned numRegLRA,
                                               unsigned &internalConflict) {
  int bank1RegNum = 0;
  int bank2RegNum = 0;
  float GRFRatio = 0;
  G4_INST *prevInst = nullptr;

  if (numRegLRA) {
    GRFRatio = ((float)(numRegLRA - SECOND_HALF_BANK_START_GRF)) /
               SECOND_HALF_BANK_START_GRF;
  }

  for (auto i = bb->rbegin(), rend = bb->rend(); i != rend; i++) {
    G4_INST *inst = (*i);
    if (inst->getNumSrc() == 3 && !inst->isSend()) {
      threeSourceInstNum++;
      setupBankConflictsOneGRFOld(inst, bank1RegNum, bank2RegNum, GRFRatio,
                                  internalConflict);
    }
    if (inst->isSend() && !inst->isEOT()) {
      // Why only data port read causes issue?
      if (inst->getMsgDesc()->isRead()) {
        sendInstNum++;
      }
    }
  }

  if ((float)threeSourceInstNum / bb->size() > 0.1) {
    if (!gra.kernel.fg.builder->lowHighBundle() &&
        gra.kernel.fg.builder->hasEarlyGRFRead()) {
      for (G4_INST *inst : *bb) {
        if (prevInst && inst->getNumSrc() == 3 && !inst->isSend()) {
          setupBankForSrc0(inst, prevInst);
        }
        prevInst = inst;
      }
    }
  }
}

void BankConflictPass::setupBankConflictsForBBTGL(G4_BB *bb,
                                                  unsigned &threeSourceInstNum,
                                                  unsigned &sendInstNum,
                                                  unsigned numRegLRA,
                                                  unsigned &internalConflict) {
  G4_INST *prevInst = nullptr;

  for (auto i = bb->rbegin(), rend = bb->rend(); i != rend; i++) {
    G4_INST *inst = (*i);
    if (inst->isSend() || inst->isCFInst() || inst->isLabel() ||
        inst->isOptBarrier()) {
      if (inst->isSend() && !inst->isEOT()) {
        // Why only data port read causes issue?
        if (inst->getMsgDesc()->isRead()) {
          sendInstNum++;
        }
      }
      continue;
    }
    if (inst->getNumSrc() >= 3) {
      threeSourceInstNum++;
      if (inst->isDpas()) {
        threeSourceInstNum += 8;
        hasDpasInst = true;
        setupBankConflictsforDPAS(inst);
      } else {
        setupBankConflictsforMad(inst);
      }
    } else if (!forGlobal &&
               inst->getNumSrc() == 2) {
      if (gra.forceBCR) {
        threeSourceInstNum++;
        setupBankConflictsforMad(inst);
      }
      if (gra.twoSrcBundleBCR) {
        threeSourceInstNum++;
        setupBundleConflictsforTwoSrcsInst(inst);
      }
    }
  }

  if ((float)threeSourceInstNum / bb->size() > 0.1) {
    if (!gra.kernel.fg.builder->lowHighBundle() &&
        gra.kernel.fg.builder->hasEarlyGRFRead()) {
      for (G4_INST *inst : *bb) {
        if (prevInst && inst->getNumSrc() == 3 && !inst->isSend()) {
          setupBankForSrc0(inst, prevInst);
        }
        prevInst = inst;
      }
    }
  }
}

// Use for BB sorting according to the loop nest level and the BB size.
bool compareBBLoopLevel(G4_BB *bb1, G4_BB *bb2) {
  if (bb1->getNestLevel() > bb2->getNestLevel()) {
    return true;
  } else if (bb1->getNestLevel() == bb2->getNestLevel()) {
    return bb1->size() > bb2->size();
  }

  return false;
}

/*
 * output:
 *        threeSourceCandidate, if there are enough three source instructions
 *        return value, if do bank confliction reduction to RR RA.
 */
bool BankConflictPass::setupBankConflictsForKernel(bool doLocalRR,
                                                   bool &threeSourceCandidate,
                                                   unsigned numRegLRA,
                                                   bool &highInternalConflict) {
  unsigned threeSourceInstNumInKernel = 0;
  unsigned internalConflict = 0;
  unsigned instNumInKernel = 0;
  unsigned sendInstNumInKernel = 0;

  std::vector<G4_BB *> orderedBBs(gra.kernel.fg.cbegin(), gra.kernel.fg.cend());
  std::sort(orderedBBs.begin(), orderedBBs.end(), compareBBLoopLevel);

  for (auto bb : orderedBBs) {
    unsigned instNum = 0;
    unsigned sendInstNum = 0;
    unsigned threeSourceInstNum = 0;
    unsigned conflicts = 0;

    unsigned loopNestLevel = 0;

    if (gra.kernel.fg.builder->lowHighBundle()) {
      setupBankConflictsForBB(bb, threeSourceInstNum, sendInstNum, numRegLRA,
                              conflicts);
    } else {
      setupBankConflictsForBBTGL(bb, threeSourceInstNum, sendInstNum, numRegLRA,
                                 conflicts);
    }

    loopNestLevel = bb->getNestLevel() + 1;

    if (threeSourceInstNum) {
      instNum = (uint32_t)bb->size() * loopNestLevel *
                BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
      threeSourceInstNum = threeSourceInstNum * loopNestLevel *
                           BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
      sendInstNum =
          sendInstNum * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
      conflicts =
          conflicts * loopNestLevel * BANK_CONFLICT_HEURISTIC_LOOP_ITERATION;
      internalConflict += conflicts;
      threeSourceInstNumInKernel += threeSourceInstNum;
      instNumInKernel += instNum;
      sendInstNumInKernel += sendInstNum;
    }
  }

  if (!threeSourceInstNumInKernel ||
      (float)threeSourceInstNumInKernel / instNumInKernel <
          BANK_CONFLICT_HEURISTIC_INST) {
    return false;
  }

  highInternalConflict =
      ((float)internalConflict / threeSourceInstNumInKernel) >
      INTERNAL_CONFLICT_RATIO_HEURISTIC;

  // Bank conflict reduction is done only when there is enough three source
  // instructions.
  threeSourceCandidate = true;

  if (doLocalRR && sendInstNumInKernel) {
    if (!hasDpasInst && (sendInstNumInKernel > threeSourceInstNumInKernel)) {
      return false;
    }
  }

  return true;
}

bool GlobalRA::areAllDefsNoMask(G4_Declare *dcl) {
  bool retval = true;
  auto &maskUsed = getMask(dcl);
  if (maskUsed.size() > 0 &&
      getAugmentationMask(dcl) != AugmentationMasks::NonDefault) {
    auto byteSize = dcl->getByteSize();
    for (unsigned i = 0; i < byteSize; i++) {
      if (maskUsed[i] != NOMASK_BYTE) {
        retval = false;
        break;
      }
    }
  } else {
    if (getAugmentationMask(dcl) == AugmentationMasks::NonDefault)
      retval = true;
    else
      retval = false;
  }
  return retval;
}

BankAlign GlobalRA::getBankAlign(const G4_Declare *dcl) const {
  const IR_Builder *builder = kernel.fg.builder;
  switch (getBankConflict(dcl)) {
  case BANK_CONFLICT_FIRST_HALF_EVEN:
  case BANK_CONFLICT_SECOND_HALF_EVEN:
    return builder->oneGRFBankDivision() ? BankAlign::Even
                                         : BankAlign::Even2GRF;
  case BANK_CONFLICT_FIRST_HALF_ODD:
  case BANK_CONFLICT_SECOND_HALF_ODD:
    return builder->oneGRFBankDivision() ? BankAlign::Odd : BankAlign::Odd2GRF;
  default:
    return BankAlign::Either;
  }
}

void GlobalRA::emitFGWithLiveness(const LivenessAnalysis &liveAnalysis) const {
  VISA_DEBUG_VERBOSE({
    for (G4_BB *bb : kernel.fg) {
      std::cout << "\n"
                << "-------------------------------------------------------"
                   "----------";
      std::cout << "\nBB" << bb->getId() << ":";
      std::cout << "\nPreds: ";
      for (const G4_BB *pred : bb->Preds)
        std::cout << "BB" << pred->getId() << ", ";
      std::cout << "\nSuccs: ";
      for (const G4_BB *succ : bb->Succs)
        std::cout << "BB" << succ->getId() << ", ";

      if (localRAEnable) {
        if (auto summary = getBBLRASummary(bb)) {
          std::cout << "\nLocal RA: ";
          for (unsigned i = 0; i < kernel.getNumRegTotal(); i++) {
            if (summary->isGRFBusy(i))
              std::cout << "r" << i << ", ";
          }
        }
      }

      std::cout << "\nGen: ";
      for (const G4_Declare *dcl : kernel.Declares) {
        if (dcl->getAliasDeclare())
          continue;

        if (dcl->getRegVar()->isRegAllocPartaker()) {
          if (liveAnalysis.use_gen[bb->getId()].test(
                  dcl->getRegVar()->getId())) {
            std::cout << dcl->getName() << ", ";
          }
        }
      }

      std::cout << "\nKill: ";
      for (const G4_Declare *dcl : kernel.Declares) {
        if (dcl->getAliasDeclare())
          continue;

        if (dcl->getRegVar()->isRegAllocPartaker()) {
          if (liveAnalysis.use_kill[bb->getId()].test(
                  dcl->getRegVar()->getId())) {
            std::cout << dcl->getName() << ", ";
          }
        }
      }

      std::cout << "\nLive-in: ";
      for (const G4_Declare *dcl : kernel.Declares) {
        if (dcl->getAliasDeclare())
          continue;

        if (dcl->getRegVar()->isRegAllocPartaker()) {
          if (liveAnalysis.isLiveAtEntry(bb, dcl->getRegVar()->getId())) {
            std::cout << dcl->getName() << ", ";
          }
        }
      }

      std::cout << "\nLive-out: ";
      for (const G4_Declare *dcl : kernel.Declares) {
        if (dcl->getAliasDeclare())
          continue;

        if (dcl->getRegVar()->isRegAllocPartaker()) {
          if (liveAnalysis.isLiveAtExit(bb, dcl->getRegVar()->getId())) {
            std::cout << dcl->getName() << ", ";
          }
        }
      }
      std::cout << "\n";
      bb->emit(COUT_ERROR);
    }
  });
}

void GlobalRA::reportSpillInfo(const LivenessAnalysis &liveness,
                               const GraphColor &coloring) const {
  // Emit out interference graph of each spill candidate
  // and if a spill candidate is a local range, emit its
  // start and end line number in file.
  const auto& lrs = coloring.getLiveRanges();

  for (const vISA::LiveRange *slr : coloring.getSpilledLiveRanges()) {
    if (slr->getRegKind() == G4_GRF) {
      const G4_RegVar *spillVar = slr->getVar();
      VISA_DEBUG_VERBOSE({
        std::cout << "Spill candidate " << spillVar->getName() << " intf:";
        std::cout << "\t(" << spillVar->getDeclare()->getTotalElems()
                  << "):" << TypeSymbol(spillVar->getDeclare()->getElemType())
                  << "\n";
      });

      if (getLocalLR(spillVar->getDeclare())) {
        if (getLocalLR(spillVar->getDeclare())->isLiveRangeLocal()) {
          [[maybe_unused]] int start, end;
          unsigned dummy;
          start = getLocalLR(spillVar->getDeclare())
                      ->getFirstRef(dummy)
                      ->getLineNo();
          end = getLocalLR(spillVar->getDeclare())
                    ->getLastRef(dummy)
                    ->getLineNo();
          VISA_DEBUG_VERBOSE(std::cout
                             << "(Liverange is local starting at line #"
                             << start << " and ending at line #" << end << ")"
                             << "\n");

        }
      }

      const Interference *intf = coloring.getIntf();
      unsigned spillVarId = slr->getVar()->getId();

      for (int i = 0; i < (int)liveness.getNumSelectedVar(); i++) {
        if (intf->interfereBetween(spillVarId, i)) {
          const G4_RegVar *intfRangeVar = lrs[i]->getVar();
          (void)intfRangeVar;
          VISA_DEBUG_VERBOSE(
              std::cout << "\t" << intfRangeVar->getName() << "("
                        << intfRangeVar->getDeclare()->getTotalElems() << "):"
                        << TypeSymbol(
                               intfRangeVar->getDeclare()->getElemType()));

          if (!lrs[i]->getPhyReg()) {
            VISA_DEBUG_VERBOSE(std::cout << " --- spilled");
          }
          VISA_DEBUG_VERBOSE(std::cout << ",\n");
        }
      }

      VISA_DEBUG_VERBOSE(std::cout << "\n\n");
    }
  }
}

LiveRange::LiveRange(G4_RegVar *v, GlobalRA &g)
    : var(v), dcl(v->getDeclare()), regKind(dcl->getRegFile()), gra(g),
      numRegNeeded(dcl->getNumRegNeeded()){
  isCandidate = true;
}

void LiveRange::initializeForbidden() {
  auto rf = gra.incRA.getSelectedRF();
  if (LivenessAnalysis::livenessClass(rf, G4_ADDRESS)) {
    setForbidden(forbiddenKind::FBD_ADDR);
  } else if (LivenessAnalysis::livenessClass(rf, G4_FLAG)) {
    setForbidden(forbiddenKind::FBD_FLAG);
  } else if (LivenessAnalysis::livenessClass(rf, G4_SCALAR)) {
    setForbidden(forbiddenKind::FBD_SCALAR);
  } else {
    setForbidden(forbiddenKind::FBD_RESERVEDGRF);
  };

  bool hasStackCall =
      gra.kernel.fg.getHasStackCalls() || gra.kernel.fg.getIsStackCallFunc();
  setCallerSaveBias(hasStackCall);
  if (getRegKind() == G4_GRF) {
    if (gra.kernel.fg.isPseudoVCADcl(dcl)) {
      setForbidden(forbiddenKind::FBD_CALLERSAVE);
    } else if (gra.kernel.fg.isPseudoVCEDcl(dcl)) {
      setForbidden(forbiddenKind::FBD_CALLEESAVE);
    } else if (dcl == gra.getOldFPDcl()) {
      setForbidden(forbiddenKind::FBD_CALLERSAVE);
    }
  }
}

void LiveRange::initialize() {
  if (gra.kernel.fg.isPseudoDcl(dcl)) {
    setIsPseudoNode();
  }
  if (dcl->getIsPartialDcl()) {
    if (G4_Declare *parentDcl = gra.getSplittedDeclare(dcl)) {
      setParentLRID(parentDcl->getRegVar()->getId());
      setIsPartialDcl();
    }
  }
  if (dcl->getIsSplittedDcl()) {
    setIsSplittedDcl(true);
  }
  setBC(gra.getBankConflict(dcl));

  initializeForbidden();
}

LiveRange *LiveRange::createNewLiveRange(G4_Declare *dcl, GlobalRA &gra) {
  auto &IncRAMem = gra.incRA.mem;
  G4_RegVar *var = dcl->getRegVar();
  vISA_ASSERT(!dcl->getAliasDeclare(),
              "error: attempt to create LiveRange for non-root dcl");
  auto *lr = new (IncRAMem) LiveRange(var, gra);

  lr->initialize();

  return lr;
}

void LiveRange::checkForInfiniteSpillCost(
    G4_BB *bb, std::list<G4_INST *>::reverse_iterator &it) {
  // G4_INST at *it defines liverange object (this ptr)
  // If next instruction of iterator uses same liverange then
  // it may be a potential infinite spill cost candidate.
  // To confirm, following requirements should be fulfilled:
  // a. this liverange is not a global
  // b. this liverange is defined/used in these 2 instructions only
  //
  // The idea is for ranges marked with infinite spill cost,
  // coloring will attempt to put them on top of stack so they
  // have higher chance of getting a color. If a range that should
  // be infinite spill cost is not marked as being so, the only
  // downside is extra compile time spent in inserting spill code
  // and then punting out when later spilled code will cause
  // even more spills.
  //
  // The assumption is that current live-range is a current register
  // allocation candidate.
  //
  G4_INST *curInst = (*it);

  // Skip the check if curInst is a pseudoKill
  // Otherwise, it may invalidate a previously marked infinite
  // spill cost candidate, e.g.,
  // pseudo_kill (1) P1(0,0)[1]:uw [Align1]
  // mov (1) P1(0,0)[1]:uw TV1(8,0)[0;1,0]:uw [Align1, NoMask]
  // (+P1.0) sel (16) V65(0,0)[1]:f TV0(0,0)[0;1,0]:f 0:f [Align1, H1]
  if (curInst->isPseudoKill()) {
    return;
  }

  // Check whether dst variable is a global
  if (gra.isBlockLocal(this->getDcl()) == false) {
    isCandidate = false;
    isInfiniteCost = false;

    return;
  }

  G4_DstRegRegion *dst = curInst->getDst();
  // If cur instruction dst is indirect write then return
  if (dst && dst->getRegAccess() == IndirGRF &&
      dst->getBase()->asRegVar()->getId() == this->getVar()->getId()) {
    return;
  }

  // isCandidate is set to true only for first definition ever seen.
  // If more than 1 def if found this gets set to false.
  const std::list<G4_INST *>::reverse_iterator rbegin = bb->rbegin();
  if (this->isCandidate == true && it != rbegin) {
    G4_INST *nextInst = NULL;
    if (this->getRefCount() != 2 || (this->getRegKind() == G4_GRF &&
                                     this->getDcl()->getAddressed() == true)) {
      // If a liverange has > 2 refs then it
      // cannot be a candidate.
      // Also an address taken GRF is not a candidate.
      // This represents an early exit.
      isCandidate = false;
      isInfiniteCost = false;

      return;
    }

    // Skip all pseudo kills
    std::list<G4_INST *>::reverse_iterator next = it;
    while (true) {
      if (next == rbegin) {
        isCandidate = isInfiniteCost = false;
        return;
      }
      --next;

      // This is not a pseudo-kill instruction, then find
      // the desired next instruction. Otherwise, continue.
      nextInst = *next;
      if (!(nextInst->isPseudoKill()))
        break;
    }

    // Check whether this liverange is used in nextInst
    for (unsigned i = 0, numSrc = nextInst->getNumSrc(); i < numSrc; i++) {
      G4_Operand *src = nextInst->getSrc(i);

      if (src && src->isSrcRegRegion() &&
          src->getBase()->isRegAllocPartaker()) {
        // src can be Direct/Indirect
        G4_SrcRegRegion *srcRgn = src->asSrcRegRegion();

        if (srcRgn->getRegAccess() == Direct && srcRgn->getBase()->isRegVar() &&
            srcRgn->getBase()->asRegVar()->getId() == this->getVar()->getId()) {
          // Def-use found back-to-back
          isInfiniteCost = true;
          // Identify no more candidates
          isCandidate = false;
        } else if (this->getRegKind() == G4_ADDRESS &&
                   srcRgn->getRegAccess() == IndirGRF &&
                   srcRgn->getBase()->isRegVar() &&
                   srcRgn->getBase()->asRegVar()->getId() ==
                       this->getVar()->getId()) {
          // Def-use found back-to-back
          isInfiniteCost = true;
          // Identify no more candidates
          isCandidate = false;
        }
      }
    }

    G4_DstRegRegion *nextDst = nextInst->getDst();
    if (isCandidate == true && this->getRegKind() == G4_ADDRESS && nextDst &&
        nextDst->getRegAccess() == IndirGRF && nextDst->getBase()->isRegVar() &&
        nextDst->getBase()->asRegVar()->isRegAllocPartaker() &&
        nextDst->getBase()->asRegVar()->getId() == this->getVar()->getId()) {
      // Pattern found:
      // A0=
      // r[A0]=
      isInfiniteCost = true;
      // Identify no more candidates
      isCandidate = false;
    }

    if (isCandidate == true && this->getRegKind() == G4_FLAG &&
        nextInst->getPredicate() && nextInst->getPredicate()->getBase() &&
        nextInst->getPredicate()->getBase()->isRegVar() &&
        nextInst->getPredicate()->getBase()->asRegVar()->isRegAllocPartaker() &&
        nextInst->getPredicate()->getBase()->asRegVar()->getId() ==
            this->getVar()->getId()) {
      // Pattern found:
      // P0 = or cmp.P0 = <-- P0 defined
      // (P0) ... <-- P0 used as predicate
      isInfiniteCost = true;
      // Identify no more candidates
      isCandidate = false;
    }

    VISA_DEBUG_VERBOSE({
      if (isInfiniteCost == true) {
        std::cout
            << "Marking " << this->getDcl()->getName()
            << " as having infinite spill cost due to back-to-back def-use"
            << "\n";
      }
    });

    // Once a def is seen, stop looking for more defs
    isCandidate = false;
  } else {
    VISA_DEBUG_VERBOSE({
      if (isInfiniteCost == true) {
        std::cout << "Unmarking " << this->getDcl()->getName()
                  << " as having infinite spill cost"
                  << "\n";
      }
    });
    isCandidate = false;
    isInfiniteCost = false;
  }
}

//
// return true, if live ranges v1 and v2 interfere
//
bool Interference::interfereBetween(unsigned v1, unsigned v2) const {
  if (v1 > v2) {
    std::swap(v1, v2);
  }

  if (useDenseMatrix()) {
    unsigned col = v2 / BITS_DWORD;
    return matrix[v1 * rowSize + col] & (1 << (v2 % BITS_DWORD));
  } else {
    auto &set1 = sparseMatrix[v1];
    return set1.test(v2);
  }
}

//
// init live vector with all live ranges that are live at the exit
// also set the next seq use of any live range that is live across to be INT_MAX
// to indicate that this live range does not have exclusive sequential uses and
// hence is not a candidate for being marked with an infinite spill cost.
//
void Interference::buildInterferenceAtBBExit(const G4_BB *bb,
                                             SparseBitVector &live) {

  // live must be empty at this point
  live = liveAnalysis->use_out[bb->getId()];
  live &= liveAnalysis->def_out[bb->getId()];
}

//
// Filter out partial or splitted declares in batch interference.
//
inline void Interference::filterSplitDclares(unsigned startIdx, unsigned endIdx,
                                             unsigned n, unsigned col,
                                             unsigned &elt, bool is_partial) {

  if (is_partial) // Don't interference with parent
  {
    unsigned rowSplited = n / BITS_DWORD;
    if (rowSplited == col) {
      elt &= ~(1 << (n % BITS_DWORD));
    }
  }

  // if current is splitted dcl, don't interference with any of its child nodes.
  // if current is partial dcl, don't interference with any other child nodes.
  if (col >= startIdx / BITS_DWORD && col < (endIdx / BITS_DWORD + 1)) {
    unsigned selt = 0;
    unsigned start_id = col * BITS_DWORD > startIdx ? 0 : startIdx % BITS_DWORD;
    unsigned end_id =
        (col + 1) * BITS_DWORD > endIdx ? endIdx % BITS_DWORD : BITS_DWORD;

    for (unsigned i = start_id; i < end_id; i++) {
      selt |= 1 << i;
    }
    elt &= ~selt;
  }

  return;
}

//
// set interference for all live ranges that are currently live
// for partial declares, following rules are applied
// a. current partial declare does not interference with any other partial
// declare b. current parent declare does not interference with its children
// declares, can children declare interference with parent declare? c. current
// partial declare does not interference with hybrid declares added by local RA,
// the reason is simple, these declares are assigned register already.
//
void Interference::buildInterferenceWithLive(const SparseBitVector &live,
                                             unsigned i) {
  // set interference between variable with index "i" and variable set in "live".
  // j is the valid bit index in the live.
  for (unsigned j : live) {
    if (!varSplitCheckBeforeIntf(i, j)) {
      if (j < i) {
        safeSetInterference(j, i);
      } else if (j > i) {
        safeSetInterference(i, j);
      }
    }
  }
  const LiveRange *lr = lrs[i];
  bool is_partial = lr->getIsPartialDcl();
  bool is_splitted = lr->getIsSplittedDcl();
  unsigned n = 0;
  unsigned start_idx = 0; // The variable index of the first child declare, the
                          // child variables' indexes are contigious.
  unsigned end_idx = 0;   // The variable index of the last child declare
  if (is_splitted) // if current is splitted dcl, don't interference with all
                   // its child nodes.
  {
    start_idx = lr->getDcl()->getSplitVarStartID();
    end_idx = start_idx + gra.getSplitVarNum(lr->getDcl());
  }
  if (is_partial) // if current is partial dcl, don't interference with all
                  // other partial dcls, and it's parent dcl.
  {
    // n is the variable ID of the splitted(parent) declare
    n = gra.getSplittedDeclare(lr->getDcl())->getRegVar()->getId();
    start_idx = splitStartId;
    end_idx = splitStartId + splitNum;
  }

  if (is_partial) { // Don't interference with parent
    if (i < n) {
      safeClearInterference(i, n);
    } else {
      safeClearInterference(n, i);
    }
  }
  for (unsigned j = start_idx; j < end_idx; j++) { //Don't inteference with the child
    if (j < i) {
      safeClearInterference(j, i);
    } else {
      safeClearInterference(i, j);
    }
  }
}

void Interference::buildInterferenceWithSubDcl(unsigned lr_id, G4_Operand *opnd,
                                               SparseBitVector &live, bool setLive,
                                               bool setIntf) {

  const G4_Declare *dcl = lrs[lr_id]->getDcl();
  for (const G4_Declare *subDcl : gra.getSubDclList(dcl)) {
    unsigned leftBound = gra.getSubOffset(subDcl);
    unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
    if (!(opnd->getRightBound() < leftBound ||
          rightBound < opnd->getLeftBound())) {
      int subID = subDcl->getRegVar()->getId();

      if (setIntf) {
        buildInterferenceWithLive(live, subID);
      }
      if (setLive) {
        live.set(subID);
      }
    }
  }

  return;
}

void Interference::buildInterferenceWithAllSubDcl(unsigned v1, unsigned v2) {
  const G4_Declare *d1 = lrs[v1]->getDcl();
  const G4_Declare *d2 = lrs[v2]->getDcl();

  if (d1->getIsSplittedDcl() && !d2->getIsPartialDcl()) {
    for (const G4_Declare *subDcl : gra.getSubDclList(d1)) {
      int subID = subDcl->getRegVar()->getId();
      checkAndSetIntf(v2, subID);
    }
  }

  if (d2->getIsSplittedDcl() && !d1->getIsPartialDcl()) {
    for (const G4_Declare *subDcl : gra.getSubDclList(d2)) {
      int subID = subDcl->getRegVar()->getId();
      checkAndSetIntf(v1, subID);
    }
  }

  return;
}
//
// Bias the live ranges in "live" to be assigned the callee-save registers as
// they are live through a stack call. Exclude file scope variables as they are
// always save/restore before/after call and are better assigned to the
// caller-save space.
//
void Interference::addCalleeSaveBias(const SparseBitVector &live) {
  for (unsigned i = 0; i < maxId; i++) {
    if (live.test(i)) {
      lrs[i]->setCallerSaveBias(false);
      lrs[i]->setCalleeSaveBias(true);
    }
  }
}

void Interference::buildInterferenceAmongLiveOuts() {
  // Mark interference between dcls marked as Output.
  //
  // Interference computation marks interference for a
  // variable only when definition for that variable is
  // seen, not otherwise.
  //
  // This method is useful when definition of such
  // "Output" variables are emitted to program post RA.
  //
  // It is safe to mark interference between all "Output"
  // dcls even when their definition is present in the program.

  // First gather all Output dcls in a vector to avoid an O(N^2)
  // lookup. Number of OutputDcls should be small.
  std::vector<G4_Declare *> OutputDcls;
  for (auto dcl : kernel.Declares) {
    if (!dcl->getRegVar()->isRegAllocPartaker() || !dcl->isOutput())
      continue;

    OutputDcls.push_back(dcl);
  }

  for (auto dcl1 : OutputDcls) {
    // dcl1 is RA partaker iter and is marked as Output
    for (auto dcl2 : OutputDcls) {
      if (dcl1 == dcl2)
        continue;

      checkAndSetIntf(dcl1->getRegVar()->getId(), dcl2->getRegVar()->getId());
    }
  }
}

void Interference::buildInterferenceAmongLiveIns() {
  //
  // Build interference between all live-ins. If all live-ins are only
  // read then their interference will be skipped in earlier phase.
  // For eg, arg and globals are both live-in. And both may only have
  // uses in function and no def.
  //
  const G4_BB *entryBB = kernel.fg.getEntryBB();

  for (auto it = liveAnalysis->globalVars.begin();
       it != liveAnalysis->globalVars.end(); ++it) {
    auto i = (*it);
    if (liveAnalysis->isLiveAtEntry(entryBB, i)) {
      // Mark reference can not gaurantee all the varaibles are local, update
      // here
      if (lrs[i]->getDcl()->getIsSplittedDcl()) {
        lrs[i]->getDcl()->setIsSplittedDcl(false);
        lrs[i]->setIsSplittedDcl(false);
      }

      auto nextIt = it;
      for (auto nit = ++nextIt; nit != liveAnalysis->globalVars.end(); ++nit) {
        auto j = (*nit);
        if (liveAnalysis->isLiveAtEntry(entryBB, j)) {
          if (lrs[i]->getDcl()->getRegFile() == G4_INPUT &&
              lrs[i]->getVar()->getPhyReg() != NULL &&
              lrs[j]->getDcl()->getRegFile() == G4_INPUT &&
              lrs[j]->getVar()->getPhyReg() != NULL) {
            continue;
          } else {
            if (!varSplitCheckBeforeIntf(i, j)) {
              checkAndSetIntf(i, j);
            }
          }
        }
      }
    }
  }
}

void Interference::markInterferenceForSend(G4_BB *bb, G4_INST *inst,
                                           G4_DstRegRegion *dst) {
  bool isDstRegAllocPartaker = false;
  bool isDstLocallyAssigned = false;
  unsigned dstId = 0;
  int dstPreg = 0, dstNumRows = 0;

  if (dst->getBase()->isRegVar()) {
    if (dst->getBase()->isRegAllocPartaker()) {
      G4_DstRegRegion *dstRgn = dst;
      isDstRegAllocPartaker = true;
      dstId = ((G4_RegVar *)dstRgn->getBase())->getId();
    } else if (gra.useLocalRA) {
      LocalLiveRange *localLR = NULL;
      G4_Declare *topdcl = GetTopDclFromRegRegion(dst);

      if (topdcl)
        localLR = gra.getLocalLR(topdcl);

      if (localLR && localLR->getAssigned()) {
        int sreg;
        G4_VarBase *preg = localLR->getPhyReg(sreg);

        vISA_ASSERT(preg->isGreg(), "Register in dst was not GRF");

        isDstLocallyAssigned = true;
        dstPreg = preg->asGreg()->getRegNum();
        dstNumRows = localLR->getTopDcl()->getNumRows();
      }
    }

    if (isDstRegAllocPartaker || isDstLocallyAssigned) {
      for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
        G4_Operand *src = inst->getSrc(j);
        if (src && src->isSrcRegRegion() &&
            src->asSrcRegRegion()->getBase()->isRegVar()) {
          if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker()) {
            unsigned srcId =
                src->asSrcRegRegion()->getBase()->asRegVar()->getId();

            if (isDstRegAllocPartaker) {
              if (!varSplitCheckBeforeIntf(dstId, srcId)) {
                checkAndSetIntf(dstId, srcId);
                buildInterferenceWithAllSubDcl(dstId, srcId);
              }
            } else {
              for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum; j++) {
                int k = getGRFDclForHRA(j)->getRegVar()->getId();
                if (!varSplitCheckBeforeIntf(k, srcId)) {
                  checkAndSetIntf(k, srcId);
                  buildInterferenceWithAllSubDcl(k, srcId);
                }
              }
            }
          } else if (gra.useLocalRA && isDstRegAllocPartaker) {
            LocalLiveRange *localLR = nullptr;
            const G4_Declare *topdcl = GetTopDclFromRegRegion(src);

            if (topdcl)
              localLR = gra.getLocalLR(topdcl);

            if (localLR && localLR->getAssigned()) {
              int sreg;
              G4_VarBase *preg = localLR->getPhyReg(sreg);
              int numrows = localLR->getTopDcl()->getNumRows();

              vISA_ASSERT(preg->isGreg(), "Register in src was not GRF");

              int reg = preg->asGreg()->getRegNum();

              for (int j = reg, sum = reg + numrows; j < sum; j++) {
                int k = getGRFDclForHRA(j)->getRegVar()->getId();
                if (!varSplitCheckBeforeIntf(dstId, k)) {
                  checkAndSetIntf(dstId, k);
                  buildInterferenceWithAllSubDcl(dstId, k);
                }
              }
            }
          }
        }
      }
    }
  }
}

void Interference::setOutOfBoundForbidden(G4_Operand *opnd) {
  G4_Declare *dcl = opnd->getBaseRegVarRootDeclare();
  vISA_ASSERT(dcl, "NULL declare");
  int dclEndGRF = (dcl->getByteSize() - 1) / builder.numEltPerGRF<Type_UB>();
  int opndEndGRF = opnd->getLinearizedEnd() / builder.numEltPerGRF<Type_UB>();
  unsigned lrId = ((G4_RegVar *)opnd->getBase())->getId();
  LiveRange *lr = lrs[lrId];

  if (lr && (opndEndGRF > dclEndGRF)) {
    vISA_ASSERT((opndEndGRF - dclEndGRF) == 1,
                "More register reservation required for svm gather");
    lr->setForbidden(forbiddenKind::FBD_LASTGRF);
  }
}

void Interference::setForbiddenGRFNumForSVMScatter(G4_INST *inst) {
  G4_DstRegRegion *dst = inst->getDst();

  if (dst && dst->getBase()->isRegVar()) {
    if (dst->getBase()->isRegAllocPartaker()) {
      setOutOfBoundForbidden(dst);
    }
  }

  for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
    G4_Operand *src = inst->getSrc(j);
    if (src && src->isSrcRegRegion() &&
        src->asSrcRegRegion()->getBase()->isRegVar()) {
      if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker()) {
        setOutOfBoundForbidden(src);
      }
    }
  }

  return;
}

void Interference::markInterferenceToAvoidDstSrcOverlap(G4_BB *bb,
                                                        G4_INST *inst) {
  bool isDstRegAllocPartaker = false;
  bool isDstLocallyAssigned = false;
  unsigned dstId = 0;
  int dstPreg = 0, dstNumRows = 0;
  bool dstOpndNumRows = false;

  G4_DstRegRegion *dst = inst->getDst();
  if (dst->getBase()->isRegVar() &&
      (dst->getTopDcl()->getRegFile() == G4_GRF)) {
    G4_Declare *dstDcl = dst->getTopDcl();
    int dstOffset = dst->getLeftBound() / kernel.numEltPerGRF<Type_UB>();
    bool isDstEvenAlign = gra.isEvenAligned(dstDcl);

    if (dst->getBase()->isRegAllocPartaker()) {
      isDstRegAllocPartaker = true;
      dstId = ((G4_RegVar *)dst->getBase())->getId();
      dstOpndNumRows = dst->getSubRegOff() * dst->getTypeSize() +
                           dst->getLinearizedEnd() - dst->getLinearizedStart() +
                           1 >
                       kernel.numEltPerGRF<Type_UB>();
    } else if (gra.useLocalRA) {
      LocalLiveRange *localLR = NULL;
      G4_Declare *topdcl = GetTopDclFromRegRegion(dst);

      if (topdcl)
        localLR = gra.getLocalLR(topdcl);
      if (localLR && localLR->getAssigned()) {
        int sreg;
        G4_VarBase *preg = localLR->getPhyReg(sreg);

        vISA_ASSERT(preg->isGreg(), "Register in dst was not GRF");

        isDstLocallyAssigned = true;
        dstPreg = preg->asGreg()->getRegNum();
        dstNumRows = localLR->getTopDcl()->getNumRows();
        dstOpndNumRows = dst->getSubRegOff() * dst->getTypeSize() +
            dst->getLinearizedEnd() - dst->getLinearizedStart() + 1 >
            kernel.numEltPerGRF<Type_UB>();
        isDstEvenAlign = (dstPreg % 2 == 0);
      }
    }

    if (isDstRegAllocPartaker || isDstLocallyAssigned) {
      for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
        if (inst->isDpas() && j != 1)
          continue;
        G4_Operand *src = inst->getSrc(j);
        if (src != NULL && src->isSrcRegRegion() &&
            src->asSrcRegRegion()->getBase()->isRegVar()) {
          G4_SrcRegRegion *srcRgn = src->asSrcRegRegion();
          G4_Declare *srcDcl = src->getTopDcl();
          if (srcRgn->getRegAccess() == Direct &&
              (src->getTopDcl()->getRegFile() == G4_GRF ||
               src->getTopDcl()->getRegFile() == G4_INPUT)) {
            int srcOffset =
                src->getLeftBound() / kernel.numEltPerGRF<Type_UB>();
            bool srcOpndNumRows =
                srcRgn->getSubRegOff() * srcRgn->getTypeSize() +
                                      srcRgn->getLinearizedEnd() -
                                      srcRgn->getLinearizedStart() + 1 >
                kernel.numEltPerGRF<Type_UB>();

            int srcReg = 0;
            bool isSrcEvenAlign = gra.isEvenAligned(srcDcl);
            if (!src->asSrcRegRegion()->getBase()->isRegAllocPartaker() &&
                gra.useLocalRA) {
              int sreg;
              LocalLiveRange *localLR = NULL;
              G4_Declare *topdcl = GetTopDclFromRegRegion(src);

              if (topdcl)
                localLR = gra.getLocalLR(topdcl);
              if (localLR && localLR->getAssigned()) {
                G4_VarBase *preg = localLR->getPhyReg(sreg);

                vISA_ASSERT(preg->isGreg(), "Register in src was not GRF");
                srcReg = preg->asGreg()->getRegNum();
                isSrcEvenAlign = (srcReg % 2 == 0);
              }
            }

            if (srcDcl->getRegFile() == G4_INPUT &&
                srcDcl->getRegVar()->getPhyReg() != NULL &&
                srcDcl->getRegVar()->getPhyReg()->isGreg()) {
              srcReg = srcDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum();
              isSrcEvenAlign = (srcReg % 2 == 0);
            }

            if (dstOpndNumRows || srcOpndNumRows) {
              if (!(isDstEvenAlign && isSrcEvenAlign &&
                    srcOffset % 2 == dstOffset % 2 && dstOpndNumRows &&
                    srcOpndNumRows)) {
                if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker()) {
                  unsigned srcId =
                      src->asSrcRegRegion()->getBase()->asRegVar()->getId();
#ifdef DEBUG_VERBOSE_ON
                  printf("Src%d  ", j);
                  inst->dump();
#endif
                  if (isDstRegAllocPartaker) {
                    if (!varSplitCheckBeforeIntf(dstId, srcId)) {
                      checkAndSetIntf(dstId, srcId);
                      buildInterferenceWithAllSubDcl(dstId, srcId);
                    }
                  } else {
                    for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum;
                         j++) {
                      int k = getGRFDclForHRA(j)->getRegVar()->getId();
                      if (!varSplitCheckBeforeIntf(k, srcId)) {
                        checkAndSetIntf(k, srcId);
                        buildInterferenceWithAllSubDcl(k, srcId);
                      }
                    }
                  }
                } else if (gra.useLocalRA &&
                           isDstRegAllocPartaker) {
                  LocalLiveRange *localLR = NULL;
                  G4_Declare *topdcl = GetTopDclFromRegRegion(src);

                  if (topdcl)
                    localLR = gra.getLocalLR(topdcl);

                  if (localLR && localLR->getAssigned()) {
                    int reg, sreg, numrows;
                    G4_VarBase *preg = localLR->getPhyReg(sreg);
                    numrows = localLR->getTopDcl()->getNumRows();

                    vISA_ASSERT(preg->isGreg(), "Register in src was not GRF");

                    reg = preg->asGreg()->getRegNum();
#ifdef DEBUG_VERBOSE_ON
                    printf("Src%d  ", j);
                    inst->dump();
#endif
                    for (int j = reg, sum = reg + numrows; j < sum; j++) {
                      int k = getGRFDclForHRA(j)->getRegVar()->getId();
                      if (!varSplitCheckBeforeIntf(dstId, k)) {
                        checkAndSetIntf(dstId, k);
                        buildInterferenceWithAllSubDcl(dstId, k);
                      }
                    }
                  }
                }
              }
            }
          } else if (srcRgn->getRegAccess() == IndirGRF) {
            // make every var in points-to set live
            const REGVAR_VECTOR &pointsToSet =
                liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(
                    srcRgn, bb);
            for (auto &pt : pointsToSet) {
              if (pt.var->isRegAllocPartaker()) {
                unsigned srcId = pt.var->getId();
                if (isDstRegAllocPartaker) {
                  if (!varSplitCheckBeforeIntf(dstId, srcId)) {
                    checkAndSetIntf(dstId, srcId);
                    buildInterferenceWithAllSubDcl(dstId, srcId);
                  }
                } else {
                  for (int j = dstPreg, sum = dstPreg + dstNumRows; j < sum;
                       j++) {
                    int k = getGRFDclForHRA(j)->getRegVar()->getId();
                    if (!varSplitCheckBeforeIntf(k, srcId)) {
                      checkAndSetIntf(k, srcId);
                      buildInterferenceWithAllSubDcl(k, srcId);
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
}

uint32_t GlobalRA::getRefCount(int loopNestLevel) {
  if (loopNestLevel == 0) {
    return 1;
  }
  return (uint32_t)std::pow(IN_LOOP_REFERENCE_COUNT_FACTOR,
                            std::min(loopNestLevel, 8));
}

// handle return value interference for fcall
void Interference::buildInterferenceForFcall(
    G4_BB *bb, SparseBitVector &live, G4_INST *inst,
    std::list<G4_INST *>::reverse_iterator i, const G4_VarBase *regVar) {
  vISA_ASSERT(inst->opcode() == G4_pseudo_fcall, "expect fcall inst");
  if (regVar->isRegAllocPartaker()) {
    unsigned id = static_cast<const G4_RegVar *>(regVar)->getId();

    buildInterferenceWithLive(live, id);
    updateLiveness(live, id, false);
  }
}

bool GlobalRA::canIncreaseGRF(unsigned spillSize, bool infCostSpilled) {
  // If we estimate insufficient # GRFs early on, we may end up
  // spilling an infinite spill cost variable. As last ditch effort,
  // we bump up # GRFs and retry compilation. If we estimate GRF
  // config well, then we should never see infCostSpilled == true.

  // Conditions to increase #GRFs assuming first RA iteration did not succeed:
  //  - Variable with inf spill cost, or
  //  - #GRFs selected and next larger one has same number of threads, or
  //  - Spill size is above threshold
  if ((infCostSpilled || kernel.grfMode.hasLargerGRFSameThreads() ||
       spillSize >= kernel.grfMode.getSpillThreshold()) &&
      !didGRFIncrease) {
    if (kernel.updateKernelToLargerGRF()) {
      // GRF successfully increased
      RA_TRACE(std::cout << "\t--new GRF size " << kernel.getNumRegTotal()
                         << ". Re-run RA\n ");
      didGRFIncrease = true;
      return true;
    }
  }
  return false;
}

void Interference::buildInterferenceForDst(
    G4_BB *bb, SparseBitVector &live, G4_INST *inst,
    std::list<G4_INST *>::reverse_iterator i, G4_DstRegRegion *dst) {

  if (dst->getBase()->isRegAllocPartaker()) {
    unsigned id = ((G4_RegVar *)dst->getBase())->getId();
    //
    // In following code,
    // pseudo_kill V10
    // mov (8) V10, V11
    //
    // V10 and V11 do not interfere and can be assigned
    // same register.
    //
    // Following condition skips marking interference for
    // pseudo_kill nodes.
    //
    if (!inst->isPseudoKill() && !inst->isLifeTimeEnd()) {
      buildInterferenceWithLive(live, id);
      if (lrs[id]->getIsSplittedDcl()) {
        buildInterferenceWithSubDcl(id, (G4_Operand *)dst, live, false, true);
      }
    }

    //
    // if the write does not cover the whole dst region, we should continue let
    // the liveness propagate upwards
    //
    if (liveAnalysis->writeWholeRegion(bb, inst, dst) || inst->isPseudoKill()) {
      updateLiveness(live, id, false);

      if (lrs[id]->getIsSplittedDcl()) {
        for (unsigned i = lrs[id]->getDcl()->getSplitVarStartID();
             i < lrs[id]->getDcl()->getSplitVarStartID() +
                     gra.getSplitVarNum(lrs[id]->getDcl());
             i++) {
          live.reset(i); // kill all childs, there may be not used childs
                         // generated due to splitting, killed also.
        }
      }
    }
  } else if (dst->isIndirect() && liveAnalysis->livenessClass(G4_GRF)) {
    //
    // add interferences to the list of potential indirect destination accesses.
    //
    const REGVAR_VECTOR &pointsToSet =
        liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(dst, bb);
    for (auto &pt : pointsToSet) {
      if (pt.var->isRegAllocPartaker()) {
        buildInterferenceWithLive(live, pt.var->getId());
      }
    }
  }
}

void Interference::buildInterferenceWithinBB(G4_BB *bb, SparseBitVector &live) {
  DebugInfoState state;

  for (auto i = bb->rbegin(); i != bb->rend(); i++) {
    G4_INST *inst = (*i);

    G4_DstRegRegion *dst = inst->getDst();
    if (dst) {
      buildInterferenceForDst(bb, live, inst, i, dst);
    }

    if (inst->opcode() == G4_pseudo_fcall) {
      if (liveAnalysis->livenessClass(G4_GRF)) {
        auto fcall = kernel.fg.builder->getFcallInfo(bb->back());
        G4_Declare *arg = kernel.fg.builder->getStackCallArg();
        G4_Declare *ret = kernel.fg.builder->getStackCallRet();
        vISA_ASSERT(fcall != std::nullopt, "fcall info not found");
        uint16_t retSize = fcall->getRetSize();
        uint16_t argSize = fcall->getArgSize();
        if (ret && retSize > 0 && ret->getRegVar()) {
          buildInterferenceForFcall(bb, live, inst, i, ret->getRegVar());
        }
        if (arg && argSize > 0 && arg->getRegVar()) {
          auto id = arg->getRegVar()->getId();
          updateLiveness(live, id, true);
        }
      } else if (liveAnalysis->livenessClass(G4_ADDRESS)) {
        // assume callee will use A0
        auto A0Dcl = kernel.fg.fcallToPseudoDclMap[inst->asCFInst()].A0;
        buildInterferenceWithLive(live, A0Dcl->getRegVar()->getId());
      } else if (liveAnalysis->livenessClass(G4_FLAG)) {
        // assume callee will use both F0 and F1
        auto flagDcl = kernel.fg.fcallToPseudoDclMap[inst->asCFInst()].Flag;
        buildInterferenceWithLive(live, flagDcl->getRegVar()->getId());
      }
    }

    if (inst->isSend() && inst->asSendInst()->isSVMScatterRW() &&
        inst->getExecSize() < g4::SIMD8) {
      setForbiddenGRFNumForSVMScatter(inst);
    }

    if ((inst->isSend() || inst->isFillIntrinsic()) && !dst->isNullReg() &&
        kernel.fg.builder->WaDisableSendSrcDstOverlap()) {
      markInterferenceForSend(bb, inst, dst);
    } else if (kernel.fg.builder->avoidDstSrcOverlap() && dst &&
               !dst->isNullReg()) {
      markInterferenceToAvoidDstSrcOverlap(bb, inst);
    }

    if (inst->isSplitSend() && !inst->getSrc(1)->isNullReg()) {
      G4_SrcRegRegion *src0 = inst->getSrc(0)->asSrcRegRegion();
      G4_SrcRegRegion *src1 = inst->getSrc(1)->asSrcRegRegion();

      if (src0->getBase()->isRegAllocPartaker() &&
          src1->getBase()->isRegAllocPartaker()) {
        // src0 and src1 of split send may not overlap. In normal cases this is
        // handled automatically as we add interference edge when we reach
        // src0/src1's def.  If one source is an undefined variable (this can
        // happen for URB write payload) and the other an input, however, we
        // could miss the interference edge between the two.  So we add it
        // explicitly here
        int src0Id = src0->getBase()->asRegVar()->getId();
        int src1Id = src1->getBase()->asRegVar()->getId();

        checkAndSetIntf(src0Id, src1Id);
        buildInterferenceWithAllSubDcl(src0Id, src1Id);
      }
    }

    // DPAS: As part of same instruction, src1 should not have overlap with dst.
    // Src0 and src2 are okay to have overlap
    if (inst->isDpas() && !inst->getSrc(1)->isNullReg()) {
      G4_SrcRegRegion *src1 = inst->getSrc(1)->asSrcRegRegion();
      if (dst->getBase()->isRegAllocPartaker() &&
          src1->getBase()->isRegAllocPartaker()) {
        int dstId = dst->getBase()->asRegVar()->getId();
        int src1Id = src1->getBase()->asRegVar()->getId();
        checkAndSetIntf(dstId, src1Id);
        buildInterferenceWithAllSubDcl(dstId, src1Id);
      }
    }

    //
    // process each source operand
    //
    for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
      G4_Operand *src = inst->getSrc(j);
      if (!src)
        continue;
      if (src->isSrcRegRegion()) {
        G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
        if (srcRegion->getBase()->isRegAllocPartaker()) {
          unsigned id = ((G4_RegVar *)(srcRegion)->getBase())->getId();

          if (!inst->isLifeTimeEnd()) {
            updateLiveness(live, id, true);
            if (lrs[id]->getIsSplittedDcl()) {
              buildInterferenceWithSubDcl(id, src, live, true, false);
            }
          }
        } else if (srcRegion->isIndirect() &&
                   liveAnalysis->livenessClass(G4_GRF)) {
          // make every var in points-to set live
          const REGVAR_VECTOR &pointsToSet =
              liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(
                  srcRegion, bb);
          for (auto &pt : pointsToSet) {
            if (pt.var->isRegAllocPartaker()) {
              updateLiveness(live, pt.var->getId(), true);
            }
          }
        }
      }
    }

    //
    // Process register-indirect destination uses of ARF.
    //
    if (dst) {
      if (dst->getBase()->isRegAllocPartaker() &&
          dst->getRegAccess() != Direct) {
        live.set(dst->getBase()->asRegVar()->getId());
      }
    }

    //
    // Process condMod
    //
    G4_CondMod *mod = inst->getCondMod();
    if (mod != NULL) {
      G4_VarBase *flagReg = mod->getBase();
      if (flagReg != NULL) {
        unsigned id = flagReg->asRegVar()->getId();
        if (flagReg->asRegVar()->isRegAllocPartaker()) {
          buildInterferenceWithLive(live, id);

          if (liveAnalysis->writeWholeRegion(bb, inst, flagReg)) {
            updateLiveness(live, id, false);
          }
        }
      } else {
        vISA_ASSERT((inst->opcode() == G4_sel || inst->opcode() == G4_csel) &&
                        inst->getCondMod() != NULL,
                    "Invalid CondMod");
      }
    }

    //
    // Process predicate
    //
    G4_Predicate *predicate = inst->getPredicate();
    if (predicate != NULL) {
      G4_VarBase *flagReg = predicate->getBase();
      unsigned id = flagReg->asRegVar()->getId();
      if (flagReg->asRegVar()->isRegAllocPartaker()) {
        live.set(id);
      }
    }

    // Update debug info intervals based on live set
    if (builder.getOption(vISA_GenerateDebugInfo)) {
      updateDebugInfo(kernel, inst, *liveAnalysis, lrs, live, &state,
                      inst == bb->front());
    }
  }
}

void Interference::applyPartitionBias() {
  // Any variable that interferes with a VCA dcl is live through an fcall.
  // This function makes such variables callee save biased to avoid save/restore
  // code around fcall. Save/restore may still be needed in case this is a
  // stack call function (vs kernel), but a single save/restore sequence can
  // free the callee save register throughout the function.
  for (auto i : liveAnalysis->globalVars) {
    if (kernel.fg.isPseudoVCADcl(lrs[i]->getDcl())) {
      const auto &intfs = sparseIntf[i];
      for (const auto edge : intfs) {
        // no point adding bias to any variable already assigned
        if (lrs[edge]->getPhyReg())
          continue;

        lrs[edge]->setCalleeSaveBias(true);
        lrs[edge]->setCallerSaveBias(false);
      }
    }
  }
}

// Any setting of LiveRange property that is discovered during interference
// must be done here. Because with incremental RA, we may not run interference
// computation for all BBs.
void Interference::setupLRs(G4_BB *bb) {
  unsigned refCount = GlobalRA::getRefCount(
      kernel.getOption(vISA_ConsiderLoopInfoInRA) ? bb->getNestLevel() : 0);
  bool incSpillCostAddrTaken = kernel.getOption(vISA_IncSpillCostAllAddrTaken);

  for (auto i = bb->rbegin(); i != bb->rend(); i++) {
    G4_INST *inst = (*i);

    auto dst = inst->getDst();
    if (dst) {
      if (dst->getBase()->isRegAllocPartaker()) {
        unsigned id = ((G4_RegVar *)dst->getBase())->getId();
        if (!inst->isPseudoKill() && !inst->isLifeTimeEnd()) {
          lrs[id]->setRefCount(lrs[id]->getRefCount() +
                               refCount); // update reference count
        }
        lrs[id]->checkForInfiniteSpillCost(bb, i);
      } else if (dst->isIndirect() && liveAnalysis->livenessClass(G4_GRF)) {
        const REGVAR_VECTOR &pointsToSet =
            liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(dst,
                                                                          bb);
        for (auto &pt : pointsToSet) {
          if (!pt.var->isRegAllocPartaker() || !incSpillCostAddrTaken)
            continue;

          lrs[pt.var->getId()]->setRefCount(
              lrs[pt.var->getId()]->getRefCount() + refCount);
        }
      }
    }

    if (inst->opcode() == G4_pseudo_fcall &&
        liveAnalysis->livenessClass(G4_GRF)) {
      auto fcall = kernel.fg.builder->getFcallInfo(bb->back());
      G4_Declare *ret = kernel.fg.builder->getStackCallRet();
      vISA_ASSERT(fcall != std::nullopt, "fcall info not found");
      uint16_t retSize = fcall->getRetSize();
      if (ret && retSize > 0 && ret->getRegVar() &&
          ret->getRegVar()->isRegAllocPartaker()) {
        unsigned id = static_cast<const G4_RegVar *>(ret->getRegVar())->getId();
        lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);
      }
    }

    bool isSend =
        inst->isSend() || inst->isFillIntrinsic() || inst->isSpillIntrinsic();
    if (isSend && !dst->isNullReg()) {
      // r127 must not be used for return address when there is a src and dest
      // overlap in send instruction. This applies to split-send as well
      if (kernel.fg.builder->needsToReserveR127() &&
          liveAnalysis->livenessClass(G4_GRF) &&
          dst->getBase()->isRegAllocPartaker() &&
          !dst->getBase()->asRegVar()->isPhyRegAssigned()) {
        int dstId = dst->getBase()->asRegVar()->getId();
        lrs[dstId]->setForbidden(forbiddenKind::FBD_LASTGRF);
      }
    }


    //
    // process each source operand
    //
    for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
      G4_Operand *src = inst->getSrc(j);
      if (!src || !src->isSrcRegRegion())
        continue;

      G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
      if (srcRegion->getBase()->isRegAllocPartaker()) {
        unsigned id = ((G4_RegVar *)(srcRegion)->getBase())->getId();

        lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);
        if (inst->isEOT() && liveAnalysis->livenessClass(G4_GRF)) {
          // mark the liveRange as the EOT source
          lrs[id]->setEOTSrc();
          if (builder.hasEOTGRFBinding()) {
            lrs[id]->setForbidden(forbiddenKind::FBD_EOT);
          }
        }
        if (inst->isReturn()) {
          lrs[id]->setRetIp();
        }
      } else if (srcRegion->isIndirect() &&
                 liveAnalysis->livenessClass(G4_GRF)) {
        // make every var in points-to set live
        const REGVAR_VECTOR &pointsToSet =
            liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(
                srcRegion, bb);
        for (auto &pt : pointsToSet) {
          if (!pt.var->isRegAllocPartaker() || !incSpillCostAddrTaken)
            continue;

          lrs[pt.var->getId()]->setRefCount(
              lrs[pt.var->getId()]->getRefCount() + refCount);
        }
      }
    }

    //
    // Process condMod
    //
    if (auto mod = inst->getCondMod()) {
      G4_VarBase *flagReg = mod->getBase();
      if (flagReg) {
        unsigned id = flagReg->asRegVar()->getId();
        if (flagReg->asRegVar()->isRegAllocPartaker()) {
          lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);
          lrs[id]->checkForInfiniteSpillCost(bb, i);
        }
      } else {
        vISA_ASSERT((inst->opcode() == G4_sel || inst->opcode() == G4_csel) &&
                        inst->getCondMod() != NULL,
                    "Invalid CondMod");
      }
    }

    //
    // Process predicate
    //
    if (auto predicate = inst->getPredicate()) {
      G4_VarBase *flagReg = predicate->getBase();
      unsigned id = flagReg->asRegVar()->getId();
      if (flagReg->asRegVar()->isRegAllocPartaker()) {
        lrs[id]->setRefCount(lrs[id]->getRefCount() + refCount);
      }
    }
  }
}

void Interference::computeInterference() {
  startTimer(TimerID::INTERFERENCE);

  for (auto bb : kernel.fg) {
    // Initialize LR properties like ref count and forbidden here.
    // This method is invoked for all BBs even with incremental RA.
    setupLRs(bb);
  }

  //
  // create bool vector, live, to track live ranges that are currently live
  //
  SparseBitVector live;

  buildInterferenceAmongLiveOuts();

  for (G4_BB *bb : kernel.fg) {
    if (!incRA.intfNeededForBB(bb)) {
      continue;
    }
    //
    // mark all live ranges dead
    //
    live.clear();
    //
    // start with all live ranges that are live at the exit of BB
    //
    buildInterferenceAtBBExit(bb, live);
    //
    // traverse inst in the reverse order
    //
    buildInterferenceWithinBB(bb, live);
  }

  buildInterferenceAmongLiveIns();

  //
  // Build interference with physical registers assigned by local RA
  //
  if (gra.useLocalRA) {
    for (auto curBB : kernel.fg) {
      buildInterferenceWithLocalRA(curBB);
    }
  }

  RA_TRACE({
    RPE rpe(gra, liveAnalysis);
    rpe.run();
    std::cout << "\t--max RP: " << rpe.getMaxRP() << "\n";
  });

  if ((builder.getOption(vISA_RATrace) ||
       builder.getOption(vISA_DumpPerfStatsVerbose)) &&
      builder.getJitInfo()->statsVerbose.RAIterNum == 1) {
    getNormIntfNum();
  }

  // Augment interference graph to accomodate non-default masks
  aug.augmentIntfGraph();

  generateSparseIntfGraph();

  countNeighbors();

  if (IncrementalRA::isEnabled(kernel)) {
    // Incremental interference was computed for current iteration.
    // Now prepare for incremental temps in next iteration.
    gra.incRA.clearCandidates();
  }

  // apply callee save bias after augmentation as interference graph is
  // up-to-date.
  if (kernel.fg.getHasStackCalls()) {
    applyPartitionBias();
  }
  stopTimer(TimerID::INTERFERENCE);
}

void Interference::getNormIntfNum() {
  unsigned numVars = liveAnalysis->getNumSelectedVar();
  uint32_t numEdges = 0;

  if (useDenseMatrix()) {
    // Iterate over intf graph matrix
    for (unsigned row = 0; row < numVars; row++) {
      unsigned rowOffset = row * rowSize;
      unsigned colStart = (row + 1) / BITS_DWORD;
      for (unsigned j = colStart; j < rowSize; j++) {
        unsigned intfBlk = getInterferenceBlk(rowOffset + j);
        if (intfBlk == 0) {
          continue;
        }
        for (unsigned k = 0; k < BITS_DWORD; k++) {
          if (!(intfBlk & (1 << k))) {
            continue;
          }
          unsigned v2 = (j * BITS_DWORD) + k;
          if (v2 != row) {
            numEdges++;
          }
        }
      }
    }
  } else {
    for (uint32_t v1 = 0; v1 < maxId; ++v1) {
      auto &intfSet = sparseMatrix[v1];
      numEdges += intfSet.count();
    }
  }

  builder.getJitInfo()->statsVerbose.normIntfNum = numEdges;
  RA_TRACE(std::cout << "\t--normal edge #: " << numEdges << "\n");
}

#define SPARSE_INTF_VEC_SIZE 64

void Interference::generateSparseIntfGraph() {
  // Generate sparse intf graph from the dense one
  unsigned numVars = liveAnalysis->getNumSelectedVar();

  sparseIntf.resize(numVars);

  for (unsigned row = 0; row < numVars; row++) {
    sparseIntf[row].reserve(SPARSE_INTF_VEC_SIZE);
  }

  if (useDenseMatrix()) {
    // Iterate over intf graph matrix
    for (unsigned row = 0; row < numVars; row++) {
      unsigned rowOffset = row * rowSize;
      unsigned colStart = (row + 1) / BITS_DWORD;
      for (unsigned j = colStart; j < rowSize; j++) {
        unsigned intfBlk = getInterferenceBlk(rowOffset + j);
        if (intfBlk != 0) {
          for (unsigned k = 0; k < BITS_DWORD; k++) {
            if (intfBlk & (1 << k)) {
              unsigned v2 = (j * BITS_DWORD) + k;
              if (v2 != row) {
                sparseIntf[v2].emplace_back(row);
                sparseIntf[row].emplace_back(v2);
              }
            }
          }
        }
      }
    }
  } else {
    for (uint32_t v1 = 0; v1 < maxId; ++v1) {
      auto &intfSet = sparseMatrix[v1];
      for (uint32_t v2 : intfSet) {
        sparseIntf[v1].emplace_back(v2);
        sparseIntf[v2].emplace_back(v1);
      }
    }
  }
}

void Interference::countNeighbors() {
  if (!builder.getOption(vISA_RATrace) &&
      !builder.getOption(vISA_DumpPerfStatsVerbose))
    return;

  uint32_t numNeighbor = 0;
  uint32_t maxNeighbor = 0;
  [[maybe_unused]] uint32_t maxIndex = 0;
  uint32_t numEdges = 0;
  for (int i = 0, numVar = (int)sparseIntf.size(); i < numVar; ++i) {
    if (lrs[i]->getPhyReg() == nullptr) {
      auto &intf = sparseIntf[i];
      numNeighbor += (uint32_t)intf.size();
      maxNeighbor = std::max(maxNeighbor, numNeighbor);
      if (maxNeighbor == numNeighbor)
        maxIndex = i;
    }
    numEdges += (uint32_t)sparseIntf[i].size();
  }
  float avgNeighbor = ((float)numNeighbor) / sparseIntf.size();
  if (builder.getJitInfo()->statsVerbose.RAIterNum == 1) {
    builder.getJitInfo()->statsVerbose.avgNeighbors = avgNeighbor;
    builder.getJitInfo()->statsVerbose.maxNeighbors = maxNeighbor;
    builder.getJitInfo()->statsVerbose.augIntfNum =
        (numEdges / 2) - builder.getJitInfo()->statsVerbose.normIntfNum;
  }
  RA_TRACE({
    std::cout << "\t--avg # neighbors: " << std::setprecision(6) << avgNeighbor
              << "\n";
    std::cout << "\t--max # neighbors: " << maxNeighbor << " ("
              << lrs[maxIndex]->getDcl()->getName() << ")\n";
    if (builder.getJitInfo()->statsVerbose.RAIterNum == 1) {
      std::cout << "\t--aug edge #: "
                << builder.getJitInfo()->statsVerbose.augIntfNum << "\n";
    }
  });
}

// This function can be invoked before local RA or after augmentation.
// This function will update sub-reg data only for non-NoMask vars and
// leave others unchanged, ie their value will be as per HW conformity
// or earlier phase.
void GlobalRA::updateSubRegAlignment(G4_SubReg_Align subAlign) {
  // Update alignment of all GRF declares to sub-align
  for (auto dcl : kernel.Declares) {
    if (dcl->getRegFile() & G4_GRF && !dcl->getIsPartialDcl()) {
      G4_Declare *topdcl = dcl->getRootDeclare();

      if (!areAllDefsNoMask(topdcl) &&
          getAugmentationMask(topdcl) != AugmentationMasks::NonDefault) {
        dcl->setSubRegAlign(subAlign);
        setSubRegAlign(dcl, subAlign);
      }
    }
  }
}

int GlobalRA::getAlignFromAugBucket(G4_Declare *dcl) {
  if (GlobalRA::useGenericAugAlign(builder.getPlatformGeneration())) {
    // Return 0 if no special alignment is needed
    // Return 2 if even alignment is needed
    // Return 4 if quad alignment is needed

    // Even align needed if for given SIMD size and elem type,
    // a complete def uses between 1-2 GRFs.
    auto kernelSimdSizeToUse = kernel.getSimdSizeWithSlicing();
    G4_Declare *topdcl = dcl->getRootDeclare();
    auto topdclAugMask = getAugmentationMask(topdcl);

    if (!areAllDefsNoMask(topdcl) && !topdcl->getIsPartialDcl() &&
        topdclAugMask != AugmentationMasks::NonDefault) {
      auto elemSizeToUse = topdcl->getElemSize();
      if (elemSizeToUse < 4 && topdclAugMask == AugmentationMasks::Default32Bit)
        // :uw with hstride 2 can also be Default32Bit and hence needs even
        // alignment
        elemSizeToUse = 4;
      else if (elemSizeToUse < 8 &&
               topdclAugMask == AugmentationMasks::Default64Bit)
        elemSizeToUse = 8;

      auto totalByteSize = elemSizeToUse * kernelSimdSizeToUse;
      auto bucketSpans2GRFs = [&]() {
        return totalByteSize > (unsigned)kernel.numEltPerGRF<Type_UB>() &&
               totalByteSize <= (unsigned)(2 * kernel.numEltPerGRF<Type_UB>());
      };

      if (!(!builder.canReadR0() && dcl == kernel.fg.builder->getBuiltinR0())) {
        if (use4GRFAlign) {
          if (topdclAugMask == AugmentationMasks::Default16Bit ||
              topdclAugMask == AugmentationMasks::Default32Bit) {
            if (bucketSpans2GRFs())
              return 2;
          } else if (topdclAugMask == AugmentationMasks::Default64Bit) {
            if (bucketSpans2GRFs())
              // :df SIMD16
              return 2;

            // :df SIMD32
            return 4;
          } else if (topdclAugMask == AugmentationMasks::Undetermined) {
            // Local RA will take this path as augmentation buckets are set
            // to Undetermined. Although this is conservative, hybrid RA
            // will run augmentation and compute buckets to fill in "holes".
            // For eg, mov (32|M0) V10<2>:f should use 4GRF alignment as
            // it's Default64Bit variable, although elem size is :f.
            return 4;
          }
        } else {
          // Even align if size is between 1-2 GRFs, for >2GRF sizes.
          if (bucketSpans2GRFs())
            return 2;
        }
      }
    }
  } else {
    if (dcl->getRegFile() & G4_GRF) {
      G4_Declare *topdcl = dcl->getRootDeclare();
      auto topdclAugMask = getAugmentationMask(topdcl);

      if (!areAllDefsNoMask(topdcl) && !topdcl->getIsPartialDcl() &&
          topdclAugMask != AugmentationMasks::NonDefault &&
          topdclAugMask != AugmentationMasks::Default64Bit) {
        if ((topdcl->getElemSize() >= 4 ||
             topdclAugMask == AugmentationMasks::Default32Bit) &&
            topdcl->getByteSize() >= kernel.numEltPerGRF<Type_UB>() &&
            !(!builder.canReadR0() &&
              dcl == kernel.fg.builder->getBuiltinR0())) {
          return 2;
        }
      }
    }
  }

  return 0;
}

void GlobalRA::augAlign() {
  // Update alignment of all GRF declares based on
  // augmentation bucket and platform.
  for (auto dcl : kernel.Declares) {
    if (dcl->getRegFile() & G4_GRF) {
      unsigned int align = getAlignFromAugBucket(dcl);
      if (align == 4) {
        if (incRA.isEnabled() && !isQuadAligned(dcl)) {
          incRA.evenAlignUpdate(dcl);
        }
        forceQuadAlign(dcl);
      } else if (align == 2) {
        if (incRA.isEnabled() && !isEvenAligned(dcl)) {
          incRA.evenAlignUpdate(dcl);
        }
        setEvenAligned(dcl, true);
      }
    }
  }
}

void GlobalRA::getBankAlignment(LiveRange *lr, BankAlign &align) {
  G4_Declare *dcl = lr->getDcl();
  if (kernel.getSimdSize() < g4::SIMD16) {
    return;
  }

  if (dcl->getRegFile() & G4_GRF) {
    G4_Declare *topdcl = dcl->getRootDeclare();
    auto topdclBC = getBankConflict(topdcl);

    if (topdclBC != BANK_CONFLICT_NONE) {
      if (topdcl->getElemSize() >= 4 && topdcl->getNumRows() > 1 &&
          !(!builder.canReadR0() && dcl == kernel.fg.builder->getBuiltinR0())) {
        if (topdclBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
            topdclBC == BANK_CONFLICT_SECOND_HALF_ODD) {
          align = BankAlign::Odd;
        }
      }
    }
  }
}

// Compute homeFunc for dcl.  Following rules are used:
// 1. A variable that's defined or used in a single function has
//    that function as its home function.
// 2. A variable that's defined or used across functions (eg,
//    args, retval) have their home function set to nullptr.
// 3. homeFunc is set only on root G4_Declare.
FuncInfo *Augmentation::computeHomeFunc(G4_Declare *dcl) {
  vISA_ASSERT(!dcl->getAliasDeclare(), "root dcl expected");
  // If there are no subroutines then all dcls have kernel as home function
  if (!hasSubroutines)
    return kernel.fg.kernelInfo;

  if (hasUniqueFuncHome(dcl))
    return getUniqueFuncHome(dcl);

  FuncInfo *homeFunction = nullptr;
  // Live-ins to kernel are modeled as being implicitly defined in kernel.
  if (dcl->isInput())
    homeFunction = kernel.fg.kernelInfo;
  auto *defs = refs.getDefs(dcl);
  if (defs) {
    for (auto &def : *defs) {
      auto *bb = std::get<1>(def);
      auto *curDefFunc = bb->getFuncInfo();
      if (!homeFunction) {
        homeFunction = curDefFunc;
        continue;
      } else if (homeFunction != curDefFunc) {
        return nullptr;
      }
    }
  }

  auto *uses = refs.getUses(dcl);
  if (uses) {
    for (auto &use : *uses) {
      auto *bb = std::get<1>(use);
      auto *curUseFunc = bb->getFuncInfo();
      if (!homeFunction) {
        homeFunction = curUseFunc;
        continue;
      } else if (homeFunction != curUseFunc) {
        return nullptr;
      }
    }
  }

  return homeFunction;
}

void Augmentation::populateFuncMaps() {
  vISA_ASSERT(kernel.fg.getBBList().back()->size() > 0, "last BB empty");
  instToFunc.resize(kernel.fg.getBBList().back()->back()->getLexicalId() + 1);
  for (auto &func : kernel.fg.sortedFuncTable) {
    for (auto &bb : func->getBBList()) {
      for (auto *inst : *bb) {
        instToFunc[inst->getLexicalId()] = func;
      }
    }
  }
}

void Augmentation::populateHomeFunc() {
  // Assume last G4_Declare has max declId
  homeFunc.resize(kernel.Declares.back()->getDeclId() + 1);
  for (auto dcl : kernel.Declares) {
    if (dcl->getAliasDeclare())
      dcl = dcl->getRootDeclare();
    auto *func = computeHomeFunc(dcl);
    vISA_ASSERT(!hasUniqueFuncHome(dcl) || getUniqueFuncHome(dcl) == func,
                "different home func set");
    homeFunc[dcl->getDeclId()] = func;
  }
}

Augmentation::Augmentation(Interference &i, const LivenessAnalysis &l,
                           GlobalRA &g)
    : kernel(g.kernel), intf(i), gra(g), liveAnalysis(l), lrs(g.incRA.getLRs()),
      fcallRetMap(g.fcallRetMap),
      refs(g.kernel, false, false, true, &g.pointsToAnalysis),
      hasSubroutines(kernel.fg.sortedFuncTable.size() > 0 &&
                     g.kernel.getOption(vISA_NewAugmentation)) {
  useGenericAugAlign =
      GlobalRA::useGenericAugAlign(kernel.getPlatformGeneration());
}

// For Scatter read, the channel is not handled as the block read.
// Update the emask according to the definition of VISA
bool Augmentation::updateDstMaskForGather(G4_INST *inst,
                                          std::vector<unsigned char> &mask) {
  G4_InstSend *sendInst = reinterpret_cast<G4_InstSend *>(inst);
  G4_SendDesc *msgDesc = sendInst->getMsgDesc();

  if (msgDesc->isRaw()) {
    return updateDstMaskForGatherRaw(
        inst, mask, reinterpret_cast<const G4_SendDescRaw *>(msgDesc));
  }
  vISA_ASSERT_UNREACHABLE("unexpected descriptor");
  return false;
}

static void updateMaskSIMT(unsigned char curEMBit, unsigned char execSize,
                           std::vector<unsigned char> &mask,
                           unsigned dataSizeBytes, unsigned vecElems) {
  unsigned blockSize = dataSizeBytes;
  unsigned blockNum = vecElems;
  for (unsigned i = 0; i < execSize; i++) {
    for (unsigned j = 0; j < blockNum; j++) {
      for (unsigned k = 0; k < blockSize; k++) {
        mask[(j * execSize + i) * blockSize + k] = curEMBit;
      }
    }
    if (curEMBit != NOMASK_BYTE) {
      curEMBit++;
      vISA_ASSERT(curEMBit <= 32, "Illegal mask channel");
    }
  }
}

bool Augmentation::updateDstMaskForGatherRaw(G4_INST *inst,
                                             std::vector<unsigned char> &mask,
                                             const G4_SendDescRaw *msgDesc) {
  unsigned char execSize = inst->getExecSize();
  const G4_DstRegRegion *dst = inst->getDst();
  unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
  unsigned short elemSize = dst->getElemSize();

  if (inst->isWriteEnableInst() ||
      kernel.fg.builder->hasGatherReadSuppressionWARA()) {
    curEMBit = NOMASK_BYTE;
  }

  SFID funcID = msgDesc->getFuncId();

  switch (funcID) {
  case SFID::RTHW:
    // Mark RT send dst to be NonDefault, even when it doesn't have WriteEnable
    if (kernel.getPlatform() >= Xe2) {
      for (auto &elem : mask)
        elem = NOMASK_BYTE;
      return true;
    }
    break;

  case SFID::DP_DC1:
    switch (msgDesc->getHdcMessageType()) {
    case DC1_A64_SCATTERED_READ: // a64 scattered read: svm_gather
    {
      unsigned blockNum = msgDesc->getElemsPerAddr();
      unsigned blockSize = msgDesc->getElemSize();

      for (unsigned i = 0; i < execSize; i++) {
        for (unsigned j = 0; j < blockNum; j++) {
          for (unsigned k = 0; k < blockSize; k++) {
            mask[(j * execSize + i) * blockSize + k] = curEMBit;
          }
        }
        if (curEMBit != NOMASK_BYTE) {
          curEMBit++;
          vISA_ASSERT(curEMBit <= 32, "Illegal mask channel");
        }
      }
      return true;
    } break;

    case DC1_A64_UNTYPED_SURFACE_READ: // SVM gather 4
    case DC1_UNTYPED_SURFACE_READ:     // VISA gather 4
    case DC1_TYPED_SURFACE_READ:       // Gather 4 typed
    {
      unsigned channelNum = msgDesc->getEnabledChannelNum();
      if (channelNum == 0) {
        return false;
      }
      if (elemSize < 4) {
        elemSize = 4;
      }

      for (unsigned i = 0; i < channelNum; i++) {
        for (unsigned j = 0; j < execSize; j++) {
          for (unsigned k = 0; k < elemSize; k++) {
            mask[(i * execSize + j) * elemSize + k] = curEMBit;
          }
          if (curEMBit != NOMASK_BYTE) {
            curEMBit++;
            vISA_ASSERT(curEMBit <= 32, "Illegal mask channel");
          }
        }
        if (curEMBit != NOMASK_BYTE) {
          curEMBit = (unsigned char)inst->getMaskOffset();
        }
      }
      return true;
    } break;

    default:
      return false;
    }
    break;
  case SFID::DP_DC2:
    switch (msgDesc->getHdcMessageType()) {
    case DC2_UNTYPED_SURFACE_READ:     // gather 4 scaled
    case DC2_A64_UNTYPED_SURFACE_READ: // SVM gather 4 scaled
    {
      unsigned channelNum = msgDesc->getEnabledChannelNum();
      if (channelNum == 0) {
        return false;
      }
      if (elemSize < 4) {
        elemSize = 4;
      }

      for (unsigned i = 0; i < channelNum; i++) {
        for (unsigned j = 0; j < execSize; j++) {
          for (unsigned k = 0; k < elemSize; k++) {
            mask[(i * execSize + j) * elemSize + k] = curEMBit;
          }
          if (curEMBit != NOMASK_BYTE) {
            curEMBit++;
            vISA_ASSERT(curEMBit <= 32, "Illegal mask channel");
          }
        }
        if (curEMBit != NOMASK_BYTE) {
          curEMBit = (unsigned char)inst->getMaskOffset();
        }
      }
      return true;
    }

    case DC2_BYTE_SCATTERED_READ: // scaled byte scattered read: gather_scaled,
                                  // handled as block read write
    default:
      return false;
    }
    break;
  case SFID::DP_DC0:
    switch (msgDesc->getHdcMessageType()) {
    case DC_DWORD_SCATTERED_READ: // dword scattered read: gather(dword),
                                  // handled as block read write
    case DC_BYTE_SCATTERED_READ: // byte scattered read:   gather(byte), handled
                                 // as block read write
    default:
      return false;
    }
    break;

  case SFID::SAMPLER: {
    unsigned respLength = msgDesc->ResponseLength();
    if (respLength * kernel.numEltPerGRF<Type_UB>() !=
            dst->getTopDcl()->getByteSize() &&
        msgDesc->isFence()) {
      // since send dst size is not exactly equal to ResponseLength encoded in
      // the descriptor, conservatively treat the send as being non-default
      auto sz = dst->getTopDcl()->getByteSize();
      for (unsigned int i = 0; i != sz; ++i)
        mask[i] = NOMASK_BYTE;
      return true;
    }
    elemSize = msgDesc->is16BitReturn() ? 2 : 4;
    unsigned warpNum =
        respLength * kernel.numEltPerGRF<Type_UB>() / (execSize * elemSize);
    if (inst->isWriteEnableInst()) {
      curEMBit = NOMASK_BYTE;
    }
    for (unsigned i = 0; i < warpNum; i++) {
      for (unsigned j = 0; j < execSize; j++) {
        for (unsigned k = 0; k < elemSize; k++) {
          mask[(i * execSize + j) * elemSize + k] = curEMBit;
        }
        if (curEMBit != NOMASK_BYTE) {
          curEMBit++;
          vISA_ASSERT(curEMBit <= 32, "Illegal mask channel");
        }
      }
      if (curEMBit != NOMASK_BYTE) {
        curEMBit = (unsigned char)inst->getMaskOffset();
      }
    }
    return true;
  }

  break;

  case SFID::UGM:
  case SFID::UGML:
  case SFID::SLM: {
    uint32_t desc = msgDesc->getDesc();
    uint32_t op = (desc & 0x3F);                  // [5:0]
    uint32_t dszEncd = (desc >> 9) & 0x7;         // [11:9]
    bool isTranspose = ((desc >> 15) & 0x1) != 0; // [15]
    if (op == LSC_LOAD && !isTranspose) {         // transpose not supported yet
      int dataSzReg = 0;
      switch (dszEncd) { // dat size [11:9]
      case 0:
        dataSzReg = 1;
        break; // d8
      case 1:
        dataSzReg = 2;
        break; // d16
      default:
        dataSzReg = 4;
        break; // d32, d8u32, d16u32, d16u32h
      case 3:
        dataSzReg = 8;
        break; // d64
      }
      int vecSz = 0;
      int vecSzEncd = (desc >> 12) & 0x7; // [14:12]
      if (vecSzEncd <= 3) {
        vecSz = vecSzEncd + 1; // V1, V2, V3, V4
      } else {
        vecSz = 4 << (vecSzEncd - 3); // V8, V16, V32, V64
      }
      updateMaskSIMT(curEMBit, execSize, mask, (unsigned)dataSzReg,
                     (unsigned)vecSz);
      return true;
    }
  }
  default:
    return false;
  }

  return false;
}


// Value stored at each byte in mask determines which bits
// of EM enable that byte for writing. When checkCmodOnly
// is set dst is ignored and mask only for cmod is set. For
// flag declares, mask is at bit granularity rather than byte.
// Function updates mask field in declaration of correspoing
// variable - dst or cmod.
void Augmentation::updateDstMask(G4_INST *inst, bool checkCmodOnly) {
  G4_DstRegRegion *dst = inst->getDst();
  G4_CondMod *cmod = inst->getCondMod();

  if ((checkCmodOnly == false && dst && dst->getBase() &&
       dst->getBase()->isRegVar()) ||
      (checkCmodOnly == true && cmod != NULL && cmod->getBase() != NULL)) {
    int dclOffset = 0;
    G4_Declare *topdcl = NULL;

    if (checkCmodOnly == false) {
      topdcl = dst->getBase()->asRegVar()->getDeclare();
    } else {
      topdcl = cmod->asCondMod()->getTopDcl();
    }

    while (topdcl->getAliasDeclare() != nullptr) {
      dclOffset += topdcl->getAliasOffset();
      topdcl = topdcl->getAliasDeclare();
    }

    auto &mask = const_cast<std::vector<unsigned char> &>(gra.getMask(topdcl));

    unsigned size = topdcl->getByteSize();
    if (checkCmodOnly == true || dst->isFlag()) {
      size *= BITS_PER_BYTE;
    }

    if (mask.size() == 0) {
      mask.resize(size);
    }

    vISA_ASSERT(mask.size() > 0, "Valid mask not found for dcl %s",
                topdcl->getName());

    unsigned short hstride, elemSize;
    short row, subReg;
    unsigned startByte;

    if (checkCmodOnly == false) {
      hstride = dst->getHorzStride();

      row = dst->getRegOff();
      subReg = dst->getSubRegOff();
      elemSize = dst->getElemSize();

      if (inst->isSend() && !inst->isEOT()) {
        if (updateDstMaskForGather(inst, mask)) {
          return;
        }
      }

      if (dst->isFlag()) {
        elemSize = 1;
      }

      startByte = (row * kernel.getGRFSize()) + (subReg * elemSize);

      if (dst->isFlag()) {
        startByte = (row * 32) + (subReg * 8);
      }
    } else {
      hstride = 1;
      row = 0;
      elemSize = 1;
      startByte = cmod->asCondMod()->getLeftBound();
    }

    unsigned rb = 0xffffffff;

    if (checkCmodOnly == true) {
      rb = cmod->asCondMod()->getRightBound();
    } else {
      rb = dst->getRightBound();
    }

    unsigned char curEMBit = (unsigned char)inst->getMaskOffset();
    if (inst->isWriteEnableInst()) {
      curEMBit = NOMASK_BYTE;
    }

    for (unsigned i = dclOffset + startByte; i <= rb;
         i += (hstride * elemSize)) {
      for (int j = 0; j < elemSize; j++) {
        vISA_ASSERT(i + j < size,
                    "updateDstMask writing past end of mask array size: %d",
                    size);
        mask[i + j] |= curEMBit;
      }
      if (curEMBit != NOMASK_BYTE) {
        curEMBit++;
      }
    }
  }
}

unsigned Augmentation::getByteSizeFromMask(AugmentationMasks type) {
  if (type == AugmentationMasks::Default16Bit) {
    return 2;
  } else if (type == AugmentationMasks::Default32Bit) {
    return 4;
  } else if (type == AugmentationMasks::Default64Bit) {
    return 8;
  }

  vISA_ASSERT_UNREACHABLE("Unexpected type of mask");

  return 0;
}

bool Augmentation::isDefaultMaskDcl(G4_Declare *dcl, unsigned simdSize,
                                    AugmentationMasks type) {
  // default mask is one where dst's hstride is 1 and
  // elem size is 4
  bool isDefault = false;
  auto &mask = gra.getMask(dcl);

  unsigned byteSize = getByteSizeFromMask(type);

  // treat simd32 as simd16 when the program is split in to 2 simd16.
  // when a simd32 program is not split in to 2 simd16, but some sends
  // are broken in to 2 simd16 then treat those simd16 sends as non-default.
  if (simdSize == 32 && kernel.getChannelSlicing()) {
    simdSize = 16;
  }
  if (mask.size() > 0) {
    G4_Declare *topdcl = dcl->getRootDeclare();
    bool isFlagDcl = (topdcl->getRegFile() == G4_FLAG);

    unsigned size = topdcl->getByteSize();
    unsigned char curEMBit = 0;
    bool found = true;
    unsigned wrapAround = simdSize * byteSize;

    if (isFlagDcl == true) {
      size *= BITS_PER_BYTE;
      wrapAround = 16;
    }

    for (unsigned i = 0; i < size; i += 1) {
      if (isFlagDcl == true) {
        curEMBit++;
      } else {
        if (byteSize && i % byteSize == 0) {
          curEMBit++;
        }
      }

      if (i % wrapAround == 0) {
        // Wrap around based on simd size
        // For SIMD8 wrap around each row,
        // for SIMD16 wrap around every other row
        curEMBit = 0;
      }

      if (mask[i] != curEMBit &&
          // For flags, we set bytesize = 2 although
          // the kernel is SIMD8. This means higher 8
          // bits of mask will be set to 0 since those
          // bits are never defined. Such masks need
          // not be considered non-default.
          !(isFlagDcl == true && mask[i] == 0)) {
        found = false;
        break;
      }
    }

    if (found == true) {
      isDefault = true;
    }
  }

  return isDefault;
}

bool Augmentation::isDefaultMaskSubDeclare(unsigned char *mask, unsigned lb,
                                           unsigned rb, G4_Declare *dcl,
                                           unsigned simdSize) {
  bool isDefault = false;

  // treat simd32 as simd16 as the instruction is always split to 2 simd16
  if (simdSize == 32) {
    simdSize = 16;
  }

  if (mask != NULL) {
    unsigned size = dcl->getByteSize();
    unsigned char curEMBit = 0;
    bool found = true;
    unsigned wrapAround = simdSize * 4;
    unsigned leftBound = gra.getSubOffset(dcl);
    unsigned rightBound = leftBound + size - 1;

    vISA_ASSERT(rightBound <= rb, "Wrong sub declare right bound!");

    for (unsigned i = lb; i < rightBound + 1; i += 1) {
      if ((i - lb) % 4 == 0) {
        curEMBit++;
      }

      if ((i - lb) % wrapAround == 0) {
        curEMBit = 0;
      }

      if (i >= leftBound) {
        if (mask[i] != curEMBit) {
          found = false;
          break;
        }
      }
    }

    if (found == true) {
      isDefault = true;
    }
  }

  return isDefault;
}

bool Augmentation::verifyMaskIfInit(G4_Declare *dcl, AugmentationMasks mask) {
  // Return true if dcl mask is either undetermined or same as mask
  auto m = gra.getAugmentationMask(dcl);
  if (m == mask || m == AugmentationMasks::Undetermined) {
    return true;
  }

  return false;
}

bool Augmentation::checkGRFPattern2(G4_Declare *dcl, G4_DstRegRegion *dst,
                                    unsigned maskOff, unsigned lb, unsigned rb,
                                    unsigned execSize) {
  auto opndByteSize = dst->getTypeSize();
  unsigned modWith = opndByteSize * kernel.getSimdSize();
  if (lb % modWith - (maskOff * opndByteSize * dst->getHorzStride()) <=
      opndByteSize) {
    if ((lb +
         (execSize * opndByteSize * dst->getHorzStride() -
          dst->getHorzStride()) -
         rb) < opndByteSize) {
      if (opndByteSize == 2 &&
          verifyMaskIfInit(dcl, AugmentationMasks::Default32Bit)) {
        gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
        return true;
      } else if (opndByteSize == 4 &&
                 verifyMaskIfInit(dcl, AugmentationMasks::Default64Bit)) {
        gra.setAugmentationMask(dcl, AugmentationMasks::Default64Bit);
        return true;
      } else {
        gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
        return true;
      }
    }
  }

  return false;
}

// Returns true if dcl mask deemed to be non-default, false otherwise.
bool Augmentation::checkGRFPattern1(G4_Declare *dcl, G4_DstRegRegion *dst,
                                    unsigned maskOff, unsigned lb, unsigned rb,
                                    unsigned execSize) {
  auto opndByteSize = dst->getTypeSize();
  unsigned modWith = opndByteSize * kernel.getSimdSize();
  if (dst->getHorzStride() == 1) {
    if ((lb % modWith == (maskOff * opndByteSize) &&
         rb == (lb + (execSize * opndByteSize) - 1))) {
      // This will be taken only when hstride = 1
      if (opndByteSize == 2 &&
          verifyMaskIfInit(dcl, AugmentationMasks::Default16Bit)) {
        gra.setAugmentationMask(dcl, AugmentationMasks::Default16Bit);
        return true;
      } else if (opndByteSize == 4 &&
                 verifyMaskIfInit(dcl, AugmentationMasks::Default32Bit)) {
        gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
        return true;
      } else if (opndByteSize == 8 &&
                 verifyMaskIfInit(dcl, AugmentationMasks::Default64Bit)) {
        gra.setAugmentationMask(dcl, AugmentationMasks::Default64Bit);
        return true;
      } else {
        gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
        return true;
      }
    }
  }

  return false;
}

void Augmentation::markNonDefaultDstRgn(G4_INST *inst, G4_Operand *opnd) {
  if (inst->isPseudoKill()) {
    return;
  }

  G4_DstRegRegion *dst = nullptr;
  G4_CondMod *condMod = nullptr;
  if (opnd->isDstRegRegion()) {
    dst = opnd->asDstRegRegion();
  } else if (opnd->isCondMod()) {
    condMod = opnd->asCondMod();
  } else {
    vISA_ASSERT(false, "Don't know how to handle this type of operand");
  }

  // Handle condMod
  if (condMod && condMod->getBase()) {
    G4_Declare *dcl = condMod->getTopDcl();
    dcl = dcl->getRootDeclare();

    if (inst->isWriteEnableInst() ||
        opnd->getLeftBound() != inst->getMaskOffset()) {
      gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
      return;
    }

    if (verifyMaskIfInit(dcl, AugmentationMasks::DefaultPredicateMask)) {
      gra.setAugmentationMask(dcl, AugmentationMasks::DefaultPredicateMask);
    }
    return;
  }

  // Handle dst
  if (dst && (inst->isCall() || inst->isCallerSave())) {
    const G4_Declare *dcl = dst->getBase()->asRegVar()->getDeclare();
    if (dcl && liveAnalysis.livenessClass(dcl->getRegFile())) {
      gra.setAugmentationMask(dcl->getRootDeclare(),
                              AugmentationMasks::NonDefault);
    }
    return;
  }

  bool isFlagRA = liveAnalysis.livenessClass(G4_FLAG);
  if (dst && dst->getBase() && dst->getBase()->isRegVar()) {
    G4_Declare *dcl = dst->getBase()->asRegVar()->getDeclare();
    if (!liveAnalysis.livenessClass(dcl->getRegFile())) {
      return;
    }
    unsigned offTopDcl = 0;
    while (dcl->getAliasDeclare()) {
      offTopDcl += dcl->getAliasOffset();
      dcl = dcl->getAliasDeclare();
    }

    // NoMask instructions's dst is always non-default
    if (inst->isWriteEnableInst()) {
      gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
      return;
    }

    if (gra.getAugmentationMask(dcl) == AugmentationMasks::NonDefault)
      return;

    unsigned maskOff = inst->getMaskOffset();
    unsigned lb = dst->getLeftBound() + offTopDcl;
    unsigned rb = dst->getRightBound() + offTopDcl;
    unsigned execSize = inst->getExecSize();

    if (dcl->getAddressed()) {
      gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
      return;
    }

    if (!isFlagRA) {
      // Treat send as special case because update mask for scatter
      // has some special checks.
      if (inst->isSend()) {
        if (gra.getAugmentationMask(dcl) == AugmentationMasks::NonDefault) {
          return;
        }

        updateDstMask(inst, false);
        if (isDefaultMaskDcl(dcl, kernel.getSimdSize(),
                             AugmentationMasks::Default16Bit)) {
          gra.setAugmentationMask(dcl, AugmentationMasks::Default16Bit);
        } else if (isDefaultMaskDcl(dcl, kernel.getSimdSize(),
                                    AugmentationMasks::Default32Bit)) {
          gra.setAugmentationMask(dcl, AugmentationMasks::Default32Bit);
        } else if (isDefaultMaskDcl(dcl, kernel.getSimdSize(),
                                    AugmentationMasks::Default64Bit)) {
          bool useNonDefault = false;

          // TODO: Why?
          useNonDefault |=
              (kernel.getSimdSize() >= g4::SIMD16 && dcl->getTotalElems() > 8);
          useNonDefault |=
              (kernel.getSimdSize() == g4::SIMD8 && dcl->getTotalElems() > 4);

          gra.setAugmentationMask(dcl, useNonDefault
                                           ? AugmentationMasks::NonDefault
                                           : AugmentationMasks::Default64Bit);
        } else {
          gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
          return;
        }
      } else {
        bool found = false;
        // default one
        found |= checkGRFPattern1(dcl, dst, maskOff, lb, rb, execSize);
        if (!found ||
            gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined) {
          // hstride = 2 case
          found |= checkGRFPattern2(dcl, dst, maskOff, lb, rb, execSize);
        }

        if (!found ||
            gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined) {
          gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
        }
      }
    } else {
      // Handle flag register as destination here
      if (!(lb == maskOff && rb == (lb + execSize - 1))) {
        gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
        return;
      }

      if (verifyMaskIfInit(dcl, AugmentationMasks::DefaultPredicateMask)) {
        gra.setAugmentationMask(dcl, AugmentationMasks::DefaultPredicateMask);
      }
    }
  }
}

// Returns true if any inst found using non-default mask.
// This function sets up lexical id of all instructions.
bool Augmentation::markNonDefaultMaskDef() {
  // Iterate dcls list and mark obvious ones as non-default.
  // Obvoius non-default is 1 element, ie uniform dcl.
  for (auto dcl : kernel.Declares) {
    auto dclRegFile = dcl->getRegFile();
    if (!liveAnalysis.livenessClass(dclRegFile))
      continue;

    if (dclRegFile == G4_GRF || dclRegFile == G4_INPUT ||
        dclRegFile == G4_ADDRESS) {
      if (dcl->getTotalElems() < 8 || dclRegFile == G4_INPUT) {
        gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
      }
    } else if (dclRegFile == G4_FLAG) {
      // Flags are processed when processing instructions
    }
  }

  unsigned id = 0;
  bool isFlagRA = liveAnalysis.livenessClass(G4_FLAG);

  for (auto bb : kernel.fg) {
    for (auto inst : *bb) {
      inst->setLexicalId(id++);

      G4_DstRegRegion *dst = inst->getDst();

      if (dst) {
        markNonDefaultDstRgn(inst, dst);
      }

      if (isFlagRA && inst->getCondMod()) {
        markNonDefaultDstRgn(inst, inst->getCondMod());
      }
    }
  }

  // Update whether each dcl is default/not
  AugmentationMasks prevAugMask = AugmentationMasks::Undetermined;
  bool nonDefaultMaskDefFound = false;

  for (auto dcl : kernel.Declares) {
    if (liveAnalysis.livenessClass(dcl->getRegFile())) {
      if (gra.getAugmentationMask(dcl) == AugmentationMasks::Undetermined) {
        gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
        nonDefaultMaskDefFound = true;
      }

      if (kernel.getOption(vISA_forceBCR) &&
          gra.getBankConflict(dcl) != BANK_CONFLICT_NONE) {
        gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
        nonDefaultMaskDefFound = true;
      }

      if (!nonDefaultMaskDefFound &&
          gra.getAugmentationMask(dcl) != prevAugMask &&
          prevAugMask != AugmentationMasks::Undetermined) {
        nonDefaultMaskDefFound = true;
      }

      prevAugMask = gra.getAugmentationMask(dcl);
    }

    bool checkLRAAlign = false;
    if (liveAnalysis.livenessClass(G4_GRF)) {
      if (GlobalRA::useGenericAugAlign(kernel.getPlatformGeneration()) &&
          gra.getAlignFromAugBucket(dcl) > 0)
        checkLRAAlign = true;
      else if (gra.getAugmentationMask(dcl) ==
                   AugmentationMasks::Default32Bit &&
               kernel.getSimdSize() > kernel.numEltPerGRF<Type_UD>())
        checkLRAAlign = true;
    }

    if (checkLRAAlign) {
      auto dclLR = gra.getLocalLR(dcl);
      if (dclLR) {
        int s;
        auto phyReg = dclLR->getPhyReg(s);
        unsigned int maxAlign = 2;
        if (gra.use4GRFAlign && gra.getAugmentationMask(dcl) == AugmentationMasks::Default64Bit) {
          maxAlign = 4;
        }
        if (phyReg && phyReg->asGreg()->getRegNum() % maxAlign != 0) {
          // If LRA assignment is not aligned as expected then
          // mark it as non-default. GRA candidates cannot fully
          // overlap with such ranges. Partial overlap is illegal.

          // TODO: There's a bug here. This branch should execute only if
          // dclLR->getAssigned() == true. If this is false, then
          // dclLR->getPhyReg() is invalid. Once this is fixed, we can
          // re-enable following assert.
          //
          //vISA_ASSERT(!gra.use4GRFAlign,
          //            "expecting LRA allocation to be aligned");
          gra.setAugmentationMask(dcl, AugmentationMasks::NonDefault);
          nonDefaultMaskDefFound = true;
        }
      }
    }
  }

  return nonDefaultMaskDefFound;
}

void Augmentation::updateStartIntervalForSubDcl(G4_Declare *dcl,
                                                G4_INST *curInst,
                                                G4_Operand *opnd) {
  for (const G4_Declare *subDcl : gra.getSubDclList(dcl)) {
    unsigned leftBound = gra.getSubOffset(subDcl);
    unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
    if (!(opnd->getRightBound() < leftBound ||
          rightBound < opnd->getLeftBound())) {
      auto subDclStartInterval = gra.getLastStartInterval(subDcl);
      if (subDclStartInterval == NULL ||
          (subDclStartInterval->getLexicalId() > curInst->getLexicalId())) {
        gra.setLastStartInterval(subDcl, curInst);
      }

      auto subDclEndIntrval = gra.getLastEndInterval(subDcl);
      if (subDclEndIntrval == NULL ||
          (subDclEndIntrval->getLexicalId() < curInst->getLexicalId())) {
        gra.setLastEndInterval(subDcl, curInst);
      }
    }
  }

  return;
}

void Augmentation::updateEndIntervalForSubDcl(G4_Declare *dcl, G4_INST *curInst,
                                              G4_Operand *opnd) {
  for (const G4_Declare *subDcl : gra.getSubDclList(dcl)) {
    unsigned leftBound = gra.getSubOffset(subDcl);
    unsigned rightBound = leftBound + subDcl->getByteSize() - 1;
    if (!(opnd->getRightBound() < leftBound ||
          rightBound < opnd->getLeftBound())) {
      auto subDclEndInterval = gra.getLastEndInterval(subDcl);
      if (subDclEndInterval == NULL ||
          (subDclEndInterval->getLexicalId() < curInst->getLexicalId())) {
        gra.setLastEndInterval(subDcl, curInst);
      }

      auto subDclStartInterval = gra.getLastStartInterval(subDcl);
      if (subDclStartInterval == NULL ||
          (subDclStartInterval->getLexicalId() > curInst->getLexicalId())) {
        gra.setLastStartInterval(subDcl, curInst);
      }
    }
  }

  return;
}

void Augmentation::updateStartInterval(const G4_Declare *dcl,
                                       G4_INST *curInst) {
  auto dclStartInterval = gra.getLastStartInterval(dcl);
  if (dclStartInterval && hasSubroutines) {
    // If we want to extend dcl in a new subroutine than earlier,
    // then we create a new interval for new subroutine. This way
    // we ensure an interval never straddles subroutines.
    auto *funcCurInst = instToFunc[curInst->getLexicalId()];
    auto *funcStartInterval = instToFunc[dclStartInterval->getLexicalId()];
    if (funcCurInst != funcStartInterval) {
      gra.pushBackNewInterval(dcl);
      dclStartInterval = nullptr;
    }
  }
  if (!dclStartInterval ||
      (dclStartInterval->getLexicalId() > curInst->getLexicalId())) {
    gra.setLastStartInterval(dcl, curInst);
  }

  auto dclEndInterval = gra.getLastEndInterval(dcl);
  if (!dclEndInterval ||
      (dclEndInterval->getLexicalId() < curInst->getLexicalId())) {
    gra.setLastEndInterval(dcl, curInst);
  }
}

void Augmentation::updateEndInterval(const G4_Declare *dcl, G4_INST *curInst) {
  auto dclEndInterval = gra.getLastEndInterval(dcl);
  if (dclEndInterval && hasSubroutines) {
    auto *funcCurInst = instToFunc[curInst->getLexicalId()];
    auto *funcEndInterval = instToFunc[dclEndInterval->getLexicalId()];
    if (funcCurInst != funcEndInterval) {
      gra.pushBackNewInterval(dcl);
      dclEndInterval = nullptr;
    }
  }
  if (!dclEndInterval ||
      (dclEndInterval->getLexicalId() < curInst->getLexicalId())) {
    gra.setLastEndInterval(dcl, curInst);
  }

  auto dclStartInterval = gra.getLastStartInterval(dcl);
  if (!dclStartInterval ||
      (dclStartInterval->getLexicalId() > curInst->getLexicalId())) {
    gra.setLastStartInterval(dcl, curInst);
  }
}

void Augmentation::updateStartIntervalForLocal(G4_Declare *dcl,
                                               G4_INST *curInst,
                                               G4_Operand *opnd) {
  updateStartInterval(dcl, curInst);
  if (dcl->getIsSplittedDcl()) {
    updateStartIntervalForSubDcl(dcl, curInst, opnd);
  }
}

void Augmentation::updateEndIntervalForLocal(G4_Declare *dcl, G4_INST *curInst,
                                             G4_Operand *opnd) {
  updateEndInterval(dcl, curInst);
  if (dcl->getIsSplittedDcl()) {
    updateEndIntervalForSubDcl(dcl, curInst, opnd);
  }
}

void GlobalRA::printLiveIntervals() {
  for (const G4_Declare *dcl : kernel.Declares) {
    if (getLastStartInterval(dcl) != nullptr || getLastEndInterval(dcl) != nullptr) {
      DEBUG_VERBOSE(dcl->getName() << " (");

      if (getLastStartInterval(dcl) != nullptr) {
        DEBUG_VERBOSE(getLastStartInterval(dcl)->getLexicalId());
      } else {
        DEBUG_VERBOSE("*");
      }

      DEBUG_VERBOSE(", ");

      if (getLastEndInterval(dcl) != nullptr) {
        DEBUG_VERBOSE(getLastEndInterval(dcl)->getLexicalId());
      } else {
        DEBUG_VERBOSE("*");
      }

      DEBUG_VERBOSE("] "
                    << "\n");
    }
  }
}

bool Augmentation::isUnknownArgOrRetval(G4_Declare *dcl) const {
  if (!argsRetVal.count(dcl))
    return false;
  return isUnknownArg(dcl) || isUnknownRetVal(dcl);
}

bool Augmentation::isUnknownRetVal(G4_Declare *dcl) const {
  return isRetvalType<RetValType::Unknown>(dcl);
}

bool Augmentation::isRegularRetVal(G4_Declare *dcl) const {
  return isRetvalType<RetValType::Regular>(dcl);
}

bool Augmentation::isUnknownArg(G4_Declare *dcl) const {
  return isArgType<ArgType::Unknown>(dcl);
}

bool Augmentation::isDefBeforeEachCallArg(G4_Declare *dcl) const {
  return isArgType<ArgType::DefBeforeEachCall>(dcl);
}

bool Augmentation::isLiveThroughArg(G4_Declare *dcl) const {
  return isArgType<ArgType::LiveThrough>(dcl);
}

void Augmentation::buildUnknownArgRetval() {
  // Arg and retval of Unknown type could have inaccurate
  // SIMD liveness. So we treat these conservatively by
  // extending their liveness throughout the function such
  // variables are referenced in. These variables may be
  // live through subroutines that don't reference them,
  // but that should be captured either by SIMT liveness
  // or by SIMD liveness when we mark mayDef of callee
  // with overlapping intervals at call sites.
  if (!hasSubroutines)
    return;

  std::unordered_map<FuncInfo *, std::pair<G4_INST *, G4_INST *>> funcStartEnd;
  for (auto *func : kernel.fg.sortedFuncTable) {
    vISA_ASSERT(!func->getInitBB()->empty(), "expecting non-empty init bb");
    vISA_ASSERT(!func->getExitBB()->empty(), "expecting non-empty exit bb");
    auto *start = func->getInitBB()->front();
    auto *end = func->getExitBB()->back();
    funcStartEnd[func] = std::make_pair(start, end);
  }


  // We've a dcl of unknown arg/retval type and a list of subroutines
  // the dcl is referenced in, directly or indirectly. We create live-intervals
  // for dcl spanning each subroutine it's referenced in. Since live-intervals
  // cannot straddle subroutines, we create 1 entry per subroutine.
  auto attachIntervals = [&](G4_Declare *dcl,
                             std::unordered_set<FuncInfo *> &subroutines) {
    vISA_ASSERT(gra.getNumIntervals(dcl) == 0,
                "found > 0 intervals for unknown");
    for (auto &startEnd : funcStartEnd) {
      auto *func = startEnd.first;
      if (!subroutines.count(func))
        continue;
      gra.pushBackNewInterval(dcl);
      gra.setLastStartInterval(dcl, startEnd.second.first);
      gra.setLastEndInterval(dcl, startEnd.second.second);
    }
  };

  if (!liveAnalysis.livenessClass(G4_GRF)) {
    for (auto &var : nonGRFRefs) {
      if (var.second.size() < 2)
        continue;

      // Non-GRF variables that are referenced in multiple subroutines
      // are considered as unknown type.
      auto dcl = var.first;
      gra.clearIntervals(dcl);
      attachIntervals(dcl, var.second);
    }
  } else {
    // Now gather all GRF Unknown arg, retval
    for (const auto &info : argsRetVal) {
      auto dcl = info.first;

      if (isUnknownArgOrRetval(dcl)) {
        if (!unknownArgRetvalRefs.count(dcl))
          continue;
        auto &allRefs = unknownArgRetvalRefs.at(dcl);
        gra.clearIntervals(dcl);
        attachIntervals(dcl, allRefs);
      }
    }
  }

  // Verify that no interval straddles function boundaries
  if (gra.verifyAugmentation) {
      [[maybe_unused]] auto getFunc = [&](G4_INST *inst) {
      unsigned int lexId = inst->getLexicalId();

      int funcId = 0;
      for (auto &func : funcStartEnd) {
        if (lexId >= func.second.first->getLexicalId() &&
            lexId <= func.second.second->getLexicalId())
          return funcId;
        funcId++;
      }
      return funcId;
    };
    for (G4_Declare *dcl : kernel.Declares) {
      auto &allIntervals = gra.getAllIntervals(dcl);
      for (auto &interval : allIntervals) {
        [[maybe_unused]] auto start = interval.start;
        [[maybe_unused]] auto end = interval.end;
        vISA_ASSERT(getFunc(start) == getFunc(end),
                    "interval straddles functions");
      }
    }
  }
}

bool Augmentation::hasUniqueFuncHome(G4_Declare *dcl) const {
  // Home functions are computed before computing arg/retval.
  // Per definition, arg/retval have no home function.
  if (argsRetVal.count(dcl))
    return false;
  auto *homeFunction = homeFunc[dcl->getDeclId()];
  return homeFunction != nullptr;
}

FuncInfo* Augmentation::getUniqueFuncHome(G4_Declare* dcl) const {
  vISA_ASSERT(hasUniqueFuncHome(dcl), "expecting unique home func");
  return homeFunc[dcl->getDeclId()];
}

void Augmentation::startIntervalForLiveIn(FuncInfo *funcInfo, G4_BB *bb) {
  // Start live-in intervals
  auto liveInBB = liveAnalysis.getLiveAtEntry(bb) & liveAnalysis.globalVars;
  for (auto i : liveInBB) {
    G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
    if (isUnknownArgOrRetval(dcl))
      continue;

    if (hasUniqueFuncHome(dcl) && getUniqueFuncHome(dcl) != funcInfo)
      continue;

    vISA_ASSERT(bb->size() > 0, "empty instlist");
    vISA_ASSERT(funcInfo == kernel.fg.kernelInfo ||
                    argsPerSub.count(funcInfo) > 0 ||
                    !liveAnalysis.livenessClass(G4_GRF),
                "didnt find callee entry");
    updateStartInterval(dcl, bb->front());
  }
}

void Augmentation::handleCallSite(G4_BB *curBB, unsigned int &funcCnt) {
  const char *name =
      kernel.fg.builder->getNameString(32, "SCALL_%d", funcCnt++);
  G4_Declare *scallDcl =
      kernel.fg.builder->createDeclare(name, G4_GRF, 1, 1, Type_UD);
  gra.addVarToRA(scallDcl);

  auto *inst = curBB->back();
  vISA_ASSERT(inst->isCall(), "expecting call instruction");

  updateStartInterval(scallDcl, inst);
  updateEndInterval(scallDcl, inst);

  std::pair<G4_INST *, G4_BB *> callInfo(inst, curBB);
  callDclMap.emplace(scallDcl, callInfo);

  if (liveAnalysis.livenessClass(G4_GRF)) {
    auto *retLocDcl = inst->getDst()->getTopDcl();
    // RET__loc dcl starts and ends at call site.
    // If a function has multiple call sites to same
    // callee then there would be as many trivial
    // live-intervals for corresponding RET__loc dcl.

    // RET__loc dcl in entryBB is identified as LiveThrough
    // rather than DefBeforeEachCall. LiveThrough variable's
    // interference is fully handled by SIMT. So we don't
    // need to create the short interval for RET__loc at
    // call site.
    if (isDefBeforeEachCallArg(retLocDcl))
      gra.pushBackNewInterval(retLocDcl);

    auto *callee = curBB->getCalleeInfo();
    vISA_ASSERT(argsPerSub.count(callee) > 0, "didnt find entry for sub");
    auto &args = argsPerSub.at(callee);
    // Terminate any arg with type DefBeforeEachCall
    for (auto *arg : args) {
      if (!isDefBeforeEachCallArg(arg))
        continue;
      updateEndInterval(arg, inst);
    }

    vISA_ASSERT(retValPerSub.count(callee) > 0, "didnt find callee entry");
    // Start regular retval live-interval at call
    const auto &retvals = retValPerSub[callee];
    for (auto *retvalDcl : retvals) {
      if (isRegularRetVal(retvalDcl)) {
        gra.pushBackNewInterval(retvalDcl);
        updateStartInterval(retvalDcl, curBB->back());
      }
    }
  }
}

void Augmentation::handleDstOpnd(FuncInfo *funcInfo, G4_BB *curBB,
                                 G4_INST *inst) {
  G4_DstRegRegion *dst = inst->getDst();

  if (dst && dst->getRegAccess() == Direct && dst->getBase()) {
    // Destination
    G4_Declare *defdcl = GetTopDclFromRegRegion(dst);

    if (dst->getBase()->isRegAllocPartaker()) {
      if (defdcl) {
        if (!liveAnalysis.livenessClass(G4_GRF))
          nonGRFRefs[defdcl].insert(funcInfo);

        if (isDefBeforeEachCallArg(defdcl)) {
          vISA_ASSERT(!defdcl->getIsSplittedDcl(),
                      "not expecting to see split on arg");
          // Check if previous interval ended at an earlier call.
          // If not continue it, otherwise start new one.
          auto *prevEnd = gra.getLastEndInterval(defdcl);
          if (prevEnd && prevEnd->isCall())
            gra.pushBackNewInterval(defdcl);
        } else if (isUnknownArgOrRetval(defdcl)) {
          unknownArgRetvalRefs[defdcl].insert(funcInfo);
        }

        if (gra.getLocalLR(defdcl)) {
          updateStartIntervalForLocal(defdcl, inst, dst);
        } else {
          updateStartInterval(defdcl, inst);
        }
      }
    } else if (liveAnalysis.livenessClass(G4_GRF)) {
      LocalLiveRange *defdclLR;

      // Handle ranges allocated by local RA
      if (defdcl && (defdclLR = gra.getLocalLR(defdcl)) &&
          defdclLR->getAssigned() == true && !defdclLR->isEOT()) {
        vISA_ASSERT(!hasSubroutines || argsRetVal.count(defdcl) > 0,
                    "didnt expect arg to be allocated by LRA");
        updateStartInterval(defdcl, inst);
      }
    }
  } else if (liveAnalysis.livenessClass(G4_ADDRESS) && dst &&
             dst->getRegAccess() == IndirGRF && dst->getBase() &&
             dst->getBase()->isRegVar()) {
    // Destination is indirect
    G4_Declare *defdcl = dst->getBaseRegVarRootDeclare();
    nonGRFRefs[defdcl].insert(funcInfo);
    updateEndInterval(defdcl, inst);
  } else if (liveAnalysis.livenessClass(G4_GRF) && dst && dst->isIndirect()) {
    const REGVAR_VECTOR &pointsToSet =
        liveAnalysis.getPointsToAnalysis().getAllInPointsToOrIndrUse(dst,
                                                                     curBB);
    for (const auto& pointsToVar : pointsToSet) {
      if (pointsToVar.var->isRegAllocPartaker()) {
        updateStartInterval(pointsToVar.var->getDeclare()->getRootDeclare(),
                            inst);
        auto dcl = pointsToVar.var->getDeclare()->getRootDeclare();
        if (isUnknownArgOrRetval(dcl))
          unknownArgRetvalRefs[dcl].insert(funcInfo);
      }
    }
  }
}

void Augmentation::handleCondMod(FuncInfo* funcInfo, G4_INST *inst) {
    if (liveAnalysis.livenessClass(G4_FLAG)) {
    G4_CondMod *cmod = inst->getCondMod();

    if (cmod && cmod->getBase()) {
      // Conditional modifier
      G4_Declare *dcl = cmod->getBaseRegVarRootDeclare();
      nonGRFRefs[dcl].insert(funcInfo);
      updateStartInterval(dcl, inst);
    }
  }
}

void Augmentation::handleSrcOpnd(FuncInfo *funcInfo, G4_BB *curBB,
                                 G4_Operand *src) {
  G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();
  auto *inst = srcRegion->getInst();
  if (srcRegion->getRegAccess() == Direct && srcRegion->getBase()) {
    G4_Declare *usedcl = GetTopDclFromRegRegion(src);

    if (isUnknownArg(usedcl) || isUnknownRetVal(usedcl))
      unknownArgRetvalRefs[usedcl].insert(funcInfo);

    if (srcRegion->getBase()->isRegAllocPartaker()) {
      if (!liveAnalysis.livenessClass(G4_GRF))
        nonGRFRefs[usedcl].insert(funcInfo);

      if (gra.getLocalLR(usedcl)) {
        updateEndIntervalForLocal(usedcl, inst, src);
      } else {
        updateEndInterval(usedcl, inst);
      }
    } else if (liveAnalysis.livenessClass(G4_GRF)) {
      LocalLiveRange *usedclLR = nullptr;
      if (usedcl && (usedclLR = gra.getLocalLR(usedcl)) &&
          usedclLR->getAssigned() == true && !usedclLR->isEOT()) {
        updateEndInterval(usedcl, inst);
      }
    }
  } else if (liveAnalysis.livenessClass(G4_GRF) && srcRegion->isIndirect()) {
    const REGVAR_VECTOR &pointsToSet =
        liveAnalysis.getPointsToAnalysis().getAllInPointsToOrIndrUse(srcRegion,
                                                                     curBB);
    for (const auto& pointsToVar : pointsToSet) {
      if (pointsToVar.var->isRegAllocPartaker()) {
        updateEndInterval(pointsToVar.var->getDeclare()->getRootDeclare(),
                          inst);
        auto dcl = pointsToVar.var->getDeclare()->getRootDeclare();
        if (isUnknownArgOrRetval(dcl))
          unknownArgRetvalRefs[dcl].insert(funcInfo);
      }
    }
  } else if (liveAnalysis.livenessClass(G4_ADDRESS) &&
             srcRegion->getRegAccess() == IndirGRF && srcRegion->getBase() &&
             srcRegion->getBase()->isRegVar()) {
    G4_Declare *usedcl = src->getBaseRegVarRootDeclare();
    nonGRFRefs[usedcl].insert(funcInfo);
    updateEndInterval(usedcl, inst);
  }
}

void Augmentation::handlePred(FuncInfo* funcInfo, G4_INST *inst) {
  if (liveAnalysis.livenessClass(G4_FLAG)) {
    G4_Predicate *pred = inst->getPredicate();

    if (pred) {
      // Predicate
      G4_Declare *dcl = pred->getBaseRegVarRootDeclare();
      nonGRFRefs[dcl].insert(funcInfo);
      updateEndInterval(dcl, inst);
    }
  }
}

void Augmentation::endIntervalForLiveOut(FuncInfo* funcInfo, G4_BB *bb) {
  auto liveOutBB = liveAnalysis.getLiveAtExit(bb) & liveAnalysis.globalVars;
  if (bb->isEndWithCall() && liveAnalysis.livenessClass(G4_GRF)) {
    // reset bit for RET__loc as we handle it specially later to
    // create point intervals at call site.
    auto retLocVarId = bb->back()->getDst()->getTopDcl()->getRegVar()->getId();
    liveOutBB.reset(retLocVarId);
    // Default subroutine argument has to start at definition and
    // end at call site. A caller may have multiple call sites for
    // a callee. We want to create multiple live-intervals, one
    // per call site. Creation of a live-interval per call site
    // is handled in handleCallSite() already, so we skip extending
    // them here.
    auto *callee = bb->getCalleeInfo();
    if (callee && argsPerSub.count(callee)) {
      auto &argsForCallee = argsPerSub.at(callee);
      for (auto *arg : argsForCallee) {
        if (isDefBeforeEachCallArg(arg))
          liveOutBB.reset(arg->getRegVar()->getId());
      }
    }
  }

  // Extend live-out interval to BB
  for (auto i : liveOutBB) {
    G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
    if (isUnknownArgOrRetval(dcl))
      continue;

    if (hasUniqueFuncHome(dcl) && getUniqueFuncHome(dcl) != funcInfo)
      continue;

    vISA_ASSERT(bb->size() > 0, "empty instlist");
    updateEndInterval(dcl, bb->back());
  }
}

// Handle live-range extension for non-reducible CFG
void Augmentation::handleNonReducibleExtension(FuncInfo *funcInfo) {
  // use SCC instead
  // FIXME: does augmentation work in the presence of subroutine? neither
  // SCCAnalysis nor findNaturalLoops considers the call graph
  SCCAnalysis SCCFinder(kernel.fg);
  SCCFinder.run();
  for (auto iter = SCCFinder.SCC_begin(), iterEnd = SCCFinder.SCC_end();
       iter != iterEnd; ++iter) {
    auto &&anSCC = *iter;
    std::unordered_set<G4_BB *> SCCSucc; // any successor BB of the SCC
    G4_BB *headBB = anSCC.getEarliestBB();
    if (hasSubroutines && headBB->getFuncInfo() != funcInfo)
      continue;
    for (auto BI = anSCC.body_begin(), BIEnd = anSCC.body_end(); BI != BIEnd;
         ++BI) {
      G4_BB *bb = *BI;
      for (auto succ : bb->Succs) {
        if (!anSCC.isMember(succ)) {
          SCCSucc.insert(succ);
        }
      }
    }
    for (auto exitBB : SCCSucc) {
      extendVarLiveness(funcInfo, exitBB, headBB->front());
    }
  }
}

std::unordered_set<G4_BB *>
Augmentation::getAllJIPTargetBBs(FuncInfo *funcInfo) {
  // Any BB that has join as first non-label instruction is a JIP target.
  std::unordered_set<G4_BB *> JIPTargetBBs;

  for (auto *BB : funcInfo->getBBList()) {
    if (BB->empty())
      continue;
    auto InstIt = BB->begin();
    if ((*InstIt)->isLabel())
      ++InstIt;
    if (InstIt != BB->end() && (*InstIt)->opcode() == G4_join)
      JIPTargetBBs.insert(BB);
  }

  return JIPTargetBBs;
}

std::vector<std::pair<G4_BB *, G4_BB *>>
Augmentation::getNonLoopBackEdges(FuncInfo *funcInfo) {
  auto &LoopBackEdges = kernel.fg.getAllNaturalLoops();
  std::vector<std::pair<G4_BB *, G4_BB *>> NonLoopBackEdges;

  for (auto *BB : funcInfo->getBBList()) {
    if (BB->empty())
      continue;
    if (BB->back()->opcode() != G4_jmpi && BB->back()->opcode() != G4_goto)
      continue;
    auto LastInstLexId = BB->back()->getLexicalId();
    for (auto *Succ : BB->Succs) {
      vISA_ASSERT(!Succ->empty(), "expecting non-empty succ BB");
      auto SuccInstLexId = Succ->front()->getLexicalId();
      // Forward edge
      if (SuccInstLexId > LastInstLexId)
        continue;
      // Check if this is a loop edge
      auto Edge = std::pair(BB, Succ);
      if (LoopBackEdges.find(Edge) == LoopBackEdges.end())
        NonLoopBackEdges.push_back(Edge);
    }
  }

  return NonLoopBackEdges;
}

void Augmentation::handleNonLoopBackEdges(FuncInfo *funcInfo) {

  // up:
  // (W) P5 =
  // ...
  // goto Later
  // ...
  // <other BBs>
  // join down
  //
  // BB1:
  // P21 = ...
  // (P38) goto down
  //
  // otherBB:
  // ...
  // (W) jmpi (M1, 1) up
  //
  // down:
  // = P21
  //
  // Later:
  //
  // In above snippet, following path may be taken:
  // BB1, otherBB, up, down, Later
  //
  // P21 is defined in BB1. If P5 uses same register
  // then it can clobber P21 before it gets used in
  // down. So P5 and P21 intervals must overlap.
  //
  // If we've a non-loop backedge in an interval and if
  // there's an incoming JIP edge within that interval
  // then it means we should extend the interval up to
  // the backedge destination. In above snippet, it means
  // extending P21 to "up" so that it overlaps with P5.

  auto AllJIPTargetBBs = getAllJIPTargetBBs(funcInfo);

  // Return true if there's any JIP incoming edge within interval
  auto hasIncomingJIPEdge = [&AllJIPTargetBBs](const vISA::Interval &Interval) {
    for (auto *JIPTargetBB : AllJIPTargetBBs) {
      vISA::Interval Temp(JIPTargetBB->front(), JIPTargetBB->front());
      if (Interval.intervalsOverlap(Temp))
        return true;
    }
    return false;
  };

  auto NonLoopBackEdges = getNonLoopBackEdges(funcInfo);
  if (NonLoopBackEdges.empty()) {
    VISA_DEBUG_VERBOSE({ std::cout << "No non-loop backedges found\n"; });
    return;
  }

  auto getNonLoopBackEdgesInInterval =
      [&NonLoopBackEdges](const vISA::Interval &Interval) {
        std::vector<std::pair<G4_BB *, G4_BB *>> NonLoopBackEdgesInInterval;

        for (auto &NonLoopBackEdge : NonLoopBackEdges) {
          vISA::Interval Temp(NonLoopBackEdge.first->back(),
                              NonLoopBackEdge.first->back());
          if (Interval.intervalsOverlap(Temp))
            NonLoopBackEdgesInInterval.push_back(NonLoopBackEdge);
        }

        return NonLoopBackEdgesInInterval;
      };

  for (G4_Declare *Dcl : kernel.Declares) {
    auto &All = gra.getAllIntervals(Dcl);
    // We shouldn't need to consider special variables like args, retval.
    // Because such variables are not defined/used in same function.
    if (All.size() != 1)
      continue;
    auto &Interval = All[0];
    bool Change = false;
    // Handle transitive backwards branches
    // TODO: Handle forward branch from interval that later jump backwards
    // and cause JIP edge to be taken in the middle of the interval.
    do {
      Change = false;
      auto Start = Interval.start;
      if (hasSubroutines && instToFunc[Start->getLexicalId()] != funcInfo)
        continue;
      if (!hasIncomingJIPEdge(Interval))
        continue;
      std::vector<std::pair<G4_BB *, G4_BB *>> NonLoopBackEdges =
          getNonLoopBackEdgesInInterval(Interval);

      for (auto &NonLoopBackEdge : NonLoopBackEdges) {
        if (NonLoopBackEdge.second) {
          vISA_ASSERT(NonLoopBackEdge.second->size() > 0,
                      "expecting backedge target to be non-empty");
          auto StartLexId = Interval.start->getLexicalId();
          if (StartLexId > NonLoopBackEdge.second->front()->getLexicalId()) {
            VISA_DEBUG_VERBOSE({
              std::cout << "Updating start interval for " << Dcl->getName()
                        << " from " << StartLexId << " to "
                        << NonLoopBackEdge.second->front()->getLexicalId()
                        << " - ";
              NonLoopBackEdge.second->front()->dump();
            });
            auto OldInterval = Interval;
            updateStartInterval(Dcl, NonLoopBackEdge.second->front());
            Change = (OldInterval != Interval);
          }
        }
      }
    } while (Change);
  }
}

void Augmentation::handleLoopExtension(FuncInfo *funcInfo) {
  // process each natural loop
  for (auto &iter : kernel.fg.getAllNaturalLoops()) {
    auto &backEdge = iter.first;
    // Check whether loop is in current function
    if (hasSubroutines &&
        funcInfo != backEdge.first->getFuncInfo())
      continue;
    G4_INST *startInst = (backEdge.second)->front();
    const std::set<G4_BB *> &loopBody = iter.second;

    for (auto block : loopBody) {
      // FIXME: this may process a BB multiple times
      for (auto succBB : block->Succs) {
        // A subroutine call BB's successor is callee's INIT BB.
        // Loop data structure doesn't include callee BB. So
        // succBB not part of loop may still be INIT BB of callee.
        // Such an INIT BB shouldn't be treated as a loop exit
        // for live-range extension. If we don't check for INIT BB
        // we end up extending RET__loc range to loop header
        // which isn't correct.
        if (loopBody.find(succBB) == loopBody.end() &&
            (succBB->getBBType() & G4_BB_INIT_TYPE) == 0) {
          G4_BB *exitBB = succBB;

          unsigned latchBBId = (backEdge.first)->getId();
          unsigned exitBBId = succBB->getId();
          if (exitBBId < latchBBId && succBB->Succs.size() == 1) {
            exitBB = succBB->Succs.front();
          }
          VISA_DEBUG_VERBOSE({
            std::cout << "==> Extend live-in for BB" << exitBB->getId() << "\n";
            exitBB->emit(std::cout);
          });
          extendVarLiveness(funcInfo, exitBB, startInst);
        }
      }
    }

    G4_BB *startBB = backEdge.second;
    G4_BB *endBB = backEdge.first;

    auto liveInStartBB = liveAnalysis.getLiveAtEntry(startBB);
    auto liveOutEndBB = liveAnalysis.getLiveAtExit(endBB);
    auto globalsLiveInAndLiveOut =
        liveInStartBB & liveOutEndBB & liveAnalysis.globalVars;

    for (auto i : globalsLiveInAndLiveOut) {
      auto *dcl = lrs[i]->getDcl()->getRootDeclare();
      // If dcl has non-nullptr home function then extend liveness only
      // in same function.
      if (hasUniqueFuncHome(dcl) && getUniqueFuncHome(dcl) != funcInfo)
        continue;

      updateEndInterval(dcl, endBB->back());
      VISA_DEBUG_VERBOSE({
        unsigned oldEnd = gra.getLastEndInterval(dcl)->getLexicalId();
        if (oldEnd < gra.getLastEndInterval(dcl)->getLexicalId()) {
          std::cout << "Extending " << dcl->getName() << " from old end "
                    << oldEnd << " to "
                    << gra.getEndInterval(dcl)->getLexicalId()
                    << " due to back-edge"
                    << "\n";
        }
      });
    }
  }
}

// Extend all variables that are live at bb entry to the given inst
void Augmentation::extendVarLiveness(FuncInfo *funcInfo, G4_BB *bb,
                                     G4_INST *inst) {
  auto liveAtEntryBB =
      liveAnalysis.getLiveAtEntry(bb) & liveAnalysis.globalVars;
  for (auto i : liveAtEntryBB) {
    G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
    // If dcl has non-nullptr home function then extend liveness only
    // in same function.
    if (hasUniqueFuncHome(dcl) && getUniqueFuncHome(dcl) != funcInfo)
      continue;

    if (!kernel.fg.isPseudoDcl(dcl)) {
      // Extend ith live-interval
      updateStartInterval(dcl, inst);
      VISA_DEBUG_VERBOSE({
        unsigned oldStart = gra.getLastStartInterval(dcl)->getLexicalId();
        if (oldStart > gra.getLastStartInterval(dcl)->getLexicalId()) {
          std::cout << "Extending " << dcl->getName() << " from old start "
                    << oldStart << " to "
                    << gra.getLastStartInterval(dcl)->getLexicalId()
                    << " due to back-edge"
                    << "\n";
        }
      });
    }
  }
}

// Build live-intervals for given subroutine and store them per subroutine.
// Arg/Retval are specially treated. We construct live-intervals with holes
// for such special variables to avoid unnecessary overlaps.
void Augmentation::buildLiveIntervals(FuncInfo* funcInfo) {
  unsigned funcCnt = 0;
  for (G4_BB *curBB : funcInfo->getBBList()) {
    if (!curBB->empty()) {
      startIntervalForLiveIn(funcInfo, curBB);
      endIntervalForLiveOut(funcInfo, curBB);
    }

    for (G4_INST *inst : *curBB) {
      if (inst->isPseudoKill() == true)
        continue;

      if (inst->isCall()) {
        handleCallSite(curBB, funcCnt);
        continue;
      }

      handleDstOpnd(funcInfo, curBB, inst);

      handleCondMod(funcInfo, inst);

      for (unsigned i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
        G4_Operand *src = inst->getSrc(i);
        if (!src || !src->isSrcRegRegion()) {
          continue;
        }
        handleSrcOpnd(funcInfo, curBB, src);
      }

      handlePred(funcInfo, inst);
    }
  }

  handleNonLoopBackEdges(funcInfo);

  // A variable may be defined in each divergent loop iteration and used
  // outside the loop. SIMT liveness can detect the variable as KILL and
  // this makes the variable non-loop carried. However, channel enable
  // behavior may differ across loop iterations. So a channel be be defined
  // in an earlier iteration and that channel could be disabled till end of
  // the loop, while getting re-enabled outside the loop. This means we
  // need to preserve value of the variable in each loop iteration and
  // treat the variable as loop carried. Following is pseudo-code:
  //
  // loop_header:
  // (W) V1 =
  //        = V1
  // V2:d = {Q1}
  // (P) goto loop_header
  //
  // outside_loop:
  //    = V2
  //
  // In above case, V2 should be treated as loop carried as it's defined using
  // Q1 EM and belongs to Default32Bit bucket. It cannot share storage with
  // V1 because V1 uses (W) and that could destroy value of V2 computed in an
  // earlier iteration.


  if (!kernel.fg.isReducible()) {
    handleNonReducibleExtension(funcInfo);
  } else {
    handleLoopExtension(funcInfo);
  }

#ifdef DEBUG_VERBOSE_ON
  // Print calculated live-ranges
  gra.printLiveIntervals();
#endif
}

// FIXME: Used by old augmentation only where no holes are modeled.
void Augmentation::buildLiveIntervals() {
  // Treat variables live-in to program first
  G4_BB *entryBB = kernel.fg.getEntryBB();

  // Live-in variables have their start interval start with
  // first instruction of entry BB
  for (auto i : liveAnalysis.globalVars) {
    if (liveAnalysis.isLiveAtEntry(entryBB, i)) {
      const G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();

      updateStartInterval(dcl, entryBB->front());
    }
  }

  unsigned funcCnt = 0;

  for (G4_BB *curBB : kernel.fg) {
    for (G4_INST *inst : *curBB) {
      if (inst->isPseudoKill() == true) {
        continue;
      }

      G4_DstRegRegion *dst = inst->getDst();

      if (inst->isCall()) {
        const char *name =
            kernel.fg.builder->getNameString(32, "SCALL_%d", funcCnt++);
        G4_Declare *scallDcl =
            kernel.fg.builder->createDeclare(name, G4_GRF, 1, 1, Type_UD);
        gra.addVarToRA(scallDcl);

        updateStartInterval(scallDcl, inst);
        updateEndInterval(scallDcl, inst);

        std::pair<G4_INST *, G4_BB *> callInfo(inst, curBB);
        callDclMap.emplace(scallDcl, callInfo);

        continue;
      }

      if (dst && dst->getRegAccess() == Direct && dst->getBase()) {
        // Destination
        G4_Declare *defdcl = GetTopDclFromRegRegion(dst);

        if (dst->getBase()->isRegAllocPartaker()) {
          if (defdcl) {
            if (gra.getLocalLR(defdcl)) {
              updateStartIntervalForLocal(defdcl, inst, dst);
            } else {
              updateStartInterval(defdcl, inst);
            }
          }
        } else if (liveAnalysis.livenessClass(G4_GRF)) {
          LocalLiveRange *defdclLR;

          // Handle ranges allocated by local RA
          if (defdcl && (defdclLR = gra.getLocalLR(defdcl)) &&
              defdclLR->getAssigned() == true && !defdclLR->isEOT()) {
            updateStartInterval(defdcl, inst);
          }
        }
      } else if (liveAnalysis.livenessClass(G4_ADDRESS) && dst &&
                 dst->getRegAccess() == IndirGRF && dst->getBase() &&
                 dst->getBase()->isRegVar()) {
        // Destination is indirect
        G4_Declare *defdcl = dst->getBaseRegVarRootDeclare();

        updateEndInterval(defdcl, inst);
      } else if (liveAnalysis.livenessClass(G4_GRF) && dst &&
                 dst->isIndirect()) {
        const REGVAR_VECTOR &pointsToSet =
            liveAnalysis.getPointsToAnalysis().getAllInPointsToOrIndrUse(dst,
                                                                         curBB);
        for (const auto &pointsToVar : pointsToSet) {
          if (pointsToVar.var->isRegAllocPartaker()) {
            updateStartInterval(pointsToVar.var->getDeclare()->getRootDeclare(),
                                inst);
          }
        }
      }

      if (liveAnalysis.livenessClass(G4_FLAG)) {
        G4_CondMod *cmod = inst->getCondMod();

        if (cmod != nullptr && cmod->getBase() != nullptr) {
          // Conditional modifier
          G4_Declare *dcl = cmod->getBaseRegVarRootDeclare();

          updateStartInterval(dcl, inst);
        }
      }

      for (unsigned i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
        G4_Operand *src = inst->getSrc(i);
        if (!src || !src->isSrcRegRegion()) {
          continue;
        }
        G4_SrcRegRegion *srcRegion = src->asSrcRegRegion();

        if (srcRegion->getRegAccess() == Direct && srcRegion->getBase()) {
          G4_Declare *usedcl = GetTopDclFromRegRegion(src);

          if (srcRegion->getBase()->isRegAllocPartaker()) {
            if (gra.getLocalLR(usedcl)) {
              updateEndIntervalForLocal(usedcl, inst, src);
            } else {
              updateEndInterval(usedcl, inst);
            }
          } else if (liveAnalysis.livenessClass(G4_GRF)) {
            LocalLiveRange *usedclLR = nullptr;
            if (usedcl && (usedclLR = gra.getLocalLR(usedcl)) &&
                usedclLR->getAssigned() == true && !usedclLR->isEOT()) {
              updateEndInterval(usedcl, inst);
            }
          }
        } else if (liveAnalysis.livenessClass(G4_GRF) &&
                   srcRegion->isIndirect()) {
          const REGVAR_VECTOR &pointsToSet =
              liveAnalysis.getPointsToAnalysis().getAllInPointsToOrIndrUse(
                  srcRegion, curBB);
          for (const auto &pointsToVar : pointsToSet) {
            if (pointsToVar.var->isRegAllocPartaker()) {
              updateEndInterval(pointsToVar.var->getDeclare()->getRootDeclare(),
                                inst);
            }
          }
        } else if (liveAnalysis.livenessClass(G4_ADDRESS) &&
                   srcRegion->getRegAccess() == IndirGRF &&
                   srcRegion->getBase() && srcRegion->getBase()->isRegVar()) {
          G4_Declare *usedcl = src->getBaseRegVarRootDeclare();

          updateEndInterval(usedcl, inst);
        }
      }

      if (liveAnalysis.livenessClass(G4_FLAG)) {
        G4_Predicate *pred = inst->getPredicate();

        if (pred != NULL) {
          // Predicate
          G4_Declare *dcl = pred->getBaseRegVarRootDeclare();

          updateEndInterval(dcl, inst);
        }
      }
    }
  }

  // extend all variables that are live at bb entry to the given inst
  // ToDo: this seems very slow when # variable is large, should look for sparse
  // implementation
  auto extendVarLiveness = [this](G4_BB *bb, G4_INST *inst) {
    for (auto i : liveAnalysis.globalVars) {
      if (liveAnalysis.isLiveAtEntry(bb, i) == true &&
          !kernel.fg.isPseudoDcl(lrs[i]->getDcl())) {
        // Extend ith live-interval
        G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
        updateStartInterval(dcl, inst);
        VISA_DEBUG_VERBOSE({
          unsigned oldStart = gra.getStartInterval(dcl)->getLexicalId();
          if (oldStart > gra.getStartInterval(dcl)->getLexicalId()) {
            std::cout << "Extending " << dcl->getName() << " from old start "
                      << oldStart << " to "
                      << gra.getStartInterval(dcl)->getLexicalId()
                      << " due to back-edge"
                      << "\n";
          }
        });
      }
    }
  };

  if (!kernel.fg.isReducible()) {
    // use SCC instead
    // FIXME: does augmentation work in the presence of subroutine? neither
    // SCCAnalysis nor findNaturalLoops considers the call graph
    SCCAnalysis SCCFinder(kernel.fg);
    SCCFinder.run();
    for (auto iter = SCCFinder.SCC_begin(), iterEnd = SCCFinder.SCC_end();
         iter != iterEnd; ++iter) {
      auto &&anSCC = *iter;
      std::unordered_set<G4_BB *> SCCSucc; // any successor BB of the SCC
      G4_BB *headBB = anSCC.getEarliestBB();
      for (auto BI = anSCC.body_begin(), BIEnd = anSCC.body_end(); BI != BIEnd;
           ++BI) {
        G4_BB *bb = *BI;
        for (auto succ : bb->Succs) {
          if (!anSCC.isMember(succ)) {
            SCCSucc.insert(succ);
          }
        }
      }
      for (auto exitBB : SCCSucc) {
        extendVarLiveness(exitBB, headBB->front());
      }
    }
  } else {
    // process each natural loop
    for (auto &&iter : kernel.fg.getAllNaturalLoops()) {
      auto &&backEdge = iter.first;
      G4_INST *startInst = (backEdge.second)->front();
      const std::set<G4_BB *> &loopBody = iter.second;

      for (auto block : loopBody) {
        // FIXME: this may process a BB multiple times
        for (auto succBB : block->Succs) {
          // A subroutine call BB's successor is callee's INIT BB.
          // Loop data structure doesn't include callee BB. So
          // succBB not part of loop may still be INIT BB of callee.
          // Such an INIT BB shouldn't be treated as a loop exit
          // for live-range extension. If we don't check for INIT BB
          // we end up extending RET__loc range to loop header
          // which isn't correct.
          if (loopBody.find(succBB) == loopBody.end() &&
              (succBB->getBBType() & G4_BB_INIT_TYPE) == 0) {
            G4_BB *exitBB = succBB;

            unsigned latchBBId = (backEdge.first)->getId();
            unsigned exitBBId = succBB->getId();
            if (exitBBId < latchBBId && succBB->Succs.size() == 1) {
              exitBB = succBB->Succs.front();
            }
            VISA_DEBUG_VERBOSE({
              std::cout << "==> Extend live-in for BB" << exitBB->getId()
                        << "\n";
              exitBB->emit(std::cout);
            });
            extendVarLiveness(exitBB, startInst);
          }
        }
      }

      G4_BB *startBB = backEdge.second;
      G4_BB *EndBB = backEdge.first;

      for (auto i : liveAnalysis.globalVars) {
        if (liveAnalysis.isLiveAtEntry(startBB, i) == true &&
            liveAnalysis.isLiveAtExit(EndBB, i) == true) {
          const G4_Declare *dcl = lrs[i]->getDcl()->getRootDeclare();
          unsigned oldEnd = gra.getEndInterval(dcl)->getLexicalId();
          (void)oldEnd;
          updateEndInterval(dcl, EndBB->back());
          VISA_DEBUG_VERBOSE({
            if (oldEnd < gra.getEndInterval(dcl)->getLexicalId()) {
              std::cout << "Extending " << dcl->getName() << " from old end "
                        << oldEnd << " to "
                        << gra.getEndInterval(dcl)->getLexicalId()
                        << " due to back-edge"
                        << "\n";
            }
          });
        }
      }
    }
  }

#ifdef DEBUG_VERBOSE_ON
  // Print calculated live-ranges
  gra.printLiveIntervals();
#endif
}

Augmentation::~Augmentation() {
  // Clear out calculated information so that subsequent RA
  // iterations don't have stale information
  for (DECLARE_LIST_ITER dcl_it = kernel.Declares.begin(),
                         end = kernel.Declares.end();
       dcl_it != end; dcl_it++) {
    gra.clearIntervals(*dcl_it);
    gra.setMask(*dcl_it, {});
    gra.setAugmentationMask(*dcl_it, AugmentationMasks::Undetermined);
  }
}

class compareInterval {
public:
  GlobalRA &gra;

  compareInterval(GlobalRA &g) : gra(g) {}

  // Used to store live-intervals in stable sorted order. Sorting is
  // done first on start lexical id, so live-ranges are stored in
  // ascending order of start. For stable order, we use secondary
  // check on dcl id.
  bool operator()(const QueueEntry &s1, const QueueEntry &s2) {
    auto s1Start = gra.getIntervalStart(s1.interval)->getLexicalId();
    auto s2Start = gra.getIntervalStart(s2.interval)->getLexicalId();

    if (s1Start == s2Start)
      return s1.dcl->getDeclId() < s2.dcl->getDeclId();

    return s1Start < s2Start;
  }
};

void Augmentation::sortLiveIntervals() {
  // Sort all intervals in kernel based on their starting point in
  // ascending order and return them in sortedIntervals vector
  // This is actually more efficient (at least according to vTune) than the O(N)
  // bucket sort algorithm below, since it avoids most of the malloc/free
  // overhead from the vector.resize()
  for (G4_Declare *dcl : kernel.Declares) {
    auto &all = gra.getAllIntervals(dcl);
    for (auto &interval : all) {
      if (gra.getIntervalEnd(interval))
        sortedIntervals.push_back(QueueEntry(dcl, interval));
    }
  }

  std::sort(sortedIntervals.begin(), sortedIntervals.end(),
            compareInterval(gra));

  VISA_DEBUG_VERBOSE({
    std::cout << "Live-intervals in sorted order:\n";
    for (auto &entry : sortedIntervals) {
      auto *dcl = entry.first;
      const auto &interval = entry.second;
      std::cout << dcl->getName() << " - "
                << "(" << gra.getIntervalStart(interval)->getLexicalId() << ", "
                << gra.getIntervalEnd(interval)->getLexicalId() << "]"
                << "\n";
    }
  });

  if (kernel.getOption(vISA_VerifyAugmentation)) {
    dumpSortedIntervals();
  }
}

unsigned Augmentation::getEnd(const G4_Declare *dcl) const {
  return gra.getLastEndInterval(dcl)->getLexicalId();
}

// Mark interference between dcls. Either one of dcls may have
// register assigned by local RA so handle those cases too.
// Re-entrant function.
void Augmentation::handleSIMDIntf(G4_Declare *firstDcl, G4_Declare *secondDcl,
                                  bool isCall) {
  auto markIntfWithLRAAssignment = [](const G4_Declare *firstDcl,
                                      const G4_Declare *lraAssigned,
                                      Interference &intf) {
    unsigned numRows = lraAssigned->getNumRows();
    const G4_VarBase *preg = lraAssigned->getRegVar()->getPhyReg();
    vISA_ASSERT(preg->isGreg(),
                "Expecting a physical register during building interference "
                "among incompatible masks");
    unsigned start = preg->asGreg()->getRegNum();

    for (unsigned i = start; i < (start + numRows); i++) {
      auto GRFDcl = intf.getGRFDclForHRA(i);
      intf.checkAndSetIntf(firstDcl->getRegVar()->getId(),
                           GRFDcl->getRegVar()->getId());
      VISA_DEBUG_VERBOSE(std::cout << "Marking interference between "
                                   << firstDcl->getName() << " and "
                                   << GRFDcl->getName() << "\n");
    }
  };

  auto firstRegVar = firstDcl->getRegVar();
  auto secondRegVar = secondDcl->getRegVar();
  if (firstDcl->getRegFile() == G4_INPUT && firstRegVar->getPhyReg() &&
      secondDcl->getRegFile() == G4_INPUT && secondRegVar->getPhyReg()) {
    return;
  }

  bool isFirstDcl = true;
  bool isPseudoVCADcl = kernel.fg.isPseudoVCADcl(firstDcl);
  if (!isPseudoVCADcl){
    isPseudoVCADcl = kernel.fg.isPseudoVCADcl(secondDcl);
    isFirstDcl = false;
  }

  if (isPseudoVCADcl) {
    // Mark intf for following pattern:
    // V33 =
    // ...
    // if
    //     = V33
    //     fcall
    // ...
    // else
    //     = V33
    // endif
    //
    // V33 will interfere with VCA_SAVE pseudo node.
    // It also needs to interfere with retval to
    // ensure V33 and retval don't get same allocation.
    // Note that if V33 is actually live after fcall
    // then graph coloring will do this for us. In this
    // case however we need to rely on augmentation.
    auto retIter =
        isFirstDcl ? fcallRetMap.find(firstDcl) : fcallRetMap.find(secondDcl);
    if (retIter != fcallRetMap.end()) {
      G4_Declare *retVar = retIter->second;
      LocalLiveRange *otherDclLR;
      G4_Declare *otherDcl = isFirstDcl ? secondDcl : firstDcl;
      if (otherDcl->getRegVar()->isRegAllocPartaker())
        intf.checkAndSetIntf(otherDcl->getRegVar()->getId(),
                             retVar->getRegVar()->getId());
      else if ((otherDclLR = gra.getLocalLR(otherDcl)) &&
               otherDclLR->getAssigned() && !otherDclLR->isEOT()) {
        markIntfWithLRAAssignment(retVar, otherDcl, intf);
      }
    }
  }

  if (firstRegVar->isRegAllocPartaker() &&
      secondRegVar->isRegAllocPartaker()) {
    if (!intf.varSplitCheckBeforeIntf(firstRegVar->getId(),
                                      secondRegVar->getId())) {
      intf.checkAndSetIntf(firstRegVar->getId(),
                           secondRegVar->getId());
      if (isCall) {
        intf.buildInterferenceWithAllSubDcl(firstRegVar->getId(),
                                            secondRegVar->getId());
      }
      VISA_DEBUG_VERBOSE(std::cout << "Marking interference between "
                                   << firstDcl->getName() << " and "
                                   << secondDcl->getName() << "\n");
    }
  } else if (liveAnalysis.livenessClass(G4_GRF)) {
    LocalLiveRange *secondDclLR = nullptr, *firstDclLR = nullptr;

    if (firstRegVar->isRegAllocPartaker() &&
        (secondDclLR = gra.getLocalLR(secondDcl)) &&
        secondDclLR->getAssigned() && !secondDclLR->isEOT()) {
      // secondDcl was assigned by local RA and it uses
      markIntfWithLRAAssignment(firstDcl, secondDcl, intf);
    } else if (secondRegVar->isRegAllocPartaker() &&
               (firstDclLR = gra.getLocalLR(firstDcl)) &&
               firstDclLR->getAssigned() && !firstDclLR->isEOT()) {
      // Call self with reversed parameters instead of re-implementing
      // above code
      handleSIMDIntf(secondDcl, firstDcl, isCall);
    }
  }
}

bool Augmentation::isNoMask(const G4_Declare *dcl, unsigned size) const {
  auto &mask = gra.getMask(dcl);
  bool result = false;

  if (mask.size() > 0) {
    result = true;

    for (unsigned i = 0; i < size; i++) {
      if (mask[i] != NOMASK_BYTE) {
        result = false;
      }
    }
  }

  return result;
}

bool Augmentation::isConsecutiveBits(const G4_Declare *dcl,
                                     unsigned size) const {
  auto &mask = gra.getMask(dcl);
  bool result = false;

  if (mask.size() > 0) {
    result = true;

    for (unsigned i = 0; i < size; i++) {
      if (mask[i] != i) {
        result = false;
      }
    }
  }

  return result;
}

bool Augmentation::isCompatible(const G4_Declare *testDcl,
                                const G4_Declare *biggerDcl) const {
  bool compatible = false;

  unsigned testSize = testDcl->getRegVar()->isFlag()
                          ? testDcl->getNumberFlagElements()
                          : testDcl->getByteSize();
  unsigned biggerSize = biggerDcl->getRegVar()->isFlag()
                            ? biggerDcl->getNumberFlagElements()
                            : biggerDcl->getByteSize();
  unsigned size = (testSize < biggerSize ? testSize : biggerSize);

  // Masks are compatible when:
  // i. Both decls have exactly 1 EM bit defining each byte
  //  (This means a dcl with Q1 in one inst and Q2 in another
  //   instruction writing same subregisters is not a candidate
  //   for next step).
  // ii. Bytes at common indices are enabled by same EM bit
  //  (This means NoMask dcl is compatible with NoMask dcl and
  //   not with any other dcl).
  // UPDATE: (ii) above is now altered such that NoMask dcls
  // that overlap are considered to be incompatible. This is to
  // handle removal of JIP edges (then->else edge).

  auto &testMask = gra.getMask(testDcl);
  auto &biggerMask = gra.getMask(biggerDcl);

  if (testMask.size() > 0 && biggerMask.size() > 0) {
    // Lets pattern match
    if (testDcl->getRegFile() == G4_FLAG) {
      if (isConsecutiveBits(testDcl, size) &&
          isConsecutiveBits(biggerDcl, size)) {
        compatible = true;
      }
    } else {
      // Add another pattern to check here
    }
  }

  return compatible;
}

void Augmentation::expireIntervals(unsigned startIdx) {
  // Expire elements from both lists
  while (defaultMaskQueue.size() > 0) {
    if (defaultMaskQueue.top().interval.end->getLexicalId() <=
        startIdx) {
      VISA_DEBUG_VERBOSE(std::cout << "Expiring "
                                   << defaultMaskQueue.top().first->getName()
                                   << "\n");
      defaultMaskQueue.pop();
    } else {
      break;
    }
  }

  while (nonDefaultMaskQueue.size() > 0) {
    if (nonDefaultMaskQueue.top().interval.end->getLexicalId() <=
        startIdx) {
      VISA_DEBUG_VERBOSE(std::cout << "Expiring "
                                   << nonDefaultMaskQueue.top().first->getName()
                                   << "\n");
      nonDefaultMaskQueue.pop();
    } else {
      break;
    }
  }
}

// Return true if edge between dcl1 and dcl2 is strong.
bool Interference::isStrongEdgeBetween(const G4_Declare *dcl1,
                                       const G4_Declare *dcl2) const {
  auto dcl1RegVar = dcl1->getRegVar();
  auto dcl2RegVar = dcl2->getRegVar();
  auto dcl1RAPartaker = dcl1RegVar->isRegAllocPartaker();
  auto dcl2RAPartaker = dcl2RegVar->isRegAllocPartaker();

  if (dcl1RAPartaker && dcl2RAPartaker) {
    if (interfereBetween(dcl1RegVar->getId(), dcl2RegVar->getId())) {
      return true;
    } else {
      return false;
    }
  }

  if (dcl1RAPartaker) {
    auto dcl2NumRows = dcl2->getNumRows();
    auto startPhyReg = dcl2RegVar->getPhyReg()->asGreg()->getRegNum();
    auto dcl2LR = gra.getLocalLR(dcl2);

    if (dcl2LR && dcl2LR->getAssigned()) {
      bool allEdgesStrong = true;
      for (unsigned i = startPhyReg; i < (startPhyReg + dcl2NumRows); i++) {
        const G4_Declare *lraPreg = getGRFDclForHRA(i);
        allEdgesStrong &= interfereBetween(lraPreg->getRegVar()->getId(),
                                           dcl1RegVar->getId());
      }

      if (allEdgesStrong)
        return true;
    }
  } else {
    return isStrongEdgeBetween(dcl2, dcl1);
  }

  return false;
}

bool Augmentation::weakEdgeNeeded(AugmentationMasks defaultDclMask,
                                  AugmentationMasks newDclMask) {
  if (gra.use4GRFAlign)
    return false;
  if (useGenericAugAlign) {
    // Weak edge needed in case #GRF exceeds 2
    if (newDclMask == AugmentationMasks::Default64Bit)
      return (TypeSize(Type_Q) * kernel.getSimdSizeWithSlicing()) >
             (unsigned)(2 * kernel.numEltPerGRF<Type_UB>());

    if (newDclMask == AugmentationMasks::Default32Bit) {
      // Even align up to 2 GRFs size variable, use weak edges beyond
      return (TypeSize(Type_D) * kernel.getSimdSizeWithSlicing()) >
             (unsigned)(2 * kernel.numEltPerGRF<Type_UB>());
    }
  } else {
    return (defaultDclMask == AugmentationMasks::Default64Bit &&
            newDclMask == AugmentationMasks::Default64Bit);
  }

  return false;
}

// This method is invoked when building SIMD intf and current variable
// is the artificial variable created to model call. Live-intervals in
// default set and non-default set are ones that overlap with call site
// at end of callBB. The idea here is to mark every such active interval
// with mask associated with func. Later, we'll mark interference with
// each live-interval bit set here and maydef of func.
void Augmentation::addSIMDIntfDclForCallSite(
    G4_BB *callBB, const std::vector<bool> &globalVars) {
  FuncInfo *func = callBB->getCalleeInfo();
  auto isLiveThroughFunc = [&](unsigned int id) {
    if (liveAnalysis.isLiveAtExit(callBB, id)) {
      auto retBB = func->getExitBB();
      if (liveAnalysis.isLiveAtExit(retBB, id))
        return true;
    }
    return false;
  };

  auto &overlapDeclares = overlapDclsWithFunc[func];
  for (auto &defaultEntry : defaultMaskQueue) {
    auto defaultDcl = defaultEntry.dcl;
    auto id = defaultDcl->getRegVar()->getId();
    if (!isLiveThroughFunc(id) && globalVars[id])
      overlapDeclares.first.insert(id);
  }

  for (auto &nonDefaultEntry : nonDefaultMaskQueue) {
    auto nonDefaultDcl = nonDefaultEntry.dcl;
    auto id = nonDefaultDcl->getRegVar()->getId();
    if (!isLiveThroughFunc(id) && globalVars[id])
      overlapDeclares.second.insert(id);
  }
}

void Augmentation::addSIMDIntfForRetDclares(
    G4_Declare *newDcl, const std::vector<bool> &globalVars) {
  auto dclIt = retDeclares.find(newDcl);
  MaskDeclares *mask = nullptr;
  if (dclIt == retDeclares.end()) {
    MaskDeclares newMask;
    retDeclares[newDcl] = std::move(newMask);
    mask = &retDeclares[newDcl];
  } else {
    mask = &dclIt->second;
  }

  for (auto& defaultSeg : defaultMaskQueue) {
    auto defaultDcl = defaultSeg.dcl;
    auto id = defaultDcl->getRegVar()->getId();
    if (globalVars[id])
      mask->first.insert(id);
  }

  for (auto& nonDefaultSeg : nonDefaultMaskQueue) {
    auto nonDefaultDcl = nonDefaultSeg.dcl;
    auto id = nonDefaultDcl->getRegVar()->getId();
    if (globalVars[id])
      mask->second.insert(id);
  }
}

Augmentation::RetValType Augmentation::computeRetValType(FuncInfo *func,
                                                         G4_Declare *retVal) {
  if (retVal->getAddressed())
    return Augmentation::RetValType::Unknown;

  const auto *defs = refs.getDefs(retVal);
  if (defs) {
    // All defs must be in func only
    for (const auto &def : *defs) {
      auto *bb = std::get<1>(def);
      if (!func->contains(bb))
        return Augmentation::RetValType::Unknown;
    }
  }

  // All uses must be in BB immediately following call site
  const auto *uses = refs.getUses(retVal);
  if (uses) {
    for (const auto &use : *uses) {
      auto *bb = std::get<1>(use);
      auto *pred = bb->getPhysicalPred();
      if (pred->isSpecialEmptyBB())
        pred = pred->getPhysicalPred();
      if (!pred->isEndWithCall() || pred->getCalleeInfo() != func)
        return Augmentation::RetValType::Unknown;
    }
  }

  return Augmentation::RetValType::Regular;
}

Augmentation::ArgType Augmentation::computeArgType(FuncInfo *func,
                                                   G4_Declare *arg) {
  if (arg->getAddressed())
    return Augmentation::ArgType::Unknown;

  // Trivial case where argument is input to kernel and no defs of
  // the variable exist in the program.
  const auto *defs = refs.getDefs(arg);
  if (!defs || defs->size() == 0)
    return Augmentation::ArgType::LiveThrough;

  // Check if all defs of arg are in kernel entry BB
  bool allDefsInEntryBB = true;
  for (const auto &def : *defs) {
    auto *bb = std::get<1>(def);
    if (kernel.fg.getEntryBB() != bb) {
      allDefsInEntryBB = false;
      break;
    }
  }

  if (allDefsInEntryBB)
    return Augmentation::ArgType::LiveThrough;

  // Check if use of subroutine arg exists in a BB that doesn't belong
  // to the subroutine.
  const auto *uses = refs.getUses(arg);
  if (uses) {
    for (const auto &use : *uses) {
      auto bb = std::get<1>(use);
      if (!func->contains(bb))
        return Augmentation::ArgType::Unknown;
    }
  }

  // Check if all defs are in same BB as call site
  std::unordered_set<G4_BB *> funcCallSitesMatched;
  for (auto bb : kernel.fg.getBBList()) {
    if (!bb->isEndWithCall() || bb->getCalleeInfo() != func)
      continue;
    funcCallSitesMatched.insert(bb);
  }

  bool killFound = false;
  for (const auto &def : *defs) {
    auto *bb = std::get<1>(def);
    if (bb->isEndWithCall() && bb->getCalleeInfo() == func) {
      auto *inst = std::get<0>(def);
      if (liveAnalysis.isLiveAtEntry(bb, arg->getRegVar()->getId())) {
        return Augmentation::ArgType::Unknown;
      }

      if (inst->isPseudoKill() ||
          liveAnalysis.writeWholeRegion(bb, inst, inst->getDst())) {
        funcCallSitesMatched.erase(bb);
        killFound = true;
      }
      continue;
    }
    return Augmentation::ArgType::Unknown;
  }
  if (!killFound || funcCallSitesMatched.size() > 0)
    return Augmentation::ArgType::Unknown;

  return Augmentation::ArgType::DefBeforeEachCall;
}

void Augmentation::discoverRetVal(FuncInfo *func) {
  if (!liveAnalysis.livenessClass(G4_GRF))
    return;

  vISA_ASSERT(retValPerSub.count(func) == 0, "already saw sub");
  retValPerSub[func] = {};

  if (func == kernel.fg.kernelInfo)
    return;

  SparseBitVector subRetVal = liveAnalysis.retVal.at(func);

  for (auto i : subRetVal) {
    auto *dcl = lrs[i]->getDcl();
    auto &retValInfo = argsRetVal[dcl];
    retValInfo.subroutines.insert(func);
    if (retValInfo.retValType != RetValType::Unknown)
      retValInfo.retValType = computeRetValType(func, dcl);
    vISA_ASSERT(retValInfo.retValType != RetValType::Init,
                "expecting non-init retval type");
    retValPerSub[func].insert(dcl);
    if (retValInfo.subroutines.size() > 1)
      retValInfo.retValType = RetValType::Unknown;
    vISA_ASSERT(!hasUniqueFuncHome(dcl),
                "retval cannot have non-nullptr home function");
  }

  if (kernel.getOption(vISA_VerifyAugmentation)) {
    dumpRetVal(subRetVal);
  }
}

void Augmentation::discoverArgs(FuncInfo *func) {
  if (!liveAnalysis.livenessClass(G4_GRF))
    return;

  vISA_ASSERT(argsPerSub.count(func) == 0, "already saw sub");
  argsPerSub[func] = {};

  SparseBitVector subArgs;
  if (func == kernel.fg.kernelInfo)
    subArgs = liveAnalysis.use_in[kernel.fg.getEntryBB()->getId()] &
              liveAnalysis.def_in[kernel.fg.getEntryBB()->getId()];
  else
    subArgs = liveAnalysis.args.at(func);

  for (auto i : subArgs) {
    auto *dcl = lrs[i]->getDcl();
    auto &argInfo = argsRetVal[dcl];
    argInfo.subroutines.insert(func);
    if (argInfo.argType != ArgType::Unknown)
      argInfo.argType = computeArgType(func, dcl);
    vISA_ASSERT(argInfo.argType != ArgType::Init,
                "expecting non-init arg type");
    argsPerSub[func].insert(dcl);
    // Same arg cannot be shared between 2 subroutines
    if (argInfo.subroutines.size() > 1 &&
        argInfo.argType == ArgType::DefBeforeEachCall)
      argInfo.argType = ArgType::Unknown;
    vISA_ASSERT(
        argInfo.argType != ArgType::DefBeforeEachCall ||
            !hasUniqueFuncHome(dcl),
        "def before each call arg cannot have non-nullptr home function");
  }


  if (kernel.getOption(vISA_VerifyAugmentation)) {
    func->dump(std::cout);
    dumpArgs(subArgs);
  }
}

void Augmentation::dumpSortedIntervals() {
  if (kernel.getOption(vISA_DumpProgramWithLexicalId)) {
    for (auto bb : kernel.fg.getBBList()) {
      for (auto inst : *bb) {
        std::cout << inst->getLexicalId() << ":\t";
        inst->print(std::cout);
      }
    }
  }

  std::cout << "Started dumping sorted intervals:\n";
  std::unordered_map<G4_Declare *, std::vector<Interval>> intervalsPerVar;
  for (auto &entry : sortedIntervals) {
    intervalsPerVar[entry.dcl].push_back(entry.interval);
  }

  for (auto &entry : sortedIntervals) {
    auto &interval = entry.interval;
    auto *dcl = entry.dcl;
    std::cout << dcl->getName();
    if (isUnknownArg(dcl))
      std::cout << " (Unknown arg)";
    else if (isUnknownRetVal(dcl))
      std::cout << " (Unknown retval)";
    else if (isDefBeforeEachCallArg(dcl))
      std::cout << " (DefBeforeEachCallArg)";
    else if (isLiveThroughArg(dcl))
      std::cout << " (LiveThroughArg)";
    else if (isRegularRetVal(dcl))
      std::cout << " (RegularRetVal)";
    if (dcl->getDeclId() >= homeFunc.size()) {
      std::cout << " @ (new var)";
    }
    else {
      auto *homeFunction = homeFunc[dcl->getDeclId()];
      if (!homeFunction)
        std::cout << " @ (global)";
      else
        std::cout << " @ (func " << (int)homeFunction->getId() << ")";
    }
    std::cout << " - (" << gra.getIntervalStart(interval)->getLexicalId()
              << ", " << gra.getIntervalEnd(interval)->getLexicalId() << "]";
    if (intervalsPerVar[dcl].size() > 1) {
      auto &allIntervals = intervalsPerVar[dcl];
      std::cout << " other intervals: ";
      for (auto &otherInterval : allIntervals) {
        if (otherInterval == interval)
          continue;
        std::cout << "(" << gra.getIntervalStart(otherInterval)->getLexicalId()
                  << ", " << gra.getIntervalEnd(otherInterval)->getLexicalId()
                  << "] ";
      }
    }
    std::cout << "\n";
  }
  std::cout << "Ended dumping sorted intervals:\n";
}

void Augmentation::dumpRetVal(SparseBitVector &subRetVal) {
  auto getRetValType = [](RetValType retValType) {
    if (retValType == Augmentation::RetValType::Init)
      return "Init";
    else if (retValType == Augmentation::RetValType::Regular)
      return "Regular";
    else if (retValType == Augmentation::RetValType::Unknown)
      return "Unknown";
    return "???";
  };

  for (auto i : subRetVal) {
    printf("Retval = %s (%d) - %s\n",
           gra.incRA.getLRs()[i]->getDcl()->getName(), i,
           getRetValType(argsRetVal[lrs[i]->getDcl()].retValType));
  }
  printf("\n\n");
}

void Augmentation::dumpArgs(SparseBitVector& subArgs)
{
  printf("\n");

  printf("\n");
  auto getArgType = [](ArgType argType) {
    if (argType == Augmentation::ArgType::DefBeforeEachCall)
      return "DefBeforeCall";
    else if (argType == Augmentation::ArgType::Init)
      return "Init";
    else if (argType == Augmentation::ArgType::LiveThrough)
      return "LiveThrough";
    else if (argType == Augmentation::ArgType::Unknown)
      return "Unknown";
    return "???";
  };
  for (auto i : subArgs) {
    printf("Arg = %s (%d) - %s\n", gra.incRA.getLRs()[i]->getDcl()->getName(),
           i, getArgType(argsRetVal[lrs[i]->getDcl()].argType));
  }
  printf("\n");
}

//
// Mark interference between newDcl and other incompatible dcls in current
// active lists.
//
void Augmentation::buildSIMDIntfDcl(G4_Declare *newDcl) {
  auto newDclAugMask = gra.getAugmentationMask(newDcl);
  auto intfNeededForNewDcl =
      (gra.incRA.isEnabled() && gra.incRA.hasAnyCandidates())
          ? gra.incRA.intfNeededForVar(newDcl)
          : true;
  auto id1 = newDcl->getRegVar()->getId();
  auto newDclRAPartaker = newDcl->getRegVar()->isRegAllocPartaker();

  auto intfNeeded = [&](G4_Declare *otherDcl) {
    if (!intfNeededForNewDcl && !gra.incRA.intfNeededForVar(otherDcl)) {
      return false;
    }

    auto otherRegVar = otherDcl->getRegVar();
    if (newDclRAPartaker && otherRegVar->isRegAllocPartaker()) {
      auto id2 = otherRegVar->getId();

      if (intf.interfereBetween(id1, id2)) {
        return false;
      }
    }

    return true;
  };

  if (newDclAugMask == AugmentationMasks::NonDefault)
  {
    for (auto& defaultEntry : defaultMaskQueue) {
      auto defaultDcl = defaultEntry.dcl;
      if (!intfNeeded(defaultDcl))
        continue;

      handleSIMDIntf(defaultDcl, newDcl, false);
    }
  } else {
    for (auto &defaultEntry : defaultMaskQueue) {
      auto defaultDcl = defaultEntry.dcl;
      auto defaultDclAugMask = gra.getAugmentationMask(defaultDcl);
      if (defaultDclAugMask != newDclAugMask) {
        if (!intfNeeded(defaultDcl))
          continue;

        handleSIMDIntf(defaultDcl, newDcl, false);
      } else {
        if (liveAnalysis.livenessClass(G4_GRF) &&
            // Populate compatible sparse intf data structure
            // only for weak edges.
            weakEdgeNeeded(defaultDclAugMask, newDclAugMask)) {
          if (defaultDcl->getRegVar()->isPhyRegAssigned() &&
              newDcl->getRegVar()->isPhyRegAssigned()) {
            continue;
          }

          if (intf.isStrongEdgeBetween(defaultDcl, newDcl)) {
            // No need to add weak edge
            continue;
          }

          // defaultDcl and newDcl are compatible live-ranges and can have weak
          // edge in intf graph
          intf.compatibleSparseIntf[defaultDcl].insert(newDcl);
          intf.compatibleSparseIntf[newDcl].insert(defaultDcl);
        }
      }
    }
  }

  // Mark interference among non-default mask variables
  for (auto &nonDefaultEntry : nonDefaultMaskQueue) {
    auto nonDefaultDcl = nonDefaultEntry.dcl;
    auto isAugNeeded = [&]() {
      if (newDclAugMask != AugmentationMasks::NonDefault)
        return true;

      // Skip augmentation check if both dcls are infinite spill cost tmp dcls
      // generated by RA. Such dcls have their interference correctly computed
      // by conventional interference computation. In case of address taken
      // spill/fill dcls, applying augmentation on them causes unexpected
      // interference edges.
      //
      // Unexpected intf shows up because we reuse dcl for address taken
      // spill/fill across BBs. As per generated code, such address taken
      // spill/fill dcl ranges are live only around the indirect operand. Also,
      // these ranges are never live across BBs. As augmentation models
      // live-intervals without holes, it ends up with unnecessary
      // interferences. Here is such an example of unnecessary interference
      // edge:
      //
      // BB1:
      // A0 = &ADDR_SP_FL_1 + offset
      // (W) Fill ADDR_SP_FL_1
      // r[A0] = ...
      // (W) Spill ADD_SP_FL_1
      //
      // BB2:
      // (W) Fill FL_V10
      //      = FL_V10
      //
      // BB10:
      // (W) Fill ADDR_SP_FL_1
      // r[A0] = ...
      // (W) Spill ADD_SP_FL_1
      //
      // ADDR_SP_FL_1 and FL_V10 shouldnt interfere. Without logic below, they
      // would interfere making RA results worse.

      auto regVar1 = nonDefaultDcl->getRegVar();
      auto regVar2 = newDcl->getRegVar();
      if (!((regVar1->isRegVarTmp() || regVar1->isRegVarTransient() ||
             regVar1->isRegVarCoalesced()) &&
            (regVar2->isRegVarTmp() || regVar2->isRegVarTransient() ||
             regVar2->isRegVarCoalesced())))
        return true;

      // Both dcls are RA tmps. Ordinarily they're never live-out of any BB. If
      // any of them is live across BBs then it's possible they don't interfere
      // as per scalar liveness but they may interfere due to divergent CF.
      // For example:
      //
      // if(cond)
      //    (W) V1 = ...
      // else
      //    (W) V2 = ...
      //           = V2
      // endif
      //
      // = V1
      //
      // In above example, V1 doesnt interfere with V2 as per scalar liveness
      // but it should if the branch were divergent. For correctness we need to
      // mark V1 and V2 as interfering. Since they're never live together as per
      // scalar liveness, they need to be handled in augmentation. This case
      // shouldnt occur for RA tmps as RA generated spill/fill tmps are
      // transient and never live-out of any BB. Still adding check to be safe.

      auto id1 = regVar1->getId();
      auto id2 = regVar2->getId();

      for (auto bb : kernel.fg.getBBList()) {
        if (liveAnalysis.isLiveAtExit(bb, id1) ||
            liveAnalysis.isLiveAtExit(bb, id2))
          return true;
      }

      // Conventional intf construction correctly handles the scenario when V1
      // and V2 are referenced in single (same) BB.

      return false;
    };

    if (!isAugNeeded())
      continue;

    if (!intfNeeded(nonDefaultDcl))
      continue;

    // Non-default masks are different so mark interference.
    // SIMD interference for call sites is handled separately.
    handleSIMDIntf(nonDefaultDcl, newDcl, false);
  }
}

//
// Mark interference between newDcl and other incompatible dcls in current
// active lists. If newDcl was created for a subroutine call, do this for all
// varaibles in function summary.
//
void Augmentation::storeOverlapWithCallRet(G4_Declare *newDcl,
                                           const std::vector<bool>& globalVars) {
  vISA_ASSERT(callDclMap.count(newDcl) > 0, "expecting newDcl in map");
  auto& callDclData = callDclMap[newDcl];

  if (liveAnalysis.livenessClass(G4_GRF)) // For return value
  {
    G4_INST *callInst = callDclData.first;
    auto* varDcl = callInst->getDst()->getTopDcl();
    addSIMDIntfForRetDclares(varDcl, globalVars);
  }

  auto *callBB = callDclData.second;
  addSIMDIntfDclForCallSite(callBB, globalVars);
}

//
// Perform linear scan and mark interference between conflicting dcls with
// incompatible masks.
//
void Augmentation::buildInterferenceIncompatibleMask() {
  // Collect global vars in unordered_set for quick lookup
  std::vector<bool> globalVars(liveAnalysis.getNumSelectedVar(), false);
  if (!kernel.fg.funcInfoTable.empty()) {
    for (auto bit : liveAnalysis.globalVars)
      globalVars[bit] = true;
  }

  // Create 2 active lists - 1 for holding active live-intervals
  // with non-default mask and other for default mask
  for (auto &interval : sortedIntervals) {
    auto *newDcl = interval.dcl;
    unsigned startIdx = interval.interval.start->getLexicalId();
    VISA_DEBUG_VERBOSE(std::cout << "New idx " << startIdx << "\n");
    expireIntervals(startIdx);

    if (callDclMap.count(newDcl) > 0) {
      storeOverlapWithCallRet(newDcl, globalVars);
    } else {
      buildSIMDIntfDcl(newDcl);
    }

    // Add newDcl to correct list
    if (gra.getHasNonDefaultMaskDef(newDcl) || newDcl->getAddressed() == true) {
      nonDefaultMaskQueue.push(interval);
      VISA_DEBUG_VERBOSE(std::cout << "Adding " << newDcl->getName()
                                   << " to non-default list\n");
    } else {
      defaultMaskQueue.push(interval);
      VISA_DEBUG_VERBOSE(std::cout << "Adding " << newDcl->getName()
                                   << " to default list\n");
    }
  }

  for (auto func : kernel.fg.funcInfoTable) {
    buildInteferenceForCallsite(func);
  }
  buildInteferenceForRetDeclares();
}

void Augmentation::buildInteferenceForCallSiteOrRetDeclare(std::vector<G4_Declare*>& dcls,
                                                           MaskDeclares *mask) {
  for (auto newDcl : dcls) {
    auto newDclAugMask = gra.getAugmentationMask(newDcl);
    auto intfNeededForNewDcl =
        (gra.incRA.isEnabled() && gra.incRA.hasAnyCandidates())
            ? gra.incRA.intfNeededForVar(newDcl)
            : true;
    auto id1 = newDcl->getRegVar()->getId();
    auto newDclRAPartaker = newDcl->getRegVar()->isRegAllocPartaker();

    auto intfNeeded = [&](G4_Declare *otherDcl) {
      if (!intfNeededForNewDcl && !gra.incRA.intfNeededForVar(otherDcl)) {
        return false;
      }

      auto otherRegVar = otherDcl->getRegVar();
      if (newDclRAPartaker && otherRegVar->isRegAllocPartaker()) {
        auto id2 = otherRegVar->getId();
        if (intf.interfereBetween(id1, id2)) {
          return false;
        }
      }

      return true;
    };

    if (newDclAugMask == AugmentationMasks::NonDefault) {
      for (auto i : mask->first) {
        G4_Declare *defaultDcl = lrs[i]->getDcl();
        if (!intfNeeded(defaultDcl))
          continue;
        handleSIMDIntf(defaultDcl, newDcl, true);
      }
    } else {
      for (auto i : mask->first) {
        G4_Declare *defaultDcl = lrs[i]->getDcl();
        if (gra.getAugmentationMask(defaultDcl) != newDclAugMask) {
          if (!intfNeeded(defaultDcl))
            continue;
          handleSIMDIntf(defaultDcl, newDcl, true);
        } else {
          if (liveAnalysis.livenessClass(G4_GRF) &&
              // Populate compatible sparse intf data structure
              // only for weak edges.
              weakEdgeNeeded(gra.getAugmentationMask(defaultDcl),
                             newDclAugMask)) {
            if (defaultDcl->getRegVar()->isPhyRegAssigned() &&
                newDcl->getRegVar()->isPhyRegAssigned()) {
              continue;
            }

            if (intf.isStrongEdgeBetween(defaultDcl, newDcl)) {
              // No need to add weak edge
              continue;
            }

            // defaultDcl and newDcl are compatible live-ranges and can have
            // weak edge in intf graph
            intf.compatibleSparseIntf[defaultDcl].insert(newDcl);
            intf.compatibleSparseIntf[newDcl].insert(defaultDcl);
          }
        }
      }
    }

    for (auto i : mask->second) {
      if (!intfNeeded(lrs[i]->getDcl()))
        continue;
      // Mark interference among non-default mask variables
      G4_Declare *nonDefaultDcl = lrs[i]->getDcl();
      // Non-default masks are different so mark interference
      handleSIMDIntf(nonDefaultDcl, newDcl, true);
    }
  }
}

std::vector<G4_Declare *> SBitToVector(SparseBitVector *sparseBitVector,
                                       const LiveRangeVec &lrs) {
  std::vector<G4_Declare *> retVector;
  for (auto bit : *sparseBitVector) {
    auto *varDcl = lrs[bit]->getDcl();
    retVector.push_back(varDcl);
  }

  return retVector;
}

// This method is invoked once per subroutine func.
void Augmentation::buildInteferenceForCallsite(FuncInfo *func) {
  auto maydefConst = liveAnalysis.subroutineMaydef.find(func);
  if (maydefConst != liveAnalysis.subroutineMaydef.end()) {
    auto *maydef = const_cast<SparseBitVector *>(&maydefConst->second);
    std::vector<G4_Declare *> maydefDcls(SBitToVector(maydef, lrs));
    buildInteferenceForCallSiteOrRetDeclare(maydefDcls, &overlapDclsWithFunc[func]);
  }
  if (gra.useLocalRA) {
    std::vector<G4_Declare *> lraDcls;
    for (uint32_t j = 0; j < kernel.getNumRegTotal(); j++) {
      if (localSummaryOfCallee[func].isGRFBusy(j)) {
        G4_Declare *varDcl = gra.getGRFDclForHRA(j);
        lraDcls.push_back(varDcl);
      }
    }
    buildInteferenceForCallSiteOrRetDeclare(lraDcls, &overlapDclsWithFunc[func]);
  }
}

void Augmentation::buildInteferenceForRetDeclares() {
  for (auto &retDclIt : retDeclares) {
    std::vector<G4_Declare *> retDclVec({retDclIt.first});
    buildInteferenceForCallSiteOrRetDeclare(retDclVec, &retDclIt.second);
  }
}

void Augmentation::buildSummaryForCallees() {
  int totalGRFNum = kernel.getNumRegTotal();

  for (auto func : kernel.fg.sortedFuncTable) {
    unsigned fid = func->getId();
    if (fid == UINT_MAX) {
      // entry kernel
      continue;
    }
    PhyRegSummary funcSummary(kernel.fg.builder, totalGRFNum);
    for (auto &&bb : func->getBBList()) {
      if (auto summary = gra.getBBLRASummary(bb)) {
        for (int i = 0; i < totalGRFNum; i++) {
          if (summary->isGRFBusy(i)) {
            funcSummary.setGRFBusy(i);
          }
        }
      }
    }

    for (auto &&callee : func->getCallees()) {
      PhyRegSummary *summary = &localSummaryOfCallee[callee];
      if (summary) {
        for (int i = 0; i < totalGRFNum; i++) {
          if (summary->isGRFBusy(i)) {
            funcSummary.setGRFBusy(i);
          }
        }
      }
    }
    localSummaryOfCallee[func] = std::move(funcSummary);
  }
}

void Augmentation::augmentIntfGraph() {
  if (!(kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
        !liveAnalysis.livenessClass(G4_ADDRESS) && kernel.fg.size() > 2)) {
    if (!kernel.getOption(vISA_DumpRegChart)) {
      return;
    }
  }

  if (gra.useLocalRA) {
    buildSummaryForCallees();
  }

  bool augWithHoles = kernel.getOption(vISA_NewAugmentation);

  // First check whether any definitions exist with incompatible mask
  bool nonDefaultMaskDef = markNonDefaultMaskDef();

  if (nonDefaultMaskDef == true) {
    if (augWithHoles) {
      if (kernel.fg.getNumFuncs() > 0)
        populateFuncMaps();

      populateHomeFunc();

      // Atleast one definition with non-default mask was found so
      // perform steps to augment intf graph with such defs

      // Discover and store subroutine arguments
      if (hasSubroutines) {
        for (auto &subroutine : kernel.fg.sortedFuncTable) {
          discoverArgs(subroutine);
          discoverRetVal(subroutine);

          // Now build live-intervals per subroutine. This function will
          // calculate live-intervals and assign start/end inst for
          // respective declares.
          buildLiveIntervals(subroutine);
        }
      } else {
        buildLiveIntervals(kernel.fg.kernelInfo);
      }

      // Create live-intervals for Unknown arg and retval
      buildUnknownArgRetval();
    } else {
      buildLiveIntervals();
    }

    // Sort live-intervals based on their start
    sortLiveIntervals();

    if (kernel.getOption(vISA_DumpLiveRanges)) {
      dumpLiveRanges(gra, sortedIntervals);
    }

    if (kernel.getOption(vISA_DumpRegChart)) {
      gra.regChart = std::make_unique<RegChartDump>(gra);
      gra.regChart->recordLiveIntervals(sortedIntervals);
    }

    if (gra.verifyAugmentation) {
      gra.verifyAugmentation->loadAugData(
          sortedIntervals, lrs, callDclMap,
          intf.liveAnalysis->getNumSelectedVar(), &intf, gra);
    }

    if (kernel.getOption(vISA_SpillAnalysis)) {
      if (gra.spillAnalysis.get())
        gra.spillAnalysis->LoadAugIntervals(sortedIntervals, gra);
    }

    if (kernel.fg.builder->getOption(vISA_GenerateDebugInfo)) {
      // Following is done to prevent passing GlobalRA to debug info function
      // for clear interface.
      std::vector<std::tuple<G4_Declare *, G4_INST *, G4_INST *>> dclIntervals;
      dclIntervals.reserve(sortedIntervals.size());
      for (auto &interval : sortedIntervals) {
        auto dcl = interval.dcl;
        dclIntervals.push_back(std::make_tuple(dcl, interval.interval.start,
                                               interval.interval.end));
      }
      updateDebugInfo(kernel, std::move(dclIntervals));
    }

    // Perform linear scan to augment graph
    buildInterferenceIncompatibleMask();

    if (liveAnalysis.livenessClass(G4_GRF)) {
      if ((GlobalRA::useGenericAugAlign(kernel.getPlatformGeneration()) &&
           kernel.getSimdSize() >= kernel.numEltPerGRF<Type_UD>()) ||
          (!GlobalRA::useGenericAugAlign(kernel.getPlatformGeneration()) &&
           kernel.getSimdSize() > kernel.numEltPerGRF<Type_UD>())) {
        // Set alignment of all GRF candidates
        // to 2GRF except for NoMask variables
        VISA_DEBUG_VERBOSE(std::cout
                           << "Kernel size is SIMD" << kernel.getSimdSize()
                           << " so updating all GRFs to aug align"
                           << "\n");
        gra.augAlign();
      }
      gra.updateSubRegAlignment(kernel.getGRFAlign());
    }
  }
}

void Interference::buildInterferenceWithLocalRA(G4_BB *bb) {
  auto LRASummary = gra.getBBLRASummary(bb);
  if (LRASummary == nullptr) {
    return;
  }

  BitSet cur(kernel.getNumRegTotal(), true);
  SparseBitVector live;
  std::vector<int> curUpdate;

  buildInterferenceAtBBExit(bb, live);
  VISA_DEBUG_VERBOSE(std::cout << "BB" << bb->getId() << "\n");

  for (INST_LIST_RITER rit = bb->rbegin(), rend = bb->rend(); rit != rend;
       rit++) {
    bool update = false;
    G4_INST *inst = (*rit);
    curUpdate.clear();
    VISA_DEBUG_VERBOSE({
      inst->emit(std::cout);
      std::cout << "\n";
    });

    // Any physical registers defined will be marked available if
    // current inst is first def or if complete region is written
    G4_DstRegRegion *dst = inst->getDst();

    if (dst && dst->getBase()->isRegVar()) {
      LocalLiveRange *localLR = NULL;
      G4_Declare *topdcl = GetTopDclFromRegRegion(dst);
      unsigned t;

      if (topdcl)
        localLR = gra.getLocalLR(topdcl);

      if (localLR && localLR->getAssigned() && !localLR->isEOT()) {
        int reg, sreg, numrows;
        G4_VarBase *preg = localLR->getPhyReg(sreg);
        numrows = localLR->getTopDcl()->getNumRows();

        vISA_ASSERT(preg->isGreg(), "Register in dst was not GRF");

        reg = preg->asGreg()->getRegNum();

        // Check whether the dst physical register is busy/available.
        // If it is available, and we still see a def that means there was no
        // corresponding use. In such cases mark the physical register as
        // busy, so interference building can take place correctly.
        for (int j = reg, sum = reg + numrows; j < sum; j++) {
          int k = getGRFDclForHRA(j)->getRegVar()->getId();

          if (cur.isSet(j) == true) {
            buildInterferenceWithLive(live, k);
            VISA_DEBUG_VERBOSE(
                std::cout << "Found no use for r" << j
                          << ".0 so marking it as interfering with live set"
                          << "\n");
          }
        }

        if ((localLR->getFirstRef(t) == inst) ||
            liveAnalysis->writeWholeRegion(bb, inst, dst)) {
          // Last row may be only partially used by the current dcl
          // so we still need to pessimistically mark last range as
          // busy. Because some other src opnd that is live may still
          // be using the remaining GRF.
          if (localLR->getSizeInWords() % kernel.numEltPerGRF<Type_UW>() != 0)
            numrows--;

          for (int j = reg, sum = reg + numrows; j < sum; j++) {
            cur.set(j, true);
            VISA_DEBUG_VERBOSE(std::cout << "Setting r" << j << ".0 available"
                                         << "\n");
          }

          // Build interference only for point ranges, ideally which shouldnt
          // exist These are ranges that have a def, but no use
          if (localLR->getFirstRef(t) == localLR->getLastRef(t)) {
            for (int j = reg; j < reg + localLR->getTopDcl()->getNumRows();
                 j++) {
              int k = getGRFDclForHRA(j)->getRegVar()->getId();
              buildInterferenceWithLive(live, k);
            }
          }
        }
      } else if (dst->getBase()->isRegAllocPartaker()) {
        // Global range

        // In bottom-up order if the live-range has not started then
        // a use was not seen for this def. But we need to ensure this
        // variable interferes with all other live vars.
        bool isPointRange = !live.test(dst->getBase()->asRegVar()->getId());

        if (isPointRange) {
          // Mark interference with all busy physical registers
          for (unsigned i = 0; i < kernel.getNumRegTotal(); i++) {
            if (cur.isSet(i) == false) {
              int k = getGRFDclForHRA(i)->getRegVar()->getId();
              checkAndSetIntf(dst->getBase()->asRegVar()->getId(), k);
            }
          }
        }

        if (liveAnalysis->writeWholeRegion(bb, inst, dst) ||
            inst->isPseudoKill()) {
          // Whole write or first def found so mark this operand as not live for
          // earlier instructions
          auto id = dst->getBase()->asRegVar()->getId();
          updateLiveness(live, id, false);
        }
      } else if (dst->isIndirect() && liveAnalysis->livenessClass(G4_GRF)) {
        // make every var in points-to set live
        const REGVAR_VECTOR &pointsToSet =
            liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(dst,
                                                                          bb);
        for (auto &pt : pointsToSet) {
          if (pt.var->isRegAllocPartaker()) {
            updateLiveness(live, pt.var->getId(), true);
          }
        }
      }
    }

    // Any physical registers used by src opnds will be busy before the current
    // inst
    for (int i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
      G4_Operand *src = inst->getSrc(i);

      if (src && src->isSrcRegRegion() &&
          src->asSrcRegRegion()->getBase()->isRegVar()) {
        LocalLiveRange *localLR = NULL;
        G4_Declare *topdcl = GetTopDclFromRegRegion(src);

        if (topdcl)
          localLR = gra.getLocalLR(topdcl);

        if (localLR && localLR->getAssigned() && !localLR->isEOT()) {
          int sreg;
          G4_VarBase *preg = localLR->getPhyReg(sreg);
          int numrows = localLR->getTopDcl()->getNumRows();

          vISA_ASSERT(preg->isGreg(), "Register in src was not GRF");

          int reg = preg->asGreg()->getRegNum();

          for (int j = reg, sum = reg + numrows; j < sum; j++) {
            int k = getGRFDclForHRA(j)->getRegVar()->getId();

            if (cur.isSet(j) == true) {
              // G4_RegVar with id k was marked free, but becomes
              // busy at this instruction. For incremental updates
              // push this to a vector and use it while updating
              // interference graph incrementally.
              curUpdate.push_back(k);
            }

            cur.set(j, false);
            VISA_DEBUG_VERBOSE(std::cout << "Setting r" << j << ".0 busy\n");
          }
        } else if (src->asSrcRegRegion()->getBase()->isRegAllocPartaker()) {
          if (live.test(
                  src->asSrcRegRegion()->getBase()->asRegVar()->getId()) ==
              false)
            update = true;

          // Mark operand as live from this inst upwards
          auto id = src->asSrcRegRegion()->getBase()->asRegVar()->getId();
          updateLiveness(live, id, true);
        } else if (src->asSrcRegRegion()->isIndirect() &&
                   liveAnalysis->livenessClass(G4_GRF)) {
          // make every var in points-to set live
          const REGVAR_VECTOR &pointsToSet =
              liveAnalysis->getPointsToAnalysis().getAllInPointsToOrIndrUse(
                  src->asSrcRegRegion(), bb);
          for (auto &pt : pointsToSet) {
            if (pt.var->isRegAllocPartaker()) {
              if (live.test(pt.var->getId()) == false)
                update = true;

              updateLiveness(live, pt.var->getId(), true);
            }
          }
        }
      }
    }

    if (update == true) {
      // Mark interference with all live
      for (unsigned i = 0; i < kernel.getNumRegTotal(); i++) {
        if (cur.isSet(i) == false) {
          int k = getGRFDclForHRA(i)->getRegVar()->getId();
          buildInterferenceWithLive(live, k);
        }
      }
    } else {
      if (curUpdate.size() > 0) {
        // Perform incremental update. This code is executed when:
        // 1) live set is unchanged, ie no new global range was started in inst
        // 2) cur set has changed, ie an earlier free GRF has become busy
        // Any new busy GRFs will have to be marked as interfering with
        // currently live-ranges. There is no need to iterate over all
        // busy GRFs. Instead only those GRFs that have got busy in this
        // iteration can be considered for incremental updates.
        for (int k : curUpdate) {
          buildInterferenceWithLive(live, k);
        }
      }
    }
  }

  for (unsigned i = 0; i < maxId; i++) {
    bool isAddrSensitive = liveAnalysis->isAddressSensitive(i);

    // If a range is Address taken AND (live-in or live-out or killed)
    // mark it to interfere with all physical registers used by local RA
    // FIXME: need to check if this is actually needed
    if (isAddrSensitive) {
      bool assigned = (lrs[i]->getVar()->getPhyReg() != NULL);
      if (!assigned) {
        bool isLiveIn = liveAnalysis->isLiveAtEntry(bb, i);
        bool isLiveOut = liveAnalysis->isLiveAtExit(bb, i);
        bool isKilled = liveAnalysis->use_kill[bb->getId()].test(i);
        if (isLiveIn || isLiveOut || isKilled) {
          // Make it to interfere with all physical registers used in the BB
          for (uint32_t j = 0, numReg = kernel.getNumRegTotal(); j < numReg;
               j++) {
            if (LRASummary->isGRFBusy(j)) {
              int k = getGRFDclForHRA(j)->getRegVar()->getId();
              checkAndSetIntf(i, k);
            }
          }
        }
      }
    }
  }
}

GraphColor::GraphColor(LivenessAnalysis &live, bool hybrid, bool forceSpill_)
    : gra(live.gra), totalGRFRegCount(gra.kernel.getNumRegTotal()),
      numVar(live.getNumSelectedVar()), intf(&live, gra), regPool(gra.regPool),
      builder(gra.builder), lrs(live.gra.incRA.getLRs()), isHybrid(hybrid),
      forceSpill(forceSpill_), GCMem(GRAPH_COLOR_MEM_SIZE), kernel(gra.kernel),
      liveAnalysis(live)
{
  spAddrRegSig.resize(builder.getNumAddrRegisters(), 0);
  m_options = builder.getOptions();
}

//
// lrs[i] gives the live range whose id is i
//
void GraphColor::createLiveRanges() {
  lrs.resize(numVar);
  for (auto dcl : gra.kernel.Declares) {
    G4_RegVar *var = dcl->getRegVar();
    // Do not include alias var in liverange creation
    if (!var->isRegAllocPartaker() || dcl->getAliasDeclare() != NULL) {
      continue;
    }
    lrs[var->getId()] = LiveRange::createNewLiveRange(dcl, gra);
  }
}

template<bool Support4GRFAlign>
void GraphColor::computeDegreeForGRF() {
  for (unsigned i = 0; i < numVar; i++) {
    unsigned degree = 0;

    if (!(lrs[i]->getIsPseudoNode()) && !(lrs[i]->getIsPartialDcl())) {
      const std::vector<unsigned> &intfs = intf.getSparseIntfForVar(i);
      unsigned bankDegree = 0;
      auto lraBC = lrs[i]->getBC();
      bool isOdd = (lraBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
                    lraBC == BANK_CONFLICT_SECOND_HALF_ODD);


      auto computeDegree = [&](LiveRange *lr1) {
        if (!lr1->getIsPartialDcl()) {
          unsigned edgeDegree = edgeWeightGRF<Support4GRFAlign>(lrs[i], lr1);

          degree += edgeDegree;

          auto lrsitBC = lr1->getBC();
          bool isOddBC = (lrsitBC == BANK_CONFLICT_SECOND_HALF_EVEN ||
                          lrsitBC == BANK_CONFLICT_SECOND_HALF_ODD);

          if ((isOdd && isOddBC) || (!isOdd && !isOddBC)) {
            bankDegree += edgeDegree;
          }
        }
      };

      for (auto it : intfs) {
        computeDegree(lrs[it]);
      }

      // consider weak edges in degree computation
      auto *weakEdges = intf.getCompatibleSparseIntf(lrs[i]->getDcl());
      if (weakEdges) {
        vISA_ASSERT(!gra.use4GRFAlign, "not expecting weak edges");
        for (auto weakNeighbor : *weakEdges) {
          if (!weakNeighbor->getRegVar()->isRegAllocPartaker())
            continue;

          computeDegree(lrs[weakNeighbor->getRegVar()->getId()]);
        }
      }

      if (isOdd) {
        oddTotalDegree += bankDegree; // std::max(bankDegree, oddMaxDegree);
        oddTotalRegNum += lrs[i]->getNumRegNeeded();
        oddMaxRegNum = std::max(oddMaxRegNum, lrs[i]->getNumRegNeeded());
      } else {
        evenTotalDegree += bankDegree; // std::max(bankDegree, evenMaxDegree);
        evenTotalRegNum += lrs[i]->getNumRegNeeded();
        evenMaxRegNum = std::max(evenMaxRegNum, lrs[i]->getNumRegNeeded());
      }
    }

    lrs[i]->setDegree(degree);
  }

  if (kernel.getOption(vISA_SpillAnalysis)) {
    for (unsigned int i = 0; i != numVar; ++i) {
      auto dcl = lrs[i]->getDcl();
      auto degree = lrs[i]->getDegree();
      gra.spillAnalysis->LoadDegree(dcl, degree);
    }
  }
}

void GraphColor::computeDegreeForARF() {
  for (unsigned i = 0; i < numVar; i++) {
    unsigned degree = 0;

    if (!(lrs[i]->getIsPseudoNode())) {
      const std::vector<unsigned> &intfs = intf.getSparseIntfForVar(i);
      for (auto it : intfs) {
        degree += edgeWeightARF(lrs[i], lrs[it]);
      }
    }

    lrs[i]->setDegree(degree);
  }
}

void GraphColor::computeSpillCosts(bool useSplitLLRHeuristic, const RPE *rpe) {
  LiveRangeVec addressSensitiveVars;
  float maxNormalCost = 0.0f;
  VarReferences directRefs(kernel, true, false);
  std::unordered_map<G4_Declare *, std::list<std::pair<G4_INST *, G4_BB *>>>
      indirectRefs;
  // when reg pressure is not very high in iter0, use spill cost function
  // that favors allocating large variables
  bool useNewSpillCost =
      (builder.getOption(vISA_NewSpillCostFunctionISPC) ||
       builder.getOption(vISA_NewSpillCostFunction)) &&
      rpe &&
      !(gra.getIterNo() == 0 &&
        (float)rpe->getMaxRP() < (float)kernel.getNumRegTotal() * 0.80f);

  RA_TRACE({
    if (useNewSpillCost)
      std::cout << "\t--using new spill cost function\n";
  });

  if (useNewSpillCost && liveAnalysis.livenessClass(G4_GRF)) {
    // gather all instructions with indirect operands
    // for ref count computation once.
    for (auto bb : kernel.fg.getBBList()) {
      for (auto inst : *bb) {
        auto dst = inst->getDst();
        if (dst && dst->isIndirect()) {
          auto pointsTo = liveAnalysis.getPointsToAnalysis().getAllInPointsTo(
              dst->getBase()
                  ->asRegVar()
                  ->getDeclare()
                  ->getRootDeclare()
                  ->getRegVar());
          if (pointsTo) {
            for (auto &pointee : *pointsTo)
              indirectRefs[pointee.var->getDeclare()->getRootDeclare()]
                  .push_back(std::make_pair(inst, bb));
          }
          continue;
        }

        for (unsigned int i = 0; i != inst->getNumSrc(); ++i) {
          auto src = inst->getSrc(i);
          if (!src || !src->isSrcRegRegion() ||
              !src->asSrcRegRegion()->isIndirect()) {
            continue;
          }
          auto pointsTo = liveAnalysis.getPointsToAnalysis().getAllInPointsTo(
              src->asSrcRegRegion()
                  ->getBase()
                  ->asRegVar()
                  ->getDeclare()
                  ->getRootDeclare()
                  ->getRegVar());
          if (pointsTo) {
            for (auto &pointee : *pointsTo)
              indirectRefs[pointee.var->getDeclare()->getRootDeclare()]
                  .push_back(std::make_pair(inst, bb));
          }
          continue;
        }
      }
    }
  }

  auto getWeightedRefCount = [&](G4_Declare *dcl, unsigned int useWt = 1,
                                 unsigned int defWt = 1) {
    auto defs = directRefs.getDefs(dcl);
    auto uses = directRefs.getUses(dcl);
    auto &loops = kernel.fg.getLoops();
    unsigned int refCount = 0;
    const unsigned int assumeLoopIter = 10;

    if (defs) {
      for (auto &def : *defs) {
        auto *bb = std::get<1>(def);
        auto *innerMostLoop = loops.getInnerMostLoop(bb);
        if (innerMostLoop) {
          auto nestingLevel = innerMostLoop->getNestingLevel();
          refCount += (unsigned int)std::pow(assumeLoopIter, nestingLevel);
        } else
          refCount += defWt;
      }
    }

    if (uses) {
      for (auto &use : *uses) {
        auto *bb = std::get<1>(use);
        auto *innerMostLoop = loops.getInnerMostLoop(bb);
        if (innerMostLoop) {
          auto nestingLevel = innerMostLoop->getNestingLevel();
          refCount += (unsigned int)std::pow(assumeLoopIter, nestingLevel);
        } else
          refCount += useWt;
      }
    }

    if (dcl->getAddressed()) {
      auto indirectRefsIt = indirectRefs.find(dcl);
      if (indirectRefsIt != indirectRefs.end()) {
        auto &dclIndirRefs = (*indirectRefsIt).second;
        for (auto &item : dclIndirRefs) {
          auto bb = item.second;

          auto *innerMostLoop = loops.getInnerMostLoop(bb);
          if (innerMostLoop) {
            auto nestingLevel = innerMostLoop->getNestingLevel();
            refCount += (unsigned int)std::pow(assumeLoopIter, nestingLevel);
          } else
            refCount += useWt;
        }
      }
    }

    return refCount == 0 ? 1 : refCount;
  };

  std::unordered_map<const G4_Declare *, std::vector<G4_Declare *>>
      addrTakenMap;
  std::unordered_map<G4_Declare *, std::vector<const G4_Declare *>>
      revAddrTakenMap;
  bool addrMapsComputed = false;
  auto incSpillCostCandidate = [&](LiveRange *lr) {
    if (kernel.getOption(vISA_IncSpillCostAllAddrTaken))
      return true;
    if (!addrMapsComputed) {
      const_cast<PointsToAnalysis &>(liveAnalysis.getPointsToAnalysis())
          .getPointsToMap(addrTakenMap);
      const_cast<PointsToAnalysis &>(liveAnalysis.getPointsToAnalysis())
          .getRevPointsToMap(revAddrTakenMap);
      addrMapsComputed = true;
    }

    // this condition is a safety measure and isnt expected to be true.
    auto it = revAddrTakenMap.find(lr->getDcl());
    if (it == revAddrTakenMap.end())
      return true;

    for (auto &addrVar : (*it).second) {
      if (addrTakenMap.count(addrVar) > 1)
        return true;
    }
    return false;
  };

  for (unsigned i = 0; i < numVar; i++) {
    G4_Declare *dcl = lrs[i]->getDcl();

    if (dcl->getIsPartialDcl()) {
      continue;
    }
    //
    // The spill cost of pseudo nodes inserted to aid generation of save/restore
    // code must be the minimum so that such nodes go to the bootom of the color
    // stack.
    //
    if (builder.kernel.fg.isPseudoDcl(dcl)) {
      if (builder.kernel.fg.isPseudoVCADcl(dcl)) {
        lrs[i]->setSpillCost(MINSPILLCOST + 1);
      } else {
        lrs[i]->setSpillCost(MINSPILLCOST);
      }
    }

    auto dclLR = gra.getLocalLR(dcl);
    if (dclLR != NULL && dclLR->getSplit()) {
      lrs[i]->setSpillCost(MINSPILLCOST + 2);
    }
    //
    // Give the tiny spill/fill ranges an infinite spill cost, so that they are
    // picked first for coloring.
    // Also ARF live ranges with exclusively sequential references within the
    // code are assigned an infinite spill cost as spilling them will not lower
    // the register pressure in the region they are referenced. This does not
    // necessarily hold for GRF live ranges are these are potentially large in
    // size but the portions accessed by each sequential use are limited to 2
    // registers for general instructions and 8 registers for SEND instructions.
    //
    else if (gra.isAddrFlagSpillDcl(dcl) || lrs[i]->isRetIp() ||
             lrs[i]->getIsInfiniteSpillCost() == true ||
             ((lrs[i]->getVar()->isRegVarTransient() == true ||
               lrs[i]->getVar()->isRegVarTmp() == true) &&
              lrs[i]->getVar()->isSpilled() == false) ||
             dcl == gra.getOldFPDcl() ||
             (!builder.canReadR0() && dcl == builder.getBuiltinR0())) {
      lrs[i]->setSpillCost(MAXSPILLCOST);
    } else if (dcl->isDoNotSpill()) {
      lrs[i]->setSpillCost(MAXSPILLCOST);
    }
    //
    // Calculate spill costs of regular nodes.
    //
    else {
      float spillCost = 0.0f;
      // NOTE: Add 1 to degree to avoid divide-by-0, as a live range may have no
      // neighbors
      if (builder.kernel.getInt32KernelAttr(Attributes::ATTR_Target) ==
          VISA_3D) {
        if (useSplitLLRHeuristic) {
          spillCost = 1.0f * lrs[i]->getRefCount() / (lrs[i]->getDegree() + 1);
        } else {
          vASSERT(lrs[i]->getDcl()->getTotalElems() > 0);
          if (!liveAnalysis.livenessClass(G4_GRF) || !useNewSpillCost) {
            // address or flag variables
            unsigned short numRows = lrs[i]->getDcl()->getNumRows();
            spillCost = 1.0f * lrs[i]->getRefCount() * lrs[i]->getRefCount() *
                        lrs[i]->getDcl()->getByteSize() *
                        (float)sqrt(lrs[i]->getDcl()->getByteSize()) /
                        ((float)sqrt(lrs[i]->getDegree() + 1) *
                         (float)(sqrt(sqrt(numRows))));
          } else {
            // GRF variables

            auto refCount = getWeightedRefCount(lrs[i]->getDcl());
            spillCost = 1.0f * refCount * refCount * refCount /
                        ((float)(lrs[i]->getDegree() + 1) *
                         (float)(lrs[i]->getDegree() + 1));
          }
        }
      } else {
        if (!useNewSpillCost) {
          spillCost = liveAnalysis.livenessClass(G4_GRF)
                          ? lrs[i]->getDegree()
                          : 1.0f * lrs[i]->getRefCount() *
                                lrs[i]->getRefCount() /
                                (lrs[i]->getDegree() + 1);
        } else {
          auto refCount = getWeightedRefCount(lrs[i]->getDcl());
          spillCost = 1.0f * refCount * refCount * refCount /
                      ((float)(lrs[i]->getDegree() + 1) *
                       (float)(lrs[i]->getDegree() + 1));
        }
      }

      lrs[i]->setSpillCost(spillCost);
      // Track address sensitive live range.
      if (liveAnalysis.isAddressSensitive(i) && incSpillCostCandidate(lrs[i])) {
        addressSensitiveVars.push_back(lrs[i]);
      } else {
        // Set the spill cost of all other normal live ranges, and
        // track the max normal cost.
        if (maxNormalCost < spillCost) {
          maxNormalCost = spillCost;
        }
      }
    }
  }

  //
  // Set the spill cost of address sensitive live ranges above all the
  // normal live ranges, so that they get colored before all the normal
  // live ranges.
  //
  for (LiveRange *lr : addressSensitiveVars) {
    if (lr->getSpillCost() != MAXSPILLCOST) {
      lr->setSpillCost(maxNormalCost + lr->getSpillCost());
    }
  }
}

//
// subtract lr's neighbors that are still in work list
//
void GraphColor::relaxNeighborDegreeGRF(LiveRange *lr) {
  if (lr->getIsPseudoNode() || lr->getIsPartialDcl())
    return;

  unsigned lr_id = lr->getVar()->getId();
  unsigned lr2_nreg = lr->getNumRegNeeded();

  const std::vector<unsigned> &intfs = intf.getSparseIntfForVar(lr_id);
  if (gra.use4GRFAlign) {
    unsigned int lr2AugAlign = gra.getAugAlign(lr->getDcl());
    for (auto it : intfs) {
      LiveRange *lr1 = lrs[it];
      if (lr1->getActive() && !lr1->getIsPseudoNode() &&
          !(lr1->getIsPartialDcl())) {
        unsigned lr1_nreg = lr1->getNumRegNeeded();
        unsigned int lr1AugAlign = gra.getAugAlign(lr1->getDcl());
        auto w =
            edgeWeightWith4GRF(lr1AugAlign, lr2AugAlign, lr1_nreg, lr2_nreg);
        relax(lr1, w);
      }
    }
    return;
  }

  // Handle case where 4GRF align is unsupported
  bool lr2EvenAlign = gra.isEvenAligned(lr->getDcl());
  for (auto it : intfs) {
    LiveRange *lr1 = lrs[it];
    if (lr1->getActive() && !lr1->getIsPseudoNode() &&
        !(lr1->getIsPartialDcl())) {
      unsigned lr1_nreg = lr1->getNumRegNeeded();
      unsigned w = 0;
      bool lr1EvenAlign = gra.isEvenAligned(lr1->getDcl());
      w = edgeWeightGRF(lr1EvenAlign, lr2EvenAlign, lr1_nreg, lr2_nreg);
      relax(lr1, w);
    }
  }

  // Weak edges are supported only when 4GRF align is unsupported
  auto *weakEdges = intf.getCompatibleSparseIntf(lr->getDcl());
  if (weakEdges) {
    for (auto weakNeighbor : *weakEdges) {
      if (!weakNeighbor->getRegVar()->isRegAllocPartaker())
        continue;
      auto lr1 = lrs[weakNeighbor->getRegVar()->getId()];
      if (lr1->getActive() && !lr1->getIsPseudoNode() &&
          !(lr1->getIsPartialDcl())) {
        unsigned lr1_nreg = lr1->getNumRegNeeded();
        bool lr1EvenAlign = gra.isEvenAligned(lr1->getDcl());
        auto w = edgeWeightGRF(lr1EvenAlign, lr2EvenAlign, lr1_nreg, lr2_nreg);
        relax(lr1, w);
      }
    }
  }
}

void GraphColor::relaxNeighborDegreeARF(LiveRange *lr) {
  if (!(lr->getIsPseudoNode())) {
    unsigned lr_id = lr->getVar()->getId();
    const std::vector<unsigned> &intfs = intf.getSparseIntfForVar(lr_id);
    for (auto it : intfs) {
      LiveRange *lrs_it = lrs[it];

      if (lrs_it->getActive() && !lrs_it->getIsPseudoNode()) {
        unsigned w = edgeWeightARF(lrs_it, lr);
        VISA_DEBUG_VERBOSE({
          std::cout << "\t relax ";
          lrs_it->dump();
          std::cout << " degree(" << lrs_it->getDegree() << ") - " << w << "\n";
        });
        lrs_it->subtractDegree(w);

        unsigned availColor = numColor;

        if (lrs_it->getDegree() + lrs_it->getNumRegNeeded() <= availColor) {
          unconstrainedWorklist.push_back(lrs_it);
          lrs_it->setActive(false);
        }
      }
    }
  }
}

static bool compareSpillCost(LiveRange *lr1, LiveRange *lr2) {
  return lr1->getSpillCost() < lr2->getSpillCost() ||
         (lr1->getSpillCost() == lr2->getSpillCost() &&
          lr1->getVar()->getId() < lr2->getVar()->getId());
}

//
// All nodes in work list are all contrained (whose degree > max color)
// find one contrained node and move it to order list
//
void GraphColor::removeConstrained() {
  if (!constrainedWorklist.empty()) {
    LiveRange *lr = constrainedWorklist.front();
    constrainedWorklist.pop_front();

    if (lr->getActive()) {
      VISA_DEBUG_VERBOSE({
        std::cout << ".... Remove Constrained ";
        lr->dump();
        std::cout << "\n";
      });

      if (liveAnalysis.livenessClass(G4_GRF)) {
        relaxNeighborDegreeGRF(lr);
      } else {
        relaxNeighborDegreeARF(lr);
      }
      colorOrder.push_back(lr);
      lr->setActive(false);
    }
  }
}

void GraphColor::determineColorOrdering() {
  numColor = 0;
  if (liveAnalysis.livenessClass(G4_GRF))
    numColor = totalGRFRegCount - reserveSpillGRFCount;
  else if (liveAnalysis.livenessClass(G4_ADDRESS))
    numColor = builder.getNumAddrRegisters();
  else if (liveAnalysis.livenessClass(G4_FLAG))
    numColor = builder.getNumFlagRegisters();

  unsigned numUnassignedVar = liveAnalysis.getNumUnassignedVar();

  //
  // create an array for sorting live ranges
  //
  LiveRangeVec sorted;
  sorted.reserve(numUnassignedVar);
  unsigned j = 0;
  for (unsigned i = 0; i < numVar; i++) {
    if (lrs[i]->getPhyReg() == nullptr && !lrs[i]->getIsPartialDcl()) {
      sorted.push_back(lrs[i]);
      j++;
    }
  }

  if (gra.incRA.isEnabledWithVerification(kernel)) {
    gra.incRA.computeLeftOverUnassigned(sorted, liveAnalysis);
  }

  vISA_ASSERT(j == numUnassignedVar, ERROR_GRAPHCOLOR);

  //
  // sort the live range array
  //
  std::sort(sorted.begin(), sorted.end(), compareSpillCost);
  //This will not change the order unless SPGSS is turned on
  builder.getFreqInfoManager().sortBasedOnFreq(sorted);

  for (unsigned i = 0; i < numUnassignedVar; i++) {
    LiveRange *lr = sorted[i];
    unsigned availColor = numColor;
    availColor = numColor - lr->getNumForbidden();

    if (lr->getDegree() + lr->getNumRegNeeded() <= availColor) {
      unconstrainedWorklist.push_back(lr);
      lr->setActive(false);
      if (lr->getRegKind() == G4_GRF) {
        // Mark current lr as unconstrained, which means RR algorithm can always
        // be applied to the variable.
        lr->setUnconstrained(true);
      }
    } else {
      constrainedWorklist.push_back(lr);
      lr->setActive(true);
    }
  }

  VISA_DEBUG_VERBOSE({
    std::cout << "\nSPILL COST\n";
    for (unsigned i = 0; i < numUnassignedVar; i++) {
      sorted[i]->dump();
      std::cout << "\t spillCost=" << sorted[i]->getSpillCost();
      std::cout << "\t degree=" << sorted[i]->getDegree();
      std::cout << "\t refCnt=" << sorted[i]->getRefCount();
      std::cout << "\t size=" << sorted[i]->getDcl()->getByteSize();
      std::cout << "\t active=" << sorted[i]->getActive();
      std::cout << "\n";
    }
    std::cout << "\n";
  });

  while (!constrainedWorklist.empty() || !unconstrainedWorklist.empty()) {
    while (!unconstrainedWorklist.empty()) {
      LiveRange *lr = unconstrainedWorklist.front();
      unconstrainedWorklist.pop_front();

      VISA_DEBUG_VERBOSE({
        std::cout << ".... Remove Unconstrained ";
        lr->dump();
        std::cout << "\n";
      });

      if (liveAnalysis.livenessClass(G4_GRF)) {
        relaxNeighborDegreeGRF(lr);
      } else {
        relaxNeighborDegreeARF(lr);
      }
      colorOrder.push_back(lr);
    }

    removeConstrained();
  }
}

void PhyRegUsage::updateRegUsage(LiveRange *lr) {
  G4_Declare *dcl = lr->getDcl();
  G4_VarBase *pr;
  if (lr->getIsPartialDcl()) {
    pr = lrs[lr->getParentLRID()]->getPhyReg();
  } else {
    pr = lr->getPhyReg();
  }

  if (!pr) {
    return;
  }
  if (pr->isGreg()) {
    if (dcl->getIsPartialDcl()) {
      // Assumptions:
      //  1. the offset of the sub declare must be G4_WSIZE aligned
      //  2. the size of the subdeclare must be G4_WSIZE aligned
      markBusyForDclSplit(G4_GRF, ((G4_Greg *)pr)->getRegNum(),
                          (lrs[lr->getParentLRID()]->getPhyRegOff() *
                               TypeSize(dcl->getElemType()) +
                           gra.getSubOffset(dcl)) /
                              G4_WSIZE,
                          dcl->getByteSize() / G4_WSIZE, dcl->getNumRows());
    } else {
      markBusyGRF(
          ((G4_Greg *)pr)->getRegNum(),
          PhyRegUsage::offsetAllocUnit(lr->getPhyRegOff(), dcl->getElemType()),
          dcl->getWordSize(), lr->getNumRegNeeded(), dcl->isPreDefinedVar());
    }
  } else if (pr->isFlag()) {
    auto flagWordOffset = lr->getPhyReg()->asAreg()->getFlagNum() * 2;
    markBusyFlag(
        0,
        PhyRegUsage::offsetAllocUnit(flagWordOffset + lr->getPhyRegOff(),
                                     dcl->getElemType()),
        PhyRegUsage::numAllocUnit(dcl->getNumElems(), dcl->getElemType()),
        dcl->getNumRows());
  } else if (pr->isA0()) {
    markBusyAddress(
        0, PhyRegUsage::offsetAllocUnit(lr->getPhyRegOff(), dcl->getElemType()),
        PhyRegUsage::numAllocUnit(dcl->getNumElems(), dcl->getElemType()),
        dcl->getNumRows());
  }
  else if (pr->isS0()) {
    markBusyScalar(
        0, PhyRegUsage::offsetAllocUnit(lr->getPhyRegOff(), dcl->getElemType()),
        PhyRegUsage::numAllocUnit(dcl->getNumElems(), dcl->getElemType()),
        dcl->getNumRows());
  }
  else {
    vISA_ASSERT(false, ERROR_GRAPHCOLOR); // un-handled reg type
  }
}

bool GraphColor::assignColors(ColorHeuristic colorHeuristicGRF,
                              bool doBankConflict, bool highInternalConflict,
                              bool doBundleConflict) {
  RA_TRACE(std::cout << "\t--"
                     << (colorHeuristicGRF == ROUND_ROBIN ? "round-robin"
                                                          : "first-fit")
                     << (doBankConflict ? " BCR" : "") << " graph coloring\n");

  unsigned bank1_end = 0;
  unsigned bank2_end = totalGRFRegCount - 1;
  unsigned bank1_start = 0;
  unsigned bank2_start = totalGRFRegCount - 1;
  unsigned totalGRFNum = kernel.getNumRegTotal();
  bool oneGRFBankDivision = gra.kernel.fg.builder->oneGRFBankDivision();
  bool allocFromBanks =
      liveAnalysis.livenessClass(G4_GRF) && builder.lowHighBundle() &&
      !builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum) &&
      doBankConflict &&
      ((oneGRFBankDivision && gra.kernel.getSimdSize() >= g4::SIMD16) ||
       (!oneGRFBankDivision && highInternalConflict));

  if (allocFromBanks && (colorHeuristicGRF == ROUND_ROBIN)) {
    bank1_end = (unsigned)((totalGRFRegCount - 1) *
                           (((float)evenTotalDegree / evenTotalRegNum) /
                            (((float)evenTotalDegree / evenTotalRegNum) +
                             ((float)oddTotalDegree / oddTotalRegNum))));
    if (bank1_end < evenMaxRegNum ||
        totalGRFRegCount - bank1_end < oddMaxRegNum ||
        bank1_end == totalGRFRegCount - 1 || bank1_end == 0) {
      // FIXME: How can we early return without assigning???
      return false;
    }

    bank2_end = bank1_end + 1;
  }

  G4_RegFileKind rFile = G4_GRF;
  if (liveAnalysis.livenessClass(G4_FLAG))
    rFile = G4_FLAG;
  else if (liveAnalysis.livenessClass(G4_ADDRESS))
    rFile = G4_ADDRESS;
  else if (liveAnalysis.livenessClass(G4_SCALAR))
    rFile = G4_SCALAR;

  FreePhyRegs FPR(kernel);

  unsigned maxGRFCanBeUsed = totalGRFRegCount;
  // FIXME: the bank configs should be computed in PhyRegAllocationState instead
  // of pased in, but the strange early return from above prevents this..
  PhyRegAllocationState parms(gra, lrs, rFile, maxGRFCanBeUsed, bank1_start,
                              bank1_end, bank2_start, bank2_end, doBankConflict,
                              doBundleConflict);
  bool noIndirForceSpills = builder.getOption(vISA_NoIndirectForceSpills);

  // Returns true when valid assignment is found or when lr is added to spilled
  // set. Adding to spill set happens only if heuristic is not round_robin (FF
  // may not spill). Parameter returnFalseOnFail is set when the function is
  // required to return false on assignment failure. When parameter spillAllowed
  // is set to true, this function adds lr to spilled set. If spillAllowed is
  // false, the lr is not added to spill set. This logic is useful to try
  // re-allocation of a child/parent dcl when split is enabled.
  // ignoreChildrenIntf is set to true when all children are assigned to
  // consecutive ranges and we want to get fully coalesceable assignment for
  // parent. In such circumstance, we don't want to account for interference
  // between parent/child since doing so cannot result in a coalesceable
  // assignment.
  auto assignColor = [&](LiveRange *lr) {
    auto lrVar = lr->getVar();

    //
    // assign register to live ranges
    //
    if (lr->getPhyReg() == NULL && !lrVar->isSpilled() &&
        !lr->getIsPartialDcl()) // no assigned register yet and not spilled
    {
      unsigned lr_id = lrVar->getId();
      //
      // compute what registers are already assigned
      //
      PhyRegUsage regUsage(parms, FPR);

      const std::vector<unsigned> &intfs = intf.getSparseIntfForVar(lr_id);
      auto weakEdgeSet =
          intf.getCompatibleSparseIntf(lrVar->getDeclare()->getRootDeclare());
      for (auto it : intfs) {
        LiveRange *lrTemp = lrs[it];
        if (lrTemp->getPhyReg() != nullptr || lrTemp->getIsPartialDcl()) {
          if (lrTemp->getIsSplittedDcl()) {
            // Only interfere with children declares
            continue;
          }

          regUsage.updateRegUsage(lrTemp);
        }
      }

      if (weakEdgeSet) {
        regUsage.runOverlapTest(true);
        for (auto weakDcl : *weakEdgeSet) {
          auto regVar = weakDcl->getRootDeclare()->getRegVar();
          unsigned pvar = 0, numRegs = 0;
          if (regVar->isPhyRegAssigned()) {
            // This branch will be taken for dcls assigned
            // regs by LRA.
            pvar = regVar->getPhyReg()->asGreg()->getRegNum();
            numRegs = weakDcl->getNumRows();
          } else {
            // For dcls not assigned regs by LRA, lookup temp
            // registers assigned to LiveRange instances.
            auto id = regVar->getId();
            auto lr = lrs[id];
            auto phyReg = lr->getPhyReg();
            if (phyReg) {
              pvar = phyReg->asGreg()->getRegNum();
              numRegs = weakDcl->getNumRows();
            }
          }

          // For now it is assumed only 8-byte types will appear
          // here. If other sized types will also appear then
          // augmentation mask also needs to be sent in
          // weak edge data structure below.
          for (unsigned r = pvar; r < (pvar + numRegs); r++) {
            auto use = regUsage.getWeakEdgeUse(r);
            if (use == 0 || use == (r - pvar + 1)) {
              regUsage.setWeakEdgeUse(r, r - pvar + 1);
            } else {
              // Indiates two neighbors use a physical
              // register with different overlap.
              regUsage.setWeakEdgeUse(r, 0xff);
            }
          }
        }
      }

      ColorHeuristic heuristic = colorHeuristicGRF;

      bool failed_alloc = false;
      G4_Declare *dcl = lrVar->getDeclare();

      if (!(noIndirForceSpills && liveAnalysis.isAddressSensitive(lr_id)) &&
          forceSpill &&
          (dcl->getRegFile() == G4_GRF || dcl->getRegFile() == G4_FLAG) &&
          lr->getRefCount() != 0 && lr->getSpillCost() != MAXSPILLCOST) {
        failed_alloc = true;
      }

      if ((dcl->getNumRows() > totalGRFNum) ||
          (dcl->isForceSpilled() && (lr->getSpillCost() != MAXSPILLCOST))) {
        // we sure as hell won't get an assignment
        failed_alloc = true;
      }

      if (kernel.getOption(vISA_GCRRInFF)) {
        if (lr->getRegKind() != G4_GRF) {
          // None GRF assignment, keep single FF or RR algorithm
          if (heuristic == FIRST_FIT) {
            parms.setStartGRF(0);
          }
        } else if (heuristic == FIRST_FIT && !lr->getIsUnconstrained()) {
          // GRF assignment, start GRF is always 0 if first fit algorithm is
          // used and the variable is constrainted
          parms.setStartGRF(0);
        }
      }

      if (!failed_alloc) {
        // When evenAlignNeeded is true, it is binding for correctness
        bool evenAlignNeeded = gra.isEvenAligned(lrVar->getDeclare());
        bool quadAlignNeeded = gra.isQuadAligned(lrVar->getDeclare());
        BankAlign align = BankAlign::Either;
        if (quadAlignNeeded)
          align = BankAlign::QuadGRF;
        else if (evenAlignNeeded)
          align = BankAlign::Even;

        if (allocFromBanks) {
          vISA_ASSERT(align != BankAlign::QuadGRF, "unexpected value");
          if (!isHybrid && oneGRFBankDivision &&
              (!evenAlignNeeded ||
               builder.getPlatformGeneration() == PlatformGen::GEN9)) {
            gra.getBankAlignment(lr, align);
          }
          failed_alloc |= !regUsage.assignGRFRegsFromBanks(
              lr, align, lr->getForbidden(), heuristic, oneGRFBankDivision);
        } else {
          failed_alloc |= !regUsage.assignRegs(
              highInternalConflict, lr, lr->getForbidden(), align,
              gra.getSubRegAlign(lrVar->getDeclare()), heuristic,
              lr->getSpillCost());
        }
      }

      //
      // assign unused color
      //
      if (failed_alloc) {
        //
        // for GRF register assignment, if we are performing round-robin (1st
        // pass) then abort on spill
        //
        if ((heuristic == ROUND_ROBIN ||
             (doBankConflict && !kernel.getOption(vISA_forceBCR))) &&
            (lr->getRegKind() == G4_GRF || lr->getRegKind() == G4_FLAG)) {
          return false;
        } else if (kernel.fg.isPseudoDcl(dcl)) {
          // these pseudo dcls are not (and cannot be) spilled, but instead
          // save/restore code will be inserted in stack call prolog/epilog
        } else {
          // for first-fit register assignment track spilled live ranges
          spilledLRs.push_back(lr);
          lr->setSpilled(true);
        }
      }
    }
    VISA_DEBUG_VERBOSE({
      lr->dump();
      std::cout << "\n";
    });
    return true;
  };

  // colorOrder is in reverse order (unconstrained at front)
  for (auto iter = colorOrder.rbegin(), iterEnd = colorOrder.rend();
       iter != iterEnd; ++iter) {
    auto lr = (*iter);

    // in case child/parent was already spilled earlier, don't recolor
    if (lr->isSpilled())
      continue;

    bool ret = assignColor(lr);

    // early exit
    if (!ret)
      return false;
  }

  if (failSafeIter) {
    // As per spec, EOT has to be allocated to r112+.
    // When fail safe iteration is run, upper GRFs are
    // reserved. It's possible that # of reserved
    // GRFs are too many and r112+ allocation restriction
    // on EOT cannot be fulfilled (eg, r116-r127 are reserved
    // EOT src operand size is 8 GRFs). This causes EOT var
    // to spill and then the spill range faces the same
    // restriction. The fix here is to check whether
    // reserved GRF restriction can be eased for EOT.
    auto hasSpilledNeighbor = [&](unsigned int id) {
      for (const auto *spillLR : spilledLRs) {
        if (id != spillLR->getVar()->getId() &&
            getIntf()->interfereBetween(id, spillLR->getVar()->getId()))
          return true;
      }
      return false;
    };

    if (gra.useHybridRAwithSpill) {
      // This local analysis is skipped in favor of
      // compile time in global RA loop, so run it here
      // when needed.
      gra.markGraphBlockLocalVars();
    }

    for (auto lrIt = spilledLRs.begin(); lrIt != spilledLRs.end(); ++lrIt) {
      auto lr = (*lrIt);
      bool needsEOTGRF = lr->getEOTSrc() && builder.hasEOTGRFBinding();
      if (needsEOTGRF && gra.isBlockLocal(lr->getDcl()) &&
          (totalGRFRegCount - reserveSpillGRFCount + lr->getNumRegNeeded()) <=
              kernel.getNumRegTotal() &&
          !hasSpilledNeighbor(lr->getVar()->getId())) {
        // Following conditions true:
        // 1. EOT range spilled that needs r112-r127 assignment,
        // 2. Variable is local to a BB,
        // 3. Reserved GRF start + # EOT GRFs fits within total GRFs,
        // 4. Has no spilled neighbor
        //
        // This makes it safe to directly assign a reserved GRF to this
        // variable than spill it.
        lr->setPhyReg(builder.phyregpool.getGreg(kernel.getNumRegTotal() -
                                                 lr->getNumRegNeeded()),
                      0);
        spilledLRs.erase(lrIt);
        break;
      }
    }
  }

  // record RA type
  if (liveAnalysis.livenessClass(G4_GRF)) {
    if (colorHeuristicGRF == ROUND_ROBIN) {
      kernel.setRAType(doBankConflict ? RA_Type::GRAPH_COLORING_RR_BC_RA
                                      : RA_Type::GRAPH_COLORING_RR_RA);
    } else {
      kernel.setRAType(doBankConflict ? RA_Type::GRAPH_COLORING_FF_BC_RA
                                      : RA_Type::GRAPH_COLORING_FF_RA);
    }
  }

#ifdef _DEBUG
  // Verify that spilledLRs has no duplicate
  for (auto item : spilledLRs) {
    unsigned count = 0;
    for (auto checkItem : spilledLRs) {
      if (checkItem == item) {
        vISA_ASSERT(count == 0, "Duplicate entry found in spilledLRs");
        count++;
      }
    }
  }

  // Verify that none of spilledLRs have an allocation
  for (auto lr : spilledLRs) {
    vISA_ASSERT(lr->getPhyReg() == nullptr,
                "Spilled LR contains valid allocation");
  }

  // Verify that all spilled LRs are synced
  for (auto lr : spilledLRs) {
    vISA_ASSERT(lr->isSpilled(),
                "LR not marked as spilled, but inserted in spilledLRs list");
  }

  // Verify if all LRs have either an allocation or are spilled
  for (auto lr : colorOrder) {
    if (!kernel.fg.isPseudoDcl(lr->getDcl())) {
      vISA_ASSERT(lr->isSpilled() || lr->getPhyReg() ||
                      lr->getDcl()->isSpilled(),
                  "Range without allocation and not spilled");
    }
  }
#endif

  return true;
}

template <class REGION_TYPE>
unsigned GlobalRA::getRegionDisp(REGION_TYPE *region, const IR_Builder &irb) {
  unsigned rowOffset = irb.numEltPerGRF<Type_UB>() * region->getRegOff();
  unsigned columnOffset = region->getSubRegOff() * region->getElemSize();
  return rowOffset + columnOffset;
}

void GlobalRA::addEUFusionCallWAInst(G4_INST *inst) {
  if (EUFusionCallWANeeded())
    EUFusionCallWAInsts.insert(inst);
}

void GlobalRA::addEUFusionNoMaskWAInst(G4_BB *BB, G4_INST *Inst) {
  if (EUFusionNoMaskWANeeded() && (BB->getBBType() & G4_BB_NM_WA_TYPE) != 0) {
    EUFusionNoMaskWAInsts.insert(Inst);
    Inst->setNeedPostRA(true);
  }
}

void GlobalRA::removeEUFusionNoMaskWAInst(G4_INST *Inst) {
  if (EUFusionNoMaskWANeeded()) {
    if (EUFusionNoMaskWAInsts.erase(Inst) > 0) {
      Inst->setNeedPostRA(false);
    }
  }
}

unsigned GlobalRA::getRegionByteSize(G4_DstRegRegion *region,
                                     unsigned execSize) {
  unsigned size =
      region->getHorzStride() * region->getElemSize() * (execSize - 1) +
      region->getElemSize();

  return size;
}

#define OWORD_BYTE_SIZE 16

template <class REGION_TYPE>
bool GlobalRA::isUnalignedRegion(REGION_TYPE *region, unsigned execSize) {
  unsigned regionDisp = getRegionDisp(region, builder);
  unsigned regionByteSize = getRegionByteSize(region, execSize);

  if (regionDisp % kernel.numEltPerGRF<Type_UB>() == 0 &&
      regionByteSize % kernel.numEltPerGRF<Type_UB>() == 0) {
    return regionByteSize / kernel.numEltPerGRF<Type_UB>() != 1 &&
           regionByteSize / kernel.numEltPerGRF<Type_UB>() != 2 &&
           regionByteSize / kernel.numEltPerGRF<Type_UB>() != 4;
  }
  return true;
}

bool GlobalRA::shouldPreloadDst(G4_INST *instContext, G4_BB *curBB) {
  // Check for partial and unaligned regions and add pre-load code, if
  // necessary.
  auto spilledRangeRegion = instContext->getDst();
  uint8_t execSize = instContext->getExecSize();

  if (isPartialRegion(spilledRangeRegion, execSize) ||
      isUnalignedRegion(spilledRangeRegion, execSize) ||
      instContext->isPartialWriteForSpill(!curBB->isAllLaneActive(),
                                          useLscForNonStackCallSpillFill)) {
    return true;
  }
  // No pre-load for whole and aligned region writes
  else {
    return false;
  }
}

bool GlobalRA::livenessCandidate(const G4_Declare *decl) const {
  if (decl->getAliasDeclare()) {
    return false;
  }

  if ((G4_GRF & decl->getRegFile())) {
    if ((decl->getRegFile() & G4_INPUT) &&
        decl->getRegVar()->isPhyRegAssigned() && !decl->getRegVar()->isGreg()) {
      return false;
    }
    if (decl->getByteSize() == 0) {
      // regrettably, this can happen for arg/retval pre-defined variable
      return false;
    }
    return true;
  } else {
    return false;
  }
}

void GlobalRA::determineSpillRegSize(unsigned &spillRegSize,
                                     unsigned &indrSpillRegSize) {
  // Iterate over all BBs
  for (auto curBB : kernel.fg) {
    // Iterate over all insts
    for (INST_LIST_ITER inst_it = curBB->begin(), iend = curBB->end();
         inst_it != iend; ++inst_it) {
      unsigned currentSpillRegSize = 0;
      unsigned currentIndrSpillRegSize = 0;

      G4_INST *curInst = (*inst_it);

      if (curInst->isPseudoKill() || curInst->isLifeTimeEnd() ||
          curInst->opcode() == G4_pseudo_fcall ||
          curInst->opcode() == G4_pseudo_fret) {
        continue;
      }

      if (curInst->isSend()) {
        G4_SendDesc *msgDesc = curInst->getMsgDesc();

        unsigned dstSpillRegSize = 0;
        dstSpillRegSize = msgDesc->getDstLenRegs();

        unsigned src0FillRegSize = 0;
        src0FillRegSize = msgDesc->getSrc0LenRegs();

        unsigned src1FillRegSize = 0;
        if (curInst->isSplitSend()) {
          src1FillRegSize = msgDesc->getSrc1LenRegs();
        }

        if (!kernel.fg.builder->useSends()) {
          dstSpillRegSize++;
        }

        currentSpillRegSize =
            dstSpillRegSize + src0FillRegSize + src1FillRegSize;
      } else if (curInst->isDpas()) {
        unsigned dstSpillRegSize = 0;
        G4_DstRegRegion *dst = curInst->getDst();
        if (dst && dst->getBase()->isRegVar()) {
          dstSpillRegSize =
              dst->getBase()->asRegVar()->getDeclare()->getNumRows();
        }

        unsigned srcFillRegSize = 0;
        for (int i = 0, srcNum = curInst->getNumSrc(); i < srcNum; i++) {
          G4_Operand *src = curInst->getSrc(i);

          if (src && src->isSrcRegRegion() &&
              src->asSrcRegRegion()->getBase()->isRegVar()) {
            if (src->asSrcRegRegion()
                    ->getBase()
                    ->asRegVar()
                    ->getDeclare()
                    ->getRegFile() == G4_GRF) {
              unsigned srcSize =
                  src->getBase()->asRegVar()->getDeclare()->getNumRows();
              // FIXME, currently we only use the max src size.
              // To save the spill registers, it's better the space can be
              // determined by checking if the variable is really spilled or
              // not.
              srcFillRegSize += srcSize;
            }
          }
        }
        currentSpillRegSize = srcFillRegSize + dstSpillRegSize;
      } else {
        ORG_REGVAR_VECTOR indrVars;

        unsigned dstSpillRegSize = 0;
        unsigned indrDstSpillRegSize = 0;
        if (G4_Inst_Table[curInst->opcode()].n_dst == 1) {
          G4_DstRegRegion *dst = curInst->getDst();

          if (dst && dst->getBase()->isRegVar()) {
            if (dst->getBase()->asRegVar()->getDeclare()->getRegFile() ==
                G4_GRF) {
              if (dst->isCrossGRFDst(builder)) {
                dstSpillRegSize = 2;
              } else {
                dstSpillRegSize = 1;
              }

              if (shouldPreloadDst(curInst, curBB)) {
                dstSpillRegSize *= 3;
              } else {
                dstSpillRegSize *= 2;
              }

              if (!kernel.fg.builder->useSends()) {
                dstSpillRegSize++;
              }
            } else if (dst->getRegAccess() == IndirGRF) {
              auto pointsToSet =
                  pointsToAnalysis.getAllInPointsTo(dst->getBase()->asRegVar());
              if (pointsToSet != nullptr) {
                for (const auto& pt : *pointsToSet) {
                  if (pt.var->isRegAllocPartaker() ||
                      ((useFastRA || useHybridRAwithSpill) &&
                       livenessCandidate(pt.var->getDeclare()))) {
                    indrVars.push_back(pt.var);
                    indrDstSpillRegSize += pt.var->getDeclare()->getNumRows();
                  }
                }
              }
            }
          }
        }

        unsigned srcFillRegSize = 0;
        unsigned indirSrcFillRegSize = 0;
        // Scan srcs
        for (int i = 0, srcNum = curInst->getNumSrc(); i < srcNum; i++) {
          G4_Operand *src = curInst->getSrc(i);

          if (src && src->isSrcRegRegion() &&
              src->asSrcRegRegion()->getBase()->isRegVar()) {
            if (src->asSrcRegRegion()
                    ->getBase()
                    ->asRegVar()
                    ->getDeclare()
                    ->getRegFile() == G4_GRF) {
              if (src->asSrcRegRegion()->crossGRF(builder)) {
                srcFillRegSize += 2;
              } else {
                srcFillRegSize += 1;
              }
            } else if (src->asSrcRegRegion()->getRegAccess() == IndirGRF) {
              auto pointsToSet = pointsToAnalysis.getAllInPointsTo(
                  src->asSrcRegRegion()->getBase()->asRegVar());
              if (pointsToSet != nullptr) {
                for (const auto& pt : *pointsToSet) {
                  if (pt.var->isRegAllocPartaker() ||
                      ((useFastRA || useHybridRAwithSpill) &&
                       livenessCandidate(pt.var->getDeclare()))) {
                    if (std::find(indrVars.begin(), indrVars.end(), pt.var) ==
                        indrVars.end()) {
                      indrVars.push_back(pt.var);
                      indirSrcFillRegSize += pt.var->getDeclare()->getNumRows();
                    }
                  }
                }
              }
            }
          }
        }

        if (builder.avoidDstSrcOverlap()) {
          currentSpillRegSize = srcFillRegSize + dstSpillRegSize;
        } else {
          currentSpillRegSize = srcFillRegSize > dstSpillRegSize
                                    ? srcFillRegSize
                                    : dstSpillRegSize;
        }
        currentIndrSpillRegSize = indrDstSpillRegSize + indirSrcFillRegSize;
      }

      spillRegSize = std::max(spillRegSize, currentSpillRegSize);
      indrSpillRegSize = std::max(indrSpillRegSize, currentIndrSpillRegSize);
    }
  }
}

void GraphColor::gatherScatterForbiddenWA() {
  if (!liveAnalysis.livenessClass(G4_GRF))
    return;

  // VISA spec supports gather.1 and scatter.1 instructions.
  // But they're not natively supported across platforms. When
  // lowering gather.1 (scatter.1) on unsupported platforms, we
  // use rsp len (msg len) = 2 while actual dst (payload) may be
  // smaller in size. This could cause a problem if dst (payload)
  // gets assigned to r127 as rsp len (msg len) = 2 could make
  // it access beyond last GRF. For eg,
  //
  // VISA:
  //.decl Rsp v_type=G type=q num_elts=1
  //.decl Addr v_type=G type=q num_elts=1
  // svm_gather.8.1 (M1, 1)   Addr   Rsp
  //
  // asm:
  // send.dc1 (1|M0)   r127   r4   null:0   exMSD   MSD   // wr:2+0, rd:2; a64
  // qword gathering read
  //
  // This asm instruction is illegal as Rsp (size = 8 bytes) was assigned r127
  // but send response length = 2.
  //
  // We fix such cases by looking them up and marking upper GRFs
  // as forbidden for allocation.
  for (auto bb : kernel.fg.getBBList()) {
    for (auto inst : *bb) {
      if (!inst->isSend() || inst->getExecSize().value >= 8)
        continue;

      // dstLen is actual # of GRFs written based on rb, lb
      // src0Len is actual # of GRFs read based on rb, lb
      // src1Len is actual # of GRFs read based on rb, lb
      unsigned int dstLen = 0, src0Len = 0, src1Len = 0;
      auto sendDst = inst->getDst();
      auto sendHdr = inst->getSrc(0);
      auto sendPayload = inst->getSrc(1);

      auto getLenInGRF = [&](G4_Operand *opnd) {
        unsigned int sz = 0;
        if (opnd && !opnd->isNullReg() && opnd->getTopDcl())
          sz = (opnd->getRightBound() - opnd->getLeftBound() +
                kernel.getGRFSize() - 1) /
               kernel.getGRFSize();
        return sz;
      };

      dstLen = getLenInGRF(sendDst);
      src0Len = getLenInGRF(sendHdr);
      src1Len = getLenInGRF(sendPayload);

      auto sendRspLen = inst->asSendInst()->getMsgDesc()->getDstLenRegs();
      auto headerLen = inst->asSendInst()->getMsgDesc()->getSrc0LenRegs();
      auto payloadLen = inst->asSendInst()->getMsgDesc()->getSrc1LenRegs();

      // For gather.[1|2|4] (scatter.[1|2|4]) difference in actual dst
      // (src0/src1) size and rspLen (msg len/ext msg len) should not exceed 1
      // GRF.
      auto markForbiddenForDcl = [&](unsigned int opndLen, G4_Declare *dcl,
                                     unsigned int lenInSend) {
        if (opndLen > 0 && dcl && dcl->getRegVar() &&
            dcl->getRegVar()->isRegAllocPartaker()) {
          if (lenInSend == (opndLen + 1)) {
            lrs[dcl->getRegVar()->getId()]->setForbidden(
                forbiddenKind::FBD_LASTGRF);
          } else if (lenInSend > opndLen) {
            vISA_ASSERT(false,
                        "mismatch between len in send and that of operand");
          }
        }
      };

      markForbiddenForDcl(dstLen, sendDst->getTopDcl(), sendRspLen);
      markForbiddenForDcl(src0Len, sendHdr->getTopDcl(), headerLen);
      markForbiddenForDcl(src1Len, sendPayload->getTopDcl(), payloadLen);
    }
  }
}

bool GraphColor::regAlloc(bool doBankConflictReduction,
                          bool highInternalConflict, const RPE *rpe) {
  bool useSplitLLRHeuristic = false;
  // FIXME: This whole bundle thing is a mess, the flag is an int but we
  // treat it as a bool when passing to assignColors, and it's not clear if it
  // works for non-DPAS instructions.
  unsigned doBundleConflictReduction = kernel.getuInt32Option(vISA_enableBundleCR);

  RA_TRACE(std::cout << "\t--# variables: " << liveAnalysis.getNumSelectedVar()
                     << "\n");

  // Copy over alignment for vars inserted by RA
  gra.copyMissingAlignment();

  //
  // create an array of live ranges.
  //
  if (!IncrementalRA::isEnabled(kernel) || lrs.size() == 0) {
    // Create vector of live ranges if we're not using
    // incremental RA or if this is 1st iteration.
    // With incremental RA, live-ranges are created right when
    // new temp var is created in RA.
    createLiveRanges();
  }

  //
  // set the pre-assigned registers
  //
  for (unsigned i = 0; i < numVar; i++) {
    if (lrs[i]->getVar()->getPhyReg()) {
      lrs[i]->setPhyReg(lrs[i]->getVar()->getPhyReg(),
                        lrs[i]->getVar()->getPhyRegOff());
    }

    G4_Declare *dcl = lrs[i]->getDcl();
    if (!useSplitLLRHeuristic) {
      auto dclLR = gra.getLocalLR(dcl);

      if (dclLR != nullptr && dclLR->getSplit()) {
        useSplitLLRHeuristic = true;
      }
    }
  }

  //
  // compute interference matrix
  //
  intf.init();
  intf.computeInterference();

  builder.getFreqInfoManager().initForRegAlloc(&liveAnalysis);

  // If option is true, try to get extra interference info from file
  if (liveAnalysis.livenessClass(G4_GRF) &&
      kernel.getOption(vISA_AddExtraIntfInfo)) {
    getExtraInterferenceInfo();
  }

  TIME_SCOPE(COLORING);
  //
  // compute degree and spill costs for each live range
  //
  if (liveAnalysis.livenessClass(G4_GRF)) {
    if (gra.use4GRFAlign)
      computeDegreeForGRF<true>();
    else
      computeDegreeForGRF<false>();
  } else {
    computeDegreeForARF();
  }

  computeSpillCosts(useSplitLLRHeuristic, rpe);
  builder.getFreqInfoManager().computeFreqSpillCosts(gra, useSplitLLRHeuristic, rpe);

  if (kernel.getOption(vISA_DumpRAIntfGraph))
    intf.dumpInterference();
  //
  // determine coloring order
  //
  determineColorOrdering();

  //
  // Set up the sub-reg alignment from declare information
  // FIXME: Why is this called after degrees are computed? Wouldn't the
  // alignment affect degree computation?
  //
  for (unsigned i = 0; i < numVar; i++) {
    G4_Declare *dcl = lrs[i]->getDcl();

    if (gra.getSubRegAlign(dcl) == Any && !dcl->getIsPartialDcl()) {
      //
      // multi-row, subreg alignment = 16 words
      //
      if (dcl->getNumRows() > 1) {
        gra.setSubRegAlign(lrs[i]->getVar()->getDeclare(),
                           kernel.getGRFAlign());
      }
      //
      // single-row
      //
      else if (gra.getSubRegAlign(lrs[i]->getVar()->getDeclare()) == Any) {
        //
        // set up Odd word or Even word sub reg alignment
        //
        unsigned nbytes = dcl->getNumElems() * TypeSize(dcl->getElemType());
        unsigned nwords = nbytes / G4_WSIZE + nbytes % G4_WSIZE;
        if (nwords >= 2 && lrs[i]->getRegKind() == G4_GRF) {
          gra.setSubRegAlign(lrs[i]->getVar()->getDeclare(), Even_Word);
        }
      }
    }
  }

  gatherScatterForbiddenWA();

  //
  // assign registers for GRFs, GRFs are first attempted to be assigned using
  // round-robin and if it fails then we retry using a first-fit heuristic.
  //
  if (liveAnalysis.livenessClass(G4_GRF)) {
    bool hasStackCall =
        kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc();

    bool willSpill =
      ((gra.useFastRA || gra.useHybridRAwithSpill) &&
         (!hasStackCall ||
          builder.getOption(vISA_PartitionWithFastHybridRA))) ||
        (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
         rpe->getMaxRP() >= kernel.getNumRegTotal() + 24);
    if (willSpill) {
      // go straight to first_fit to save compile time since we are definitely
      // spilling we do this for 3D only since with indirect/subroutine the RP
      // pressure can be very unreliable
      // FIXME: due to factors like local split and scalar variables that are
      // not accurately modeled in RP estimate, RA may succeed even when RP is >
      // total #GRF. We should investigate these cases and fix RPE
      assignColors(FIRST_FIT);
      // assert(requireSpillCode() && "inaccurate GRF pressure estimate");
      return !requireSpillCode();
    }

    if (kernel.getOption(vISA_RoundRobin) && !hasStackCall) {
      if (assignColors(ROUND_ROBIN, doBankConflictReduction,
                       highInternalConflict,
                       doBundleConflictReduction) == false) {
        resetTemporaryRegisterAssignments();
        bool success = assignColors(FIRST_FIT, doBankConflictReduction,
                         highInternalConflict, doBundleConflictReduction);

        if (!success && doBankConflictReduction && isHybrid) {
          return false;
        }

        if (!kernel.getOption(vISA_forceBCR)) {
          if (!success && doBankConflictReduction) {
            resetTemporaryRegisterAssignments();
            assignColors(FIRST_FIT);
          }
        }
      }
    } else {
      bool success = assignColors(FIRST_FIT, true, highInternalConflict,
                                  doBundleConflictReduction);
      if (!success) {
        resetTemporaryRegisterAssignments();
        assignColors(FIRST_FIT);
      }
    }
  } else if (liveAnalysis.livenessClass(G4_FLAG)) {
    if (kernel.getOption(vISA_RoundRobin)) {
      if (assignColors(ROUND_ROBIN) == false) {
        resetTemporaryRegisterAssignments();
        assignColors(FIRST_FIT);
      }
    } else {
      assignColors(FIRST_FIT);
    }
  } else {
    // assign registers for ARFs using a first-fit heuristic
    assignColors(FIRST_FIT, false, false);
  }

  return (requireSpillCode() == false);
}

void GraphColor::confirmRegisterAssignments() {
  for (unsigned i = 0; i < numVar; i++) {
    if (lrs[i]->getPhyReg()) {
      if (lrs[i]->getVar()->getPhyReg()) {
        vISA_ASSERT((lrs[i]->getVar()->getPhyReg() == lrs[i]->getPhyReg()),
                    ERROR_GRAPHCOLOR);
      } else {
        lrs[i]->getVar()->setPhyReg(lrs[i]->getPhyReg(),
                                    lrs[i]->getPhyRegOff());
      }
    }
  }
}

void GraphColor::resetTemporaryRegisterAssignments() {
  for (unsigned i = 0; i < numVar; i++) {
    if (lrs[i]->getVar()->getPhyReg() == NULL) {
      lrs[i]->resetPhyReg();
      lrs[i]->setSpilled(false);
    }
  }
  spilledLRs.clear();
}

void GraphColor::cleanupRedundantARFFillCode() {
  for (G4_BB *bb : builder.kernel.fg) {
    clearSpillAddrLocSignature();

    for (std::list<G4_INST *>::iterator i = bb->begin(); i != bb->end();) {
      G4_INST *inst = (*i);

      //
      // process writes to spill storage (GRF) of addr regs
      //
      G4_DstRegRegion *dst = inst->getDst();

      if (dst && dst->getBase() && dst->getBase()->isRegVar() &&
          (kernel.fg.isPseudoA0Dcl(dst->getBase()->asRegVar()->getDeclare()) ||
           inst->isPseudoKill())) {
        i++;
        continue;
      }

      if (dst != NULL && dst->getRegAccess() == Direct) {

        if (dst->getBase()->isRegVar() &&
            dst->getBase()->asRegVar()->isRegVarAddrSpillLoc()) {
          pruneActiveSpillAddrLocs(dst, inst->getExecSize(),
                                   inst->getExecType());
        }
        //
        // process writes to (allocated) addr regs
        //
        else if (dst->getBase()->isRegAllocPartaker()) {
          G4_RegVar *addrReg = dst->getBase()->asRegVar();

          if (gra.isAddrFlagSpillDcl(addrReg->getDeclare())) {
            G4_SrcRegRegion *srcRgn = inst->getSrc(0)->asSrcRegRegion();

            if (redundantAddrFill(dst, srcRgn, inst->getExecSize())) {
              std::list<G4_INST *>::iterator j = i++;
              bb->erase(j);
              continue;
            } else {
              updateActiveSpillAddrLocs(dst, srcRgn, inst->getExecSize());
            }
          } else {
            pruneActiveSpillAddrLocs(dst, inst->getExecSize(),
                                     inst->getExecType());
          }
        }
      }

      i++;
    }
  }
}

void GraphColor::pruneActiveSpillAddrLocs(G4_DstRegRegion *dstRegion,
                                          unsigned exec_size,
                                          G4_Type exec_type) {
  if (dstRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc()) {
    vISA_ASSERT(((exec_type == Type_UW || exec_type == Type_W) &&
                 exec_size <= builder.getNumAddrRegisters()) ||
                    (exec_size == 1),
                "Unexpected ADDR spill loc update format!");
    vISA_ASSERT(dstRegion->getRegAccess() == Direct,
                "Unexpected ADDR spill loc");

    G4_RegVarAddrSpillLoc *spillLocReg =
        static_cast<G4_RegVarAddrSpillLoc *>(dstRegion->getBase());
    unsigned startId = spillLocReg->getLocId() + dstRegion->getSubRegOff();
    unsigned endId = startId + exec_size * dstRegion->getHorzStride();

    for (unsigned i = 0, horzStride = dstRegion->getHorzStride();
         i < builder.getNumAddrRegisters(); i += horzStride) {
      if (spAddrRegSig[i] >= startId && spAddrRegSig[i] < endId) {
        spAddrRegSig[i] = 0;
      }
    }
  } else if (dstRegion->getBase()->asRegVar()->isPhyRegAssigned()) {
    G4_RegVar *addrReg = dstRegion->getBase()->asRegVar();
    vISA_ASSERT(addrReg->getPhyReg()->isA0(),
                "Unknown error in ADDR reg spill code cleanup!");
    unsigned startId = addrReg->getPhyRegOff();
    unsigned endId = startId + exec_size * dstRegion->getHorzStride();
    vISA_ASSERT(endId <= builder.getNumAddrRegisters(),
                "Unknown error in ADDR reg spill code cleanup!");

    for (unsigned i = startId; i < endId; i += dstRegion->getHorzStride()) {
      spAddrRegSig[i] = 0;
    }
  } else {
    vISA_ASSERT(false, "Unknown error in ADDR reg spill code cleanup!");
  }
}

void GraphColor::updateActiveSpillAddrLocs(G4_DstRegRegion *tmpDstRegion,
                                           G4_SrcRegRegion *srcRegion,
                                           unsigned exec_size) {
  vISA_ASSERT(
      gra.isAddrFlagSpillDcl(tmpDstRegion->getBase()->asRegVar()->getDeclare()),
      "Unknown error in ADDR reg spill code cleanup!");
  G4_RegVar *addrReg = tmpDstRegion->getBase()->asRegVar();
  vISA_ASSERT(addrReg->getPhyReg()->isA0(),
              "Unknown error in ADDR reg spill code cleanup!");
  unsigned startAddrId = addrReg->getPhyRegOff();
  unsigned endAddrId = startAddrId + exec_size * tmpDstRegion->getHorzStride();
  vISA_ASSERT(endAddrId <= builder.getNumAddrRegisters(),
              "Unknown error in ADDR reg spill code cleanup!");

  vISA_ASSERT(srcRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc(),
              "Unknown error in ADDR reg spill code cleanup!");
  G4_RegVarAddrSpillLoc *spillLocReg =
      static_cast<G4_RegVarAddrSpillLoc *>(srcRegion->getBase());
  unsigned startLocId = spillLocReg->getLocId() + srcRegion->getSubRegOff();

  for (unsigned i = startAddrId, j = startLocId; i < endAddrId;
       i += tmpDstRegion->getHorzStride(),
                j += srcRegion->getRegion()->horzStride) {
    spAddrRegSig[i] = j;
  }
}

bool GraphColor::redundantAddrFill(G4_DstRegRegion *tmpDstRegion,
                                   G4_SrcRegRegion *srcRegion,
                                   unsigned exec_size) {
  bool match = true;

  vISA_ASSERT(
      gra.isAddrFlagSpillDcl(tmpDstRegion->getBase()->asRegVar()->getDeclare()),
      "Unknown error in ADDR reg spill code cleanup!");
  G4_RegVar *addrReg = tmpDstRegion->getBase()->asRegVar();
  vISA_ASSERT(addrReg->getPhyReg()->isA0(),
              "Unknown error in ADDR reg spill code cleanup!");
  unsigned startAddrId = addrReg->getPhyRegOff();
  unsigned endAddrId = startAddrId + exec_size * tmpDstRegion->getHorzStride();
  vISA_ASSERT(endAddrId <= builder.getNumAddrRegisters(),
              "Unknown error in ADDR reg spill code cleanup!");

  vISA_ASSERT(srcRegion->getBase()->asRegVar()->isRegVarAddrSpillLoc(),
              "Unknown error in ADDR reg spill code cleanup!");
  G4_RegVarAddrSpillLoc *spillLocReg =
      static_cast<G4_RegVarAddrSpillLoc *>(srcRegion->getBase());
  unsigned startLocId = spillLocReg->getLocId() + srcRegion->getSubRegOff();

  for (unsigned i = startAddrId, j = startLocId; i < endAddrId;
       i += tmpDstRegion->getHorzStride(),
                j += srcRegion->getRegion()->horzStride) {
    if (spAddrRegSig[i] != j) {
      match = false;
      break;
    }
  }

  return match;
}

unsigned GlobalRA::sendBlockSizeCode(unsigned owordSize) {
  unsigned code;

  switch (owordSize) {
  case 1:
    code = 0;
    break;
  case 2:
    code = 2;
    break;
  case 4:
    code = 3;
    break;
  case 8:
    code = 4;
    break;
  case 16:
    code = 5;
    break;
  default:
    vISA_ASSERT_UNREACHABLE(ERROR_REGALLOC);
    code = 0;
  }

  return code;
}

#define STATELESS_SURFACE_INDEX 0xFF
#define HEADER_PRESENT 0x80000
#define SEND_OWORD_READ_TYPE 0
#define SEND_OWORD_WRITE_TYPE 8
#define SEND_MSG_TYPE_BIT_OFFSET 14
#define SEND_RSP_LENGTH_BIT_OFFSET 20
#define SEND_MSG_LENGTH_BIT_OFFSET 25
#define SEND_DESC_DATA_SIZE_BIT_OFFSET 8

G4_Imm *GlobalRA::createMsgDesc(unsigned owordSize, bool writeType,
                                bool isSplitSend) {
  // If isSplitSend = true then messageLength = 1 and extMesLength =
  // (owordSize/2) GRFs
  unsigned message = STATELESS_SURFACE_INDEX;
  message |= HEADER_PRESENT;
  if (writeType) {
    unsigned messageType = SEND_OWORD_WRITE_TYPE;
    message |= messageType << SEND_MSG_TYPE_BIT_OFFSET;
    unsigned messageLength = 1;
    if (!isSplitSend) {
      messageLength += owordToGRFSize(
          ROUND(owordSize, kernel.numEltPerGRF<Type_UB>() / OWORD_BYTE_SIZE),
          builder);
    }
    message |= messageLength << SEND_MSG_LENGTH_BIT_OFFSET;
  } else {
    unsigned messageType = SEND_OWORD_READ_TYPE;
    message |= messageType << SEND_MSG_TYPE_BIT_OFFSET;
    unsigned responseLength = owordToGRFSize(
        ROUND(owordSize, kernel.numEltPerGRF<Type_UB>() / OWORD_BYTE_SIZE),
        builder);
    message |= responseLength << SEND_RSP_LENGTH_BIT_OFFSET;
    unsigned messageLength = 1;
    message |= messageLength << SEND_MSG_LENGTH_BIT_OFFSET;
  }
  unsigned writeOwordSize = sendBlockSizeCode(owordSize);
  message |= writeOwordSize << SEND_DESC_DATA_SIZE_BIT_OFFSET;
  return builder.createImm(message, Type_UD);
}

void GlobalRA::stackCallProlog() {
  G4_BB *entryBB = builder.kernel.fg.getEntryBB();

  // Used for creating inst to initialize address for immediate offset usage.
  auto AddrComputeInst = [this](G4_Declare *srcDcl) {
    auto addSrc0 = builder.createSrc(srcDcl->getRegVar(), 0, 0,
                                     builder.getRegionScalar(), Type_UD);
    auto immSrc1 = builder.createImm(SPILL_FILL_IMMOFF_MAX, Type_UD);
    auto addInst = builder.createBinOp(
        G4_add, g4::SIMD1,
        builder.createDstRegRegion(builder.kernel.fg.scratchRegDcl, 1), addSrc0,
        immSrc1, InstOpt_WriteEnable, false);
    return addInst;
  };

  // Initialize address for immediate offset usage for spill/fill messages
  // except for frame descriptor save message.
  // This is for common cases which uses %be_fp as address.
  {
    // Turn off immediate offset if frame size is 0 or exceeds threshhold
    if ((kernel.fg.frameSizeInOWord == 0) ||
        (kernel.fg.frameSizeInOWord * 16 > SPILL_FILL_IMMOFF_MAX * 2))
      canUseLscImmediateOffsetSpillFill = false;

    if (canUseLscImmediateOffsetSpillFill) {
      // copy (%be_fp + 0x10000) to r126.0 for immediate offset usage in
      // stackcall spill/fill
      //     add(1) r126.0 %be_fp 0x10000
      auto insertIt = std::find(entryBB->begin(), entryBB->end(),
                                builder.kernel.getBEFPSetupInst());
      vISA_ASSERT(insertIt != entryBB->end(), "Can't find BE_FP setup inst");
      entryBB->insertBefore(++insertIt, AddrComputeInst(builder.getBEFP()));

      // Each stack function has its own r126.0, so need resume r126.0 after
      // function call as the value has been changed in the callee.
      // See below example:
      //     Foo()
      //         mov r125.3 r125.2
      //         add r126.0 r125.3 0x10000
      //         add r125.2 r125.2 frameSizeFoo
      //         spill [r126.0 offset1-0x10000]
      //         Bar()
      //             mov r125.3 r125.2
      //             add r126.0 r125.3 0x10000
      //             add r125.2 r125.2 frameSizeBar
      //             spill [r126.0 offset2-0x10000]
      //             ...
      //         add r126.0 r125.3 0x10000
      //         spill [r126.0 offset3-0x10000]
      // After Bar() return, we should re-compute r126.0
      for (auto bb : kernel.fg) {
        if (bb->isEndWithFCall()) {
          G4_BB *succ = bb->Succs.front();
          insertIt =
              std::find_if(succ->begin(), succ->end(),
                           [](G4_INST *inst) { return inst->isLabel(); });
          vISA_ASSERT(insertIt != succ->end(), "Can't find label");
          succ->insertBefore(++insertIt, AddrComputeInst(builder.getBEFP()));
        }
      }
    }
  }

  // Emit frame descriptor
  if (kernel.fg.getIsStackCallFunc()) {
    if (canSkipFDE())
      return;

    auto payload = builder.createHardwiredDeclare(
        8, Type_UD, kernel.stackCall.getFPSPGRF(), 0);
    payload->setName(builder.getNameString(24, "FrameDescriptorGRF"));
    auto payloadSrc =
        builder.createSrcRegRegion(payload, builder.getRegionStride1());
    const unsigned execSize = 8;
    G4_DstRegRegion *postDst = builder.createNullDst(Type_UD);
    G4_INST *store = nullptr;
    if (builder.supportsLSC()) {
      auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
      store = builder.createSpill(
          postDst, headerOpnd, payloadSrc, G4_ExecSize(execSize), 1, 0,
          builder.getBESP(), InstOpt_WriteEnable, false);
    } else {
      store =
          builder.createSpill(postDst, payloadSrc, G4_ExecSize(execSize), 1, 0,
                              builder.getBESP(), InstOpt_WriteEnable, false);
    }
    builder.setFDSpillInst(store);
    auto iter = std::find_if(entryBB->begin(), entryBB->end(),
                             [](G4_INST *inst) { return !inst->isLabel(); });
    iter = entryBB->insertBefore(iter, store);

    if (EUFusionCallWANeeded()) {
      auto oldSaveInst = builder.getPartFDSaveInst();
      builder.setPartFDSaveInst(store);
      entryBB->remove(oldSaveInst);
    }
    addEUFusionCallWAInst(store);

    // Initialize address for immediate offset usage for frame descriptor store
    // message. This is a special case as it uses %be_sp as address.
    {
      if (canUseLscImmediateOffsetSpillFill) {
        // copy (%be_sp + 0x10000) to r126.0 for immediate offset usage
        // for frame descriptor save instruction
        //    add(1) r126.0<1>:ud  %be_sp<1;0,1>:ud  0x10000:ud
        entryBB->insertBefore(iter, AddrComputeInst(builder.getBESP()));
      }
    }
    return;
  }

  // Make r126 a copy of r0 only up to VISA ABI v2
  if (kernel.stackCall.getVersion() >= StackCallABI::StackCallABIVersion::VER_3)
    return;

  // mov (8)  r126.0<1>:ud  r0.0<1;1,1>:ud
  auto dstRgn = builder.createDstRegRegion(builder.kernel.fg.scratchRegDcl, 1);
  auto srcRgn = builder.createSrcRegRegion(builder.getBuiltinR0(),
                                           builder.getRegionStride1());

  G4_INST *mov = builder.createMov(G4_ExecSize(kernel.numEltPerGRF<Type_UD>()),
                                   dstRgn, srcRgn, InstOpt_WriteEnable, false);

  auto iter = std::find_if(entryBB->begin(), entryBB->end(),
                           [](G4_INST *inst) { return !inst->isLabel(); });
  entryBB->insertBefore(iter, mov);
}

//
// Generate the save code for startReg to startReg+owordSize/2.
//
void GlobalRA::saveRegs(unsigned startReg, unsigned owordSize,
                        G4_Declare *scratchRegDcl, G4_Declare *framePtr,
                        unsigned frameOwordOffset, G4_BB *bb,
                        INST_LIST_ITER insertIt,
                        std::unordered_set<G4_INST *> &group) {
  vISA_ASSERT(builder.getPlatform() >= GENX_SKL,
              "stack call only supported on SKL+");

  if ((useLscForSpillFill && owordSize == 16) || owordSize == 8 ||
      owordSize == 4 || owordSize == 2) {
    // add (1) r126.2<1>:ud    r125.3<0;1,0>:ud    0x2:ud
    // sends (8) null<1>:ud    r126.0    r1.0 ...
    G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
    unsigned messageLength = GlobalRA::owordToGRFSize(owordSize, builder);
    G4_Declare *msgDcl =
        builder.createTempVar(messageLength * builder.getGenxDataportIOSize(),
                              Type_UD, builder.getGRFAlign(), StackCallStr);
    msgDcl->getRegVar()->setPhyReg(regPool.getGreg(startReg), 0);
    auto sendSrc2 = builder.createSrc(msgDcl->getRegVar(), 0, 0,
                                      builder.getRegionStride1(), Type_UD);
    G4_DstRegRegion *dst =
        builder.createNullDst((execSize > 8) ? Type_UW : Type_UD);
    G4_INST *spillIntrinsic = nullptr;
    if (builder.supportsLSC()) {
      auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
      spillIntrinsic = builder.createSpill(
          dst, headerOpnd, sendSrc2, execSize, messageLength,
          frameOwordOffset / 2, framePtr, InstOpt_WriteEnable, false);
    } else
      spillIntrinsic = builder.createSpill(
          dst, sendSrc2, execSize, messageLength, frameOwordOffset / 2,
          framePtr, InstOpt_WriteEnable, false);
    spillIntrinsic->inheritDIFrom(*insertIt);
    bb->insertBefore(insertIt, spillIntrinsic);
    group.insert(spillIntrinsic);
  } else if ((useLscForSpillFill && owordSize > 16)) {
    saveRegs(startReg, 16, scratchRegDcl, framePtr, frameOwordOffset, bb,
             insertIt, group);
    saveRegs(startReg + GlobalRA::owordToGRFSize(16, builder), owordSize - 16,
             scratchRegDcl, framePtr, frameOwordOffset + 16, bb, insertIt,
             group);
  } else if (owordSize > 8) {
    saveRegs(startReg, 8, scratchRegDcl, framePtr, frameOwordOffset, bb,
             insertIt, group);
    saveRegs(startReg + GlobalRA::owordToGRFSize(8, builder), owordSize - 8,
             scratchRegDcl, framePtr, frameOwordOffset + 8, bb, insertIt,
             group);
  }
  //
  // Split into chunks of sizes 4 and remaining owords.
  //
  else if (owordSize > 4) {
    saveRegs(startReg, 4, scratchRegDcl, framePtr, frameOwordOffset, bb,
             insertIt, group);
    saveRegs(startReg + GlobalRA::owordToGRFSize(4, builder), owordSize - 4,
             scratchRegDcl, framePtr, frameOwordOffset + 4, bb, insertIt,
             group);
  }
  //
  // Split into chunks of sizes 2 and remaining owords.
  //
  else if (owordSize > 2) {
    saveRegs(startReg, 2, scratchRegDcl, framePtr, frameOwordOffset, bb,
             insertIt, group);
    saveRegs(startReg + GlobalRA::owordToGRFSize(2, builder), owordSize - 2,
             scratchRegDcl, framePtr, frameOwordOffset + 2, bb, insertIt,
             group);
  } else {
    vISA_ASSERT(false, ERROR_REGALLOC);
  }
}

//
// Generate the save code for the i/p saveRegs.
//
void GlobalRA::saveActiveRegs(std::vector<bool> &saveRegs, unsigned startReg,
                              unsigned frameOffset, G4_BB *bb,
                              INST_LIST_ITER insertIt,
                              std::unordered_set<G4_INST *> &group) {
  G4_Declare *scratchRegDcl = builder.kernel.fg.scratchRegDcl;
  G4_Declare *framePtr = builder.kernel.fg.framePtrDcl;

  unsigned frameOwordPos = frameOffset;
  unsigned startPos = 0;

  while (startPos < saveRegs.size()) {
    for (; startPos < saveRegs.size() && saveRegs[startPos] == false;
         startPos++)
      ;
    if (startPos < saveRegs.size() && saveRegs[startPos]) {
      unsigned endPos = startPos + 1;
      for (; endPos < saveRegs.size() && saveRegs[endPos] == true; endPos++)
        ;
      unsigned owordSize =
          (endPos - startPos) * GlobalRA::GRFSizeToOwords(1, builder);
      owordSize = std::max(owordSize, GlobalRA::GRFSizeToOwords(1, builder));
      this->saveRegs(startPos + startReg, owordSize, scratchRegDcl, framePtr,
                     frameOwordPos, bb, insertIt, group);
      frameOwordPos += owordSize;
      startPos = endPos;
    }
  }
}

G4_SrcRegRegion *GraphColor::getScratchSurface() const {
  if (builder.hasScratchSurface()) {
    return builder.createSrcRegRegion(builder.getBuiltinScratchSurface(),
                                      builder.getRegionScalar());
  }
  return nullptr; // use stateless access
}

//
// Generate the restore code for startReg to startReg+owordSize/2.
//
void GlobalRA::restoreRegs(unsigned startReg, unsigned owordSize,
                           G4_Declare *scratchRegDcl, G4_Declare *framePtr,
                           unsigned frameOwordOffset, G4_BB *bb,
                           INST_LIST_ITER insertIt,
                           std::unordered_set<G4_INST *> &group, bool caller) {
  //
  // Process chunks of size 8, 4, 2 and 1.
  //
  if ((useLscForSpillFill && owordSize == 16) || owordSize == 8 ||
      owordSize == 4 || owordSize == 2) {
    G4_ExecSize execSize = (owordSize > 2) ? g4::SIMD16 : g4::SIMD8;
    unsigned responseLength = GlobalRA::owordToGRFSize(owordSize, builder);
    G4_Declare *dstDcl =
        builder.createTempVar(responseLength * builder.getGenxDataportIOSize(),
                              Type_UD, builder.getGRFAlign(), StackCallStr);
    if (caller) {
      kernel.callerRestoreDecls.push_back(dstDcl);
    }
    dstDcl->getRegVar()->setPhyReg(regPool.getGreg(startReg), 0);
    G4_DstRegRegion *dstRgn = builder.createDst(
        dstDcl->getRegVar(), 0, 0, 1, (execSize > 8) ? Type_UW : Type_UD);
    G4_INST *fillIntrinsic = nullptr;
    if (builder.supportsLSC()) {
      auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
      fillIntrinsic = builder.createFill(headerOpnd, dstRgn, execSize,
                                         responseLength, frameOwordOffset / 2,
                                         framePtr, InstOpt_WriteEnable, false);
    } else
      fillIntrinsic = builder.createFill(dstRgn, execSize, responseLength,
                                         frameOwordOffset / 2, framePtr,
                                         InstOpt_WriteEnable, false);
    fillIntrinsic->inheritDIFrom(*insertIt);
    bb->insertBefore(insertIt, fillIntrinsic);
    group.insert(fillIntrinsic);
  }
  //
  // Split into chunks of sizes 8 and remaining owords.
  //
  else if ((useLscForSpillFill && owordSize > 16)) {
    restoreRegs(startReg, 16, scratchRegDcl, framePtr, frameOwordOffset, bb,
                insertIt, group, caller);
    restoreRegs(startReg + GlobalRA::owordToGRFSize(16, builder),
                owordSize - 16, scratchRegDcl, framePtr, frameOwordOffset + 16,
                bb, insertIt, group, caller);
  } else if (owordSize > 8) {
    restoreRegs(startReg, 8, scratchRegDcl, framePtr, frameOwordOffset, bb,
                insertIt, group, caller);
    restoreRegs(startReg + GlobalRA::owordToGRFSize(8, builder), owordSize - 8,
                scratchRegDcl, framePtr, frameOwordOffset + 8, bb, insertIt,
                group, caller);
  }
  //
  // Split into chunks of sizes 4 and remaining owords.
  //
  else if (owordSize > 4) {
    restoreRegs(startReg, 4, scratchRegDcl, framePtr, frameOwordOffset, bb,
                insertIt, group, caller);
    restoreRegs(startReg + GlobalRA::owordToGRFSize(4, builder), owordSize - 4,
                scratchRegDcl, framePtr, frameOwordOffset + 4, bb, insertIt,
                group, caller);
  }
  //
  // Split into chunks of sizes 2 and remaining owords.
  //
  else if (owordSize > 2) {
    restoreRegs(startReg, 2, scratchRegDcl, framePtr, frameOwordOffset, bb,
                insertIt, group, caller);
    restoreRegs(startReg + GlobalRA::owordToGRFSize(2, builder), owordSize - 2,
                scratchRegDcl, framePtr, frameOwordOffset + 2, bb, insertIt,
                group, caller);
  } else {
    vISA_ASSERT(false, ERROR_REGALLOC);
  }
}

//
// Generate the restore code for the i/p restoreRegs.
//
void GlobalRA::restoreActiveRegs(std::vector<bool> &restoreRegs,
                                 unsigned startReg, unsigned frameOffset,
                                 G4_BB *bb, INST_LIST_ITER insertIt,
                                 std::unordered_set<G4_INST *> &group,
                                 bool caller) {
  G4_Declare *scratchRegDcl = builder.kernel.fg.scratchRegDcl;
  G4_Declare *framePtr = builder.kernel.fg.framePtrDcl;

  unsigned frameOwordPos = frameOffset;
  unsigned startPos = 0;

  while (startPos < restoreRegs.size()) {
    for (; startPos < restoreRegs.size() && restoreRegs[startPos] == false;
         startPos++)
      ;
    if (startPos < restoreRegs.size() && restoreRegs[startPos]) {
      unsigned endPos = startPos + 1;
      for (; endPos < restoreRegs.size() && restoreRegs[endPos] == true;
           endPos++)
        ;
      unsigned owordSize =
          (endPos - startPos) * GlobalRA::GRFSizeToOwords(1, builder);
      owordSize = std::max(owordSize, GlobalRA::GRFSizeToOwords(1, builder));
      this->restoreRegs(startPos + startReg, owordSize, scratchRegDcl, framePtr,
                        frameOwordPos, bb, insertIt, group, caller);
      frameOwordPos += owordSize;
      startPos = endPos;
    }
  }
}

//
// Optimize the reg footprint so as to reduce the number of "send" instructions
// required for save/restore, at the cost of a little additional save/restore
// memory (if any). Since we are using oword read/write for save/restore, we can
// only read/write only in units of 1, 2 or 4 regs per "send" instruction.
//
void GlobalRA::OptimizeActiveRegsFootprint(std::vector<bool> &saveRegs) {
  unsigned startPos = 0;
  while (startPos < saveRegs.size()) {
    for (; startPos < saveRegs.size() && !saveRegs[startPos]; ++startPos)
      ;
    if (startPos == saveRegs.size()) {
      break;
    }
    if (startPos + 4 <= saveRegs.size()) {
      if (saveRegs[startPos] & saveRegs[startPos + 2] &
          !saveRegs[startPos + 3]) {
        saveRegs[startPos + 1] = saveRegs[startPos + 3] = true;
      } else if (saveRegs[startPos] & saveRegs[startPos + 3]) {
        if (startPos + 4 < saveRegs.size()) {
          if (!saveRegs[startPos + 4]) {
            saveRegs[startPos + 1] = saveRegs[startPos + 2] = true;
          }
        } else {
          saveRegs[startPos + 1] = saveRegs[startPos + 2] = true;
        }
      }
    }
    unsigned winBound =
        std::min(static_cast<unsigned>(saveRegs.size()), startPos + 4);
    for (; startPos < winBound && saveRegs[startPos]; ++startPos)
      ;
  }
}

void GlobalRA::OptimizeActiveRegsFootprint(std::vector<bool> &saveRegs,
                                           std::vector<bool> &retRegs) {
  unsigned startPos = 0;
  while (startPos < saveRegs.size()) {
    for (; startPos < saveRegs.size() && !saveRegs[startPos]; ++startPos)
      ;
    if (startPos == saveRegs.size()) {
      break;
    }
    if (startPos + 4 <= saveRegs.size()) {
      if (saveRegs[startPos] & saveRegs[startPos + 2]) {
        if (!saveRegs[startPos + 1] & !retRegs[startPos + 1]) {
          saveRegs[startPos + 1] = true;
        }
        if (!saveRegs[startPos + 3] & !retRegs[startPos + 3]) {
          saveRegs[startPos + 3] = true;
        }
      } else if (saveRegs[startPos] & saveRegs[startPos + 3]) {
        if (startPos + 4 < saveRegs.size()) {
          if (!saveRegs[startPos + 4]) {
            if (!saveRegs[startPos + 1] & !retRegs[startPos + 1]) {
              saveRegs[startPos + 1] = true;
            }
            if (!saveRegs[startPos + 2] & !retRegs[startPos + 2]) {
              saveRegs[startPos + 2] = true;
            }
          }
        } else {
          if (!saveRegs[startPos + 1] & !retRegs[startPos + 1]) {
            saveRegs[startPos + 1] = true;
          }
          if (!saveRegs[startPos + 2] & !retRegs[startPos + 2]) {
            saveRegs[startPos + 2] = true;
          }
        }
      }
    }
    unsigned winBound =
        std::min(static_cast<unsigned>(saveRegs.size()), startPos + 4);
    for (; startPos < winBound && saveRegs[startPos]; ++startPos)
      ;
  }
}

void GraphColor::getCallerSaveRegisters() {
  unsigned callerSaveNumGRF = kernel.stackCall.getCallerSaveLastGRF() + 1;

  for (BB_LIST_ITER it = builder.kernel.fg.begin();
       it != builder.kernel.fg.end(); ++it) {
    if ((*it)->isEndWithFCall()) {
      //
      // Determine the caller-save registers per call site.
      //
      gra.callerSaveRegsMap[(*it)].resize(callerSaveNumGRF, false);
      gra.retRegsMap[(*it)].resize(callerSaveNumGRF, false);
      unsigned callerSaveRegCount = 0;
      G4_INST *callInst = (*it)->back();
      unsigned pseudoVCAId =
          builder.kernel.fg.fcallToPseudoDclMap[callInst->asCFInst()]
              .VCA->getRegVar()
              ->getId();
      vISA_ASSERT((*it)->Succs.size() == 1,
                  "fcall basic block cannot have more than 1 successor");

      for (unsigned i = 0; i < numVar; i++) {
        if (i != pseudoVCAId &&
            kernel.fg.isPseudoVCEDcl(lrs[i]->getDcl()) != true &&
            intf.interfereBetween(pseudoVCAId, i) == true) {
          if (!builder.isPreDefArg(lrs[i]->getDcl())) {
            // It is possible that we end up with unallocated spill variable
            // when using new fail safe RA.
            if (lrs[i]->getDcl()->isSpilled() &&
                kernel.getOption(vISA_NewFailSafeRA))
              continue;
            // NOTE: Spilled live ranges should not be caller-save.
            vISA_ASSERT(lrs[i]->getPhyReg()->isGreg(), ERROR_REGALLOC);
            unsigned startReg = lrs[i]->getPhyReg()->asGreg()->getRegNum();
            unsigned endReg = startReg + lrs[i]->getDcl()->getNumRows();
            startReg =
                (startReg < callerSaveNumGRF) ? startReg : callerSaveNumGRF;
            startReg = (startReg > 0) ? startReg : 1;
            endReg = (endReg < callerSaveNumGRF) ? endReg : callerSaveNumGRF;
            endReg = (endReg > 0) ? endReg : 1;
            for (unsigned j = startReg; j < endReg; j++) {
              if (builder.isPreDefRet(lrs[i]->getDcl())) {
                if (gra.retRegsMap[(*it)][j] == false) {
                  gra.retRegsMap[(*it)][j] = true;
                }
              } else {
                if (gra.callerSaveRegsMap[(*it)][j] == false) {
                  gra.callerSaveRegsMap[(*it)][j] = true;
                  callerSaveRegCount++;
                }
              }
            }
          }
        }
      }

      gra.callerSaveRegCountMap[(*it)] = callerSaveRegCount;
      VISA_DEBUG_VERBOSE({
        std::cout << "Caller save size: "
                  << callerSaveRegCount * builder.getGRFSize()
                  << " bytes for fcall at cisa id "
                  << (*it)->back()->getVISAId() << "\n";
      });
    }
  }
}

//
// Add caller save/restore code before/after each stack call.
//
void GlobalRA::addCallerSaveRestoreCode() {
  uint32_t maxCallerSaveSize = 0;

  for (G4_BB *bb : builder.kernel.fg) {
    if (bb->isEndWithFCall()) {
      //
      // Determine the caller-save registers per call site.
      //
      G4_INST *callInst = bb->back();
      G4_BB *afterFCallBB = bb->Succs.front();

      OptimizeActiveRegsFootprint(callerSaveRegsMap[bb], retRegsMap[bb]);

      unsigned callerSaveRegsWritten = 0;
      for (bool csr : callerSaveRegsMap[bb])
        callerSaveRegsWritten += (csr ? 1 : 0);

      INST_LIST_ITER insertSaveIt = bb->end();
      --insertSaveIt, --insertSaveIt;
      while ((*insertSaveIt)->isPseudoKill()) {
        --insertSaveIt;
      }
      vISA_ASSERT((*insertSaveIt)->isCallerSave(), ERROR_REGALLOC);
      INST_LIST_ITER rmIt = insertSaveIt;
      if (insertSaveIt == bb->begin()) {
        insertSaveIt = bb->end();
      }

      if (insertSaveIt != bb->end()) {
        ++insertSaveIt;
      } else {
        insertSaveIt = bb->begin();
      }
      if (callerSaveRegCountMap[bb] > 0) {
        if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
          builder.kernel.getKernelDebugInfo()->clearOldInstList();
          builder.kernel.getKernelDebugInfo()->setOldInstList(bb);
        }

        saveActiveRegs(callerSaveRegsMap[bb], 0,
                       builder.kernel.fg.callerSaveAreaOffset, bb, insertSaveIt,
                       callerSaveInsts[callInst]);

        // mark instructions for EU Fusion WA
        for (auto save : callerSaveInsts[callInst])
          addEUFusionCallWAInst(save);

        if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
          auto deltaInstList =
              builder.kernel.getKernelDebugInfo()->getDeltaInstructions(bb);
          for (auto jt : deltaInstList) {
            builder.kernel.getKernelDebugInfo()->addCallerSaveInst(bb, jt);
          }
        }
      }
      bb->erase(rmIt);
      INST_LIST_ITER insertRestIt = afterFCallBB->begin();
      for (; !(*insertRestIt)->isCallerRestore(); ++insertRestIt)
        ;
      if (callerSaveRegCountMap[bb] > 0) {
        if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
          builder.kernel.getKernelDebugInfo()->clearOldInstList();
          builder.kernel.getKernelDebugInfo()->setOldInstList(afterFCallBB);
        }

        restoreActiveRegs(callerSaveRegsMap[bb], 0,
                          builder.kernel.fg.callerSaveAreaOffset, afterFCallBB,
                          insertRestIt, callerRestoreInsts[callInst], true);

        // mark instructions for EU Fusion WA
        for (auto restore : callerRestoreInsts[callInst])
          addEUFusionCallWAInst(restore);

        if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
          auto deltaInsts =
              builder.kernel.getKernelDebugInfo()->getDeltaInstructions(
                  afterFCallBB);
          for (auto jt : deltaInsts) {
            builder.kernel.getKernelDebugInfo()->addCallerRestoreInst(bb, jt);
          }
        }
      }
      afterFCallBB->erase(insertRestIt);

      maxCallerSaveSize = std::max(maxCallerSaveSize, callerSaveRegsWritten *
                                                          builder.getGRFSize());
    }
  }

  auto byteOffset =
      builder.kernel.fg.callerSaveAreaOffset * 16 + maxCallerSaveSize;
  builder.kernel.fg.frameSizeInOWord = ROUND(byteOffset, 64) / 16;

  builder.instList.clear();
}

void GraphColor::getCalleeSaveRegisters() {
  unsigned callerSaveNumGRF = kernel.stackCall.getCallerSaveLastGRF() + 1;
  unsigned numCalleeSaveRegs = kernel.stackCall.getNumCalleeSaveRegs();

  // Determine the callee-save registers.

  gra.calleeSaveRegs.resize(numCalleeSaveRegs, false);
  gra.calleeSaveRegCount = 0;

  unsigned pseudoVCEId = builder.kernel.fg.pseudoVCEDcl->getRegVar()->getId();
  unsigned stackCallStartReg = kernel.stackCall.getStackCallStartReg();
  for (unsigned i = 0; i < numVar; i++) {
    if (pseudoVCEId != i && intf.interfereBetween(pseudoVCEId, i)) {
      if (lrs[i]->getPhyReg()) {
        vISA_ASSERT(lrs[i]->getPhyReg()->isGreg(), ERROR_REGALLOC);
        unsigned startReg = lrs[i]->getPhyReg()->asGreg()->getRegNum();
        unsigned endReg = startReg + lrs[i]->getDcl()->getNumRows();
        startReg = (startReg >= callerSaveNumGRF) ? startReg : callerSaveNumGRF;
        startReg =
            (startReg < stackCallStartReg) ? startReg : stackCallStartReg;
        endReg = (endReg >= callerSaveNumGRF) ? endReg : callerSaveNumGRF;
        endReg = (endReg < stackCallStartReg) ? endReg : stackCallStartReg;
        for (unsigned j = startReg; j < endReg; j++) {
          if (gra.calleeSaveRegs[j - callerSaveNumGRF] == false) {
            gra.calleeSaveRegs[j - callerSaveNumGRF] = true;
            gra.calleeSaveRegCount++;
          }
        }
      }
    }
  }
}

//
// Add callee save/restore code at stack call function entry/exit.
//
void GlobalRA::addCalleeSaveRestoreCode() {
  unsigned callerSaveNumGRF = kernel.stackCall.getCallerSaveLastGRF() + 1;

  OptimizeActiveRegsFootprint(calleeSaveRegs);
  unsigned calleeSaveRegsWritten = 0;
  for (bool b : calleeSaveRegs)
    calleeSaveRegsWritten += (b ? 1 : 0);

  INST_LIST_ITER insertSaveIt = builder.kernel.fg.getEntryBB()->end();
  for (--insertSaveIt; !(*insertSaveIt)->isCalleeSave(); --insertSaveIt)
    ;
  if (calleeSaveRegCount > 0) {
    if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
      // Store old inst list so we can separate callee save
      // instructions that get inserted.
      builder.kernel.getKernelDebugInfo()->clearOldInstList();
      builder.kernel.getKernelDebugInfo()->setOldInstList(
          builder.kernel.fg.getEntryBB());
    }
    vISA_ASSERT(calleeSaveInsts.size() == 0,
                "Unexpected size of callee save set");
    saveActiveRegs(calleeSaveRegs, callerSaveNumGRF,
                   builder.kernel.fg.calleeSaveAreaOffset,
                   builder.kernel.fg.getEntryBB(), insertSaveIt,
                   calleeSaveInsts);

    if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
      // Delta of oldInstList and current instList are all
      // callee save instructions.
      auto instList = builder.kernel.getKernelDebugInfo()->getDeltaInstructions(
          builder.kernel.fg.getEntryBB());
      for (auto inst : instList) {
        builder.kernel.getKernelDebugInfo()->addCalleeSaveInst(inst);
      }
    }
  }
  builder.kernel.fg.getEntryBB()->erase(insertSaveIt);
  INST_LIST_ITER insertRestIt = builder.kernel.fg.getUniqueReturnBlock()->end();
  for (--insertRestIt; !(*insertRestIt)->isCalleeRestore(); --insertRestIt)
    ;
  INST_LIST_ITER eraseIt = insertRestIt++;
  if (calleeSaveRegCount > 0) {
    if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
      // Store old inst list so we can separate callee save
      // instructions that get inserted.
      builder.kernel.getKernelDebugInfo()->clearOldInstList();
      builder.kernel.getKernelDebugInfo()->setOldInstList(
          builder.kernel.fg.getUniqueReturnBlock());
    }
    vISA_ASSERT(calleeRestoreInsts.size() == 0,
                "Unexpected size of callee restore set");
    restoreActiveRegs(calleeSaveRegs, callerSaveNumGRF,
                      builder.kernel.fg.calleeSaveAreaOffset,
                      builder.kernel.fg.getUniqueReturnBlock(), insertRestIt,
                      calleeRestoreInsts, false);

    if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
      auto instList = builder.kernel.getKernelDebugInfo()->getDeltaInstructions(
          builder.kernel.fg.getUniqueReturnBlock());
      for (auto inst : instList) {
        builder.kernel.getKernelDebugInfo()->addCalleeRestoreInst(inst);
      }
    }
  }
  builder.kernel.fg.getUniqueReturnBlock()->erase(eraseIt);

  builder.instList.clear();

  // mark instructions for EU Fusion WA
  for (auto save : calleeSaveInsts)
    addEUFusionCallWAInst(save);
  for (auto restore : calleeRestoreInsts)
    addEUFusionCallWAInst(restore);

  // caller-save starts after callee-save and is 64-byte aligned
  auto byteOffset = builder.kernel.fg.calleeSaveAreaOffset * 16 +
                    calleeSaveRegsWritten * builder.getGRFSize();
  builder.kernel.fg.callerSaveAreaOffset = ROUND(byteOffset, 64) / 16;
  VISA_DEBUG({
    std::cout << "Callee save size: "
              << calleeSaveRegCount * builder.getGRFSize() << " bytes"
              << "\n";
  });
}

//
// Add code to setup the stack frame in callee.
//
void GlobalRA::addGenxMainStackSetupCode() {
  uint32_t fpInitVal =
      (uint32_t)kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
  // FIXME: a potential failure here is that frameSizeInOword is already the
  // offset based on GlobalSratchOffset, which is the value of fpInitVal. So
  // below we generate code to do SP = fpInitVal + frameSize, which does not
  // make sense. It is correct now since when there's stack call, IGC will not
  // use scratch, so fpInitVal will be 0.
  unsigned frameSize = builder.kernel.fg.frameSizeInOWord;
  uint16_t factor = 1;
  if (useLscForSpillFill)
    factor = 16;
  G4_Declare *framePtr = builder.kernel.fg.framePtrDcl;
  G4_Declare *stackPtr = builder.kernel.fg.stackPtrDcl;

  auto entryBB = builder.kernel.fg.getEntryBB();
  auto insertIt = std::find_if(entryBB->begin(), entryBB->end(),
                               [](G4_INST *inst) { return !inst->isLabel(); });
  //
  // FP = spillMemOffset
  //
  {
    G4_DstRegRegion *dst =
        builder.createDst(framePtr->getRegVar(), 0, 0, 1, Type_UD);
    G4_Imm *src = builder.createImm(fpInitVal, Type_UD);
    G4_INST *fpInst =
        builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);
    insertIt = entryBB->insertBefore(insertIt, fpInst);

    builder.kernel.setBEFPSetupInst(fpInst);

    if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
      builder.kernel.getKernelDebugInfo()->setBEFPSetupInst(fpInst);
      builder.kernel.getKernelDebugInfo()->setFrameSize(frameSize * 16);
    }
  }
  //
  // SP = FP + FrameSize (overflow-area offset + overflow-area size)
  //
  {
    G4_DstRegRegion *dst =
        builder.createDst(stackPtr->getRegVar(), 0, 0, 1, Type_UD);
    G4_Imm *src = builder.createImm(fpInitVal + frameSize * factor, Type_UD);
    G4_INST *spIncInst =
        builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable, false);

    builder.kernel.setBESPSetupInst(spIncInst);
    entryBB->insertBefore(++insertIt, spIncInst);
  }

  VISA_DEBUG(std::cout << "Total frame size: " << frameSize * 16 << " bytes"
                       << "\n");
}

//
// Add code to setup the stack frame in callee.
//
void GlobalRA::addCalleeStackSetupCode() {
  int frameSize = (int)builder.kernel.fg.frameSizeInOWord;
  uint16_t factor = 1;
  // convert framesize to bytes from oword for LSC
  if (useLscForSpillFill)
    factor = 16;
  G4_Declare *framePtr = builder.kernel.fg.framePtrDcl;
  G4_Declare *stackPtr = builder.kernel.fg.stackPtrDcl;

  vISA_ASSERT(frameSize > 0, "frame size cannot be 0");

  //
  // BE_FP = BE_SP
  // BE_SP += FrameSize
  //
  {
    G4_DstRegRegion *dst =
        builder.createDst(stackPtr->getRegVar(), 0, 0, 1, Type_UD);
    G4_DstRegRegion *fp_dst =
        builder.createDst(framePtr->getRegVar(), 0, 0, 1, Type_UD);
    const RegionDesc *rDesc = builder.getRegionScalar();
    G4_Operand *src0 =
        builder.createSrc(stackPtr->getRegVar(), 0, 0, rDesc, Type_UD);
    G4_Operand *sp_src =
        builder.createSrc(stackPtr->getRegVar(), 0, 0, rDesc, Type_UD);
    G4_Imm *src1 = builder.createImm(frameSize * factor, Type_UD);
    auto createBEFP = builder.createMov(g4::SIMD1, fp_dst, sp_src,
                                        InstOpt_WriteEnable, false);
    createBEFP->addComment("vISA_FP = vISA_SP");
    auto addInst = builder.createBinOp(G4_add, g4::SIMD1, dst, src0, src1,
                                       InstOpt_WriteEnable, false);
    addInst->addComment("vISA_SP += vISA_frameSize");
    G4_BB *entryBB = builder.kernel.fg.getEntryBB();
    auto insertIt =
        std::find(entryBB->begin(), entryBB->end(), getSaveBE_FPInst());
    vISA_ASSERT(insertIt != entryBB->end(), "Can't find BE_FP store inst");

    builder.kernel.setBEFPSetupInst(createBEFP);
    builder.kernel.setBESPSetupInst(addInst);

    if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
      builder.kernel.getKernelDebugInfo()->setBEFPSetupInst(createBEFP);
      builder.kernel.getKernelDebugInfo()->setFrameSize(frameSize * 16);
    }

    addEUFusionCallWAInst(createBEFP);
    addEUFusionCallWAInst(addInst);

    if (EUFusionCallWANeeded()) {
      builder.kernel.getKernelDebugInfo()->setCallerBEFPSaveInst(createBEFP);
    }

    insertIt++;
    entryBB->insertBefore(insertIt, createBEFP);
    entryBB->insertBefore(insertIt, addInst);
  }

  // Stack is destroyed in function addStoreRestoreToReturn() where part FDE is
  // restored before fret. This is an optimization as 1 SIMD4 instruction
  // restores ret %ip, ret EM, caller's BE_FP, BE_SP.

  builder.instList.clear();

  VISA_DEBUG(std::cout << "\nTotal frame size: " << frameSize * 16
                       << " bytes\n");
}

//
// Add A0 save/restore code for stack calls.
//
void GraphColor::addA0SaveRestoreCode() {
  uint8_t numA0Elements = (uint8_t)builder.getNumAddrRegisters();

  int count = 0;
  for (auto bb : builder.kernel.fg) {
    if (bb->isEndWithFCall()) {
      G4_BB *succ = bb->Succs.front();
      auto fcallInst = bb->back()->asCFInst();
      G4_RegVar *assocPseudoA0 =
          bb->getParent().fcallToPseudoDclMap[fcallInst].A0->getRegVar();

      if (!assocPseudoA0->getPhyReg()) {
        // Insert save/restore code because the pseudo node did not get an
        // allocation
        const char *name = builder.getNameString(20, "SA0_%d", count++);
        G4_Declare *savedDcl =
            builder.createDeclare(name, G4_GRF, numA0Elements, 1, Type_UW);

        {
          //
          // (W) mov (16) TMP_GRF<1>:uw a0.0<16;16,1>:uw
          //
          G4_DstRegRegion *dst =
              builder.createDst(savedDcl->getRegVar(), 0, 0, 1, Type_UW);
          const RegionDesc *rDesc = builder.getRegionStride1();
          G4_Operand *src =
              builder.createSrc(regPool.getAddrReg(), 0, 0, rDesc, Type_UW);
          G4_INST *saveInst = builder.createMov(
              G4_ExecSize(numA0Elements), dst, src, InstOpt_WriteEnable, false);
          INST_LIST_ITER insertIt = std::prev(bb->end());
          bb->insertBefore(insertIt, saveInst);

          gra.addEUFusionCallWAInst(saveInst);
        }

        {
          //
          // (W) mov (16) a0.0<1>:uw TMP_GRF<16;16,1>:uw
          //
          G4_DstRegRegion *dst =
              builder.createDst(regPool.getAddrReg(), 0, 0, 1, Type_UW);
          const RegionDesc *rDesc = builder.getRegionStride1();
          G4_Operand *src =
              builder.createSrc(savedDcl->getRegVar(), 0, 0, rDesc, Type_UW);
          G4_INST *restoreInst = builder.createMov(
              G4_ExecSize(numA0Elements), dst, src, InstOpt_WriteEnable, false);
          auto insertIt =
              std::find_if(succ->begin(), succ->end(),
                           [](G4_INST *inst) { return !inst->isLabel(); });
          succ->insertBefore(insertIt, restoreInst);

          gra.addEUFusionCallWAInst(restoreInst);
        }
      }
    }
  }

  builder.instList.clear();
}

//
// Add Flag save/restore code for stack calls.
//
void GraphColor::addFlagSaveRestoreCode() {
  int count = 0;
  int num32BitFlags = builder.getNumFlagRegisters() / 2;

  // each 32-bit flag gets a declare
  // ToDo: should we use flag ARF directly here?
  std::vector<G4_Declare *> tmpFlags;
  for (int i = 0; i < num32BitFlags; ++i) {
    G4_Declare *tmpFlag = builder.createTempFlag(2);
    tmpFlag->getRegVar()->setPhyReg(regPool.getFlagAreg(i), 0);
    tmpFlags.push_back(tmpFlag);
  }

  for (auto bb : builder.kernel.fg) {
    if (bb->isEndWithFCall()) {
      G4_BB *succ = bb->Succs.front();
      auto fcallInst = bb->back()->asCFInst();
      G4_RegVar *assocPseudoFlag =
          bb->getParent().fcallToPseudoDclMap[fcallInst].Flag->getRegVar();

      if (!assocPseudoFlag->getPhyReg()) {
        // Insert save/restore code because the pseudo node did not get an
        // allocation
        const char *name = builder.getNameString(32, "SFLAG_%d", count++);
        G4_Declare *savedDcl1 =
            builder.createDeclare(name, G4_GRF, num32BitFlags, 1, Type_UD);
        {
          //
          // (W) mov (1) TMP_GRF.0<1>:ud f0.0:ud
          // (W) mov (1) TMP_GRF.1<1>:ud f1.0:ud
          //
          auto createFlagSaveInst = [&](int index) {
            auto flagDcl = tmpFlags[index];
            G4_DstRegRegion *dst =
                builder.createDst(savedDcl1->getRegVar(), 0, index, 1, Type_UD);
            G4_Operand *src = builder.createSrc(
                flagDcl->getRegVar(), 0, 0, builder.getRegionScalar(), Type_UD);
            return builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable,
                                     false);
          };

          auto iter = std::prev(bb->end());
          for (int i = 0; i < num32BitFlags; ++i) {
            auto saveInst = createFlagSaveInst(i);
            bb->insertBefore(iter, saveInst);

            gra.addEUFusionCallWAInst(saveInst);
          }
        }

        {
          //
          // mov (1) f0.0:ud TMP_GRF.0<0;1,0>:ud
          // mov (1) f1.0:ud TMP_GRF.1<0;1,0>:ud
          //
          auto createRestoreFlagInst = [&](int index) {
            auto flagDcl = tmpFlags[index];
            G4_DstRegRegion *dst =
                builder.createDst(flagDcl->getRegVar(), 0, 0, 1, Type_UD);
            const RegionDesc *rDesc = builder.getRegionScalar();
            G4_Operand *src = builder.createSrc(savedDcl1->getRegVar(), 0,
                                                index, rDesc, Type_UD);
            return builder.createMov(g4::SIMD1, dst, src, InstOpt_WriteEnable,
                                     false);
          };
          auto insertIt =
              std::find_if(succ->begin(), succ->end(),
                           [](G4_INST *inst) { return !inst->isLabel(); });
          for (int i = 0; i < num32BitFlags; ++i) {
            auto restoreInst = createRestoreFlagInst(i);
            succ->insertBefore(insertIt, restoreInst);

            gra.addEUFusionCallWAInst(restoreInst);
          }
        }
      }
    }
  }

  builder.instList.clear();
}

void GraphColor::getSaveRestoreRegister() {
  if (!builder.getIsKernel()) {
    getCalleeSaveRegisters();
  }
  getCallerSaveRegisters();
}

//
// Get the forbidden vector size
//
unsigned ForbiddenRegs::getForbiddenVectorSize(G4_RegFileKind regKind) const {
  switch (regKind) {
  case G4_GRF:
  case G4_INPUT:
    return builder.kernel.getNumRegTotal();
  case G4_ADDRESS:
    return builder.getNumAddrRegisters();
  case G4_FLAG:
    return builder.getNumFlagRegisters();
  case G4_SCALAR:
    return builder.kernel.getSRFInWords();
  default:
    vISA_ASSERT_UNREACHABLE("illegal reg file");
    return 0;
  }
}

//
// Get the forbidden vectors of reserved GRFs
// May be reserved for user, stack call, and spill
// This is the default RC for all GRF live ranges.
//
void ForbiddenRegs::generateReservedGRFForbidden(
    unsigned reserveSpillSize) {
  bool hasStackCall = builder.kernel.fg.getHasStackCalls() ||
                      builder.kernel.fg.getIsStackCallFunc();
  uint32_t reservedGRFNum = builder.getuint32Option(vISA_ReservedGRFNum);
  uint32_t reservedFromFrontGRFNum =
      builder.getuint32Option(vISA_ReservedFromFrontGRFNum);
  unsigned int stackCallRegSize =
      hasStackCall ? builder.kernel.stackCall.numReservedABIGRF() : 0;

  // r0 - Forbidden when platform is not 3d
  // The last 1-3 GRFs may be reserved for stack call ABIs.
  int index = static_cast<int>(forbiddenKind::FBD_RESERVEDGRF);
  unsigned totalGRFNum = builder.kernel.getNumRegTotal();
  forbiddenVec[index].resize(getForbiddenVectorSize(G4_GRF));
  forbiddenVec[index].clear();

  if (builder.kernel.getKernelType() != VISA_3D || !builder.canWriteR0() ||
      reserveSpillSize > 0 || builder.kernel.getOption(vISA_PreserveR0InR0)) {
    forbiddenVec[index].set(0, true);
  }

  if (builder.mustReserveR1()) {
    // r1 is reserved for SIP kernel
    forbiddenVec[index].set(1, true);
  }

  unsigned reservedRegSize = stackCallRegSize + reserveSpillSize;
  for (unsigned int i = 0; i < reservedRegSize; i++) {
    forbiddenVec[index].set(totalGRFNum - 1 - i, true);
  }

  unsigned largestNoneReservedReg = totalGRFNum - reservedRegSize - 1;
  if (totalGRFNum - reservedRegSize >= totalGRFNum - 16) {
    largestNoneReservedReg = totalGRFNum - 16 - 1;
  }

  if (totalGRFNum - reservedRegSize < reservedGRFNum) {
    vISA_ASSERT(false, "After reservation, there is not enough regiser!");
  }

  for (unsigned int i = 0; i < reservedGRFNum; i++) {
    forbiddenVec[index].set(largestNoneReservedReg - i, true);
  }

  for (unsigned int i = 0; i < reservedFromFrontGRFNum; i++) {
    forbiddenVec[index].set(i, true);
  }

  auto &fg = builder.kernel.fg;
  if (fg.reserveSR) {
    forbiddenVec[index].set(
        fg.scratchRegDcl->getRegVar()->getPhyReg()->asGreg()->getRegNum(),
        true);
  }
}

// ETO use only last 16 registers
void ForbiddenRegs::generateEOTGRFForbidden() {
  forbiddenVec[(size_t)forbiddenKind::FBD_EOT].resize(
      getForbiddenVectorSize(G4_GRF));
  forbiddenVec[(size_t)forbiddenKind::FBD_EOT].clear();
  for (unsigned i = 0; i < builder.kernel.getNumRegTotal() - 16; i++) {
    forbiddenVec[(size_t)forbiddenKind::FBD_EOT].set(i, true);
  }
  forbiddenVec[(size_t)forbiddenKind::FBD_EOT] |=
      forbiddenVec[(size_t)forbiddenKind::FBD_RESERVEDGRF];
}

void ForbiddenRegs::generateLastGRFForbidden() {
  forbiddenVec[(size_t)forbiddenKind::FBD_LASTGRF].resize(
      getForbiddenVectorSize(G4_GRF));
  forbiddenVec[(size_t)forbiddenKind::FBD_LASTGRF].clear();
  forbiddenVec[(size_t)forbiddenKind::FBD_LASTGRF].set(
      builder.kernel.getNumRegTotal() - 1, true);
  forbiddenVec[(size_t)forbiddenKind::FBD_LASTGRF] |=
      forbiddenVec[(size_t)forbiddenKind::FBD_RESERVEDGRF];
}

void ForbiddenRegs::generateEOTLastGRFForbidden() {
  forbiddenVec[(size_t)forbiddenKind::FBD_EOTLASTGRF].resize(
      getForbiddenVectorSize(G4_GRF));
  forbiddenVec[(size_t)forbiddenKind::FBD_EOTLASTGRF].clear();
  forbiddenVec[(size_t)forbiddenKind::FBD_EOTLASTGRF] |=
      forbiddenVec[(size_t)forbiddenKind::FBD_EOT];
  forbiddenVec[(size_t)forbiddenKind::FBD_EOTLASTGRF] |=
      forbiddenVec[(size_t)forbiddenKind::FBD_LASTGRF];
}


//
// mark forbidden registers for caller-save pseudo var
//
void ForbiddenRegs::generateCallerSaveGRFForbidden() {
  unsigned int startCalleeSave = builder.kernel.stackCall.calleeSaveStart();
  unsigned int endCalleeSave =
      startCalleeSave + builder.kernel.stackCall.getNumCalleeSaveRegs();
  // r60-r124 are caller save regs for SKL
  forbiddenVec[(size_t)forbiddenKind::FBD_CALLERSAVE].resize(
      getForbiddenVectorSize(G4_GRF));
  forbiddenVec[(size_t)forbiddenKind::FBD_CALLERSAVE].clear();
  for (unsigned int i = startCalleeSave; i < endCalleeSave; i++) {
    forbiddenVec[(size_t)forbiddenKind::FBD_CALLERSAVE].set(i, true);
  }
  forbiddenVec[(size_t)forbiddenKind::FBD_CALLERSAVE] |=
      forbiddenVec[(size_t)forbiddenKind::FBD_RESERVEDGRF];
}

//
// mark forbidden registers for callee-save pseudo var
//
void ForbiddenRegs::generateCalleeSaveGRFForbidden() {
  unsigned int numCallerSaveGRFs =
      builder.kernel.stackCall.getCallerSaveLastGRF() + 1;
  forbiddenVec[(size_t)forbiddenKind::FBD_CALLEESAVE].resize(
      getForbiddenVectorSize(G4_GRF));
  forbiddenVec[(size_t)forbiddenKind::FBD_CALLEESAVE].clear();
  for (unsigned int i = 1; i < numCallerSaveGRFs; i++) {
    forbiddenVec[(size_t)forbiddenKind::FBD_CALLEESAVE].set(i, true);
  }
  forbiddenVec[(size_t)forbiddenKind::FBD_CALLEESAVE] |=
      forbiddenVec[(size_t)forbiddenKind::FBD_RESERVEDGRF];
}

//
// Add GRF caller/callee save/restore code for stack calls.
// localSpillAreaOwordsize specifices the starting offset of the
// caller/callee-save area in this frame. It is 64-byte aligned.
//
void GlobalRA::addSaveRestoreCode(unsigned localSpillAreaOwordSize) {
  if (builder.getIsKernel()) {
    builder.kernel.fg.callerSaveAreaOffset = localSpillAreaOwordSize;
  } else {
    builder.kernel.fg.calleeSaveAreaOffset = localSpillAreaOwordSize;
    addCalleeSaveRestoreCode();
  }
  addCallerSaveRestoreCode();
  if (builder.getIsKernel()) {
    addGenxMainStackSetupCode();
  } else {
    addCalleeStackSetupCode();
  }
  stackCallProlog();
  builder.instList.clear();
}

//
// If the graph has stack calls, then add the caller-save pseudo code
// immediately before and after the stack call. The pseudo code is either
// converted to actual save/restore code or is eliminated at the end of
// coloringRegAlloc().
//
void GlobalRA::addCallerSavePseudoCode() {
  unsigned retID = 0;

  for (G4_BB *bb : builder.kernel.fg) {
    if (bb->isEndWithFCall()) {
      // GRF caller save/restore
      auto fcallInst = bb->back()->asCFInst();
      G4_Declare *pseudoVCADcl =
          bb->getParent().fcallToPseudoDclMap[fcallInst].VCA;
      G4_DstRegRegion *dst =
          builder.createDst(pseudoVCADcl->getRegVar(), 0, 0, 1, Type_UD);
      G4_INST *saveInst = builder.createInternalIntrinsicInst(
          nullptr, Intrinsic::CallerSave, g4::SIMD1, dst, nullptr, nullptr,
          nullptr, InstOpt_WriteEnable);
      saveInst->inheritDIFrom(fcallInst);
      INST_LIST_ITER callBBIt = bb->end();
      bb->insertBefore(--callBBIt, saveInst);

      auto fcall = builder.getFcallInfo(bb->back());
      vISA_ASSERT(fcall != std::nullopt, "fcall info not found");
      uint16_t retSize = fcall->getRetSize();
      if (retSize > 0) {
        const char *name =
            builder.getNameString(32, "FCALL_RETVAL_%d", retID++);
        auto retDcl = builder.createHardwiredDeclare(
            kernel.numEltPerGRF<Type_UD>() * retSize, Type_UD,
            kernel.stackCall.retReg, 0);
        retDcl->setName(name);
        addVarToRA(retDcl);
        fcallRetMap.emplace(pseudoVCADcl, retDcl);
      }

      vISA_ASSERT(bb->Succs.size() == 1,
                  "fcall basic block cannot have more than 1 successor node");

      G4_BB *retBB = bb->Succs.front();
      const RegionDesc *rd = builder.getRegionScalar();
      G4_Operand *src =
          builder.createSrc(pseudoVCADcl->getRegVar(), 0, 0, rd, Type_UD);
      INST_LIST_ITER retBBIt = retBB->begin();
      for (; retBBIt != retBB->end() && (*retBBIt)->isLabel(); ++retBBIt)
        ;
      G4_INST *restoreInst = builder.createInternalIntrinsicInst(
          nullptr, Intrinsic::CallerRestore, g4::SIMD1, nullptr, src, nullptr,
          nullptr, InstOpt_WriteEnable);
      restoreInst->inheritDIFrom(fcallInst);
      retBB->insertBefore(retBBIt, restoreInst);
    }
  }
  builder.instList.clear();
}

//
// If the graph has stack calls, then add the callee-save pseudo code at the
// entry/exit blocks of the function. The pseudo code is either converted to
// actual save/restore code or is eliminated at the end of coloringRegAlloc().
//
void GlobalRA::addCalleeSavePseudoCode() {
  G4_Declare *pseudoVCEDcl = builder.kernel.fg.pseudoVCEDcl;

  G4_DstRegRegion *dst =
      builder.createDst(pseudoVCEDcl->getRegVar(), 0, 0, 1, Type_UD);
  auto saveInst = builder.createInternalIntrinsicInst(
      nullptr, Intrinsic::CalleeSave, g4::SIMD1, dst, nullptr, nullptr, nullptr,
      InstOpt_WriteEnable);
  INST_LIST_ITER insertIt = builder.kernel.fg.getEntryBB()->begin();
  for (; insertIt != builder.kernel.fg.getEntryBB()->end() &&
         (*insertIt)->isLabel();
       ++insertIt) { /*  void */
  };
  builder.kernel.fg.getEntryBB()->insertBefore(insertIt, saveInst);

  G4_BB *exitBB = builder.kernel.fg.getUniqueReturnBlock();
  const RegionDesc *rDesc = builder.getRegionScalar();
  G4_Operand *src =
      builder.createSrc(pseudoVCEDcl->getRegVar(), 0, 0, rDesc, Type_UD);
  G4_INST *restoreInst = builder.createInternalIntrinsicInst(
      nullptr, Intrinsic::CalleeRestore, g4::SIMD1, nullptr, src, nullptr,
      nullptr, InstOpt_WriteEnable);
  INST_LIST_ITER exitBBIt = exitBB->end();
  --exitBBIt;
  vISA_ASSERT((*exitBBIt)->isFReturn(), ERROR_REGALLOC);
  exitBB->insertBefore(exitBBIt, restoreInst);
  builder.instList.clear();
}

void GlobalRA::storeCEInProlog() {
  if (!kernel.getOption(vISA_storeCE))
    return;

  // If we've to store CE in prolog, we emit:
  // TmpReg (GRF_Aligned) = CE0.0
  // Store TmpReg @ FP+Offset
  //
  // Where Offset = 1 GRF size in bytes

  // Create new variable equal to GRF size so it's always GRF aligned.
  // It's transitory so shouldn't impact register pressure. We want to
  // write CE0.0 in 0th location of this variable so that it can be
  // used as send payload.
  auto TmpReg = builder.createDeclare(
      "TmpCEReg", G4_GRF, builder.numEltPerGRF<Type_UD>(), 1, Type_UD);
  auto *DstRgn = builder.createDstRegRegion(TmpReg, 1);
  auto *CEReg = regPool.getMask0Reg();
  auto *SrcOpnd = builder.createSrc(
      CEReg, 0, 0, kernel.fg.builder->getRegionScalar(), Type_UD);
  auto Mov = builder.createMov(g4::SIMD1, DstRgn, SrcOpnd,
                               G4_InstOption::InstOpt_WriteEnable, false);
  auto nextPos = kernel.fg.getEntryBB()->insertBefore(
      kernel.fg.getEntryBB()->getFirstInsertPos(), Mov);

  auto payloadSrc =
      builder.createSrcRegRegion(TmpReg, builder.getRegionStride1());
  const unsigned execSize = 8;
  G4_DstRegRegion *postDst = builder.createNullDst(Type_UD);
  G4_INST *store = nullptr;
  unsigned int HWOffset = builder.numEltPerGRF<Type_UB>() / getHWordByteSize();
  vISA_ASSERT(kernel.stackCall.getFrameDescriptorByteSize() <=
                  builder.numEltPerGRF<Type_UB>(),
              "ce0 overwrote FDE");
  kernel.getKernelDebugInfo()->setCESaveOffset(HWOffset * getHWordByteSize());

  if (builder.supportsLSC()) {
    auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
    store = builder.createSpill(postDst, headerOpnd, payloadSrc,
                                G4_ExecSize(execSize), 1, HWOffset,
                                builder.getBEFP(), InstOpt_WriteEnable, false);
  } else {
    store = builder.createSpill(postDst, payloadSrc, G4_ExecSize(execSize), 1,
                                HWOffset, builder.getBEFP(),
                                InstOpt_WriteEnable, false);
  }
  kernel.fg.getEntryBB()->insertAfter(nextPos, store);

  if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
    builder.kernel.getKernelDebugInfo()->setSaveCEInst(store);
  }
}

//
// Insert store r125.[0-4] at entry and restore before return.
// Dst of store will be a hardwired temp at upper end of caller save area.
// This method emits:
// (W) mov (4) SR_BEStack<1>:ud    r125.0<4;4,1>:ud <-- in prolog
// (W) mov (4) r125.0<1>:ud        SR_BEStack<4;4,1>:ud <-- in epilog
void GlobalRA::addStoreRestoreToReturn() {
  unsigned int size = 4;
  if (kernel.stackCall.getVersion() ==
      StackCallABI::StackCallABIVersion::VER_3)
    size = 8;

  unsigned regNum = kernel.stackCall.getCallerSaveLastGRF();
  unsigned subRegNum = kernel.numEltPerGRF<Type_UD>() - size;
  oldFPDcl = builder.createHardwiredDeclare(size, Type_UD, regNum, subRegNum);
  oldFPDcl->setName(builder.getNameString(24, "CallerSaveRetIp_BE_FP"));

  G4_DstRegRegion *oldFPDst =
      builder.createDst(oldFPDcl->getRegVar(), 0, 0, 1, Type_UD);
  const RegionDesc *rd = builder.getRegionStride1();
  G4_Operand *oldFPSrc =
      builder.createSrc(oldFPDcl->getRegVar(), 0, 0, rd, Type_UD);

  unsigned saveRestoreSubReg =
      kernel.stackCall.getVersion() == StackCallABI::StackCallABIVersion::VER_3
          ? kernel.stackCall.subRegs.BE_FP
          : kernel.stackCall.subRegs.Ret_IP;
  auto saveRestoreDecl = builder.createHardwiredDeclare(
      size, Type_UD, kernel.stackCall.getFPSPGRF(), saveRestoreSubReg);
  addVarToRA(saveRestoreDecl);
  saveRestoreDecl->setName(builder.getNameString(24, "SR_BEStack"));
  G4_DstRegRegion *FPdst =
      builder.createDst(saveRestoreDecl->getRegVar(), 0, 0, 1, Type_UD);
  rd = builder.getRegionStride1();
  G4_Operand *FPsrc =
      builder.createSrc(saveRestoreDecl->getRegVar(), 0, 0, rd, Type_UD);

  saveBE_FPInst = builder.createMov(size == 4 ? g4::SIMD4 : g4::SIMD8, oldFPDst,
                                    FPsrc, InstOpt_WriteEnable, false);
  saveBE_FPInst->addComment("save vISA SP/FP to temp");
  builder.setPartFDSaveInst(saveBE_FPInst);

  auto entryBB = builder.kernel.fg.getEntryBB();
  auto insertIt = std::find_if(entryBB->begin(), entryBB->end(),
                               [](G4_INST *inst) { return !inst->isLabel(); });
  entryBB->insertBefore(insertIt, saveBE_FPInst);

  auto fretBB = builder.kernel.fg.getUniqueReturnBlock();
  auto iter = std::prev(fretBB->end());
  vISA_ASSERT((*iter)->isFReturn(), "fret BB must end with fret");

  // Following 4 cases exist for combination of EU fusion WA value, -skipFDE:
  // 1. No WA needed, no -skipFDE: restore r127 from r59
  // 2. No WA needed, -skipFDE: restore r127 from r59 and skip FDE store in
  //    leaf function
  // 3. WA needed, no -skipFDE: restore r127 using load reading FDE
  // 4. WA needed, -skipFDE: restore r127 from r59 in leaf function. In
  //    non-lead, use load to read stored FDE.
  if (!EUFusionCallWANeeded() || canSkipFDE()) {
    restoreBE_FPInst =
        builder.createMov(size == 4 ? g4::SIMD4 : g4::SIMD8, FPdst, oldFPSrc,
                          InstOpt_WriteEnable, false);
    fretBB->insertBefore(iter, restoreBE_FPInst);
  } else {
    // emit frame descriptor
    auto dstDcl =
        builder.createHardwiredDeclare(8, Type_UD, kernel.stackCall.getFPSPGRF(), 0);
    dstDcl->setName(builder.getNameString(24, "FrameDescriptorGRF"));
    auto dstData = builder.createDstRegRegion(dstDcl, 1);
    const unsigned execSize = 8;
    G4_INST *load = nullptr;
    if (builder.supportsLSC()) {
      auto headerOpnd = getSpillFillHeader(*kernel.fg.builder, nullptr);
      load =
          builder.createFill(headerOpnd, dstData, G4_ExecSize(execSize), 1, 0,
                             builder.getBEFP(), InstOpt_WriteEnable, false);
    } else {
      load = builder.createFill(dstData, G4_ExecSize(execSize), 1, 0,
                                builder.getBEFP(), InstOpt_WriteEnable, false);
    }
    fretBB->insertBefore(iter, load);
    addEUFusionCallWAInst(load);
    restoreBE_FPInst = load;
  }

  restoreBE_FPInst->addComment("restore vISA SP/FP from temp");

  if (builder.kernel.getOption(vISA_GenerateDebugInfo)) {
    builder.kernel.getKernelDebugInfo()->setCallerBEFPRestoreInst(
        restoreBE_FPInst);
    builder.kernel.getKernelDebugInfo()->setCallerSPRestoreInst(
        restoreBE_FPInst);
    if (!EUFusionCallWANeeded())
      builder.kernel.getKernelDebugInfo()->setCallerBEFPSaveInst(saveBE_FPInst);
  }
}

void GlobalRA::updateDefSet(std::set<G4_Declare *> &defs,
                            G4_Declare *referencedDcl) {
  // Get topmost dcl
  while (referencedDcl->getAliasDeclare() != NULL) {
    referencedDcl = referencedDcl->getAliasDeclare();
  }

  defs.insert(referencedDcl);
}

void GlobalRA::detectUndefinedUses(LivenessAnalysis &liveAnalysis,
                                   G4_Kernel &kernel) {
  // This function iterates over each inst and checks whether there is
  // a reaching def for each src operand.
  VISA_DEBUG_VERBOSE({
    std::cout << "\n";
    if (liveAnalysis.livenessClass(G4_FLAG)) {
      std::cout << "=== Uses with reaching def - Flags ===\n";
    } else if (liveAnalysis.livenessClass(G4_ADDRESS)) {
      std::cout << "=== Uses with reaching def - Address ===\n";
    } else {
      std::cout << "=== Uses with reaching def - GRF ===\n";
    }
    if (useLocalRA) {
      std::cout
          << "(Use -nolocalra switch for accurate results of uses without "
             "reaching defs)\n";
    }
  });

  for (G4_BB *bb : kernel.fg) {
    std::set<G4_Declare *> defs;
    std::set<G4_Declare *>::iterator defs_it;
    G4_Declare *referencedDcl = nullptr;

    for (G4_INST *inst : *bb) {
      // Src/predicate opnds are uses
      if (inst->getPredicate() && inst->getPredicate()->getBase() &&
          inst->getPredicate()->getBase()->isRegVar() &&
          inst->getPredicate()->getBase()->isRegAllocPartaker()) {
        referencedDcl = inst->getPredicate()
                            ->asPredicate()
                            ->getBase()
                            ->asRegVar()
                            ->getDeclare();
        reportUndefinedUses(liveAnalysis, bb, inst, referencedDcl, defs,
                            Opnd_pred);
      }

      for (unsigned i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
        G4_Operand *opnd = inst->getSrc(i);

        if (opnd && opnd->isAddrExp() == false && opnd->getBase() &&
            opnd->getBase()->isRegVar() &&
            opnd->getBase()->isRegAllocPartaker()) {
          referencedDcl = opnd->getBase()->asRegVar()->getDeclare();
          reportUndefinedUses(liveAnalysis, bb, inst, referencedDcl, defs,
                              (Gen4_Operand_Number)(i + Opnd_src0));
        }
      }

      // Dst/cond modifier opnds are defs
      if (inst->getCondModBase() && inst->getCondMod()->getBase()->isRegVar() &&
          inst->getCondMod()->getBase()->isRegAllocPartaker()) {
        referencedDcl = inst->getCondMod()
                            ->asCondMod()
                            ->getBase()
                            ->asRegVar()
                            ->getDeclare();
        updateDefSet(defs, referencedDcl);
      }

      if (inst->getDst() && inst->getDst()->getBase() &&
          inst->getDst()->getBase()->isRegVar() &&
          inst->getDst()->getBase()->isRegAllocPartaker()) {
        referencedDcl = inst->getDst()->getBase()->asRegVar()->getDeclare();
        updateDefSet(defs, referencedDcl);
      }
    }
  }

  VISA_DEBUG_VERBOSE(std::cout << "\n\n");
}

void GlobalRA::detectNeverDefinedUses() {
  // Detect variables that are used but never defined in entire CFG.
  // This does not use liveness information.
  // Hold all decls from symbol table as key.
  // Boolean mapped value determines whether the dcl is
  // defined in kernel or not.
  std::map<G4_Declare *, bool> vars;
  std::map<G4_Declare *, bool>::iterator map_it;

  for (auto bb : kernel.fg) {
    for (G4_INST *inst : *bb) {
      G4_Declare *referencedDcl = nullptr;

      if (inst->getDst() && inst->getDst()->getBase() &&
          inst->getDst()->getBase()->isRegVar()) {
        referencedDcl = inst->getDst()->getBaseRegVarRootDeclare();

        // Always insert top-most dcl
        map_it = vars.find(referencedDcl);
        if (map_it == vars.end()) {
          vars.emplace(referencedDcl, true);
        } else {
          map_it->second = true;
        }
      }

      if (inst->getCondModBase() && inst->getCondMod()->getBase()->isRegVar()) {
        referencedDcl = inst->getCondMod()->getBaseRegVarRootDeclare();

        map_it = vars.find(referencedDcl);
        if (map_it == vars.end()) {
          vars.emplace(referencedDcl, true);
        } else {
          map_it->second = true;
        }
      }

      if (inst->getPredicate() && inst->getPredicate()->getBase() &&
          inst->getPredicate()->getBase()->isRegVar()) {
        referencedDcl = inst->getPredicate()->getBaseRegVarRootDeclare();

        // Check whether dcl was already added to list.
        // If not, add it with flag set to false to indicate
        // that a use was found but a def hasnt been seen yet.
        map_it = vars.find(referencedDcl);
        if (map_it == vars.end()) {
          vars.emplace(referencedDcl, false);
        }
      }

      for (unsigned i = 0, numSrc = inst->getNumSrc(); i < numSrc; i++) {
        G4_Operand *opnd = inst->getSrc(i);

        if (opnd && opnd->getBase() && opnd->getBase()->isRegVar()) {
          referencedDcl = opnd->getBaseRegVarRootDeclare();

          map_it = vars.find(referencedDcl);
          if (map_it == vars.end()) {
            vars.emplace(referencedDcl, false);
          }
        }
      }
    }
  }
  VISA_DEBUG_VERBOSE(std::cout
                     << "\n=== Variables used but never defined ===\n\n");

  for (auto dcl : kernel.Declares) {
    while (dcl->getAliasDeclare())
      dcl = dcl->getAliasDeclare();

    map_it = vars.find(dcl);
    if (map_it != vars.end()) {
      if (map_it->second == false && dcl->getRegFile() != G4_INPUT &&
          dcl->getAddressed() == false) {
        // No def found for this non-input variable in
        // entire CFG so report it.
        VISA_DEBUG_VERBOSE({
          std::cout << dcl->getName();
          if (dcl->getRegFile() == G4_GRF) {
            std::cout << " (General)";
          } else if (dcl->getRegFile() == G4_ADDRESS) {
            std::cout << " (Address)";
          } else if (dcl->getRegFile() == G4_FLAG) {
            std::cout << " (Flag)";
          }
          std::cout << "\n";
        });
      }
    }
  }

  VISA_DEBUG_VERBOSE(std::cout << "\n\n");
}

//
//  Check the overlap of two sources' ranges and do range splitting
//  Such as, range1: 0~63, range2: 32~95  --> 0~31,32~63,64~95
//       or, range1: 0~63, range2: 32~63  --> 0~31,32~63
//
VarRange *VarSplit::splitVarRange(VarRange *src1, VarRange *src2,
                                  std::stack<VarRange *> *toDelete) {
  VarRange *new_var_range = nullptr;

  vISA_ASSERT(!(src1->leftBound == src2->leftBound &&
                src1->rightBound == src2->rightBound),
              "Same ranges can not be spiltted");

  if (src1->leftBound > src2->rightBound ||
      src1->rightBound < src2->leftBound) // No overlap
  {
    return NULL;
  }

  unsigned left1 = std::min(src1->leftBound, src2->leftBound); // left
  unsigned right1 = std::max(src1->leftBound, src2->leftBound);

  unsigned left2 = std::min(src1->rightBound, src2->rightBound); // right
  unsigned right2 = std::max(src1->rightBound, src2->rightBound);

  if (left1 == right1) // Same left
  {
    src1->leftBound = left1;
    src1->rightBound = left2;

    src2->leftBound = left2 + 1;
    src2->rightBound = right2;
  } else if (left2 == right2) // Same right
  {
    src1->leftBound = left1;
    src1->rightBound = right1 - 1;
    src2->leftBound = right1;
    src2->rightBound = right2;
  } else // No same boundary
  {
    src1->leftBound = left1; // Left one: in list already
    src1->rightBound = right1 - 1;

    src2->leftBound = left2 + 1; // Right one: keep in list
    src2->rightBound = right2;

    new_var_range = new VarRange;
    new_var_range->leftBound = right1; // Middle one: need add one range object
    new_var_range->rightBound = left2;
    toDelete->push(new_var_range);
  }

  return new_var_range;
}

//
// Scan the range list, Insert the new range into the range list.
// Range splitting is applied if required.
//
void VarSplit::rangeListSpliting(VAR_RANGE_LIST *rangeList, G4_Operand *opnd,
                                 std::stack<VarRange *> *toDelete) {
  VarRange *range = new VarRange;
  range->leftBound = opnd->getLeftBound();
  range->rightBound = opnd->getRightBound();
  toDelete->push(range);

  VAR_RANGE_LIST_ITER it = rangeList->begin();

  // The ranges in the list are ordered from low to high
  while (it != rangeList->end()) {
    if ((*it)->leftBound == range->leftBound &&
        ((*it)->rightBound == range->rightBound)) {
      // Same range exists in the list already
      return;
    }

    if ((*it)->leftBound > range->rightBound) {
      // The range item in the list is on the right of current range, insert it
      // before the postion. Since the whole range is inserted first, all the
      // ranges should be continuous.
      vISA_ASSERT((*it)->leftBound - range->rightBound == 1,
                  "none continuous spliting happened\n");
      rangeList->insert(it, range);
      return;
    }

    // Overlap happened, do splitting.
    //(*lt) is updated to the left range
    //"range" is updated to the right range
    // If "newRange" is not NULL, it's the middle range.
    VarRange *newRange = splitVarRange((*it), range, toDelete);

    // Insert the middle one
    it++;
    if (newRange) {
      it = rangeList->insert(it, newRange);
    }
  }

  rangeList->push_back(range); // Insert the right one

  return;
}

void VarSplit::getHeightWidth(G4_Type type, unsigned numberElements,
                              unsigned short &dclWidth,
                              unsigned short &dclHeight,
                              int &totalByteSize) const {
  dclWidth = 1, dclHeight = 1;
  totalByteSize = numberElements * TypeSize(type);
  if (totalByteSize <= (int)kernel.numEltPerGRF<Type_UB>()) {
    dclWidth = (uint16_t)numberElements;
  } else {
    // here we assume that the start point of the var is the beginning of a GRF?
    // so subregister must be 0?
    dclWidth = kernel.numEltPerGRF<Type_UB>() / TypeSize(type);
    dclHeight = totalByteSize / kernel.numEltPerGRF<Type_UB>();
    if (totalByteSize % kernel.numEltPerGRF<Type_UB>() != 0) {
      dclHeight++;
    }
  }
}

void VarSplit::createSubDcls(G4_Kernel &kernel, G4_Declare *oldDcl,
                             std::vector<G4_Declare *> &splitDclList) {
  if (oldDcl->getByteSize() <= kernel.numEltPerGRF<Type_UB>() ||
      oldDcl->getByteSize() % kernel.numEltPerGRF<Type_UB>()) {
    return;
  }

  int splitVarSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
  for (unsigned i = 0, bSizePerGRFSize = (oldDcl->getByteSize() /
                                          kernel.numEltPerGRF<Type_UB>());
       i < bSizePerGRFSize; i += splitVarSize) {
    G4_Declare *splitDcl = NULL;
    unsigned leftBound = i * kernel.numEltPerGRF<Type_UB>();
    unsigned rightBound =
        (i + splitVarSize) * kernel.numEltPerGRF<Type_UB>() - 1;
    unsigned short dclWidth = 0;
    unsigned short dclHeight = 0;
    int dclTotalSize = 0;

    getHeightWidth(oldDcl->getElemType(),
                   (rightBound - leftBound + 1) / oldDcl->getElemSize(),
                   dclWidth, dclHeight, dclTotalSize);
    const char *splitDclName = kernel.fg.builder->getNameString(
        16, "split_%d_%s", i, oldDcl->getName());
    splitDcl = kernel.fg.builder->createDeclare(
        splitDclName, G4_GRF, dclWidth, dclHeight, oldDcl->getElemType());
    gra.setSubOffset(splitDcl, leftBound);
    splitDcl->copyAlign(oldDcl);
    gra.copyAlignment(splitDcl, oldDcl);
    unsigned nElementSize =
        (rightBound - leftBound + 1) / oldDcl->getElemSize();
    if ((rightBound - leftBound + 1) % oldDcl->getElemSize()) {
      nElementSize++;
    }
    splitDcl->setTotalElems(nElementSize);
    splitDclList.push_back(splitDcl);
  }

  return;
}

void VarSplit::insertMovesToTemp(IR_Builder &builder, G4_Declare *oldDcl,
                                 G4_Operand *dstOpnd, G4_BB *bb,
                                 INST_LIST_ITER instIter,
                                 std::vector<G4_Declare *> &splitDclList) {
  G4_INST *inst = (*instIter);
  INST_LIST_ITER iter = instIter;
  iter++;

  for (size_t i = 0, size = splitDclList.size(); i < size; i++) {
    G4_Declare *subDcl = splitDclList[i];
    unsigned leftBound = gra.getSubOffset(subDcl);
    unsigned rightBound = leftBound + subDcl->getByteSize() - 1;

    if (!(dstOpnd->getRightBound() < leftBound ||
          rightBound < dstOpnd->getLeftBound())) {
      unsigned maskFlag = (inst->getOption() & 0xFFF010C);
      G4_DstRegRegion *dst = builder.createDstRegRegion(subDcl, 1);
      auto src = builder.createSrc(
          oldDcl->getRegVar(),
          (gra.getSubOffset(subDcl)) / kernel.numEltPerGRF<Type_UB>(), 0,
          builder.getRegionStride1(), oldDcl->getElemType());
      G4_INST *splitInst = builder.createMov(
          G4_ExecSize(subDcl->getTotalElems()), dst, src, maskFlag, false);
      bb->insertBefore(iter, splitInst);
      if (splitInst->isWriteEnableInst() && gra.EUFusionNoMaskWANeeded()) {
        gra.addEUFusionNoMaskWAInst(bb, splitInst);
      }
    }
  }

  return;
}

void VarSplit::insertMovesFromTemp(G4_Kernel &kernel, G4_Declare *oldDcl,
                                   int index, G4_Operand *srcOpnd, int pos,
                                   G4_BB *bb, INST_LIST_ITER instIter,
                                   std::vector<G4_Declare *> &splitDclList) {
  G4_INST *inst = (*instIter);

  int sizeInGRF = (srcOpnd->getRightBound() - srcOpnd->getLeftBound() +
                   kernel.numEltPerGRF<Type_UB>() - 1) /
                  kernel.numEltPerGRF<Type_UB>();
  int splitSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
  if (sizeInGRF != splitSize) {
    unsigned short dclWidth = 0;
    unsigned short dclHeight = 0;
    int dclTotalSize = 0;
    G4_SrcRegRegion *oldSrc = srcOpnd->asSrcRegRegion();
    getHeightWidth(oldSrc->getType(),
                   (srcOpnd->getRightBound() - srcOpnd->getLeftBound() + 1) /
                       oldSrc->getElemSize(),
                   dclWidth, dclHeight, dclTotalSize);
    const char *newDclName = kernel.fg.builder->getNameString(
        16, "copy_%d_%s", index, oldDcl->getName());
    G4_Declare *newDcl = kernel.fg.builder->createDeclare(
        newDclName, G4_GRF, dclWidth, dclHeight, oldSrc->getType());
    newDcl->copyAlign(oldDcl);
    gra.copyAlignment(newDcl, oldDcl);

    unsigned newLeftBound = 0;

    for (size_t i = 0, size = splitDclList.size(); i < size; i++) {
      G4_Declare *subDcl = splitDclList[i];
      unsigned leftBound = gra.getSubOffset(subDcl);
      unsigned rightBound = leftBound + subDcl->getByteSize() - 1;

      if (!(srcOpnd->getRightBound() < leftBound ||
            rightBound < srcOpnd->getLeftBound())) {

        G4_DstRegRegion *dst = kernel.fg.builder->createDst(
            newDcl->getRegVar(), newLeftBound / kernel.numEltPerGRF<Type_UB>(),
            0, 1, oldSrc->getType());
        newLeftBound += subDcl->getByteSize();
        G4_SrcRegRegion *src = kernel.fg.builder->createSrc(
            subDcl->getRegVar(), 0, 0, kernel.fg.builder->getRegionStride1(),
            oldSrc->getType());
        G4_INST *movInst =
            kernel.fg.builder->createMov(G4_ExecSize(subDcl->getTotalElems()),
                                         dst, src, InstOpt_WriteEnable, false);
        bb->insertBefore(instIter, movInst);
        if (gra.EUFusionNoMaskWANeeded()) {
          gra.addEUFusionNoMaskWAInst(bb, movInst);
        }
      }
    }
    auto newSrc = kernel.fg.builder->createSrcRegRegion(
        oldSrc->getModifier(), Direct, newDcl->getRegVar(), 0,
        oldSrc->getSubRegOff(), oldSrc->getRegion(), newDcl->getElemType());
    inst->setSrc(newSrc, pos);
  } else {
    for (size_t i = 0, size = splitDclList.size(); i < size; i++) {
      G4_Declare *subDcl = splitDclList[i];
      unsigned leftBound = gra.getSubOffset(subDcl);
      unsigned rightBound = leftBound + subDcl->getByteSize() - 1;

      if (!(srcOpnd->getRightBound() < leftBound ||
            rightBound < srcOpnd->getLeftBound())) {
        G4_SrcRegRegion *oldSrc = srcOpnd->asSrcRegRegion();
        G4_SrcRegRegion *newSrc = kernel.fg.builder->createSrcRegRegion(
            oldSrc->getModifier(), Direct, subDcl->getRegVar(), 0,
            oldSrc->getSubRegOff(), oldSrc->getRegion(), oldSrc->getType());
        inst->setSrc(newSrc, pos);
        break;
      }
    }
  }

  return;
}

bool VarSplit::canDoGlobalSplit(IR_Builder &builder, G4_Kernel &kernel,
                                uint32_t sendSpillRefCount) {
  if (!builder.getOption(vISA_GlobalSendVarSplit)) {
    return false;
  }

  if (!builder.getOption(vISA_Debug) && // Not work in debug mode
      kernel.getInt32KernelAttr(Attributes::ATTR_Target) ==
          VISA_3D && // Only works for 3D/OCL/OGL
      sendSpillRefCount) {
    return true;
  }

  return false;
}

void VarSplit::globalSplit(IR_Builder &builder, G4_Kernel &kernel) {
  typedef std::list<
      std::tuple<G4_BB *, G4_Operand *, int, unsigned, INST_LIST_ITER>>
      SPLIT_OPERANDS;
  typedef std::list<std::tuple<G4_BB *, G4_Operand *, int, unsigned,
                               INST_LIST_ITER>>::iterator SPLIT_OPERANDS_ITER;
  typedef std::map<G4_RegVar *, SPLIT_OPERANDS> SPLIT_DECL_OPERANDS;
  typedef std::map<G4_RegVar *, SPLIT_OPERANDS>::iterator
      SPLIT_DECL_OPERANDS_ITER;

  SPLIT_DECL_OPERANDS splitDcls;
  unsigned instIndex = 0;
  int splitSize = kernel.getSimdSize() == g4::SIMD8 ? 1 : 2;
  for (auto bb : kernel.fg) {
    for (INST_LIST_ITER it = bb->begin(), iend = bb->end(); it != iend;
         ++it, ++instIndex) {
      G4_INST *inst = (*it);
      G4_DstRegRegion *dst = inst->getDst();

      if (inst->isLifeTimeEnd() || inst->isPseudoKill()) {
        continue;
      }

      //
      // process send destination operand
      //
      if (inst->isSend() &&
          inst->getMsgDesc()->getDstLenRegs() > (size_t)splitSize &&
          inst->asSendInst()->isDirectSplittableSend()) {
        G4_DstRegRegion *dstrgn = dst;
        G4_Declare *topdcl = GetTopDclFromRegRegion(dstrgn);

        if (topdcl && dstrgn->getRegAccess() == Direct &&
            !topdcl->getAddressed() && topdcl->getRegFile() != G4_INPUT &&
            (dstrgn->getRightBound() - dstrgn->getLeftBound() + 1) ==
                topdcl->getByteSize() &&
            (dstrgn->getRightBound() - dstrgn->getLeftBound()) >
                kernel.numEltPerGRF<Type_UB>()) {
          // The tuple<G4_BB*, G4_Operand*, int pos, unsigned instIndex,
          // INST_LIST_ITER>, these info are tuning and split
          // operand/instruction generation
          splitDcls[topdcl->getRegVar()].push_front(
              make_tuple(bb, dst, 0, instIndex, it));
        }
      }
    }
  }

  instIndex = 0;
  for (auto bb : kernel.fg) {
    for (INST_LIST_ITER it = bb->begin(), end = bb->end(); it != end;
         ++it, ++instIndex) {

      G4_INST *inst = (*it);

      if (inst->isLifeTimeEnd() || inst->isPseudoKill()) {
        continue;
      }

      //
      // process each source operand
      //
      for (unsigned j = 0, numSrc = inst->getNumSrc(); j < numSrc; j++) {
        G4_Operand *src = inst->getSrc(j);

        if (src == NULL) {
          continue;
        }

        if (src->isSrcRegRegion()) {
          G4_Declare *topdcl = GetTopDclFromRegRegion(src);

          if (topdcl && topdcl->getRegFile() != G4_INPUT &&
              !topdcl->getAddressed() &&
              splitDcls.find(topdcl->getRegVar()) != splitDcls.end() &&
              ((src->asSrcRegRegion()->getRightBound() -
                src->asSrcRegRegion()->getLeftBound() + 1) <
               topdcl->getByteSize()) &&
              src->asSrcRegRegion()->getRegAccess() ==
                  Direct) // We don't split the indirect access
          {
            splitDcls[topdcl->getRegVar()].push_back(
                make_tuple(bb, src, j, instIndex, it));
          }
        }
      }
    }
  }

  for (SPLIT_DECL_OPERANDS_ITER it = splitDcls.begin();
       it != splitDcls.end();) {
    unsigned srcIndex = 0xFFFFFFFF;
    unsigned dstIndex = 0;
    SPLIT_DECL_OPERANDS_ITER succIt = it;
    succIt++;
    G4_Declare *topDcl = it->first->getDeclare();
    if (topDcl->getByteSize() <= kernel.numEltPerGRF<Type_UB>() * 2u) {
      splitDcls.erase(it);
      it = succIt;
      continue;
    }

    bool hasSrcOpearnd = false;
    for (SPLIT_OPERANDS_ITER vt = it->second.begin(); vt != it->second.end();
         vt++) {
      G4_BB *bb = nullptr;
      G4_Operand *opnd = nullptr;
      INST_LIST_ITER instIter;
      int pos = 0;
      unsigned iIndex = 0;

      std::tie(bb, opnd, pos, iIndex, instIter) = (*vt);

      if (opnd == nullptr) {
        continue;
      }

      if (opnd->isDstRegRegion()) {
        dstIndex = std::max(dstIndex, iIndex);
      }

      if (opnd->isSrcRegRegion()) {
        srcIndex = std::min(srcIndex, iIndex);
        hasSrcOpearnd = true;
      }
    }

    if (!hasSrcOpearnd ||
        (dstIndex > srcIndex && dstIndex - srcIndex < it->second.size() + 1)) {
      splitDcls.erase(it);
      it = succIt;
      continue;
    }

    it++;
  }

  for (SPLIT_DECL_OPERANDS_ITER it = splitDcls.begin(); it != splitDcls.end();
       it++) {
    G4_Declare *topDcl = it->first->getDeclare();
    std::vector<G4_Declare *> splitDclList;
    splitDclList.clear();

    createSubDcls(kernel, topDcl, splitDclList);
    int srcIndex = 0;
    for (SPLIT_OPERANDS_ITER vt = it->second.begin(); vt != it->second.end();
         vt++) {
      G4_BB *bb = nullptr;
      G4_Operand *opnd = nullptr;
      INST_LIST_ITER instIter;
      int pos = 0;
      unsigned instIndex = 0;
      std::tie(bb, opnd, pos, instIndex, instIter) = (*vt);

      if (opnd == nullptr) {
        continue;
      }

      if (opnd->isDstRegRegion()) {
        insertMovesToTemp(builder, topDcl, opnd, bb, instIter, splitDclList);
      }

      if (opnd->isSrcRegRegion()) {
        insertMovesFromTemp(kernel, topDcl, srcIndex, opnd, pos, bb, instIter,
                            splitDclList);
      }

      srcIndex++;
    }
  }

  return;
}

void VarSplit::localSplit(IR_Builder &builder, G4_BB *bb) {
  class CmpRegVarId {
  public:
    bool operator()(G4_RegVar *first, G4_RegVar *second) const {
      return first->getDeclare()->getDeclId() <
             second->getDeclare()->getDeclId();
    }
  };
  std::map<G4_RegVar *, std::vector<std::pair<G4_Operand *, INST_LIST_ITER>>,
           CmpRegVarId>
      localRanges;
  std::map<G4_RegVar *, std::vector<std::pair<G4_Operand *, INST_LIST_ITER>>,
           CmpRegVarId>::iterator localRangesIt;
  std::map<G4_RegVar *, VarRangeListPackage, CmpRegVarId> varRanges;
  std::map<G4_RegVar *, VarRangeListPackage, CmpRegVarId>::iterator varRangesIt;
  std::stack<VarRange *> toDelete;

  // Skip BB if there are no sends.
  bool hasSends = std::any_of(bb->begin(), bb->end(),
                              [](G4_INST *inst) { return inst->isSend(); });
  if (!hasSends)
    return;

  //
  // Iterate instruction in BB from back to front
  //
  for (INST_LIST::reverse_iterator rit = bb->rbegin(), rend = bb->rend();
       rit != rend; ++rit) {
    G4_INST *i = (*rit);
    G4_DstRegRegion *dst = i->getDst();

    if (i->isLifeTimeEnd() || i->isPseudoKill()) {
      continue;
    }

    //
    // process destination operand
    //
    if (dst) {
      // It's RA candidate
      G4_Declare *topdcl = GetTopDclFromRegRegion(dst);

      LocalLiveRange *topdclLR = nullptr;
      // Local only
      if ((topdcl && (topdclLR = gra.getLocalLR(topdcl)) &&
           topdcl->getIsRefInSendDcl() && topdclLR->isLiveRangeLocal()) &&
          topdcl->getRegFile() == G4_GRF) {
        varRangesIt = varRanges.find(topdcl->getRegVar());
        INST_LIST_ITER iterToInsert = rit.base();
        iterToInsert--; // Point to the iterator of current instruction
        if (varRangesIt == varRanges.end()) {
          VarRange *new_range = new VarRange;
          new_range->leftBound = 0;
          new_range->rightBound = topdcl->getByteSize() - 1;
          toDelete.push(new_range);
          varRanges[topdcl->getRegVar()].list.push_back(new_range);
        } else {
          rangeListSpliting(&(varRanges[topdcl->getRegVar()].list), dst,
                            &toDelete);
        }

        localRanges[topdcl->getRegVar()].emplace_back(
            dst, iterToInsert); // Ordered from back to front.
      }
    }

    //
    // process each source operand
    //
    for (unsigned j = 0, numSrc = i->getNumSrc(); j < numSrc; j++) {
      G4_Operand *src = i->getSrc(j);

      if (src == NULL) {
        continue;
      }

      // Local only
      if (src->isSrcRegRegion()) {
        G4_Declare *topdcl = GetTopDclFromRegRegion(src);
        LocalLiveRange *topdclLR = nullptr;

        if (topdcl && (topdclLR = gra.getLocalLR(topdcl)) &&
            topdcl->getIsRefInSendDcl() && topdclLR->isLiveRangeLocal() &&
            topdcl->getRegFile() == G4_GRF) {
          G4_VarBase *base = topdcl->getRegVar();

          INST_LIST_ITER iterToInsert = rit.base();
          iterToInsert--;

          varRangesIt = varRanges.find(base->asRegVar());
          if (varRangesIt == varRanges.end()) {
            VarRange *new_range = new VarRange;
            new_range->leftBound = 0;
            new_range->rightBound = topdcl->getByteSize() - 1;
            toDelete.push(new_range);
            varRanges[topdcl->getRegVar()].list.push_back(new_range);
          }

          rangeListSpliting(&(varRanges[topdcl->getRegVar()].list), src,
                            &toDelete);

          localRanges[topdcl->getRegVar()].emplace_back(
              src, iterToInsert); // Ordered from back to front.
        }
      }
    }
  }

  // Clean the varaibles without no partial usage, or whose partial live range
  // is too short
  std::map<G4_RegVar *, VarRangeListPackage>::iterator it = varRanges.begin();
  while (it != varRanges.end()) {
    std::map<G4_RegVar *, VarRangeListPackage>::iterator succ_it = it;
    succ_it++;

    // No partial
    if (it->second.list.size() <= 1) {
      varRanges.erase(it);
      it = succ_it;
      continue;
    }

    // If total GRF size divides partial number is less than 16 bytes (half
    // GRF), remove it
    if (((*it->second.list.rbegin())->rightBound -
         (*it->second.list.begin())->leftBound) /
            it->second.list.size() <
        kernel.numEltPerGRF<Type_UW>() * 2 / 2) {
      varRanges.erase(it);
      it = succ_it;
      continue;
    }

    G4_Declare *topDcl = it->first->getDeclare();
    bool aligned = true;
    for (const VarRange *vr : it->second.list) {
      unsigned leftBound = vr->leftBound;
      unsigned rightBound = vr->rightBound;
      int elementSize =
          topDcl->getElemSize() > G4_WSIZE ? topDcl->getElemSize() : G4_WSIZE;
      unsigned short elemsNum = (rightBound - leftBound + 1) / elementSize;

      if (!elemsNum) {
        aligned = false;
        break;
      }

      // TODO: we can merge serveral unaligned sub declares into one aligned.
      // Such as [0-1], [2-63]  --> [0-63]
      if (leftBound % kernel.numEltPerGRF<Type_UW>() ||
          (rightBound + 1) % kernel.numEltPerGRF<Type_UW>()) {
        aligned = false;
        break;
      }
    }

    if (!aligned) {
      varRanges.erase(it);
      it = succ_it;
      continue;
    }

    it = succ_it;
  }

  int splitid = 0;
  for (std::map<G4_RegVar *, VarRangeListPackage>::iterator it =
           varRanges.begin();
       it != varRanges.end(); it++) {
    G4_Declare *topDcl = it->first->getDeclare();
    const char *dclName = topDcl->getName();

    topDcl->setIsSplittedDcl(true);

    // Vertical split: varaible split
    unsigned splitVarNum = 0;
    unsigned pre_rightBound = 0;
    for (VAR_RANGE_LIST_ITER vt = it->second.list.begin();
         vt != it->second.list.end(); vt++) {
      unsigned leftBound = (*vt)->leftBound;
      unsigned rightBound = (*vt)->rightBound;
      int elementSize =
          topDcl->getElemSize() > G4_WSIZE ? topDcl->getElemSize() : G4_WSIZE;
      unsigned short elemsNum = (rightBound - leftBound + 1) / elementSize;

      if (!elemsNum) {
        vASSERT(false);
        pre_rightBound = rightBound;
        continue;
      }

      if (leftBound && pre_rightBound + 1 != leftBound) {
        vASSERT(false);
      }
      pre_rightBound = rightBound;

      std::stringstream nameStrm;
      nameStrm << dclName << "_" << splitid << "_" << leftBound << "_"
               << rightBound << std::ends;
      int nameLen = unsigned(nameStrm.str().length()) + 1;
      const char *name = builder.getNameString(nameLen, "%s_%d_%d_%d", dclName,
                                               splitid, leftBound, rightBound);

      unsigned short dclWidth = 0;
      unsigned short dclHeight = 0;
      int dclTotalSize = 0;

      getHeightWidth(topDcl->getElemType(),
                     (rightBound - leftBound + 1) / topDcl->getElemSize(),
                     dclWidth, dclHeight, dclTotalSize);
      G4_Declare *partialDcl = builder.createDeclare(
          name, G4_GRF, dclWidth, dclHeight, topDcl->getElemType());
      gra.setSubOffset(partialDcl, leftBound);
      partialDcl->setIsPartialDcl(true);
      gra.setSplittedDeclare(partialDcl, topDcl);
      unsigned nElementSize =
          (rightBound - leftBound + 1) / topDcl->getElemSize();
      if ((rightBound - leftBound + 1) % topDcl->getElemSize()) {
        nElementSize++;
      }
      partialDcl->setTotalElems(nElementSize);
      gra.addSubDcl(topDcl, partialDcl);
      splitVarNum++;
      VISA_DEBUG_VERBOSE(std::cout << "==> Sub Declare: " << splitid
                                   << "::" << name << "\n");
      splitid++;
    }
    if (splitVarNum) {
      gra.setSplitVarNum(topDcl, splitVarNum);
    }
  }

  while (toDelete.size() > 0) {
    delete toDelete.top();
    toDelete.pop();
  }

  return;
}

void GlobalRA::addrRegAlloc() {
  uint32_t addrSpillId = 0;
  unsigned maxRAIterations = builder.getuint32Option(vISA_MaxRAIterations);
  unsigned iterationNo = 0;

  while (iterationNo < maxRAIterations) {
    RA_TRACE(std::cout << "--address RA iteration " << iterationNo << "\n");
    //
    // choose reg vars whose reg file kind is ARF
    //
    LivenessAnalysis liveAnalysis(*this, G4_ADDRESS);
    liveAnalysis.computeLiveness();

    //
    // if no reg var needs to reg allocated, then skip reg allocation
    //
    if (liveAnalysis.getNumSelectedVar() > 0) {
      GraphColor coloring(liveAnalysis, false, false);
      if (!coloring.regAlloc(false, false, nullptr)) {
        SpillManager spillARF(*this, coloring.getSpilledLiveRanges(),
                              addrSpillId);
        spillARF.insertSpillCode();
        addrSpillId = spillARF.getNextTempDclId();

        //
        // if new addr temps are created, we need to do RA again so that newly
        // created temps can get registers. If there are no more newly created
        // temps, we then commit reg assignments
        //
        if (spillARF.isAnyNewTempCreated() == false) {
          coloring.confirmRegisterAssignments();
          coloring.cleanupRedundantARFFillCode();
          if ((builder.kernel.fg.getHasStackCalls() ||
               builder.kernel.fg.getIsStackCallFunc())) {
            coloring.addA0SaveRestoreCode();
          }
          break; // no more new addr temps; done with ARF allocation
        }
      } else // successfully allocate register without spilling
      {
        coloring.confirmRegisterAssignments();
        coloring.cleanupRedundantARFFillCode();
        if ((builder.kernel.fg.getHasStackCalls() ||
             builder.kernel.fg.getIsStackCallFunc())) {
          coloring.addA0SaveRestoreCode();
        }
        VISA_DEBUG_VERBOSE(detectUndefinedUses(liveAnalysis, kernel));

        break; // done with ARF allocation
      }
    } else {
      break; // no ARF allocation needed
    }
    kernel.dumpToFile("after.Address_RA." + std::to_string(iterationNo));
    iterationNo++;
  }

  // Addr spill/fill
  addVarToRA(kernel.Declares.back());

  vISA_ASSERT(iterationNo < maxRAIterations, "Address RA has failed.");
}

void GlobalRA::flagRegAlloc() {
  uint32_t flagSpillId = 0;
  unsigned maxRAIterations = builder.getuint32Option(vISA_MaxRAIterations);
  uint32_t iterationNo = 0;
  bool spillingFlag = false;

  while (iterationNo < maxRAIterations) {
    RA_TRACE(std::cout << "--flag RA iteration " << iterationNo << "\n");

    //
    // choose reg vars whose reg file kind is FLAG
    //
    LivenessAnalysis liveAnalysis(*this, G4_FLAG);
    liveAnalysis.computeLiveness();

    //
    // if no reg var needs to reg allocated, then skip reg allocation
    //
    if (liveAnalysis.getNumSelectedVar() > 0) {
      GraphColor coloring(liveAnalysis, false, false);
      if (!coloring.regAlloc(false, false, nullptr)) {
        SpillManager spillFlag(*this, coloring.getSpilledLiveRanges(),
                               flagSpillId);
        spillFlag.insertSpillCode();
        VISA_DEBUG_VERBOSE({
          printf("FLAG Spill inst count: %d\n",
                 spillFlag.getNumFlagSpillStore());
          printf("FLAG Fill inst count: %d\n", spillFlag.getNumFlagSpillLoad());
          printf("*************************\n");
        });
        flagSpillId = spillFlag.getNextTempDclId();

        spillingFlag = true;
        if (spillFlag.isAnyNewTempCreated() == false) {
          coloring.confirmRegisterAssignments();

          if ((builder.kernel.fg.getHasStackCalls() ||
               builder.kernel.fg.getIsStackCallFunc())) {
            coloring.addFlagSaveRestoreCode();
          }
          break;
        }
        builder.getJitInfo()->stats.numFlagSpillStore +=
            spillFlag.getNumFlagSpillStore();
        builder.getJitInfo()->stats.numFlagSpillLoad +=
            spillFlag.getNumFlagSpillLoad();
      } else // successfully allocate register without spilling
      {
        coloring.confirmRegisterAssignments();
        if ((builder.kernel.fg.getHasStackCalls() ||
             builder.kernel.fg.getIsStackCallFunc())) {
          coloring.addFlagSaveRestoreCode();
        }

        if (spillingFlag && builder.getOption(vISA_FlagSpillCodeCleanup)) {
          CLEAN_NUM_PROFILE clean_num_profile;

          FlagSpillCleanup f(*this);
          f.spillFillCodeCleanFlag(builder, kernel, &clean_num_profile);

#ifdef DEBUG_VERBOSE_ON1
          for (int i = 0; i < 3; i++) {
            printf("Profiler %d Spill clean: %d\n", i,
                   clean_num_profile.spill_clean_num[i]);
            printf("Profiler %d Fill clean: %d\n", i,
                   clean_num_profile.fill_clean_num[i]);
            clean_num += clean_num_profile.spill_clean_num[i];
            clean_num += clean_num_profile.fill_clean_num[i];
          }
          printf("**Flag clean num: %d\n", clean_num);
#endif
        }

        VISA_DEBUG_VERBOSE(detectUndefinedUses(liveAnalysis, kernel));

        break; // done with FLAG allocation
      }
    } else {
      break; // no FLAG allocation needed
    }
    kernel.dumpToFile("after.Flag_RA." + std::to_string(iterationNo));
    iterationNo++;
  }

  // Flag spill/fill
  addVarToRA(kernel.Declares.back());

  vISA_ASSERT(iterationNo < maxRAIterations, "Flag RA has failed.");
}
void GlobalRA::scalarRegAlloc() {
  uint32_t scalarSpillId = 0;
  unsigned maxRAIterations = builder.getuint32Option(vISA_MaxRAIterations);
  unsigned iterationNo = 0;

  std::set<G4_Declare *> PreAssigned;
  for (auto dcl : kernel.Declares) {
    if (dcl->getRegFile() == G4_SCALAR) {
      auto regVar = dcl->getRegVar();
      if (regVar->isS0())
        PreAssigned.insert(dcl);
    }
  }
  while (iterationNo < maxRAIterations) {
    RA_TRACE(std::cout << "--scalar RA iteration " << iterationNo << "\n");
    //
    // choose reg vars whose reg file kind is ARF
    //
    LivenessAnalysis liveAnalysis(*this, G4_SCALAR);
    liveAnalysis.computeLiveness();

    //
    // if no reg var needs to reg allocated, then skip reg allocation
    //
    if (liveAnalysis.getNumSelectedVar() > 0) {
      GraphColor coloring(liveAnalysis, false, false);
      if (!coloring.regAlloc(false, false, nullptr)) {
        SpillManager spillScalar(*this, coloring.getSpilledLiveRanges(),
                                 scalarSpillId);
        spillScalar.insertSpillCode();
        scalarSpillId = spillScalar.getNextTempDclId();

        //
        // if new scalar temps are created, we need to do RA again so that newly
        // created temps can get registers. If there are no more newly created
        // temps, we then commit reg assignments
        //
        if (spillScalar.isAnyNewTempCreated() == false) {
          coloring.confirmRegisterAssignments();
          if ((builder.kernel.fg.getHasStackCalls() ||
               builder.kernel.fg.getIsStackCallFunc())) {
            vASSERT(false &&
                    "missing code"); // coloring.addA0SaveRestoreCode();
          }
          break; // no more new scalar temps; done with scalar allocation
        }
      } else // successfully allocate register without spilling
      {
        coloring.confirmRegisterAssignments();
        if ((builder.kernel.fg.getHasStackCalls() ||
             builder.kernel.fg.getIsStackCallFunc())) {
          vASSERT(false && "missing code"); // coloring.addA0SaveRestoreCode();
        }
        VISA_DEBUG_VERBOSE(detectUndefinedUses(liveAnalysis, kernel));

        break; // done with scalar allocation
      }
    } else {
      break; // no scalar allocation needed
    }
    iterationNo++;
  }
  kernel.dumpToFile("after.Scalar_RA." + std::to_string(iterationNo));
  constexpr unsigned ScalarRegisterGRFBase = 96;
  // change scalar-register assignment back to fixed GRF location so that
  // the code can be simulated on platform without scalar pipe
  for (G4_Declare *dcl : kernel.Declares) {
    if (dcl->getRegFile() == G4_SCALAR && !PreAssigned.count(dcl)) {
      auto regVar = dcl->getRegVar();
      if (regVar->isS0()) {
        auto offset = regVar->getPhyRegOff() * dcl->getElemSize();
        unsigned int regNum = offset / builder.getGRFSize();
        unsigned int subRegNum =
            (offset % builder.getGRFSize()) / dcl->getElemSize();
        regVar->setPhyReg(regPool.getGreg(regNum + ScalarRegisterGRFBase),
                        subRegNum);
        dcl->setRegFile(G4_GRF);
      }
    }
  }
  kernel.dumpToFile("after.Scalar_Rename." + std::to_string(iterationNo));
  vISA_ASSERT(iterationNo < maxRAIterations, "Scalar RA has failed.");
}

void GlobalRA::assignRegForAliasDcl() {
  //
  // assign Reg for Alias DCL
  //
  for (G4_Declare *dcl : kernel.Declares) {
    G4_RegVar *AliasRegVar;
    G4_RegVar *CurrentRegVar;
    unsigned tempoffset;

    if (dcl->getAliasDeclare() != NULL) {
      AliasRegVar = dcl->getAliasDeclare()->getRegVar();
      CurrentRegVar = dcl->getRegVar();
      tempoffset = AliasRegVar->getPhyRegOff() *
                       AliasRegVar->getDeclare()->getElemSize() +
                   dcl->getAliasOffset();
      if (AliasRegVar->getPhyReg() != NULL) {
        //
        // alias register assignment for A0
        //
        if (CurrentRegVar->getDeclare()->useGRF()) {
          // if the tempoffset is one grf
          if (tempoffset < kernel.numEltPerGRF<Type_UW>() * 2u) {
            CurrentRegVar->setPhyReg(
                AliasRegVar->getPhyReg(),
                tempoffset / CurrentRegVar->getDeclare()->getElemSize());
          }
          // tempoffset covers several GRFs
          else {
            unsigned addtionalrow =
                tempoffset / (kernel.numEltPerGRF<Type_UW>() * 2);
            unsigned actualoffset =
                tempoffset % (kernel.numEltPerGRF<Type_UW>() * 2);
            bool valid = false;
            unsigned orignalrow = AliasRegVar->ExRegNum(valid);
            vISA_ASSERT(valid == true, ERROR_REGALLOC);
            CurrentRegVar->setPhyReg(
                regPool.getGreg(orignalrow + addtionalrow),
                actualoffset / CurrentRegVar->getDeclare()->getElemSize());
          }
        } else if (CurrentRegVar->getDeclare()->getRegFile() == G4_ADDRESS) {
          vISA_ASSERT(tempoffset < builder.getNumAddrRegisters() * 2,
                      ERROR_REGALLOC); // Must hold tempoffset in one A0 reg
          CurrentRegVar->setPhyReg(
              AliasRegVar->getPhyReg(),
              tempoffset / CurrentRegVar->getDeclare()->getElemSize());
        } else if (CurrentRegVar->getDeclare()->getRegFile() == G4_SCALAR) {
          if (builder.getuint32Option(vISA_ScalarPipe))
            vISA_ASSERT(tempoffset < kernel.getSRFInWords() *2,
                        ERROR_REGALLOC);
          else
          vISA_ASSERT(tempoffset < builder.getScalarRegisterSizeInBytes(),
                      ERROR_REGALLOC);
          CurrentRegVar->setPhyReg(
              AliasRegVar->getPhyReg(),
              tempoffset / CurrentRegVar->getDeclare()->getElemSize());
        } else {
          vISA_ASSERT(false, ERROR_REGALLOC);
        }
      } else {
        if (dcl->isSpilled() == false)
          dcl->setSpillFlag();
      }
    }
  }

  return;
}

void GlobalRA::removeSplitDecl() {
  for (auto dcl : kernel.Declares) {
    if (!getSubDclList(dcl).empty()) {
      clearSubDcl(dcl);
      dcl->setIsSplittedDcl(false);
    }
  }

  kernel.Declares.erase(
      std::remove_if(kernel.Declares.begin(), kernel.Declares.end(),
                     [](G4_Declare *dcl) { return dcl->getIsPartialDcl(); }),
      kernel.Declares.end());
}


void GlobalRA::fastRADecision()
{
  if (builder.getOption(vISA_SelectiveFastRA)) {
    unsigned instNum = 0;
    for (auto bb : kernel.fg) {
      instNum += (int)bb->size();
    }

    if (instNum > builder.getOptions()->getuInt32Option(vISA_SelectiveRAInstThreshold)) {
      useFastRA = true;
      useHybridRAwithSpill = true;
    } else {
      useFastRA = false;
      useHybridRAwithSpill = false;
    }
    RA_TRACE(std::cout << "\t--SelectiveFastRA decision: " << useFastRA << "\n");
  } else {
    useFastRA = builder.getOption(vISA_FastCompileRA);
    useHybridRAwithSpill = builder.getOption(vISA_HybridRAWithSpill);
  }

}

bool GlobalRA::tryHybridRA() {
  copyMissingAlignment();
  BankConflictPass bc(*this, false);


  LocalRA lra(bc, *this);
  if (lra.localRA()) {
    return true;
  }

  if (useHybridRAwithSpill) {
    insertPhyRegDecls();
  } else {
    if (hybridRA(lra)) {
      return true;
    }
  }
  return false;
}

bool GlobalRA::hybridRA(LocalRA &lra) {
  RA_TRACE(std::cout << "--hybrid RA--\n");
  uint32_t numOrigDcl = (uint32_t)kernel.Declares.size();
  insertPhyRegDecls();

  LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
  liveAnalysis.computeLiveness();

  if (liveAnalysis.getNumSelectedVar() > 0) {
    RPE rpe(*this, &liveAnalysis);
    rpe.run();

    bool spillLikely =
        kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
        rpe.getMaxRP() >= kernel.getNumRegTotal() - 16;
    if (spillLikely) {
      RA_TRACE(std::cout << "\t--skip hybrid RA due to high pressure: "
                         << rpe.getMaxRP() << "\n");
      kernel.Declares.resize(numOrigDcl);
      lra.undoLocalRAAssignments(false);
      // We check src/dst overlap WA because here to keep intf graph simple.
      // When LRA is run, it sets augmentation alignment conservatively so
      // that LRA assignments can co-exist with HRA assignments after
      // augmentation is run. If we reset alignment here, it means that
      // augmentation buckets are reset and alignment is copied over
      // from original G4_Declare. This is correct behavior. However, when
      // avoidSrcDstOverlap WA sees that src/dst of an instruction have no
      // alignment, it forces an interference edge between them. This causes
      // extra interferences in graph compared to case when we use conservative
      // alignment computed in LRA. So when the WA is enabled, we avoid
      // resetting alignment as it may produce better code.
      if (!builder.avoidDstSrcOverlap() || use4GRFAlign)
        copyAlignment();
      return false;
    }

    GraphColor coloring(liveAnalysis, /*isHybrid*/ true, /*forceSpill*/ false);
    generateForbiddenTemplates(0);
    // FIXME: doBankConflictReduction and highInternalConflict are computed by
    // local RA, they should be moved to some common code.
    bool isColoringGood =
        coloring.regAlloc(lra.doHybridBCR(), lra.hasHighInternalBC(), &rpe);
    if (!isColoringGood) {
      if (!kernel.getOption(vISA_Debug)) {
        // Why?? Keep LRA results when -debug is passed
        kernel.Declares.resize(numOrigDcl);
        lra.undoLocalRAAssignments(false);
      }
      // Restore alignment in case LRA modified it
      copyAlignment();
      return false;
    }
    coloring.confirmRegisterAssignments();

    if (kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc()) {
      coloring.getSaveRestoreRegister();
      addSaveRestoreCode(0);
    }

    if (verifyAugmentation) {
      assignRegForAliasDcl();
      verifyAugmentation->verify();
    }
  }

  kernel.setRAType(lra.doHybridBCR() ? RA_Type::HYBRID_BC_RA
                                     : RA_Type::HYBRID_RA);
  return true;
}


//
// change single-element dcl for G4_GRF to G4_SCALAR
//
void GlobalRA::selectScalarCandidates() {
  // collect root-declares that may be changed to scalar
  std::set<G4_Declare *> candidates;
  for (auto bb : kernel.fg) {
    for (auto inst : *bb) {
      G4_DstRegRegion *dst = inst->getDst();
      if (dst && dst->getTopDcl()) {
        auto rootDcl = dst->getTopDcl();
        if (rootDcl->getRegFile() == G4_GRF && !candidates.count(rootDcl) &&
            rootDcl->getNumRows() <= 1) {
          bool isNoMaskInst =
              (inst->isWriteEnableInst() || bb->isAllLaneActive());
          if (inst->getExecSize() == g4::SIMD1 && isNoMaskInst) {
            candidates.insert(rootDcl);
          }
        }
      }
    }
  }
  // filter candidates that we cannot handle
  std::set<G4_Declare *> multiUseInputs;
  std::set<G4_Declare *> visitedInputs;
  for (auto bb : kernel.fg) {
    for (auto inst : *bb) {
      G4_DstRegRegion *dst = inst->getDst();
      if (dst && dst->getTopDcl()) {
        auto rootDcl = dst->getTopDcl();
        if (candidates.count(rootDcl)) {
          // when there is only one SRF, it is unrealistic to allow
          // send to write to SRF
          if (inst->isSend() && kernel.getSRFInWords()*2
              <= builder.getScalarRegisterSizeInBytes()) {
            candidates.erase(rootDcl);
          }
          bool isNoMaskInst =
              (inst->isWriteEnableInst() || bb->isAllLaneActive());
          // all writes to that top-dcl have to be simd1-nomask
          if (!(inst->getExecSize() == g4::SIMD1 && isNoMaskInst)) {
            candidates.erase(rootDcl);
          }
        }
      }
      // when there is only one SRF, it is also unrealistic to allow
      // SRF as regular send source because it has to be 64-byte aligned
      if (inst->isSend() && kernel.getSRFInWords() * 2 <=
                                builder.getScalarRegisterSizeInBytes()) {
        for (int i = 0; i < inst->getNumSrc(); i++) {
          auto src = inst->getSrc(i);
          if (!src || !src->isSrcRegRegion())
            continue;
          auto srcDcl = src->getTopDcl();
          if (srcDcl && candidates.count(srcDcl))
            candidates.erase(srcDcl);
        }
      }
      // Also find all the input dcls that is used inside a loop,
      // or used more than once. Skip moves and sends
      if (inst->isRawMov() || inst->isSend())
        continue;
      for (int i = 0; i < inst->getNumSrc(); i++) {
        auto src = inst->getSrc(i);
        if (!src || !src->isSrcRegRegion())
          continue;
        G4_SrcRegRegion *origSrc = src->asSrcRegRegion();
        auto srcDcl = src->getTopDcl();
        if (srcDcl && srcDcl->getRegFile() == G4_INPUT &&
            srcDcl->getTotalElems() == 1 && origSrc && origSrc->isScalar()) {
          // mark multi-use
          if (bb->getNestLevel() > 0)
            multiUseInputs.insert(srcDcl);
          else if (visitedInputs.find(srcDcl) != visitedInputs.end())
            multiUseInputs.insert(srcDcl);
          // mark any use
          visitedInputs.insert(srcDcl);
        }
      }
    }
  }
  // update declares
  for (auto dcl : kernel.Declares) {
    auto rootDcl = dcl->getRootDeclare();
    if (candidates.count(rootDcl)) {
      dcl->setRegFile(G4_SCALAR);
    }
  }
  G4_BB *entryBB = kernel.fg.getEntryBB();
  auto insertIt = entryBB->begin();
  for (INST_LIST_ITER IB = entryBB->end(); insertIt != IB; ++insertIt) {
    G4_INST *tI = (*insertIt);
    if (tI->isFlowControl() || tI == entryBB->back())
      break;
  }
  std::map<G4_Declare *, G4_Declare *> inputMap;
  for (auto bb : kernel.fg) {
    INST_LIST_ITER it = bb->begin(), iEnd = bb->end();
    INST_LIST_ITER next_iter = it;
    for (; it != iEnd; it = next_iter) {
      ++next_iter;
      G4_INST *inst = *it;
      // skip moves and sends
      if (inst->isRawMov() || inst->isSend())
        continue;
      // Add move for scalar-iput with multiple-uses to save power.
      // The move should be inserted into the entry-block before the first use.
      for (int i = 0; i < inst->getNumSrc(); i++) {
        auto src = inst->getSrc(i);
        if (!src || !src->isSrcRegRegion())
          continue;
        G4_SrcRegRegion *origSrc = src->asSrcRegRegion();
        auto srcDcl = src->getTopDcl();
        if (srcDcl && origSrc && origSrc->isScalar() &&
            multiUseInputs.find(srcDcl) != multiUseInputs.end()) {
          vISA_ASSERT(!candidates.count(srcDcl),
                      "input-variable cannot be a scalar candidate");
          // insert a move to a scalar-register candidate
          auto subAlign = Get_G4_SubRegAlign_From_Size(
              (uint16_t)origSrc->getElemSize(), builder.getPlatform(),
              builder.getGRFAlign());
          G4_Declare *tmpDcl = nullptr;
          G4_SrcModifier modifier = origSrc->getModifier();
          if (inputMap.find(srcDcl) != inputMap.end())
            tmpDcl = inputMap[srcDcl];
          else {
            // create dcl for scalar
            tmpDcl =
                builder.createTempVar(g4::SIMD1, origSrc->getType(), subAlign);
            tmpDcl->setRegFile(G4_SCALAR);
            candidates.insert(tmpDcl);
            inputMap[srcDcl] = tmpDcl;
            addVarToRA(tmpDcl);
            // create mov
            origSrc->setModifier(Mod_src_undef);
            G4_DstRegRegion *dst = builder.createDstRegRegion(tmpDcl, 1);
            G4_INST *movInst = builder.createMov(g4::SIMD1, dst, origSrc,
                                                 InstOpt_WriteEnable, false);
            // insert mov
            if (bb == entryBB)
              bb->insertBefore(it, movInst);
            else
              entryBB->insertBefore(insertIt, movInst);
          }
          G4_SrcRegRegion *newSrc = builder.createSrcRegRegion(
              modifier, Direct, tmpDcl->getRegVar(), 0, 0,
              builder.getRegionScalar(), tmpDcl->getElemType());
          inst->setSrc(newSrc, i);
        }
      }
    }
  }
  // flag and address spill location now should be scalar registers.
  // scalar registers can be spilled into GRF
  // need this? addrFlagSpillDcls.clear();
  kernel.dumpToFile("after.select_scalar.");
}


std::pair<unsigned, unsigned> GlobalRA::reserveGRFSpillReg(GraphColor &coloring) {
  coloring.markFailSafeIter(true);
  unsigned spillRegSize = 0;
  unsigned indrSpillRegSize = 0;

  if (kernel.getOption(vISA_NewFailSafeRA)) {
    spillRegSize = getNumReservedGRFs();
  } else {
    determineSpillRegSize(spillRegSize, indrSpillRegSize);
  }

  if (builder.usesStack())
    vISA_ASSERT(spillRegSize + indrSpillRegSize <
                    kernel.stackCall.getNumCalleeSaveRegs(),
                "Invalid reserveSpillSize in fail-safe RA!");
  coloring.setReserveSpillGRFCount(spillRegSize + indrSpillRegSize);
  return std::make_pair(spillRegSize, indrSpillRegSize);
}

// pre-allocate the bits for forbidden registers which will not be used in
// register assignment.
// Note that the order of the calls matters, as all RCs inherit from RESERVEDGRF
// for example.
void GlobalRA::generateForbiddenTemplates(unsigned reserveSpillSize) {
  fbdRegs.generateReservedGRFForbidden(reserveSpillSize);
  fbdRegs.generateCallerSaveGRFForbidden();
  fbdRegs.generateCalleeSaveGRFForbidden();
  fbdRegs.generateEOTGRFForbidden();
  fbdRegs.generateLastGRFForbidden();
  fbdRegs.generateEOTLastGRFForbidden();
}

//
// Create variables will be used in fail safe RA
//
void GlobalRA::createVariablesForHybridRAWithSpill() {
  // To conduct fail safe in iteration 0, some variables need be allocated
  // first so that they can join RA and be used in the spill/fill directly.
  addVarToRA(builder.getSpillFillHeader());
  addVarToRA(builder.getOldA0Dot2Temp());

  if (builder.hasScratchSurface() && !builder.getSpillSurfaceOffset()) {
    vISA_ASSERT(builder.instList.empty(),
                "Inst list should be empty at this point before creating "
                "instruction that initializes SSO");
    builder.initScratchSurfaceOffset();
    addVarToRA(builder.getSpillSurfaceOffset());
    if (!builder.instList.empty()) {
      // If SSO is not yet initialized, insert the created
      // instruction into the entry BB.
      auto entryBB = builder.kernel.fg.getEntryBB();
      auto iter = std::find_if(entryBB->begin(), entryBB->end(),
                               [](G4_INST *inst) { return !inst->isLabel(); });
      entryBB->splice(iter, builder.instList);
    }
  }
  // BuiltinR0 may be spilled which is not allowed.
  // FIXME: BuiltinR0 spill cost has been set to MAX already,
  // keep spilling means there is some issue in cost model
  builder.getBuiltinR0()->setLiveOut();
  builder.getBuiltinR0()->getRegVar()->setPhyReg(builder.phyregpool.getGreg(0),
                                                 0);
}

void GlobalRA::initSRAsScratch() const {
  // Verify old scratch dcl assignment before changing it
  vISA_ASSERT(kernel.fg.scratchRegDcl->getRegVar()
                      ->getPhyReg()
                      ->asGreg()
                      ->getRegNum() == kernel.stackCall.getSpillHeaderGRF(),
              "unexpected assignment");
  vISA_ASSERT(kernel.stackCall.getSpillHeaderGRF() ==
                  kernel.stackCall.getFPSPGRF(),
              "expecting same GRF");
  // Use last caller save GRF for spill/fill addr computation. Since this
  // address is used as LSC header, we must use 0th sub-reg of reserved
  // GRF.
  kernel.fg.scratchRegDcl->getRegVar()->setPhyReg(
      regPool.getGreg(kernel.stackCall.getCallerSaveLastGRF()), 0);

  // Mark SR assignment as reserved so other variables don't try to
  // use it.
  kernel.fg.reserveSR = true;
}

void GlobalRA::stackCallSaveRestore(bool hasStackCall) {
  //
  // If the graph has stack calls, then add the caller-save/callee-save pseudo
  // declares and code. This currently must be done after flag/addr RA due to
  // the assumption about the location of the pseudo save/restore instructions
  //
  if (hasStackCall) {
    addCallerSavePseudoCode();

    // Only GENX sub-graphs require callee-save code.

    if (builder.getIsKernel() == false) {
      storeCEInProlog();
      addCalleeSavePseudoCode();
      addStoreRestoreToReturn();
    }

    if (!kernel.getOption(vISA_PreserveR0InR0)) {
      // bind builtinR0 to the reserved stack call ABI GRF so that caller and
      // callee can agree on which GRF to use for r0
      builder.getBuiltinR0()->getRegVar()->setPhyReg(
          builder.phyregpool.getGreg(kernel.stackCall.getThreadHeaderGRF()), 0);
    }

  }
}

int GlobalRA::doGlobalLinearScanRA() {
  copyMissingAlignment();
  BankConflictPass bc(*this, false);
  LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
  liveAnalysis.computeLiveness();

  TIME_SCOPE(LINEARSCAN_RA);
  LinearScanRA lra(bc, *this, liveAnalysis);
  int ret = lra.doLinearScanRA();
  if (ret == VISA_SUCCESS) {
    expandSpillFillIntrinsics(nextSpillOffset);
    assignRegForAliasDcl();
    if (builder.getOption(vISA_verifyLinearScan)) {
      resetGlobalRAStates();
      markGraphBlockLocalVars();
      LivenessAnalysis live(*this, G4_GRF | G4_INPUT, false, true);
      live.computeLiveness();
      GraphColor coloring(live, false, false);
      coloring.createLiveRanges();
      Interference intf(&live, *this);
      intf.init();
      intf.computeInterference();

      if (kernel.getOption(vISA_DumpRAIntfGraph))
        intf.dumpInterference();
      intf.linearScanVerify();
    }
    return VISA_SUCCESS;
  }
  return ret;
}

void GlobalRA::incRABookKeeping() {
  // Reset state of incremental RA here as we move from hybrid RA
  // to global RA. Note that when moving from flag->address or from
  // address->GRF RA, we don't need to explicitly reset state because
  // incremental RA can deduce we're moving to RA for different
  // variable class. But it cannot deduce so when moving from hybrid
  // to global RA.
  incRA.moveFromHybridToGlobalGRF();

  // This part makes incremental RA a non-NFC change. The reason we need
  // to do this is because variables that spill intrinsics use may end up
  // getting extended in each RA iteration. Given that those variables
  // are either r0 or scalars, we mark them as Output here so they're
  // live-out throughout. To make this an NFC change, we can enable this
  // block even when incremental RA is not enabled.
  if (incRA.isEnabled()) {
    builder.getBuiltinR0()->getRootDeclare()->setLiveOut();
    builder.getSpillFillHeader();

    bool initSS = builder.hasScratchSurface();
    if (initSS) {
      builder.initScratchSurfaceOffset();
      builder.getOldA0Dot2Temp();
    }
  }
}

std::pair<bool, bool> GlobalRA::remat(bool fastCompile, bool rematDone,
                                      LivenessAnalysis &liveAnalysis,
                                      GraphColor &coloring, RPE &rpe) {
  bool runRemat = kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM
                      ? true
                      : kernel.getSimdSize() < kernel.numEltPerGRF<Type_UB>();
  // -noremat takes precedence over -forceremat
  bool rematOn = !kernel.getOption(vISA_Debug) &&
                 !kernel.getOption(vISA_NoRemat) &&
                 !kernel.getOption(vISA_FastSpill) && !fastCompile &&
                 (kernel.getOption(vISA_ForceRemat) || runRemat);

  if (!rematDone && rematOn) {
    RA_TRACE(std::cout << "\t--rematerialize\n");
    Rematerialization remat(kernel, liveAnalysis, coloring, rpe, *this);
    remat.run();

    // Re-run GRA loop only if remat caused changes to IR
    return std::make_pair(remat.getChangesMade(), true);
  }
  return std::make_pair(false, rematDone);
}

std::tuple<bool, bool, bool>
GlobalRA ::alignedScalarSplit(bool fastCompile, bool alignedScalarSplitDone,
                              GraphColor &coloring) {
  bool isEarlyExit = false;
  if (kernel.getOption(vISA_SplitGRFAlignedScalar) && !fastCompile &&
      !kernel.getOption(vISA_FastSpill) && !alignedScalarSplitDone) {
    SplitAlignedScalars split(*this, coloring);
    split.run();

    // Re-run GRA loop if changes were made to IR
    bool rerunGRA = split.getChangesMade();
    kernel.dumpToFile("after.Split_Aligned_Scalar." +
                      std::to_string(getIterNo()));
#ifndef DLL_MODE
    if (stopAfter("Split_Aligned_Scalar")) {
      isEarlyExit = true;
    }
#endif // DLL_MODE
    return std::make_tuple(rerunGRA, true, isEarlyExit);
  }
  return std::make_tuple(false, alignedScalarSplitDone, false);
}

bool GlobalRA::globalSplit(VarSplit& splitPass, GraphColor& coloring) {
  unsigned int sendAssociatedGRFSpillFillCount = 0;
  // Calculate the spill caused by send to decide if global splitting is
  // required or not
  for (auto spilled : coloring.getSpilledLiveRanges()) {
    auto spillDcl = spilled->getDcl();
    if (spillDcl->getIsRefInSendDcl() && spillDcl->getNumRows() > 1) {
      sendAssociatedGRFSpillFillCount += spilled->getRefCount();
    }
  }

  if (getIterNo() ==
          0 && // Only works when first iteration of Global RA failed.
      !splitPass.didGlobalSplit && // Do only one time.
      splitPass.canDoGlobalSplit(builder, kernel,
                                 sendAssociatedGRFSpillFillCount)) {
    RA_TRACE(std::cout << "\t--global send split\n");
    splitPass.globalSplit(builder, kernel);
    splitPass.didGlobalSplit = true;
    // TODO: Since global split is rarely enabled, for now we skip
    // incremental RA whenever it is enabled.
    incRA.skipIncrementalRANextIter();
    return true;
  }

  return false;
}

void GlobalRA::localSplit(bool fastCompile, VarSplit& splitPass) {
  // Do variable splitting in each iteration
  // Don't do when fast compile is required
  if (builder.getOption(vISA_LocalDeclareSplitInGlobalRA) && !fastCompile) {
    RA_TRACE(std::cout << "\t--split local send--\n");
    for (auto bb : kernel.fg) {
      splitPass.localSplit(builder, bb);
    }
  }
}

std::pair<bool, bool> GlobalRA::bankConflict() {
  bool doBankConflictReduction = false, highInternalConflict = false;
  if (builder.getOption(vISA_LocalBankConflictReduction) &&
      builder.hasBankCollision()) {
    bool reduceBCInRR = false;
    bool reduceBCInTAandFF = false;
    BankConflictPass bc(*this, true);

    reduceBCInRR = bc.setupBankConflictsForKernel(
        true, reduceBCInTAandFF, SECOND_HALF_BANK_START_GRF * 2,
        highInternalConflict);
    doBankConflictReduction = reduceBCInRR && reduceBCInTAandFF;
  }
  return std::make_pair(doBankConflictReduction, highInternalConflict);
}

bool GlobalRA::setupFailSafeIfNeeded(bool fastCompile, bool hasStackCall,
                                     unsigned int maxRAIterations,
                                     unsigned int failSafeRAIteration) {
  bool reserveSpillReg = false;
  bool allowAddrTaken = builder.getOption(vISA_FastSpill) || fastCompile ||
                        !kernel.getHasAddrTaken();
  if (builder.getOption(vISA_FailSafeRA) &&
      kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
      kernel.getNumRegTotal() > 32 &&
      !hasStackCall &&
      ((getIterNo() == maxRAIterations - 1) ||
       (allowAddrTaken && getIterNo() == failSafeRAIteration))) {
    RA_TRACE(std::cout << "\t--enable failSafe RA\n");
    reserveSpillReg = true;

    if (incRA.isEnabled()) {
      incRA.skipIncrementalRANextIter();
    }

    if (builder.hasScratchSurface() && !hasStackCall) {
      // Since this is fail safe RA iteration, we ensure the 2 special
      // variables are created before coloring so spill code can use
      // them, if needed.
      auto a0Dot2Temp = kernel.fg.builder->getOldA0Dot2Temp();
      addVarToRA(a0Dot2Temp);
      if (builder.supportsLSC()) {
        auto spillFillHdr = kernel.fg.builder->getSpillFillHeader();
        addVarToRA(spillFillHdr);
      }
    }
  }
  return reserveSpillReg;
}

void GlobalRA::undefinedUses(bool rematDone, LivenessAnalysis& liveAnalysis) {
  if (builder.getOption(vISA_DumpUndefUsesFromLiveness) && getIterNo() == 0 &&
      !rematDone) {
    liveAnalysis.reportUndefinedUses();
  }
}

void GlobalRA::writeVerboseStatsNumVars(LivenessAnalysis &liveAnalysis,
                                        FINALIZER_INFO *jitInfo) {
  if (builder.getOption(vISA_DumpPerfStatsVerbose)) {
    jitInfo->statsVerbose.varNum = liveAnalysis.getNumSelectedVar();
    jitInfo->statsVerbose.globalVarNum = liveAnalysis.getNumSelectedGlobalVar();
  }
}

void GlobalRA::writeVerboseRPEStats(RPE &rpe) {
  if (builder.getOption(vISA_DumpPerfStatsVerbose) &&
      builder.getJitInfo()->statsVerbose.RAIterNum == 1) {
    builder.getJitInfo()->statsVerbose.maxRP = rpe.getMaxRP();
  }
  if (builder.getOption(vISA_DumpPerfStats)) {
    builder.getJitInfo()->stats.maxGRFPressure = rpe.getMaxRP();
  }
}

bool GlobalRA::VRTIncreasedGRF(GraphColor &coloring) {
  if (kernel.useAutoGRFSelection()) {
    bool infCostSpilled =
        coloring.getSpilledLiveRanges().end() !=
        std::find_if(coloring.getSpilledLiveRanges().begin(),
                     coloring.getSpilledLiveRanges().end(),
                     [](const LiveRange *spilledLR) {
                       return spilledLR->getSpillCost() == MAXSPILLCOST;
                     });
    // Check if GRF can be increased to avoid large spills
    if (canIncreaseGRF(computeSpillSize(coloring.getSpilledLiveRanges()),
                       infCostSpilled))
      return true;
  }
  return false;
}

void GlobalRA::splitOnSpill(bool fastCompile, GraphColor &coloring,
                            LivenessAnalysis &liveAnalysis) {
  if (!kernel.getOption(vISA_Debug) && getIterNo() == 0 && !fastCompile &&
      kernel.getOption(vISA_DoSplitOnSpill)) {
    RA_TRACE(std::cout << "\t--var split around loop\n");
    LoopVarSplit loopSplit(kernel, &coloring, &liveAnalysis);
    kernel.fg.getLoops().computePreheaders();
    loopSplit.run();
  }
}

bool GlobalRA::convertToFailSafe(bool reserveSpillReg, GraphColor &coloring,
                                 LivenessAnalysis &liveAnalysis,
                                 unsigned int nextSpillOffset) {
  // Very few spills in this iter. Check if we can convert this to fail
  // safe iter. By converting this iter to fail safe we can save (at
  // least) 1 additional iter to allocate spilled temps. But converting to
  // fail safe needs extra checks because no reserved GRF may exist at
  // this point. So push/pop needs to succeed without additional GRF
  // potentially.
  if (!kernel.getOption(vISA_Debug) && getIterNo() >= 1 &&
      kernel.getOption(vISA_NewFailSafeRA) && !reserveSpillReg &&
      coloring.getSpilledLiveRanges().size() <= BoundedRA::MaxSpillNumVars &&
      liveAnalysis.getNumSelectedVar() > BoundedRA::LargeProgramSize) {
    // Stack call always has free GRF so it is safe to convert this iter
    // to fail safe
    if (builder.usesStack() ||
        // If LSC has to be used for spill/fill then we need to ensure
        // spillHeader is created
        (useLscForNonStackCallSpillFill && builder.hasValidSpillFillHeader()) ||
        // or if immediate can be folded in to LSC
        canUseLscImmediateOffsetSpillFill ||
       // If scratch is to be used then max spill offset must be within
       // addressable range and r0 must be available as reserved. If r0
       // is not reserved, we cannot conver current iteration to fail
       // safe because r0 may get assigned to other virtual variables.
        ((kernel.getOption(vISA_PreserveR0InR0) ||
          builder.getBuiltinR0()->isOutput()) &&
         (nextSpillOffset + BoundedRA::getNumPhyVarSlots(kernel)) <
             SCRATCH_MSG_LIMIT)) {
      // Few ranges are spilled but this was not executed as fail
      // safe iteration. However, we've the capability of doing
      // push/pop with new fail safe RA implementation. So for very
      // few spills, we insert push/pop to free up some GRFs rather
      // than executing a new RA iteration. When doing so, we mark
      // this RA iteration as fail safe.
      coloring.markFailSafeIter(true);
      // No reserved GRFs
      setNumReservedGRFsFailSafe(0);
      RA_TRACE(std::cout << "\t--enabling new fail safe RA\n");
      return true;
    }
  }
  return reserveSpillReg;
}

std::pair<bool, unsigned int>
GlobalRA::abortOnSpill(unsigned int GRFSpillFillCount,
                            GraphColor &coloring) {
  // Calculate the spill caused by send to decide if global splitting is
  // required or not
  for (auto spilled : coloring.getSpilledLiveRanges()) {
    GRFSpillFillCount += spilled->getRefCount();
  }

  // vISA_AbortOnSpillThreshold is defined as [0..200]
  // where 0 means abort on any spill and 200 means never abort
  auto underSpillThreshold = [this](int numSpill, int asmCount,
                                    GraphColor &coloring) {
    int threshold = std::min(
        builder.getOptions()->getuInt32Option(vISA_AbortOnSpillThreshold),
        200u);
    unsigned spillSize = computeSpillSize(coloring.getSpilledLiveRanges());

    return (numSpill * 200) < (threshold * asmCount) ||
           spillSize < kernel.grfMode.getSpillThreshold();
  };

  unsigned int instNum = instCount();
  bool isUnderThreshold =
      underSpillThreshold(GRFSpillFillCount, instNum, coloring);
  isUnderThreshold = builder.getFreqInfoManager().underFreqSpillThreshold(
      coloring.getSpilledLiveRanges(), instNum, GRFSpillFillCount,
      isUnderThreshold);

  if (isUnderThreshold) {
    if (auto jitInfo = builder.getJitInfo()) {
      jitInfo->avoidRetry = true;
    }
  }

  if (builder.getOption(vISA_AbortOnSpill) && !isUnderThreshold) {
    // update jit metadata information
    if (auto jitInfo = builder.getJitInfo()) {
      jitInfo->stats.spillMemUsed = 0;
      jitInfo->stats.numAsmCountUnweighted = instNum;
      jitInfo->stats.numGRFSpillFillWeighted = GRFSpillFillCount;
    }

    return std::make_pair(true, GRFSpillFillCount);
  }
  return std::make_pair(false, GRFSpillFillCount);
}

unsigned GlobalRA::computeSpillSize(std::list<LSLiveRange *> &spilledLRs) {
  unsigned spillSize = 0;
  for (auto lr : spilledLRs) {
    spillSize += lr->getTopDcl()->getByteSize();
  }
  return spillSize;
}

unsigned GlobalRA::computeSpillSize(const LIVERANGE_LIST &spilledLRs) {
  unsigned spillSize = 0;
  for (auto lr : spilledLRs) {
    spillSize += lr->getDcl()->getByteSize();
  }
  return spillSize;
}

bool GlobalRA::spillSpaceCompression(int spillSize,
                                     const int globalScratchOffset) {
  if (builder.getOption(vISA_ForceSpillSpaceCompression) &&
      (builder.getuint32Option(vISA_SpillSpaceCompressionThreshold) == 0))
    return true;

  int spillcompressionThreshold =
      (int)builder.getuint32Option(vISA_SpillSpaceCompressionThreshold) * 1024;

  // user disabled vISA_ForceSpillSpaceCompression and no threshold override.
  if (spillcompressionThreshold == 0) {
    spillcompressionThreshold = SCRATCH_COMPRESS_THRESHOLD;
  }

  // factor 1.2 is used to count in the space used for the following
  // iterations. Generally, the most spill will happen in first iteration.
  if ((spillSize * 1.2) <
      (spillcompressionThreshold - globalScratchOffset)) {
    return false;
  }
  return true;
}

void GlobalRA::verifyNoInfCostSpill(GraphColor& coloring, bool reserveSpillReg)
{
  vISA_ASSERT(std::all_of(coloring.getSpilledLiveRanges().begin(),
                          coloring.getSpilledLiveRanges().end(),
                          [&](const LiveRange *spilledLR) {
                            // EOT spills even of infinite cost are
                            // specially handled in spill insertion when
                            // using old fail safe RA. So don't assert for
                            // such spills.
                            if (isEOTSpillWithFailSafeRA(builder, spilledLR,
                                                         reserveSpillReg) &&
                                !builder.getOption(vISA_NewFailSafeRA))
                              return true;
                            return spilledLR->getSpillCost() != MAXSPILLCOST;
                          }),
              "Spilled inf spill cost range");
}

void GlobalRA::setupA0Dot2OnSpill(bool hasStackCall,
                                  unsigned int nextSpillOffset,
                                  int globalScratchOffset) {
  if (builder.hasScratchSurface() && !hasStackCall &&
      (nextSpillOffset + globalScratchOffset) >= SCRATCH_MSG_LIMIT) {
    // create temp variable to store old a0.2 - this is marked as live-in
    // and live-out. because the variable is emitted only post RA to
    // preserve old value of a0.2.
    kernel.fg.builder->getOldA0Dot2Temp();
  } else if (useLscForNonStackCallSpillFill || useLscForScatterSpill) {
    // Xe2+ LSC-based spill/fill needs the same as above
    {
      kernel.fg.builder->getOldA0Dot2Temp();
    }
  }
}

bool GlobalRA::spillCleanup(bool fastCompile, bool useScratchMsgForSpill,
                            bool hasStackCall, bool reserveSpillReg, RPE &rpe,
                            GraphColor &coloring,
                            LivenessAnalysis &liveAnalysis,
                            SpillManagerGRF &spillGRF) {
  bool disableSpillCoalecse = builder.getOption(vISA_DisableSpillCoalescing) ||
                              builder.getOption(vISA_FastSpill) ||
                              fastCompile || builder.getOption(vISA_Debug) ||
                              // spill cleanup is not support when we use oword
                              // msg for spill/fill for non-stack calls.
                              (!useScratchMsgForSpill && !hasStackCall);

  if (!reserveSpillReg && !disableSpillCoalecse && builder.useSends()) {
    RA_TRACE(std::cout << "\t--spill/fill cleanup\n");
    CoalesceSpillFills c(kernel, liveAnalysis, coloring, spillGRF, getIterNo(),
                         rpe, *this);
    c.run();
#ifndef DLL_MODE
    if (stopAfter("spillCleanup")) {
      return true;
    }
#endif // DLL_MODE
  }
  return false;
}

std::tuple<bool, bool, bool, unsigned int, unsigned int>
GlobalRA::insertSpillCode(bool enableSpillSpaceCompression,
                          GraphColor &coloring, LivenessAnalysis &liveAnalysis,
                          RPE &rpe, unsigned int scratchOffset,
                          bool fastCompile, bool hasStackCall,
                          int globalScratchOffset, unsigned int nextSpillOffset,
                          bool reserveSpillReg, unsigned int spillRegSize,
                          unsigned int indrSpillRegSize,
                          bool useScratchMsgForSpill) {
  if (getIterNo() == 0 && enableSpillSpaceCompression &&
      kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D &&
      !hasStackCall) {
    enableSpillSpaceCompression = spillSpaceCompression(
        computeSpillSize(coloring.getSpilledLiveRanges()), globalScratchOffset);
  }

  startTimer(TimerID::SPILL);
  SpillManagerGRF spillGRF(*this, nextSpillOffset, &liveAnalysis,
                           coloring.getIntf(), &coloring.getSpilledLiveRanges(),
                           reserveSpillReg, spillRegSize, indrSpillRegSize,
                           enableSpillSpaceCompression, useScratchMsgForSpill);

  if (kernel.getOption(vISA_SpillAnalysis)) {
    spillAnalysis->Do(&liveAnalysis, &coloring, &spillGRF);
  }

  verifyNoInfCostSpill(coloring, reserveSpillReg);

  bool success = spillGRF.insertSpillFillCode(&kernel, pointsToAnalysis);
  nextSpillOffset = spillGRF.getNextOffset();

  if (kernel.getOption(vISA_VerifyRA)) {
    // For least false positives, turn off RMW opt and spill cleanup
    verifySpillFill();
  }

  setupA0Dot2OnSpill(hasStackCall, nextSpillOffset, globalScratchOffset);

  RA_TRACE({
    auto &&spills = coloring.getSpilledLiveRanges();
    std::cout << "\t--# variables spilled: " << spills.size() << "\n";
    if (spills.size() < 100) {
      std::cout << "\t--spilled variables: ";
      for (auto &&lr : spills) {
        std::cout << lr->getDcl()->getName() << "  ";
      }
      std::cout << "\n";
    }
    std::cout << "\t--current spill size: " << nextSpillOffset << "\n";
  });

  if (!success) {
    return std::make_tuple(false, enableSpillSpaceCompression, false,
                           scratchOffset, nextSpillOffset);
  }

  kernel.dumpToFile("after.Spill_GRF." + std::to_string(getIterNo() + 1));
#ifndef DLL_MODE
  if (stopAfter("Spill_GRF")) {
    return std::make_tuple(true, enableSpillSpaceCompression, true,
                           scratchOffset, nextSpillOffset);
  }
#endif // DLL_MODE

  scratchOffset = std::max(scratchOffset, spillGRF.getNextScratchOffset());

  bool isEarlyExit =
      spillCleanup(fastCompile, useScratchMsgForSpill, hasStackCall,
                   reserveSpillReg, rpe, coloring, liveAnalysis, spillGRF);

  return std::make_tuple(true, enableSpillSpaceCompression, isEarlyExit,
                         scratchOffset, nextSpillOffset);
}

bool GlobalRA::rerunGRAIter(bool rerunGRA)
{
  if (getIterNo() == 0 && (rerunGRA || kernel.getOption(vISA_forceBCR))) {
    if (kernel.getOption(vISA_forceBCR)) {
      // FIXME: We shouldn't modify options. Use local bool flag instead.
      kernel.getOptions()->setOption(vISA_forceBCR, false);
    }
    return true;
  }
  return false;
}

//
// graph coloring entry point.  returns nonzero if RA fails
//
int GlobalRA::coloringRegAlloc() {
  VISA_DEBUG_VERBOSE({
    std::cout << "\n=== Register Allocation ===\n";
    if (builder.getIsKernel() == false) {
      std::cout << "Function: " << kernel.getName() << "\n";
    } else {
      std::cout << "Kernel: " << kernel.getName() << "\n";
    }

    detectNeverDefinedUses();
  });

#ifndef DLL_MODE
  // Points-to analysis is done in RegAlloc.cpp just before constructing
  // GlobalRA instance.
  if (stopAfter("p2a")) {
    pointsToAnalysis.dump(std::cout);
    return VISA_EARLY_EXIT;
  }
#endif // DLL_MODE

  bool hasStackCall =
      kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc();

  fastRADecision();

  bool hybridWithSpill = useHybridRAwithSpill &&
    (!hasStackCall || builder.getOption(vISA_PartitionWithFastHybridRA));
  useLocalRA = builder.getOption(vISA_LocalRA)
    && (kernel.fg.funcInfoTable.size() == 0
        || kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_3D
        || hybridWithSpill);

  // this needs to be called before addr/flag RA since it changes their
  // alignment as well
  fixAlignment();

  {
    TIME_SCOPE(ADDR_FLAG_RA);

    addVarToRA(kernel.Declares.back());

    addrRegAlloc();

    flagRegAlloc();
  }
  if (builder.getuint32Option(vISA_ScalarPipe)) {
    selectScalarCandidates();
    scalarRegAlloc();
  }
  // LSC messages are used when:
  // a. Stack call is used on PVC+,
  // b. Spill size exceeds what can be represented using hword msg on PVC+
  // c. Xe2+ requires LSC stack (can force on DG2+ via -lscNonStackSpill)
  if (builder.supportsLSC()) {
    canUseLscImmediateOffsetSpillFill = LSCUsesImmOff(builder);
  }

  stackCallSaveRestore(hasStackCall);

  if (kernel.getOption(vISA_SpillAnalysis)) {
    spillAnalysis = std::make_unique<SpillAnalysis>();
  }

  if (kernel.fg.getIsStackCallFunc()) {
    // Allocate space to store Frame Descriptor
    nextSpillOffset += builder.numEltPerGRF<Type_UB>();
    scratchOffset += builder.numEltPerGRF<Type_UB>();

    if (kernel.getOption(vISA_storeCE)) {
      nextSpillOffset += builder.numEltPerGRF<Type_UB>();
      scratchOffset += builder.numEltPerGRF<Type_UB>();
    }
  }

  // Global linear scan RA
  if (builder.getOption(vISA_LinearScan) &&
      builder.kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_3D) {
    int success = doGlobalLinearScanRA();
    if (success == VISA_SUCCESS)
      return success;
    else if (success == VISA_SPILL) {
      return VISA_SPILL;
    }
  } else if (useLocalRA && !hasStackCall) {
    if (tryHybridRA()) {
      assignRegForAliasDcl();
      return VISA_SUCCESS;
    }
  }

  startTimer(TimerID::GRF_GLOBAL_RA);
  unsigned maxRAIterations = builder.getuint32Option(vISA_MaxRAIterations);
  unsigned iterationNo = 0;

  int globalScratchOffset =
      kernel.getInt32KernelAttr(Attributes::ATTR_SpillMemOffset);
  bool useScratchMsgForSpill =
      !hasStackCall &&
      (globalScratchOffset < (int)(SCRATCH_MSG_LIMIT * 0.6)
       // useScratchMsgForSpill is true for
       // * scratch msg
       // * LSC msg
       // Spill insertion module decides whether to expand a fill/spill to
       // scratch or LSC depending on spill offset. oword is supported for PVC
       // but it is not emitted in favor of LSC.
       || builder.supportsLSC());
  bool enableSpillSpaceCompression =
      builder.getOption(vISA_SpillSpaceCompression);

  uint32_t GRFSpillFillCount = 0;
  if (builder.getFreqInfoManager().isFreqBasedSpillSelectionEnabled())
    builder.getFreqInfoManager().initGRFSpillFillFreq();

  unsigned fastCompileIter = 1;
  bool fastCompile =
    (useFastRA || useHybridRAwithSpill) &&
    (!hasStackCall || builder.getOption(vISA_PartitionWithFastHybridRA));

  if (fastCompile) {
    fastCompileIter = 0;
  }

  if (kernel.getInt32KernelAttr(Attributes::ATTR_Target) == VISA_CM) {
    maxRAIterations = 12;
  }

  unsigned failSafeRAIteration =
      (builder.getOption(vISA_FastSpill) || fastCompile)
          ? fastCompileIter
          : builder.getuint32Option(vISA_FailSafeRALimit);

  if (failSafeRAIteration == 0) { // Fail safe RA directly in iteration 0, used
                                  // for hybrid RA with spill
    createVariablesForHybridRAWithSpill();
  }

  bool rematDone = false, alignedScalarSplitDone = false;
  bool reserveSpillReg = false;
  VarSplit splitPass(*this);
  DynPerfModel perfModel(kernel);
  FINALIZER_INFO *jitInfo = builder.getJitInfo();

  incRABookKeeping();
  while (iterationNo < maxRAIterations) {
    jitInfo->statsVerbose.RAIterNum++;
    if (builder.getOption(vISA_DynPerfModel)) {
      perfModel.NumRAIters++;
    }
    RA_TRACE(std::cout << "--GRF RA iteration " << iterationNo << "--"
                       << kernel.getName() << "\n");
    setIterNo(iterationNo);

    if (builder.getOption(vISA_clearScratchWritesBeforeEOT) &&
        (globalScratchOffset + nextSpillOffset) > 0) {
      // we need to set r0 be live out for this WA
      builder.getBuiltinR0()->setLiveOut();
    }

    if (!useHybridRAwithSpill) {
      resetGlobalRAStates();
      // Identify the local variables to speedup following analysis
      markGraphBlockLocalVars();
    }

    if (kernel.getOption(vISA_SpillAnalysis)) {
      spillAnalysis->Clear();
    }

    localSplit(fastCompile, splitPass);

    const auto [doBankConflictReduction, highInternalConflict] = bankConflict();

    reserveSpillReg = setupFailSafeIfNeeded(
        fastCompile, hasStackCall, maxRAIterations, failSafeRAIteration);

    LivenessAnalysis liveAnalysis(*this, G4_GRF | G4_INPUT);
    liveAnalysis.computeLiveness();

#ifndef DLL_MODE
    if (stopAfter("Global_RA_liveness")) {
      return VISA_EARLY_EXIT;
    }
#endif // DLL_MODE
    if (builder.getOption(vISA_dumpLiveness)) {
      liveAnalysis.dump();
    }
    if (jitInfo->statsVerbose.RAIterNum == 1) {
      writeVerboseStatsNumVars(liveAnalysis, jitInfo);
      RA_TRACE(std::cout << "\t--# global variable: "
                         << jitInfo->statsVerbose.globalVarNum << "\n");
    }
#ifdef DEBUG_VERBOSE_ON
    emitFGWithLiveness(liveAnalysis);
#endif
    // if no reg var needs to reg allocated, then skip reg allocation
    if (liveAnalysis.getNumSelectedVar() == 0)
      break;

    undefinedUses(rematDone, liveAnalysis);

    // force spill should be done only for the 1st iteration
    bool forceSpill =
        iterationNo > 0 ? false : builder.getOption(vISA_ForceSpills);
    RPE rpe(*this, &liveAnalysis);
    if (!fastCompile) {
      rpe.run();
      writeVerboseRPEStats(rpe);
    }
    GraphColor coloring(liveAnalysis, false, forceSpill);

    if (builder.getOption(vISA_dumpRPE) && iterationNo == 0 && !rematDone) {
      coloring.dumpRPEToFile();
      // dump pressure the first time we enter global RA
      coloring.dumpRegisterPressure(std::cerr);
    }

    // Get the size of register which are reserved for spill
    unsigned spillRegSize = 0;
    unsigned indrSpillRegSize = 0;

    if (reserveSpillReg) {
      std::tie(spillRegSize, indrSpillRegSize) = reserveGRFSpillReg(coloring);
    }
    generateForbiddenTemplates(spillRegSize + indrSpillRegSize);
    bool isColoringGood =
        coloring.regAlloc(doBankConflictReduction, highInternalConflict, &rpe);
    if (!isColoringGood) {
      // When there are spills and -abortonspill is set, vISA will bump up the
      // number of GRFs first and try to compile without spills under one of
      // the following conditions:
      //  - Variable with inf spill cost, or
      //  - #GRFs selected and next larger one has same number of threads, or
      //  - Spill ratio is above threshold
      // If none of the conditions is met, vISA will abort and return VISA_SPILL.
      if (VRTIncreasedGRF(coloring))
        continue;

      bool rerunGRA1 = false, rerunGRA2 = false, rerunGRA3 = false,
           isEarlyExit = false, abort = false, success = false;
      std::tie(rerunGRA1, rematDone) = remat(fastCompile, rematDone, liveAnalysis, coloring, rpe);
      std::tie(rerunGRA2, alignedScalarSplitDone, isEarlyExit) =
          alignedScalarSplit(fastCompile, alignedScalarSplitDone, coloring);
#ifndef DLL_MODE
      if (isEarlyExit) {
        return VISA_EARLY_EXIT;
      }
#endif // DLL_MODE

      rerunGRA3 = globalSplit(splitPass, coloring);

      if (rerunGRAIter(rerunGRA1 || rerunGRA2 || rerunGRA3))
        continue;

      splitOnSpill(fastCompile, coloring, liveAnalysis);

      reserveSpillReg = convertToFailSafe(reserveSpillReg, coloring, liveAnalysis,
                        nextSpillOffset);

      if (iterationNo == 0) {
        // Dump out interference graph information of spill candidates
        VISA_DEBUG_VERBOSE(reportSpillInfo(liveAnalysis, coloring));
      }

      std::tie(abort, GRFSpillFillCount) =
          abortOnSpill(GRFSpillFillCount, coloring);
      if (abort) {
        // Early exit when -abortonspill is passed, instead of
        // spending time inserting spill code and then aborting.
        stopTimer(TimerID::GRF_GLOBAL_RA);
        return VISA_SPILL;
      }

      std::tie(success, enableSpillSpaceCompression, isEarlyExit, scratchOffset,
               nextSpillOffset) =
          insertSpillCode(enableSpillSpaceCompression, coloring, liveAnalysis,
                          rpe, scratchOffset, fastCompile, hasStackCall,
                          globalScratchOffset, nextSpillOffset, reserveSpillReg,
                          spillRegSize, indrSpillRegSize,
                          useScratchMsgForSpill);
      if (!success) {
        iterationNo = maxRAIterations;
        break;
      }
#ifndef DLL_MODE
      if (isEarlyExit)
        return VISA_EARLY_EXIT;
#endif // DLL_MODE

      ++iterationNo;

      if (iterationNo == builder.getuint32Option(vISA_FailSafeRALimit)) {
        if (coloring.getSpilledLiveRanges().size() < 2) {
          // give regular RA one more try as we are close to success
          failSafeRAIteration++;
        }
      }
      stopTimer(TimerID::SPILL);
    }
    // RA successfully allocates regs
    if (isColoringGood == true || reserveSpillReg) {
      coloring.confirmRegisterAssignments();

      if (hasStackCall) {
        // spill/fill intrinsics expect offset in HWord, so round up to 64
        // byte but maintain it in OWord unit ToDo: we really need to change
        // everything to byte for everyone's sanity..
        unsigned localSpillAreaOwordSize = ROUND(scratchOffset, 64) / 16;
        coloring.getSaveRestoreRegister();
        addSaveRestoreCode(localSpillAreaOwordSize);
      }

      if (kernel.getOption(vISA_DumpRegChart)) {
        assignRegForAliasDcl();
        // invoke before expanding spill/fill since
        // it modifies IR
        regChart->dumpRegChart(std::cerr, {}, 0);
      }

      if (builder.getOption(vISA_DynPerfModel)) {
        perfModel.run();
      }
      expandSpillFillIntrinsics(nextSpillOffset);

      VISA_DEBUG_VERBOSE(detectUndefinedUses(liveAnalysis, kernel));

      if (nextSpillOffset) {
        switch (kernel.getRAType()) {
        case RA_Type::GRAPH_COLORING_RR_BC_RA:
          kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_RR_BC_RA);
          break;
        case RA_Type::GRAPH_COLORING_FF_BC_RA:
          kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_FF_BC_RA);
          break;
        case RA_Type::GRAPH_COLORING_RR_RA:
          kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_RR_RA);
          break;
        case RA_Type::GRAPH_COLORING_FF_RA:
          kernel.setRAType(RA_Type::GRAPH_COLORING_SPILL_FF_RA);
          break;
        default:
          vISA_ASSERT_UNREACHABLE("invalid ra type");
          break;
        }
      }

      if (verifyAugmentation) {
        assignRegForAliasDcl();
        verifyAugmentation->verify();
      }
      break; // done
    }
  }
  assignRegForAliasDcl();

  stopTimer(TimerID::GRF_GLOBAL_RA);
  //
  // Report failure to allocate due to excessive register pressure.
  //
  if (!reserveSpillReg && (iterationNo == maxRAIterations)) {
    std::stringstream spilledVars;
    for (auto dcl : kernel.Declares) {
      if (dcl->isSpilled() && dcl->getRegFile() == G4_GRF) {
        spilledVars << dcl->getName() << "\t";
      }
    }

    vISA_ASSERT(false, "%d GRF registers are NOT enough to compile kernel %s \
        The maximum register pressure in the kernel is higher than the available \
        physical registers in hardware (even with spill code). Please consider \
        rewriting the kernel. Compiling with the symbolic register option and \
        inspecting the spilled registers may help in determinig the region of high \
        pressure. The spilling virtual registers are as follows %s.",
                (kernel.getNumRegTotal() -
                 builder.getOptions()->getuInt32Option(vISA_ReservedGRFNum)),
                kernel.getName(), spilledVars.str().c_str());

    return VISA_SPILL;
  }

  // this includes vISA's scratch space use only and does not include whatever
  // IGC may use for private memory
  uint32_t spillMemUsed = builder.kernel.fg.frameSizeInOWord ?
      (builder.kernel.fg.frameSizeInOWord * 16) : nextSpillOffset;

  spillMemUsed = ROUND(spillMemUsed, kernel.numEltPerGRF<Type_UB>());

  if (spillMemUsed &&
      !(kernel.fg.getHasStackCalls() || kernel.fg.getIsStackCallFunc())) {
    builder.criticalMsgStream()
        << "Spill memory used = " << spillMemUsed << " bytes for kernel "
        << kernel.getName()
        << "\n Compiling kernel with spill code may degrade performance."
        << " Please consider rewriting the kernel to use less registers.\n";
  }

  // update jit metadata information for spill
  if (auto jitInfo = builder.getJitInfo()) {
    // jitInfo->spillMemUsed is the entire visa stack size. Consider the
    // caller/callee save size if having caller/callee save
    // globalScratchOffset in unit of byte, others in Oword
    //
    // FIXME: globalScratchOffset must be 0 when having stack call, or
    // there is a problem at stack setup
    // (see GlobalRA::addGenxMainStackSetupCode)
    //
    //                               vISA stack
    //  globalScratchOffset     -> ---------------------
    //                             |  spill            |
    //  calleeSaveAreaOffset    -> ---------------------
    //                             |  callee save      |
    //  callerSaveAreaOffset    -> ---------------------
    //                             |  caller save      |
    //  frameSizeInOWord        -> ---------------------

    jitInfo->hasStackcalls = kernel.fg.getHasStackCalls();

    // Each function reports its required stack size.
    // We will summarize the final stack size of entire vISA module into
    // the main functions (ref: CISA_IR_Builder::summarizeFunctionInfo)
    jitInfo->stats.spillMemUsed = spillMemUsed;
    kernel.getGTPinData()->setScratchNextFree(spillMemUsed +
                                              globalScratchOffset);
    jitInfo->stats.numGRFSpillFillWeighted = GRFSpillFillCount;
  }

  if (builder.getOption(vISA_LocalDeclareSplitInGlobalRA)) {
    removeSplitDecl();
  }

  if (builder.getOption(vISA_DynPerfModel)) {
    perfModel.dump();
  }

  return VISA_SUCCESS;
}

// Insert declarations with pre-assigned registers in kernel
// this is needed for HRA, and the fake declares will be removed at the end of
// HRA
void GlobalRA::insertPhyRegDecls() {
  int numGRF = kernel.getNumRegTotal();
  std::vector<bool> grfUsed(numGRF, false);
  GRFDclsForHRA.resize(numGRF);

  for (auto curBB : kernel.fg) {
    if (auto summary = getBBLRASummary(curBB)) {
      for (int i = 0; i < numGRF; i++) {
        if (summary->isGRFBusy(i)) {
          grfUsed[i] = true;
        }
      }
    }
  }

  // Insert declarations for each GRF that is used
  unsigned numGRFsUsed = 0;
  for (int i = 0; i < numGRF; i++) {
    if (grfUsed[i] == true) {
      const char *dclName = builder.getNameString(10, "r%d", i);
      G4_Declare *phyRegDcl =
          builder.createDeclare(dclName, G4_GRF, kernel.numEltPerGRF<Type_UD>(),
                                1, Type_D, Regular, NULL, NULL);
      G4_Greg *phyReg = builder.phyregpool.getGreg(i);
      phyRegDcl->getRegVar()->setPhyReg(phyReg, 0);
      GRFDclsForHRA[i] = phyRegDcl;
      addVarToRA(phyRegDcl);
      numGRFsUsed++;
    }
  }

  VISA_DEBUG(std::cout << "Local RA used " << numGRFsUsed << " GRFs\n");
}

void GraphColor::dumpRPEToFile() {
  // Dump RPE output to file if asmName is set
  auto *asmOutput = builder.getOptions()->getOptionCstr(VISA_AsmFileName);
  if (asmOutput) {
    std::string FN(asmOutput);
    FN += ".rpe";
    std::ofstream OF;
    OF.open(FN, std::ofstream::out);
    dumpRegisterPressure(OF);
    OF.close();
  }
}

void GraphColor::dumpRegisterPressure(std::ostream &OS) {
  RPE rpe(gra, &liveAnalysis);
  uint32_t max = 0;
  std::vector<G4_INST *> maxInst;
  rpe.run();

  for (auto bb : builder.kernel.fg) {
    OS << "BB " << bb->getId() << ": (Pred: ";
    for (auto pred : bb->Preds) {
      OS << pred->getId() << ",";
    }
    OS << " Succ: ";
    for (auto succ : bb->Succs) {
      OS << succ->getId() << ",";
    }
    OS << ")\n";
    for (auto instIt = bb->begin(); instIt != bb->end(); ++instIt) {
      auto *inst = *instIt;
      uint32_t pressure = rpe.getRegisterPressure(inst);
      if (pressure > max) {
        max = pressure;
        maxInst.clear();
        maxInst.push_back(inst);
      } else if (pressure == max) {
        maxInst.push_back(inst);
      }

      if (kernel.getOption(vISA_EmitSrcFileLineToRPE))
        bb->emitInstructionSourceLineMapping(OS, instIt);
      OS << "[" << pressure << "] ";
      inst->print(OS);
    }
  }
  OS << "max pressure: " << max << ", " << maxInst.size() << " inst(s)\n";
  for (auto inst : maxInst) {
    inst->print(OS);
  }
}

void GlobalRA::fixAlignment() {
  // Copy over alignment from G4_RegVar to GlobalRA instance
  // Rest of RA shouldnt have to read/modify alignment of G4_RegVar
  copyAlignment();

  for (auto dcl : kernel.Declares) {
    if (dcl->getRegFile() & G4_FLAG) {
      if (dcl->getByteSize() > 2 ||
          (kernel.getSimdSize() == g4::SIMD32 &&
           kernel.getInt32KernelAttr(Attributes::ATTR_Target) != VISA_CM))
        setSubRegAlign(dcl, G4_SubReg_Align::Even_Word);
    }
  }

  if (builder.getPlatform() == GENX_BDW) {
    // BDW requires even_word alignment for scalar HF variables
    for (auto dcl : kernel.Declares) {
      if (dcl->getElemType() == Type_HF && dcl->getSubRegAlign() == Any) {
        setSubRegAlign(dcl, Even_Word);
      }
    }
  }

  // ToDo: remove these as it should be done by HWConformity
  for (auto BB : kernel.fg) {
    for (auto inst : *BB) {
      G4_DstRegRegion *dst = inst->getDst();
      if (dst && dst->getTopDcl()) {
        G4_RegVar *var = dst->getBase()->asRegVar();
        if (inst->isSend() && dst->getRegAccess() == Direct) {
          if (!var->isPhyRegAssigned()) {
            setSubRegAlign(dst->getTopDcl(), builder.getGRFAlign());
          }
        }

        if (!var->isPhyRegAssigned() && var->getDeclare()->getNumRows() <= 1 &&
            dst->getRegAccess() == Direct &&
            var->getDeclare()->getSubRegAlign() == Any) {
          if (inst->isAccSrcInst()) {
            setSubRegAlign(dst->getTopDcl(),
                           var->getDeclare()->getRegFile() != G4_ADDRESS
                               ? builder.getGRFAlign()
                               : Eight_Word);
          }
        }
      }
    }
  }
}

//
// DFS to check if there is any conflict in subroutine return location
//
bool GlobalRA::isSubRetLocConflict(G4_BB *bb, std::vector<unsigned> &usedLoc,
                                   unsigned stackTop) {
  auto &fg = kernel.fg;
  if (bb->isAlreadyTraversed(fg.getTraversalNum()))
    return false;
  bb->markTraversed(fg.getTraversalNum());

  G4_INST *lastInst = bb->size() == 0 ? NULL : bb->back();
  if (lastInst && lastInst->isReturn()) {
    if (lastInst->getPredicate() == NULL)
      return false;
    else {
      return isSubRetLocConflict(bb->fallThroughBB(), usedLoc, stackTop);
    }
  } else if (lastInst && lastInst->isCall()) // need to traverse to next level
  {
    unsigned curSubRetLoc = getSubRetLoc(bb);
    //
    // check conflict firstly
    //
    for (unsigned i = 0; i < stackTop; i++)
      if (usedLoc[i] == curSubRetLoc)
        return true;
    //
    // then traverse all the subroutines and return BB
    //
    usedLoc[stackTop] = curSubRetLoc;
    unsigned afterCallId = bb->BBAfterCall()->getId();

    // call can have 1 or 2 successors
    // If it has 1 then it is sub-entry block, if it has 2
    // then call has to be predicated. In case of predication,
    // 1st successor is physically following BB, 2nd is
    // sub-entry.
    if (lastInst->getPredicate()) {
      vISA_ASSERT(bb->Succs.size() == 2,
                  "Expecting 2 successor BBs for predicated call");
      if (isSubRetLocConflict(bb->Succs.back(), usedLoc, stackTop))
        return true;
    }

    if (bb->BBAfterCall()->getId() == afterCallId) {
      if (isSubRetLocConflict(bb->BBAfterCall(), usedLoc, stackTop))
        return true;
    }
  } else {
    for (G4_BB *succ : bb->Succs)
      if (isSubRetLocConflict(succ, usedLoc, stackTop))
        return true;
  }

  return false;
}

//
// The routine traverses all BBs that can be reached from the entry of a
// subroutine (not traversing into nested subroutine calls). Mark retLoc[bb] =
// entryId (to associate bb with the subroutine entry. When two subroutines
// share code, we return the location of the subroutine that was previously
// traversed so that the two routines can then use the same location to save
// their return addresses.
//
unsigned GlobalRA::determineReturnAddrLoc(unsigned entryId,
                                          std::vector<unsigned> &retLoc,
                                          G4_BB *bb) {
  auto &fg = kernel.fg;
  if (bb->isAlreadyTraversed(fg.getTraversalNum()))
    return retLoc[bb->getId()];
  bb->markTraversed(fg.getTraversalNum());

  if (retLoc[bb->getId()] != UNDEFINED_VAL)
    return retLoc[bb->getId()];

  retLoc[bb->getId()] = entryId;
  G4_INST *lastInst = bb->size() == 0 ? NULL : bb->back();

  if (lastInst && lastInst->isReturn()) {
    if (!lastInst->getPredicate())
      return entryId;
    return determineReturnAddrLoc(entryId, retLoc, bb->fallThroughBB());
  } else if (lastInst && lastInst->isCall()) {
    // skip nested subroutine calls
    return determineReturnAddrLoc(entryId, retLoc, bb->BBAfterCall());
  }
  unsigned sharedId = entryId;
  for (G4_BB *succ : bb->Succs) {
    unsigned loc = determineReturnAddrLoc(entryId, retLoc, succ);
    if (loc != entryId) {
      while (retLoc[loc] != loc) // find the root of subroutine loc
        loc = retLoc[loc];       // follow the link to reach the root
      if (sharedId == entryId) {
        sharedId = loc;
      } else if (sharedId != loc) {
        //
        // The current subroutine share code with two other subroutines, we
        // force all three of them to use the same location by linking them
        // togethers.
        //
        retLoc[loc] = sharedId;
      }
    }
  }
  return sharedId;
}

void GlobalRA::assignLocForReturnAddr() {
  auto &fg = kernel.fg;
  std::vector<unsigned> retLoc(fg.getNumBB(), UNDEFINED_VAL);
  // a data structure for doing a quick map[id] ---> block
  // FIXME: I have no idea why we need this vector, do we have to iterate the
  // blocks by their id for some reason?
  std::vector<G4_BB *> BBs(fg.getNumBB());
  for (G4_BB *bb : fg) {
    unsigned i = bb->getId();
    BBs[i] = bb; // BBs are sorted by ID
  }

  //
  // Firstly, keep the original algorithm unchanged to mark the retLoc
  //
  std::vector<G4_BB *> caller; // just to accelerate the algorithm later

  for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++) {
    G4_BB *bb = BBs[i];
    if (bb->isEndWithCall() == false) {
      continue;
    }

#ifdef _DEBUG
    G4_INST *last = bb->empty() ? NULL : bb->back();
    vISA_ASSERT(last, ERROR_FLOWGRAPH);
#endif

    caller.push_back(
        bb); // record the callers, just to accelerate the algorithm

    G4_BB *subEntry = bb->getCalleeInfo()->getInitBB();
    if (retLoc[subEntry->getId()] !=
        UNDEFINED_VAL) // a loc has been assigned to the subroutine
    {
      // Need to setSubRetLoc if subEntry is part of another subRoutine because,
      // in the final phase, we use SubRetLoc != UNDEFINED_VAL to indicate
      // a block is an entry of a subroutine.
      setSubRetLoc(subEntry, retLoc[subEntry->getId()]);
    } else {
      fg.prepareTraversal();
      unsigned loc =
          determineReturnAddrLoc(subEntry->getId(), retLoc, subEntry);
      if (loc != subEntry->getId()) {
        retLoc[subEntry->getId()] = loc;
      }
      setSubRetLoc(subEntry, loc);
      //
      // We do not merge indirect call here, because it will createt additional
      // (bb->getSubRetLoc() != bb->getId())  cases that kill the share code
      // detection
      //
    }

    // retBB is the exit basic block of callee, ie the block with return
    // statement at end
    G4_BB *retBB = bb->getCalleeInfo()->getExitBB();

    if (retLoc[retBB->getId()] == UNDEFINED_VAL) {
      // retBB block was unreachable so retLoc element corresponding to that
      // block was left undefined
      retLoc[retBB->getId()] = getSubRetLoc(subEntry);
    }
  }
  VISA_DEBUG_VERBOSE({
    std::cout << "\nBefore merge indirect call:\n";
    for (unsigned i = 0; i < fg.getNumBB(); i++)
      if (retLoc[i] == UNDEFINED_VAL) {
        std::cout << "BB" << i << ": X   ";
      } else {
        std::cout << "BB" << i << ": " << retLoc[i] << "   ";
      }
    std::cout << "\n";
  });

  //
  // this final phase is needed. Consider the following scenario.  Sub2 shared
  // code with both Sub1 and Sub3. All three must use the same location to save
  // return addresses. If we traverse Sub1 then Sub3, retLoc[Sub1] and
  // retLoc[Sub3] all point to their own roots.  As we traverse Sub2, code
  // sharing is detected, we need to this phase to make sure that Sub1 and Sub3
  // use the same location.
  //
  for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++) {
    G4_BB *bb = BBs[i];
    if (getSubRetLoc(bb) != UNDEFINED_VAL) {
      if (getSubRetLoc(bb) != bb->getId()) {
        unsigned loc = bb->getId();
        while (retLoc[loc] != loc) // not root
          loc = retLoc[loc];       // follow the link to reach the root
      }
    }
  }

  //
  // Merge the retLoc in indirect call cases
  //
  for (G4_BB *bb : caller) {
    G4_INST *last = bb->empty() ? NULL : bb->back();
    vISA_ASSERT(last, ERROR_FLOWGRAPH);

    unsigned fallThroughId = bb->fallThroughBB() == NULL
                                 ? UNDEFINED_VAL
                                 : bb->fallThroughBB()->getId();
    if ((last && last->getPredicate() == NULL && bb->Succs.size() > 1) ||
        (last && last->getPredicate() != NULL && bb->Succs.size() > 2)) {
      //
      // merge all subroutines to the last one, it is a trick to conduct the
      // conditional call by using last one instead of first one
      //
      unsigned masterEntryId = bb->Succs.back()->getId();
      //
      // find the root of the master subroutine
      //
      unsigned masterRetLoc = masterEntryId;
      while (retLoc[masterRetLoc] != masterRetLoc)
        masterRetLoc = retLoc[masterRetLoc];
      //
      // check other subroutines in one vertex
      //
      for (G4_BB *subBB : bb->Succs) {
        if (subBB->getId() != masterEntryId &&
            subBB->getId() != fallThroughId) {
          //
          // find the root of the current subroutine
          //
          unsigned loc = subBB->getId();
          while (retLoc[loc] != loc)
            loc = retLoc[loc];
          //
          // Merge: let all the items in retLoc with value loc pointing to
          // masterRetLoc Suppose indirect call X calls subroutine A and B,
          // indirect call Y calls B and C, and indirect call Z calls C and D.
          // Before merge, the A~D will be assigned different return location.
          // Suppose we process the callers in order X-->Z-->Y in the merge, if
          // we just modified the return locations of one indirect call, we will
          // fail to merge the return locations of A~D.
          //
          if (loc != masterRetLoc) {
            for (unsigned i = 0; i < fg.getNumBB(); i++)
              if (retLoc[i] == loc)
                retLoc[i] = masterRetLoc;
          }
        }
      }
    }
  }

  VISA_DEBUG_VERBOSE({
    std::cout << "\nAfter merge indirect call:\n";
    for (unsigned i = 0; i < fg.getNumBB(); i++)
      if (retLoc[i] == UNDEFINED_VAL) {
        std::cout << "BB" << i << ": X   ";
      } else {
        std::cout << "BB" << i << ": " << retLoc[i] << "   ";
      }
    std::cout << "\n";
  });

  //
  //  Assign ret loc for subroutines firstly, and then check if it is wrong (due
  //  to circle in call graph).
  //
  for (unsigned i = 0, bbNum = fg.getNumBB(); i < bbNum; i++) {
    //
    // reset the return BB's retLoc
    //
    unsigned loc = i;
    if (retLoc[i] != UNDEFINED_VAL) {
      while (retLoc[loc] != loc)
        loc = retLoc[loc];
      retLoc[i] = loc;
      setSubRetLoc(BBs[i], retLoc[loc]);
    }
  }

  for (G4_BB *bb : caller) {
    //
    // set caller BB's retLoc
    //
#ifdef _DEBUG
    G4_INST *last = bb->empty() ? NULL : bb->back();
    vISA_ASSERT(last, ERROR_FLOWGRAPH);
#endif
    G4_BB *subBB = bb->getCalleeInfo()->getInitBB();
    //
    // 1: Must use retLoc here, because some subBB is also the caller of another
    // subroutine, so the entry loc in BB may be changed in this step 2: In some
    // cases, the caller BB is also the entry BB. At this time, the associated
    // entry BB ID will be overwritten. However, it will not impact the conflict
    // detection and return location assignment, since we only check the return
    // BB and/or caller BB in these two moudles.
    //
    setSubRetLoc(bb, retLoc[subBB->getId()]);
  }

  VISA_DEBUG_VERBOSE({
    for (unsigned i = 0; i < fg.getNumBB(); i++) {
      G4_BB *bb = BBs[i];
      if (getSubRetLoc(bb) != UNDEFINED_VAL) {
        if (!bb->empty() && bb->front()->isLabel()) {
          std::cout << ((G4_Label *)bb->front()->getSrc(0))->getLabel()
                    << " assigned location " << getSubRetLoc(bb) << "\n";
        }
      }
    }
  });

  //
  // detect the conflict (circle) at last
  //
  std::vector<unsigned> usedLoc(fg.getNumBB());
  unsigned stackTop = 0;
  for (G4_BB *bb : caller) {
    //
    // Must re-start the traversal from each caller, otherwise will lose some
    // circle cases like TestRA_Call_1_1_3B, D, F, G, H
    //
    fg.prepareTraversal();

    usedLoc[stackTop] = getSubRetLoc(bb);

    G4_BB *subEntry = bb->Succs.back();

    if (isSubRetLocConflict(subEntry, usedLoc, stackTop + 1)) {
      vISA_ASSERT(false, "ERROR: Fail to assign call-return variables due to "
                         "cycle in call graph!");
    }
  }

  insertCallReturnVar();
}

void GlobalRA::insertCallReturnVar() {
  for (auto bb : kernel.fg) {
    G4_INST *last = bb->empty() ? NULL : bb->back();
    if (last) {
      if (last->isCall()) {
        insertSaveAddr(bb);
      } else {
        if (last->isReturn()) {
          // G4_BB_EXIT_TYPE is just a dummy BB, and the return will be the last
          // inst in each of its predecessors
          insertRestoreAddr(bb);
        }
      }
    }
  }
}

void GlobalRA::insertSaveAddr(G4_BB *bb) {
  vISA_ASSERT(bb != NULL, ERROR_INTERNAL_ARGUMENT);
  vISA_ASSERT(getSubRetLoc(bb) != UNDEFINED_VAL,
              ERROR_FLOWGRAPH); // must have a assigned loc

  G4_INST *last = bb->back();
  vASSERT(last->isCall());
  if (last->getDst() == NULL) {
    unsigned loc = getSubRetLoc(bb);
    G4_Declare *dcl = getRetDecl(loc);

    last->setDest(builder.createDst(dcl->getRegVar(), 0, 0, 1,
                                    Type_UD)); // RET__loc12<1>:ud

    last->setExecSize(g4::SIMD2);
  }
}

void GlobalRA::insertRestoreAddr(G4_BB *bb) {
  vISA_ASSERT(bb != NULL, ERROR_INTERNAL_ARGUMENT);

  G4_INST *last = bb->back();
  vASSERT(last->isReturn());
  if (last->getSrc(0) == NULL) {
    unsigned loc = getSubRetLoc(bb);
    G4_Declare *dcl = getRetDecl(loc);

    G4_SrcRegRegion *new_src = builder.createSrc(
        dcl->getRegVar(), 0, 0, builder.createRegionDesc(0, 2, 1), Type_UD);

    last->setSrc(new_src, 0);
    last->setDest(builder.createNullDst(Type_UD));

    last->setExecSize(g4::SIMD2);
  }
}

// This function returns the weight of interference edge lr1--lr2,
// which is used for computing the degree of lr1.
//
// When there is no alignment restriction, we should use the normal weight,
// which is lr1_nreg + lr2_nreg - 1.
//
// Otherewise, we need to take into account additional space that may be
// required because of the alignment restriction. For example,
// if lr1 has even alignment and lr2 has no alignment restriction,
// we need to consider the following cases that would require the
// maximal available GRF space for successful allocation:
// 1) lr1's size is odd, lr2's size is odd and lr2's start position is even,
//    the total space required would be (lr1_nreg + lr2_nreg + 1)
// 2) lr1's size is odd, lr2's size is even and lr2's start position is even,
//    the total space required would be (lr1_nreg + lr2_nreg)
// 3) lr1's size is even, lr2's size is odd and lr2's start position is odd,
//    the total space required would be (lr1_nreg + lr2_nreg)
// 4) lr1's size is even, lr2's size is even and lr2's start position is odd,
//    the total space required would be (lr1_nreg + lr2_nreg + 1)
// The above logic can be simplified to the following formula:
//    lr1_nreg + lr2_nreg + 1 - ((lr1_nreg + lr2_nreg) % 2)
//
// If both lr1 and lr2 have even alignment restriction,
// we need to consider the following cases that would require the
// maximal available GRF space for successful allocation:
// 1) lr1's size is odd, lr2's size is odd and lr2's start position is even,
//    the total space required would be (lr1_nreg + lr2_nreg + 1)
// 2) lr1's size is odd, lr2's size is even and lr2's start position is even,
//    the total space required would be (lr1_nreg + lr2_nreg)
// 3) lr1's size is even, lr2's size is odd and lr2's start position is even,
//    the total space required would be (lr1_nreg + lr2_nreg)
// 4) lr1's size is even, lr2's size is even and lr2's start position is even,
//    the total space required would be (lr1_nreg + lr2_nreg - 1)
// The above logic can be simplified to the following formula:
//    lr1_nreg + lr2_nreg - 1 + (lr1_nreg % 2) + (lr2_nreg % 2)
//
// Note: Edge weight between 2 nodes is asymmetric and depends on ordering
// of nodes. Swapping lr1, lr2 and invoking edgeWeightGRF() may return
// different result. So using the correct order of lr1, lr2 during edge
// weight computation and later during simplification is necessary for
// correctness.
//
template <bool Support4GRFAlign>
unsigned GraphColor::edgeWeightGRF(const LiveRange *lr1, const LiveRange *lr2) {
  unsigned lr1_nreg = lr1->getNumRegNeeded();
  unsigned lr2_nreg = lr2->getNumRegNeeded();

  if constexpr (Support4GRFAlign) {
    auto lr1Align = gra.getAugAlign(lr1->getDcl());
    auto lr2Align = gra.getAugAlign(lr2->getDcl());

    return edgeWeightWith4GRF(lr1Align, lr2Align, lr1_nreg, lr2_nreg);
  } else {
    bool lr1EvenAlign = gra.isEvenAligned<false>(lr1->getDcl());
    bool lr2EvenAlign = gra.isEvenAligned<false>(lr2->getDcl());

    return edgeWeightGRF(lr1EvenAlign, lr2EvenAlign, lr1_nreg, lr2_nreg);
  }
}

unsigned GraphColor::edgeWeightARF(const LiveRange *lr1, const LiveRange *lr2) {
  if (lr1->getRegKind() == G4_FLAG) {
    G4_SubReg_Align lr1_align = gra.getSubRegAlign(lr1->getVar()->getDeclare());
    G4_SubReg_Align lr2_align = gra.getSubRegAlign(lr2->getVar()->getDeclare());
    unsigned lr1_nreg = lr1->getNumRegNeeded();
    unsigned lr2_nreg = lr2->getNumRegNeeded();

    if (lr1_align == Any) {
      return lr1_nreg + lr2_nreg - 1;
    } else if (lr1_align == Even_Word && lr2_align == Any) {
      return lr1_nreg + lr2_nreg + 1 - ((lr1_nreg + lr2_nreg) % 2);
    } else if (lr1_align == Even_Word && lr2_align == Even_Word) {
      if (lr1_nreg % 2 == 0 && lr2_nreg % 2 == 0) {
        return lr1_nreg + lr2_nreg - 2;
      } else {
        return lr1_nreg + lr2_nreg - 1 + (lr1_nreg % 2) + (lr2_nreg % 2);
      }
    } else {
      vISA_ASSERT_UNREACHABLE(
          "Found unsupported subRegAlignment in flag register allocation!");
      return 0;
    }
  } else if (lr1->getRegKind() == G4_ADDRESS) {
    G4_SubReg_Align lr1_align = gra.getSubRegAlign(lr1->getVar()->getDeclare());
    G4_SubReg_Align lr2_align = gra.getSubRegAlign(lr2->getVar()->getDeclare());
    unsigned lr1_nreg = lr1->getNumRegNeeded();
    unsigned lr2_nreg = lr2->getNumRegNeeded();

    if (lr1_align < lr2_align) {
      G4_SubReg_Align tmp_align = lr1_align;
      unsigned tmp_nreg = lr1_nreg;
      lr1_align = lr2_align;
      lr2_align = tmp_align;
      lr1_nreg = lr2_nreg;
      lr2_nreg = tmp_nreg;
    }

    if (lr1_align == Any) {
      // Any vs
      return lr1_nreg + lr2_nreg - 1;
    } else if (lr1_align == Four_Word && lr2_align == Any) {
      // 4 vs Any
      return lr1_nreg + lr2_nreg + 3 - (lr1_nreg + lr2_nreg) % 4;
    } else if (lr1_align == Four_Word && lr2_align == Four_Word) {
      // 4 vs 4
      return lr1_nreg + lr2_nreg - 1 + (4 - lr1_nreg % 4) % 4 +
             (4 - lr2_nreg % 4) % 4;
    } else if (lr1_align == Eight_Word && lr2_align == Any) {
      // 8 vs Any
      return lr1_nreg + lr2_nreg + 7 - (lr1_nreg + lr2_nreg) % 8;
    } else if (lr1_align == Eight_Word && lr2_align == Four_Word) {
      // 8 vs 4
      if (((8 - lr1_nreg % 8) % 8) >= 4)
        return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 - 4;
      return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 +
             (4 - lr2_nreg % 4) % 4;
    } else if (lr1_align == Eight_Word && lr2_align == Eight_Word) {
      // 8 vs 8
      return lr1_nreg + lr2_nreg - 1 + (8 - lr1_nreg % 8) % 8 +
             (8 - lr2_nreg % 8) % 8;
    } else if (lr1_align == Sixteen_Word && lr2_align == Any) {
      // 16 vs Any
      return lr1_nreg + lr2_nreg + 15 - (lr1_nreg + lr2_nreg) % 16;
    } else if (lr1_align == Sixteen_Word && lr2_align == Four_Word) {
      // 16 vs 4
      if (((16 - lr1_nreg % 16) % 16) >= 4)
        return lr1_nreg + lr2_nreg - 1 + (16 - lr1_nreg % 16) % 16 - 4;
      return lr1_nreg + lr2_nreg - 1 + (16 - lr1_nreg % 16) % 16 +
             (4 - lr2_nreg % 4) % 4;
    } else if (lr1_align == Sixteen_Word && lr2_align == Eight_Word) {
      // 16 vs 8
      if (((16 - lr1_nreg % 16) % 16) >= 8)
        return lr1_nreg + lr2_nreg - 1 + (16 - lr1_nreg % 16) % 16 - 8;
      return lr1_nreg + lr2_nreg - 1 + (16 - lr1_nreg % 16) % 16 +
             (8 - lr2_nreg % 8) % 8;
    } else if (lr1_align == Sixteen_Word && lr2_align == Sixteen_Word) {
      // 16 vs 16
      return lr1_nreg + lr2_nreg - 1 + (16 - lr1_nreg % 16) % 16 +
             (16 - lr2_nreg % 16) % 16;
    } else {
      vISA_ASSERT_UNREACHABLE(
          "Found unsupported subRegAlignment in address register allocation!");
      return 0;
    }
  }
  else if (lr1->getRegKind() == G4_SCALAR) {
    return edgeWeightGRF<false>(lr1, lr2); // treat scalar just like GRF
  }
  vISA_ASSERT_UNREACHABLE(
      "Found unsupported ARF reg type in register allocation!");
  return 0;
}

void GlobalRA::fixSrc0IndirFcall() {
  // Indirect calls look like:
  // mov (1|NM) V10    0x123456:ud
  // fcall (1) dst     V10 <-- V10 which is src0 contains %ip to jump to
  //
  // In this function, we want to set V10 to r125.0 which is same as dst of
  // fcall as per ABI. This way, when inserting save/restore code around fcall,
  // no special checks are needed to handle V10.
  //
  // But this works only if V10 is a local. If it not a local we create a mov
  // that copies V10 in to a new temp variable. And then we map this temp
  // variable to r125.0. Hopefully V10 being global would be a rare occurence.
  for (auto bb : kernel.fg) {
    if (bb->isEndWithFCall()) {
      auto fcall = bb->back()->asCFInst();
      if (!fcall->getSrc(0) || !fcall->getSrc(0)->isSrcRegRegion())
        continue;

      auto src0Rgn = fcall->getSrc(0)->asSrcRegRegion();
      auto src0TypeSize = src0Rgn->getTypeSize();
      auto src0Dcl = src0Rgn->getBase()->asRegVar()->getDeclare();
      auto src0TopDcl = src0Rgn->getTopDcl();

      if (src0Dcl != src0TopDcl || !isBlockLocal(src0TopDcl) ||
          src0TopDcl->getNumElems() > 1) {
        // create a copy
        auto tmpDcl = kernel.fg.builder->createHardwiredDeclare(
            1, src0Rgn->getType(), kernel.stackCall.getFPSPGRF(),
            kernel.stackCall.subRegs.Ret_IP * TypeSize(Type_UD) / src0TypeSize);
        auto dst = kernel.fg.builder->createDst(tmpDcl->getRegVar(),
                                                src0Rgn->getType());
        auto src = kernel.fg.builder->duplicateOperand(src0Rgn);
        auto copy = kernel.fg.builder->createMov(g4::SIMD1, dst, src,
                                                 InstOpt_WriteEnable, false);
        auto iter = std::find_if(bb->begin(), bb->end(),
                                 [](G4_INST *inst) { return inst->isFCall(); });
        bb->insertBefore(iter, copy);
        auto newSrc = kernel.fg.builder->createSrc(
            tmpDcl->getRegVar(), 0, 0, kernel.fg.builder->getRegionScalar(),
            Type_UD);
        fcall->setSrc(newSrc, 0);
      } else {
        auto fcallDstTypeSize = fcall->getDst()->getTypeSize();
        vISA_ASSERT(fcallDstTypeSize == 4, "expecting DW type dst");
        src0TopDcl->getRegVar()->setPhyReg(
            fcall->getDst()->getBase()->asRegVar()->getPhyReg(),
            fcall->getDst()->getBase()->asRegVar()->getPhyRegOff() *
                fcallDstTypeSize / src0TypeSize);
      }
    }
  }
}