intel-graphics-compiler/IGC/Compiler/CISACodeGen/EstimateFunctionSize.cpp

/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2021 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#include "Compiler/CISACodeGen/EstimateFunctionSize.h"
#include "Compiler/CodeGenContextWrapper.hpp"
#include "Compiler/MetaDataUtilsWrapper.h"
#include "Compiler/CodeGenPublic.h"
#include "Compiler/IGCPassSupport.h"
#include "common/igc_regkeys.hpp"
#include "common/LLVMWarningsPush.hpp"
#include "llvm/IR/Module.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/SyntheticCountsUtils.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvmWrapper/IR/BasicBlock.h"
#include "llvmWrapper/ADT/Optional.h"
#include "common/LLVMWarningsPop.hpp"
#include "Probe/Assertion.h"
#include <deque>
#include <cfloat>
#include <algorithm>
#include <cmath>
#include <optional>

using namespace llvm;
using namespace IGC;
using Scaled64 = ScaledNumber<uint64_t>;
char EstimateFunctionSize::ID = 0;

IGC_INITIALIZE_PASS_BEGIN(EstimateFunctionSize, "EstimateFunctionSize", "EstimateFunctionSize", false, true)
IGC_INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
IGC_INITIALIZE_PASS_END(EstimateFunctionSize, "EstimateFunctionSize", "EstimateFunctionSize", false, true)

llvm::ModulePass *IGC::createEstimateFunctionSizePass() {
  initializeEstimateFunctionSizePass(*PassRegistry::getPassRegistry());
  return new EstimateFunctionSize;
}

llvm::ModulePass *IGC::createEstimateFunctionSizePass(bool EnableStaticProfileGuidedTrimming) {
  initializeEstimateFunctionSizePass(*PassRegistry::getPassRegistry());
  return new EstimateFunctionSize(EstimateFunctionSize::AnalysisLevel::AL_Module, EnableStaticProfileGuidedTrimming);
}

llvm::ModulePass *IGC::createEstimateFunctionSizePass(EstimateFunctionSize::AnalysisLevel AL) {
  initializeEstimateFunctionSizePass(*PassRegistry::getPassRegistry());
  return new EstimateFunctionSize(AL, false);
}

EstimateFunctionSize::EstimateFunctionSize(AnalysisLevel AL, bool EnableStaticProfileGuidedTrimming)
    : ModulePass(ID), M(nullptr), AL(AL), tmpHasImplicitArg(false), HasRecursion(false), EnableSubroutine(false) {
  thresholdForTrimming = Scaled64::get(IGC_GET_FLAG_VALUE(ControlInlineTinySizeForSPGT));
  threshold_func_freq = Scaled64::getLargest();

  // Flags for Kernel trimming
  ControlKernelTotalSize = IGC_IS_FLAG_ENABLED(ControlKernelTotalSize);
  ControlUnitSize = IGC_IS_FLAG_ENABLED(ControlUnitSize);
  ControlInlineTinySize = IGC_GET_FLAG_VALUE(ControlInlineTinySize);
  UnitSizeThreshold = IGC_GET_FLAG_VALUE(UnitSizeThreshold);

  // Flags for Static Profile-guided trimming
  StaticProfileGuidedTrimming = IGC_IS_FLAG_ENABLED(StaticProfileGuidedTrimming);
  UseFrequencyInfoForSPGT = IGC_IS_FLAG_ENABLED(UseFrequencyInfoForSPGT);
  BlockFrequencySampling = IGC_IS_FLAG_ENABLED(BlockFrequencySampling);
  EnableLeafCollapsing = IGC_IS_FLAG_ENABLED(EnableLeafCollapsing);
  EnableSizeContributionOptimization = IGC_IS_FLAG_ENABLED(EnableSizeContributionOptimization);
  LoopCountAwareTrimming = IGC_IS_FLAG_ENABLED(LoopCountAwareTrimming);
  EnableGreedyTrimming = IGC_IS_FLAG_ENABLED(EnableGreedyTrimming);
  SizeWeightForSPGT = IGC_GET_FLAG_VALUE(SizeWeightForSPGT);
  FrequencyWeightForSPGT = IGC_GET_FLAG_VALUE(FrequencyWeightForSPGT);
  MetricForKernelSizeReduction = IGC_GET_FLAG_VALUE(MetricForKernelSizeReduction);
  ParameterForColdFuncThreshold = IGC_GET_FLAG_VALUE(ParameterForColdFuncThreshold);
  ControlInlineTinySizeForSPGT = IGC_GET_FLAG_VALUE(ControlInlineTinySizeForSPGT);
  MaxUnrollCountForFunctionSizeAnalysis = IGC_GET_FLAG_VALUE(MaxUnrollCountForFunctionSizeAnalysis);
  SkipTrimmingOneCopyFunction = IGC_GET_FLAG_VALUE(SkipTrimmingOneCopyFunction);
  SelectiveTrimming = IGC_GET_REGKEYSTRING(SelectiveTrimming);
  // Flags for Partitioning
  PartitionUnit = IGC_IS_FLAG_ENABLED(PartitionUnit);
  StaticProfileGuidedPartitioning = IGC_IS_FLAG_ENABLED(StaticProfileGuidedPartitioning);

  // Flags for implcit arguments and external functions
  ForceInlineExternalFunctions = IGC_IS_FLAG_ENABLED(ForceInlineExternalFunctions);
  ForceInlineStackCallWithImplArg = IGC_IS_FLAG_ENABLED(ForceInlineStackCallWithImplArg);
  ControlInlineImplicitArgs = IGC_IS_FLAG_ENABLED(ControlInlineImplicitArgs);
  SubroutineThreshold = IGC_GET_FLAG_VALUE(SubroutineThreshold);
  LargeKernelThresholdMultiplier = IGC_GET_FLAG_VALUE(LargeKernelThresholdMultiplier);
  KernelTotalSizeThreshold = IGC_GET_FLAG_VALUE(KernelTotalSizeThreshold);
  ExpandedUnitSizeThreshold = IGC_GET_FLAG_VALUE(ExpandedUnitSizeThreshold);
  if (EnableStaticProfileGuidedTrimming) {
    StaticProfileGuidedTrimming = true;
    EnableLeafCollapsing = true;
    EnableSizeContributionOptimization = true;
    LoopCountAwareTrimming = true;
  }
}

EstimateFunctionSize::~EstimateFunctionSize() { clear(); }

void EstimateFunctionSize::getAnalysisUsage(AnalysisUsage &AU) const {
  AU.setPreservesAll();
  AU.addRequired<LoopInfoWrapperPass>();
  AU.addRequired<BranchProbabilityInfoWrapperPass>();
  AU.addRequired<BlockFrequencyInfoWrapperPass>();
  AU.addRequired<ScalarEvolutionWrapperPass>();
}

bool EstimateFunctionSize::runOnModule(Module &Mod) {
  clear();
  M = &Mod;
  analyze();
  checkSubroutine();
  return false;
}

// Given a module, estimate the maximal function size with complete inlining.
/*
   A ----> B ----> C ---> D ---> F
    \       \       \
     \       \       \---> E
      \       \
       \       \---> C ---> D --> F
        \             \
         \----> F      \---> E
*/
// ExpandedSize(A) = size(A) + size(B) + 2 * size(C) + 2 * size(D)
//                   + 2 * size(E) + 3 * size(F)
//
// We compute the size as follows:
//
// (1) Initialize the data structure
//
// A --> {size(A), [B, F], [] }
// B --> {size(B), [C, C], [A] }
// C --> {size(C), [D, E], [B] }
// D --> {size(D), [F],    [C] }
// E --> {size(E), [],     [C] }
// F --> {size(F), [],     [A, D] }
//
// where the first list consists of functions to be expanded and the second list
// consists of its caller functions.
//
// (2) Traverse in a reverse topological order and expand each node

namespace {

#define PrintPartitionUnit(hex_val, contents)                                                                          \
  if ((IGC_GET_FLAG_VALUE(PrintPartitionUnit) & hex_val) != 0) {                                                       \
    dbgs() << "PartitionUnit0x" << hex_val << ": " << contents << "\n";                                                \
  }
#define PrintControlUnitSize(hex_val, contents)                                                                        \
  if ((IGC_GET_FLAG_VALUE(PrintControlUnitSize) & hex_val) != 0) {                                                     \
    dbgs() << "ControlUnitSize0x" << hex_val << ": " << contents << "\n";                                              \
  }
#define PrintControlKernelTotalSize(hex_val, contents)                                                                 \
  if ((IGC_GET_FLAG_VALUE(PrintControlKernelTotalSize) & hex_val) != 0) {                                              \
    dbgs() << "ControlKernelTotalSize0x" << hex_val << ": " << contents << "\n";                                       \
  }
#define PrintTrimUnit(hex_val, contents)                                                                               \
  if ((IGC_GET_FLAG_VALUE(PrintControlKernelTotalSize) & hex_val) != 0 ||                                              \
      (IGC_GET_FLAG_VALUE(PrintControlUnitSize) & hex_val) != 0) {                                                     \
    dbgs() << "TrimUnit0x" << hex_val << ": " << contents << "\n";                                                     \
  }
#define PrintFunctionSizeAnalysis(hex_val, contents)                                                                   \
  if ((IGC_GET_FLAG_VALUE(PrintFunctionSizeAnalysis) & hex_val) != 0) {                                                \
    dbgs() << "FunctionSizeAnalysis0x" << hex_val << ": " << contents << "\n";                                         \
  }
#define PrintStaticProfileGuidedKernelSizeReduction(hex_val, contents)                                                 \
  if ((IGC_GET_FLAG_VALUE(PrintStaticProfileGuidedKernelSizeReduction) & hex_val) != 0) {                              \
    dbgs() << "StaticProfileGuidedKernelSizeReduction0x" << hex_val << ": " << contents << "\n";                       \
  }

static Scaled64 getSPGTWeight(unsigned Size, Scaled64 Freq, unsigned SizeWeightForSPGT,
                              unsigned FrequencyWeightForSPGT) {
  Scaled64 ScaledSize = Scaled64::get(Size);
  unsigned SizeWeight = SizeWeightForSPGT;
  Scaled64 WeightedSize = Scaled64::getOne();
  for (unsigned i = 0; i < SizeWeight; i++)
    WeightedSize *= ScaledSize;
  if (Freq == 0)
    return WeightedSize;
  unsigned FreqWeight = FrequencyWeightForSPGT;
  Scaled64 WeightedFreq = Scaled64::getOne();
  for (unsigned i = 0; i < FreqWeight; i++)
    WeightedFreq *= Freq;
  return WeightedSize / WeightedFreq;
}

typedef enum {
  SP_NO_METRIC = 0, /// \brief A flag to indicate whether no metric is used. We use this especially when we only need
                    /// static profile infomation without enforcement
  SP_NORMAL_DISTRIBUTION = (0x1 << 0x0), /// \brief A flag to indicate whether a normal distribution is used as metric
  SP_LONGTAIL_DISTRIBUTION =
      (0x1 << 0x1),                     /// \brief A flag to indicate whether a long tail distribution is used as metric
  SP_AVERAGE_PERCENTAGE = (0x1 << 0x2), /// \brief A flag to indicate whether average % is used as metric
} StatiProfile_FLAG_t;

// Function Attribute Flag type
typedef enum {
  FA_BEST_EFFORT_INLINE =
      0, /// \brief A flag to indicate whether it is to be inlined but it can be trimmed or assigned stackcall
  FA_FORCE_INLINE = (0x1 << 0x0), /// \brief A flag to indicate whether it is to be inlined and it cannot be reverted
  FA_TRIMMED = (0x1 << 0x1),      /// \brief A flag to indicate whetehr it will be trimmed
  FA_STACKCALL = (0x1 << 0x2),    /// \brief A flag to indicate whether this node should be a stack call header
  FA_KERNEL_ENTRY =
      (0x1
       << 0x3), /// \brief A flag to indicate whether this node is a kernel entry. It will be affected by any schemes.
  FA_ADDR_TAKEN = (0x1 << 0x4), /// \brief A flag to indicate whether this node is an address taken function.
} FA_FLAG_t;
/// Associate each function with a partially expanded size and remaining
/// unexpanded function list, etc.

typedef enum {
  FT_NOT_APPLICABLE = 0,             /// \brief A flag to indicate functions don't need to be considered
  FT_NOT_BEST_EFFORT = (0x1 << 0x1), /// \brief A flag to indicate function is not open to trimming or partitioning
  FT_MUL_KERNEL =
      (0x1 << 0x2), /// \brief A flag to indicate function is in multiple kernels and they are forced to be inlined
  FT_BIG_ENOUGH = (0x1 << 0x3),    /// \brief A flag to indicate functions are big enough to trim
  FT_TOO_TINY = (0x1 << 0x4),      /// \brief A flag to indicate function is too tiny to be trimmed
  FT_HIGHER_WEIGHT = (0x1 << 0x5), /// \brief a flag to indicate the function has higher weight than threshold
  FT_LOWER_WEIGHT = (0x1 << 0x6),  /// \brief a flag to indicate the function has lower weight than threshold
} FUNCTION_TRAIT_FLAG_t;
struct FunctionNode {
  FunctionNode(Function *F, std::size_t Size)
      : F(F), InitialSize(Size), UnitSize(Size), ExpandedSize(Size), SizeAfterCollapsing(Size), Inline_cnt(0),
        tmpSize(Size), CallingSubroutine(false), FunctionAttr(0), InMultipleUnit(false), HasImplicitArg(false),
        staticFuncFreq(0, 0), EntryFreq(0, 0) {}

  Function *F;

  /// leaf node.

  /// \brief Initial size before partition
  uint32_t InitialSize;

  //  \brief the size of a compilation unit
  uint32_t UnitSize;

  /// \brief Expanded size when all functions in a unit below the node are expanded
  uint32_t ExpandedSize;

  /// \brief Expanded size when all functions in a unit below the node are expanded
  uint32_t SizeAfterCollapsing;

  /// \brief How many times the function is inlined at callsites.
  uint32_t Inline_cnt;

  /// \brief used to update unit size or expanded unit size in topological sort
  uint32_t tmpSize;

  /// \brief Function attribute
  uint8_t FunctionAttr;

  /// \brief An estimated static function frequency
  Scaled64 staticFuncFreq;

  /// \brief A flag to indicate whether this node has a subroutine call before
  /// expanding.
  bool CallingSubroutine;

  /// \brief A flag to indicate whether it is located in multiple kernels or units
  bool InMultipleUnit;

  bool HasImplicitArg;

  Scaled64 EntryFreq;
  std::unordered_map<llvm::BasicBlock *, Scaled64> blockFreqs;

  /// \brief All functions directly called in this function.
  std::unordered_map<FunctionNode *, uint16_t> CalleeList;

  /// \brief All functions that call this function F.
  std::unordered_map<FunctionNode *, uint16_t> CallerList;

  bool EnableLeafCollapsing;
  bool EnableSizeContributionOptimization;
  bool StaticProfileGuidedTrimming;
  bool UseFrequencyInfoForSPGT;
  bool ForceInlineExternalFunctions;
  unsigned ControlInlineTinySize;
  bool ForceInlineStackCallWithImplArg;
  bool ControlInlineImplicitArgs;
  unsigned SizeWeightForSPGT;
  unsigned FrequencyWeightForSPGT;

  void setFlags(bool EnableLC, bool EnableSCO, bool SPGT, bool UseFreqInfo, bool ForceInlineExtFun, unsigned TinySize,
                bool InlineStkCallWithImplArg, bool InlineImplArgs, unsigned SizeWeight, unsigned FreqWeight) {
    EnableLeafCollapsing = EnableLC;
    EnableSizeContributionOptimization = EnableSCO;
    StaticProfileGuidedTrimming = SPGT;
    UseFrequencyInfoForSPGT = UseFreqInfo;
    ForceInlineExternalFunctions = ForceInlineExtFun;
    ControlInlineTinySize = TinySize;
    ForceInlineStackCallWithImplArg = InlineStkCallWithImplArg;
    ControlInlineImplicitArgs = InlineImplArgs;
    SizeWeightForSPGT = SizeWeight;
    FrequencyWeightForSPGT = FreqWeight;
    return;
  }

  void setStaticFuncFreq(Scaled64 freq) { staticFuncFreq = freq; }

  Scaled64 getStaticFuncFreq() { return staticFuncFreq; }

  std::string getStaticFuncFreqStr() { return staticFuncFreq.toString(); }

  // \brief return the size used for Static Profile Guided Trimming
  uint64_t getPotentialBodySize() { return EnableLeafCollapsing ? SizeAfterCollapsing : InitialSize; }

  uint64_t getSizeContribution() {
    return Inline_cnt == 0 ? getPotentialBodySize() : static_cast<uint64_t>(Inline_cnt) * getPotentialBodySize();
  }

  uint64_t getSizeForTrimming() {
    return EnableSizeContributionOptimization ? getSizeContribution() : getPotentialBodySize();
  }

  Scaled64 getWeightForTrimming() {
    if (StaticProfileGuidedTrimming && UseFrequencyInfoForSPGT) {
      return getSPGTWeight(getSizeForTrimming(), staticFuncFreq, SizeWeightForSPGT, FrequencyWeightForSPGT);
    }
    return Scaled64::get(getSizeForTrimming());
  }

  /// \brief A node becomes a leaf when all called functions are expanded.
  bool isLeaf() const { return CalleeList.empty(); }

  /// \brief Add a caller or callee.
  // A caller may call the same callee multiple times, e.g. A->{B,B,B}: A->CalleeList(B,B,B), B->CallerList(A,A,A)
  void addCallee(FunctionNode *G, unsigned weight) {
    IGC_ASSERT(G);
    if (CalleeList.find(G) == CalleeList.end()) // First time added, Initialize it
      CalleeList[G] = 0;
    CalleeList[G] += weight;
    CallingSubroutine = true;
  }
  void addCaller(FunctionNode *G, unsigned weight) {
    IGC_ASSERT(G);
    if (CallerList.find(G) == CallerList.end()) // First time added, Initialize it
      CallerList[G] = 0;
    CallerList[G] += weight;
  }

  void setKernelEntry() {
    FunctionAttr = FA_KERNEL_ENTRY;
    return;
  }
  void setAddressTaken() { FunctionAttr = FA_ADDR_TAKEN; }
  void setForceInline() {
    IGC_ASSERT(FunctionAttr != FA_KERNEL_ENTRY &&
               FunctionAttr != FA_ADDR_TAKEN); // Can't force inline a kernel entry or address taken function
    FunctionAttr = FA_FORCE_INLINE;
    return;
  }
  void setTrimmed() {
    IGC_ASSERT(FunctionAttr == FA_BEST_EFFORT_INLINE); // Only best effort inline function can be trimmed
    FunctionAttr = FA_TRIMMED;
    return;
  }
  void unsetTrimmed() {
    IGC_ASSERT(FunctionAttr == FA_TRIMMED); // Only best effort inline function can be trimmed
    FunctionAttr = FA_BEST_EFFORT_INLINE;
    return;
  }

  void setStackCall() {
    // Can't assign stack call to force inlined function, kernel entry,
    // address taken functions and functions that already assigned stack call
    IGC_ASSERT(FunctionAttr == FA_BEST_EFFORT_INLINE || FunctionAttr == FA_TRIMMED);
    FunctionAttr = FA_STACKCALL;
    return;
  }

  void setEntryFrequency(uint64_t digit, uint16_t scale) { EntryFreq = Scaled64(digit, scale); }
  Scaled64 getEntryFrequency() { return EntryFreq; }

  bool isEntryFunc() { return FunctionAttr == FA_KERNEL_ENTRY; }
  bool isAddrTakenFunc() { return FunctionAttr == FA_ADDR_TAKEN; }
  bool isTrimmed() { return FunctionAttr == FA_TRIMMED; }
  bool isForcedInlined() { return FunctionAttr == FA_FORCE_INLINE; }
  bool isBestEffortInline() { return FunctionAttr == FA_BEST_EFFORT_INLINE; }
  bool hasNoCaller() { return isAddrTakenFunc() || isEntryFunc(); }
  bool willBeInlined() { return isBestEffortInline() || isForcedInlined(); }
  bool isStackCallAssigned() { return FunctionAttr == FA_STACKCALL; }
  bool canAssignStackCall() {
    if (FA_BEST_EFFORT_INLINE == FunctionAttr ||
        FA_TRIMMED == FunctionAttr) // The best effort inline or manually trimmed functions can be assigned stack call
      return true;
    return false;
  }

  uint16_t getFunctionTrait(Scaled64 thresholdForTrimming) {
    if (FunctionAttr != FA_BEST_EFFORT_INLINE) // Only best effort inline can be trimmed
      return FT_NOT_BEST_EFFORT;
    // to allow trimming functions called from other kernels, set the regkey to false
    if (ForceInlineExternalFunctions && InMultipleUnit)
      return FT_MUL_KERNEL;

    uint64_t tinySize = ControlInlineTinySize;

    if (getPotentialBodySize() < tinySize) // It's too small to trim
      return FT_TOO_TINY;

    if (StaticProfileGuidedTrimming) {
      if (getWeightForTrimming() < thresholdForTrimming) {
        return FT_LOWER_WEIGHT;
      } else {
        return FT_HIGHER_WEIGHT;
      }
    }

    return FT_BIG_ENOUGH;
  }

  std::string getFuncAttrStr() {
    switch (FunctionAttr) {
    case FA_BEST_EFFORT_INLINE:
      return "Best effort innline";
    case FA_FORCE_INLINE:
      return "Force innline";
    case FA_TRIMMED:
      return "Trimmed";
    case FA_STACKCALL:
      return "Stack call";
    case FA_KERNEL_ENTRY:
      return "Kernel entry";
    case FA_ADDR_TAKEN:
      return "Address taken";
    default:
      return "Wrong value";
    }
    return "";
  }

  void dumpFuncInfo(uint16_t type, std::string message) {
    std::string dumpInfo = message + ", ";
    dumpInfo += F->getName().str();
    dumpInfo += ", Function Attribute: ";
    dumpInfo += getFuncAttrStr();
    dumpInfo += ", Function size: ";
    dumpInfo += std::to_string(InitialSize);
    if (EnableLeafCollapsing) {
      dumpInfo += ", Size after collapsing: ";
      dumpInfo += std::to_string(SizeAfterCollapsing);
    }
    if (EnableSizeContributionOptimization) {
      dumpInfo += ", Size contribution: ";
      dumpInfo += std::to_string(getSizeContribution());
    }
    if (UseFrequencyInfoForSPGT) {
      dumpInfo += ", Freq: ";
      dumpInfo += getStaticFuncFreqStr();
    }
    if (StaticProfileGuidedTrimming) {
      dumpInfo += ", Weight: ";
      dumpInfo += getWeightForTrimming().toString();
    }
    PrintTrimUnit(type, dumpInfo);
  }

  // Top down bfs to find the size of a compilation unit
  uint32_t updateUnitSize() {
    std::unordered_set<FunctionNode *> visit;
    std::deque<FunctionNode *> TopDownQueue;
    TopDownQueue.push_back(this);
    visit.insert(this);
    uint32_t total = 0;
    PrintFunctionSizeAnalysis(0x4, "Functions in the unit " << F->getName().str()) while (!TopDownQueue.empty()) {
      FunctionNode *Node = TopDownQueue.front();
      PrintFunctionSizeAnalysis(0x4, Node->F->getName().str() << ": " << Node->InitialSize) TopDownQueue.pop_front();
      total += Node->InitialSize;
      for (auto &Callee : Node->CalleeList) {
        FunctionNode *calleeNode = Callee.first;
        if (visit.find(calleeNode) != visit.end() ||
            calleeNode->isStackCallAssigned()) // Already processed or head of stack call
          continue;
        visit.insert(calleeNode);
        TopDownQueue.push_back(calleeNode);
      }
    }
    return UnitSize = total;
  }

  /// \brief A single step to expand F
  void expand(FunctionNode *callee) {
    // When the collaped callee has implicit arguments
    // the node will have implicit arguments too
    // In this scenario, when ControlInlineImplicitArgs is set
    // the node should be inlined unconditioinally so exempt from a stackcall and trimming target
    if (HasImplicitArg == false && callee->HasImplicitArg == true) {
      HasImplicitArg = true;
      PrintFunctionSizeAnalysis(0x4, "Func " << this->F->getName().str() << " expands to has implicit arg due to "
                                             << callee->F->getName().str())

          if (!hasNoCaller()) // Can't inline kernel entry or address taken functions
      {
        if (isStackCallAssigned()) { // When stackcall is assigned we need to determine based on the flag
          if (ForceInlineStackCallWithImplArg)
            setForceInline();
        } else if (ControlInlineImplicitArgs) { // Force inline ordinary functions with implicit arguments
          setForceInline();
        }
      }
    }
    uint32_t sizeIncrease = callee->ExpandedSize * CalleeList[callee];
    tmpSize += sizeIncrease;
  }
#if defined(_DEBUG)
  void print(raw_ostream &os);

  void dump() { print(llvm::errs()); }
#endif
};

} // namespace
#if defined(_DEBUG)

void FunctionNode::print(raw_ostream &os) {
  os << "Function: " << F->getName() << ", " << InitialSize << "\n";
  for (const auto &G : CalleeList)
    os << "--->>>" << G.first->F->getName() << "\n";
  for (const auto &G : CallerList)
    os << "<<<---" << G.first->F->getName() << "\n";
}
#endif

void EstimateFunctionSize::clear() {
  M = nullptr;
  for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
    auto Node = (FunctionNode *)I->second;
    delete Node;
  }
  ECG.clear();
  kernelEntries.clear();
  stackCallFuncs.clear();
  addressTakenFuncs.clear();
}

bool EstimateFunctionSize::matchImplicitArg(CallInst &CI) {
  bool matched = false;
  StringRef funcName = CI.getCalledFunction()->getName();
  if (funcName.equals(GET_LOCAL_ID_X) || funcName.equals(GET_LOCAL_ID_Y) || funcName.equals(GET_LOCAL_ID_Z)) {
    matched = true;
  } else if (funcName.equals(GET_GROUP_ID)) {
    matched = true;
  } else if (funcName.equals(GET_LOCAL_THREAD_ID)) {
    matched = true;
  } else if (funcName.equals(GET_GLOBAL_OFFSET)) {
    matched = true;
  } else if (funcName.equals(GET_GLOBAL_SIZE)) {
    matched = true;
  } else if (funcName.equals(GET_LOCAL_SIZE)) {
    matched = true;
  } else if (funcName.equals(GET_WORK_DIM)) {
    matched = true;
  } else if (funcName.equals(GET_NUM_GROUPS)) {
    matched = true;
  } else if (funcName.equals(GET_ENQUEUED_LOCAL_SIZE)) {
    matched = true;
  } else if (funcName.equals(GET_STAGE_IN_GRID_ORIGIN)) {
    matched = true;
  } else if (funcName.equals(GET_STAGE_IN_GRID_SIZE)) {
    matched = true;
  } else if (funcName.equals(GET_SYNC_BUFFER)) {
    matched = true;
  } else if (funcName.equals(GET_ASSERT_BUFFER)) {
    matched = true;
  }

  if (matched && (IGC_GET_FLAG_VALUE(PrintControlKernelTotalSize) & 0x40) != 0) {
    PrintFunctionSizeAnalysis(0x8, "Matched implicit arg " << funcName.str())
  }
  return matched;
}

// visit Call inst to determine if implicit args are used by the caller
void EstimateFunctionSize::visitCallInst(CallInst &CI) {
  if (!CI.getCalledFunction()) {
    return;
  }
  // Check for implicit arg function calls
  bool matched = matchImplicitArg(CI);
  tmpHasImplicitArg = matched;
}

void EstimateFunctionSize::updateStaticFuncFreq() {
  DenseMap<Function *, ScaledNumber<uint64_t>> Counts;
  auto MayHaveIndirectCalls = [](Function &F) {
    for (auto *U : F.users()) {
      if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
        return true;
    }
    return false;
  };
  uint64_t InitialSyntheticCount = 10;
  uint64_t InlineSyntheticCount = 15;
  uint64_t ColdSyntheticCount = 5;
  for (Function &F : *M) {
    uint64_t InitialCount = InitialSyntheticCount;
    if (F.empty() || F.isDeclaration())
      continue;
    if (F.hasFnAttribute(llvm::Attribute::AlwaysInline) || F.hasFnAttribute(llvm::Attribute::InlineHint)) {
      // Use a higher value for inline functions to account for the fact that
      // these are usually beneficial to inline.
      InitialCount = InlineSyntheticCount;
    } else if (F.hasLocalLinkage() && !MayHaveIndirectCalls(F)) {
      // Local functions without inline hints get counts only through
      // propagation.
      InitialCount = 0;
    } else if (F.hasFnAttribute(llvm::Attribute::Cold) || F.hasFnAttribute(llvm::Attribute::NoInline)) {
      // Use a lower value for noinline and cold functions.
      InitialCount = ColdSyntheticCount;
    }
    Counts[&F] = Scaled64(InitialCount, 0);
  }
  // Edge includes information about the source. Hence ignore the first
  // parameter.
  auto GetCallSiteProfCount = [&](const CallGraphNode *, const CallGraphNode::CallRecord &Edge) {
    std::optional<Scaled64> Res = std::nullopt;
    if (!Edge.first)
      return IGCLLVM::makeLLVMOptional(Res);
    CallBase &CB = *cast<CallBase>(*Edge.first);
    Function *Caller = CB.getCaller();
    BasicBlock *CSBB = CB.getParent();
    // Now compute the callsite count from relative frequency and
    // entry count:
    Scaled64 EntryFreq = get<FunctionNode>(Caller)->getEntryFrequency();
    Scaled64 BBCount = get<FunctionNode>(Caller)->blockFreqs[CSBB];
    IGC_ASSERT(EntryFreq != 0);
    BBCount /= EntryFreq;
    BBCount *= Counts[Caller];
    return IGCLLVM::makeLLVMOptional(std::optional<Scaled64>(BBCount));
  };
  CallGraph CG(*M);
  // Propgate the entry counts on the callgraph.
  SyntheticCountsUtils<const CallGraph *>::propagate(&CG, GetCallSiteProfCount,
                                                     [&](const CallGraphNode *N, Scaled64 New) {
                                                       auto F = N->getFunction();
                                                       if (!F || F->isDeclaration())
                                                         return;
                                                       Counts[F] += New;
                                                     });

  for (auto &F : M->getFunctionList()) {
    if (F.empty())
      continue;
    FunctionNode *Node = get<FunctionNode>(&F);

    if (Counts.find(&F) != Counts.end())
      Node->setStaticFuncFreq(Counts[&F]);
  }
  return;
}

void EstimateFunctionSize::runStaticAnalysis() {
  // Analyze function frequencies from SyntheticCountsPropagation
  PrintStaticProfileGuidedKernelSizeReduction(
      0x1, "------------------Static analysis start------------------") for (auto &F : M->getFunctionList()) {
    if (F.empty())
      continue;
    auto &BFI = getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
    FunctionNode *Node = get<FunctionNode>(&F);
    Node->setEntryFrequency(BFI.getEntryFreq(), 0);

    for (auto &B : F)
      Node->blockFreqs[&B] = Scaled64(BFI.getBlockFreq(&B).getFrequency(), 0);
  }
  updateStaticFuncFreq();
  std::vector<Scaled64> freqLog;
  if (BlockFrequencySampling) { // Set basic blocks as the sample space
    for (auto &F : M->getFunctionList()) {
      if (F.empty())
        continue;
      FunctionNode *Node = get<FunctionNode>(&F);
      Scaled64 EntryFreq = Node->getEntryFrequency();
      PrintStaticProfileGuidedKernelSizeReduction(0x1, "Function frequency of "
                                                           << Node->F->getName().str() << ": "
                                                           << Node->getStaticFuncFreqStr()) for (auto &B : F) {
        Scaled64 BBCount = Node->blockFreqs[&B];
        BBCount /= EntryFreq;
        BBCount *= Node->getStaticFuncFreq();
        PrintStaticProfileGuidedKernelSizeReduction(0x1, "Block frequency of " << B.getName().str() << ": "
                                                                               << BBCount.toString())

            if (BBCount > 0) // Can't represent 0 in log scale so ignore, better idea?
            freqLog.push_back(BBCount);
      }
    }
  } else {
    for (auto &F : M->getFunctionList()) {
      if (F.empty())
        continue;
      FunctionNode *Node = get<FunctionNode>(&F);
      PrintStaticProfileGuidedKernelSizeReduction(
          0x1, "Function frequency of "
                   << Node->F->getName().str() << ": "
                   << Node->getStaticFuncFreqStr()) if (Node->getStaticFuncFreq() >
                                                        0) // Can't represent 0 in log scale so ignore, better idea?
          freqLog.push_back(Node->getStaticFuncFreq());
    }
  }

  if ((MetricForKernelSizeReduction & SP_NORMAL_DISTRIBUTION) != 0 &&
      !freqLog.empty()) { // When using a normal distribution. Ignore when there are no frequency data
    IGC_ASSERT(ParameterForColdFuncThreshold >= 0 && ParameterForColdFuncThreshold <= 30);
    // Find a threshold from a normal distribution
    std::sort(freqLog.begin(), freqLog.end()); // Sort frequency data
    std::vector<double> freqLogDbl;
    std::unordered_map<double, Scaled64> map_log10_to_scaled64;
    double log10_2 = std::log10(2);
    for (Scaled64 &val : freqLog) // transform into log10 scale
    {
      double logedVal = std::log10(val.getDigits()) + val.getScale() * log10_2;
      map_log10_to_scaled64[logedVal] = val;
      freqLogDbl.push_back(logedVal);
    }
    double sum_val = std::accumulate(freqLogDbl.begin(), freqLogDbl.end(), 0.0);
    double mean = sum_val / freqLogDbl.size();
    double sq_sum = std::inner_product(
        freqLogDbl.begin(), freqLogDbl.end(), freqLogDbl.begin(), 0.0,
        [](double const &x, double const &y) { return x + y; },
        [mean](double const &x, double const &y) { return (x - mean) * (y - mean); });
    double standard_deviation = std::sqrt(sq_sum / freqLogDbl.size());
    float C = (float)ParameterForColdFuncThreshold /
              10; // Since 1 STD is too wide in the majority case, we need to scale down
    double threshold_log10 = mean - C * standard_deviation;
    auto it_lower = std::lower_bound(freqLogDbl.begin(), freqLogDbl.end(), threshold_log10);
    if (it_lower == freqLogDbl.end())
      threshold_func_freq = freqLog.back();
    else
      threshold_func_freq = map_log10_to_scaled64[*it_lower];
    PrintStaticProfileGuidedKernelSizeReduction(0x1, "Metric: Normal distribution");
    PrintStaticProfileGuidedKernelSizeReduction(0x1, "Sample count: " << freqLogDbl.size());
    PrintStaticProfileGuidedKernelSizeReduction(0x1, "Execution frequency mean (Log10 scale): " << mean);
    PrintStaticProfileGuidedKernelSizeReduction(0x1, "Standard deviation (Log10 scale): " << standard_deviation);
    PrintStaticProfileGuidedKernelSizeReduction(0x1, "Execution frequency threshold with Constant(C) "
                                                         << C << ": " << threshold_func_freq.toString());
  } else if ((MetricForKernelSizeReduction & SP_LONGTAIL_DISTRIBUTION) != 0 &&
             !freqLog.empty()) { // When using a long-tail distribution. Ignore when there are no frequency data
    IGC_ASSERT(ParameterForColdFuncThreshold > 0 && ParameterForColdFuncThreshold <= 100);
    // Find a threshold from a long tail distribution
    uint32_t threshold_cold = (uint32_t)ParameterForColdFuncThreshold;
    uint32_t C_pos = freqLog.size() * threshold_cold / 100;
    std::nth_element(freqLog.begin(), freqLog.begin() + C_pos, freqLog.end(),
                     [](Scaled64 &x, Scaled64 &y) { return x < y; }); // Low C%
    threshold_func_freq = freqLog[C_pos];
    PrintStaticProfileGuidedKernelSizeReduction(0x1, "Metric: Long tail distribution");
    PrintStaticProfileGuidedKernelSizeReduction(0x1, "Low " << threshold_cold << "% pos: " << C_pos << " out of "
                                                            << freqLog.size());
    PrintStaticProfileGuidedKernelSizeReduction(0x1, "Execution frequency threshold: " << threshold_func_freq);
  } else if ((MetricForKernelSizeReduction & SP_AVERAGE_PERCENTAGE) != 0 &&
             !freqLog.empty()) { // When using a average C%
    Scaled64 sum_val = std::accumulate(freqLog.begin(), freqLog.end(), Scaled64::getZero());
    Scaled64 mean = sum_val / Scaled64::get(freqLog.size());
    Scaled64 C = Scaled64::get(ParameterForColdFuncThreshold) / Scaled64::get(10); // Scale down /10
    IGC_ASSERT(C > 0 && C <= 100);
    threshold_func_freq = mean * (C / Scaled64::get(100));
    PrintStaticProfileGuidedKernelSizeReduction(0x1, "Metric: Average%");
    PrintStaticProfileGuidedKernelSizeReduction(0x1, "Average threshold * " << C.toString()
                                                                            << "%: " << threshold_func_freq.toString());
  }

  unsigned sizeThreshold = ControlInlineTinySizeForSPGT;
  if (UseFrequencyInfoForSPGT) {
    thresholdForTrimming = getSPGTWeight(sizeThreshold, threshold_func_freq, SizeWeightForSPGT, FrequencyWeightForSPGT);
  } else {
    thresholdForTrimming = Scaled64::get(sizeThreshold); // If we don't want to use freq data,
                                                         // just use size only
  }

  PrintStaticProfileGuidedKernelSizeReduction(0x1, "------------------Static analysis end------------------\n") return;
}

void EstimateFunctionSize::estimateTotalLoopIteration(llvm::Function &F, LoopInfo *LI) {
  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>(F).getSE();
  for (Loop *L : LI->getLoopsInPreorder()) {
    Scaled64 ParentLCnt = Scaled64::getOne();
    Loop *ParentL = L->getParentLoop();
    if (ParentL) {
      IGC_ASSERT(LoopIterCnts.find(ParentL) != LoopIterCnts.end());
      ParentLCnt = LoopIterCnts[ParentL];
    }
    StringRef LoopCntAttr = " Back edge count not available";
    if (SE.hasLoopInvariantBackedgeTakenCount(L)) {
      unsigned TripCount = 0;
      SmallVector<BasicBlock *, 8> ExitingBlocks;
      L->getExitingBlocks(ExitingBlocks);
      for (BasicBlock *ExitingBlock : ExitingBlocks)
        if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))
          if (!TripCount || TC < TripCount)
            TripCount = TC;
      if (TripCount) {
        // We assume that loop unrolling will not exceed 16 times
        unsigned MaxUnrollCount = MaxUnrollCountForFunctionSizeAnalysis;
        TripCount = std::min(TripCount, MaxUnrollCount);
        LoopIterCnts[L] = ParentLCnt * Scaled64::get(TripCount);
        LoopCntAttr = " Trip count available";
      } else {
        // TODO: We currently set a loop count to 5
        // if we don't know the exact number
        LoopIterCnts[L] = ParentLCnt * Scaled64::get(5);
        LoopCntAttr = " Upper bound available";
      }
    } else {
      LoopIterCnts[L] = Scaled64::getOne();
    }
    PrintFunctionSizeAnalysis(0x2, "Loop " << L->getName().str() << ": Loop Count = " << LoopIterCnts[L].toString()
                                           << ", Parent Loop Count = " << ParentLCnt.toString() << LoopCntAttr)
  }
  return;
}

void EstimateFunctionSize::analyze() {
  auto getSize = [&](llvm::Function &F) {
    std::size_t Size = 0;
    for (auto &BB : F) {
      std::size_t BlkSize = IGCLLVM::sizeWithoutDebug(&BB);
      Size += BlkSize;
    }
    return Size;
  };

  auto getSizeWithLoopCnt = [&](llvm::Function &F, LoopInfo &LI) {
    std::size_t Size = 0;
    for (auto &BB : F) {
      std::size_t BlkSize = IGCLLVM::sizeWithoutDebug(&BB);
      Loop *L = LI.getLoopFor(&BB);
      if (L) {
        BlkSize = BlkSize * LoopIterCnts[L].toInt<size_t>();
      }
      Size += BlkSize;
    }
    return Size;
  };

  auto MdWrapper = getAnalysisIfAvailable<MetaDataUtilsWrapper>();
  auto pMdUtils = MdWrapper->getMetaDataUtils();
  // Initialize the data structure. find all noinline and stackcall properties
  for (auto &F : M->getFunctionList()) {
    if (F.empty())
      continue;
    FunctionNode *node = nullptr;
    if (LoopCountAwareTrimming) {
      auto &LI = getAnalysis<LoopInfoWrapperPass>(F).getLoopInfo();
      estimateTotalLoopIteration(F, &LI);
      size_t FuncSize = getSize(F);
      size_t FuncSizeWithLoopCnt = getSizeWithLoopCnt(F, LI);
      node = new FunctionNode(&F, FuncSizeWithLoopCnt);
      PrintFunctionSizeAnalysis(0x1, "Function " << F.getName().str() << " Original Size: " << FuncSize
                                                 << " Size with Loop Iter: " << FuncSizeWithLoopCnt);
    } else {
      node = new FunctionNode(&F, getSize(F));
    }
    node->setFlags(EnableLeafCollapsing, EnableSizeContributionOptimization, StaticProfileGuidedTrimming,
                   UseFrequencyInfoForSPGT, ForceInlineExternalFunctions, ControlInlineTinySize,
                   ForceInlineStackCallWithImplArg, ControlInlineImplicitArgs, SizeWeightForSPGT,
                   FrequencyWeightForSPGT);
    bool isForceTrim = false;
    if (!SelectiveTrimming.empty()) {
      std::string functionToTrim = SelectiveTrimming;
      if (F.getName().str() == functionToTrim) {
        isForceTrim = true;
        PrintFunctionSizeAnalysis(0x1, "Force trimming (No inline) " << functionToTrim);
      }
    }
    ECG[&F] = node;
    if (isEntryFunc(pMdUtils, node->F)) { /// Entry function
      node->setKernelEntry();
      kernelEntries.push_back(node);
    } else if (F.hasFnAttribute("igc-force-stackcall")) {
      node->setStackCall();
    } else if (F.hasFnAttribute(llvm::Attribute::NoInline) || isForceTrim) {
      node->setTrimmed();
    } else if (F.hasFnAttribute(llvm::Attribute::AlwaysInline)) {
      node->setForceInline();
    }
    // Otherwise, the function attribute to be assigned is best effort
  }

  // Visit all call instructions and populate CG.
  for (auto &F : M->getFunctionList()) {
    if (F.empty())
      continue;
    FunctionNode *Node = get<FunctionNode>(&F);
    auto &LI = getAnalysis<LoopInfoWrapperPass>(F).getLoopInfo();
    for (auto U : F.users()) {
      // Other users (like bitcast/store) are ignored.
      if (auto *CI = dyn_cast<CallInst>(U)) {
        // G calls F, or G --> F
        BasicBlock *BB = CI->getParent();
        Function *G = BB->getParent();
        FunctionNode *GN = get<FunctionNode>(G);
        unsigned LoopCnt = 1;
        if (LoopCountAwareTrimming) {
          Loop *L = LI.getLoopFor(BB);
          if (L) {
            IGC_ASSERT(LoopIterCnts.find(L) != LoopIterCnts.end());
            LoopCnt = LoopIterCnts[L].toInt<size_t>();
          }
        }
        GN->addCallee(Node, LoopCnt);
        Node->addCaller(GN, LoopCnt);
      }
    }
  }

  // Find all address taken functions
  for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
    FunctionNode *Node = (FunctionNode *)I->second;
    // Address taken functions neither have callers nor is an entry function
    if (Node->CallerList.empty() && !Node->isEntryFunc())
      Node->setAddressTaken();
  }

  bool needImplAnalysis = ControlInlineImplicitArgs || ForceInlineStackCallWithImplArg;
  // check functions and mark those that use implicit args.
  PrintFunctionSizeAnalysis(0x1, "--------------------------Function size analysis start--------------------------");
  if (needImplAnalysis)
    performImplArgsAnalysis();

  // Update expanded and static unit size and propagate implicit argument information which might cancel some stackcalls
  for (void *entry : kernelEntries) {
    FunctionNode *kernelEntry = (FunctionNode *)entry;
    updateExpandedUnitSize(kernelEntry->F, true);
    kernelEntry->updateUnitSize();
    PrintFunctionSizeAnalysis(0x1, "Unit size (kernel entry) " << kernelEntry->F->getName().str() << ": "
                                                               << kernelEntry->UnitSize);
    PrintFunctionSizeAnalysis(0x1, "Expanded unit size (kernel entry) " << kernelEntry->F->getName().str() << ": "
                                                                        << kernelEntry->ExpandedSize);
  }

  // Find all survived stackcalls and address taken functions and update unit sizes
  for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
    FunctionNode *Node = (FunctionNode *)I->second;
    if (Node->isStackCallAssigned()) {
      stackCallFuncs.push_back(Node);
      Node->updateUnitSize();
      PrintFunctionSizeAnalysis(0x1, "Unit size (stack call) " << Node->F->getName().str() << ": " << Node->UnitSize);
    } else if (Node->isAddrTakenFunc()) {
      addressTakenFuncs.push_back(Node);
      updateExpandedUnitSize(Node->F, true);
      Node->updateUnitSize();
      PrintFunctionSizeAnalysis(0x1,
                                "Unit size (address taken) " << Node->F->getName().str() << ": " << Node->UnitSize);
      PrintFunctionSizeAnalysis(0x1, "Expanded unit size (address taken) " << Node->F->getName().str() << ": "
                                                                           << Node->ExpandedSize);
    }
  }
  PrintFunctionSizeAnalysis(0x1, "Function count= " << ECG.size());
  PrintFunctionSizeAnalysis(0x1, "Kernel count= " << kernelEntries.size());
  PrintFunctionSizeAnalysis(0x1, "Manual stack call count= " << stackCallFuncs.size());
  PrintFunctionSizeAnalysis(0x1, "Address taken function call count= " << addressTakenFuncs.size());
  PrintFunctionSizeAnalysis(0x1, "--------------------------Function size analysis end--------------------------\n");
  return;
}

void EstimateFunctionSize::performImplArgsAnalysis() {
  for (auto I = ECG.begin(), E = ECG.end(); I != E; ++I) {
    FunctionNode *Node = (FunctionNode *)I->second;
    IGC_ASSERT(Node);
    tmpHasImplicitArg = false;
    visit(Node->F);
    if (!tmpHasImplicitArg) // The function doesn't have an implicit argument: skip
      continue;
    Node->HasImplicitArg = true;
    static int cnt = 0;
    const char *Name;
    if (Node->isLeaf()) {
      Name = "Leaf";
    } else {
      Name = "nonLeaf";
    }
    PrintFunctionSizeAnalysis(0x8, Name << " Func " << ++cnt << " " << Node->F->getName().str()
                                        << " calls implicit args so HasImplicitArg")

        if (Node->hasNoCaller()) // Can't inline kernel entry or address taken functions
        continue;

    if (Node->isStackCallAssigned()) // When stackcall is assigned we need to determine based on the flag
    {
      if (ForceInlineStackCallWithImplArg)
        Node->setForceInline();
      continue;
    }

    // For other cases
    if (ControlInlineImplicitArgs) // Force inline ordinary functions with implicit arguments
      Node->setForceInline();
  }
  return;
}

/// \brief Return the estimated maximal function size after complete inlining.
std::size_t EstimateFunctionSize::getMaxExpandedSize() const {
  uint32_t MaxSize = 0;
  for (auto I : kernelEntries) {
    FunctionNode *Node = (FunctionNode *)I;
    MaxSize = std::max(MaxSize, Node->ExpandedSize);
  }
  for (auto I : addressTakenFuncs) {
    FunctionNode *Node = (FunctionNode *)I;
    MaxSize = std::max(MaxSize, Node->ExpandedSize);
  }
  return MaxSize;
}

void EstimateFunctionSize::checkSubroutine() {
  auto CGW = getAnalysisIfAvailable<CodeGenContextWrapper>();
  if (!CGW)
    return;

  EnableSubroutine = true;
  CodeGenContext *pContext = CGW->getCodeGenContext();
  if (pContext->type != ShaderType::OPENCL_SHADER && pContext->type != ShaderType::COMPUTE_SHADER &&
      pContext->type != ShaderType::RAYTRACING_SHADER)
    EnableSubroutine = false;

  if (EnableSubroutine) {
    uint32_t subroutineThreshold = SubroutineThreshold;
    uint32_t expandedMaxSize = getMaxExpandedSize();

    if (AL != AL_Module) // at the second call of EstimationFucntionSize, halve the threshold
      subroutineThreshold = subroutineThreshold >> 1;

    if (expandedMaxSize <= subroutineThreshold) {
      PrintTrimUnit(0x1, "No need to reduce the kernel size. (The max expanded kernel size is small) "
                             << expandedMaxSize << " < " << subroutineThreshold) if (!HasRecursion) EnableSubroutine =
          false;
    } else if (AL == AL_Module &&
               IGC_IS_FLAG_DISABLED(DisableAddingAlwaysAttribute)) { // kernel trimming and partitioning only kick in at
                                                                     // the first EstimationFunctionSize
      // Analyze Function/Block frequencies

      if (StaticProfileGuidedPartitioning ||
          StaticProfileGuidedTrimming) // Either a normal or long-tail distribution is enabled
        runStaticAnalysis();

      // If the max unit size exceeds threshold, do partitioning
      if (PartitionUnit) {
        PrintPartitionUnit(0x1, "--------------------------Partition unit start--------------------------");
        uint32_t unitThreshold = UnitSizeThreshold;
        uint32_t maxUnitSize = getMaxUnitSize();
        if (maxUnitSize > unitThreshold) {
          PrintPartitionUnit(0x1, "Max unit size " << maxUnitSize << " is larger than the threshold (to partition) "
                                                   << unitThreshold) partitionKernel();
        } else {
          PrintPartitionUnit(0x1, "Max unit size " << maxUnitSize
                                                   << " is smaller than the threshold (No partitioning needed) "
                                                   << unitThreshold)
        }
        PrintPartitionUnit(0x1, "--------------------------Partition unit end--------------------------\n");
      }

      PrintTrimUnit(0x1, "Need to reduce the kernel size. (The max expanded kernel size is large) "
                             << expandedMaxSize << " > " << subroutineThreshold)
          PrintTrimUnit(
              0x1,
              "-----------------------------Trimming start-----------------------------") if (ControlKernelTotalSize) {
        reduceKernelSize();
      }
      else if (ControlUnitSize) {
        reduceCompilationUnitSize();
      }
      PrintTrimUnit(0x1, "-----------------------------Trimming end-----------------------------\n")
    }
  }
  IGC_ASSERT(!HasRecursion || EnableSubroutine);
  return;
}

std::size_t EstimateFunctionSize::getExpandedSize(const Function *F) const {
  // IGC_ASSERT(IGC_IS_FLAG_DISABLED(ControlKernelTotalSize));
  auto I = ECG.find((Function *)F);
  if (I != ECG.end()) {
    FunctionNode *Node = (FunctionNode *)I->second;
    IGC_ASSERT(F == Node->F);
    return Node->ExpandedSize;
  }
  return std::numeric_limits<std::size_t>::max();
}

bool EstimateFunctionSize::onlyCalledOnce(const Function *F, const Function *CallerF) {
  // IGC_ASSERT(IGC_IS_FLAG_DISABLED(ControlKernelTotalSize));
  auto I = ECG.find((Function *)F);
  if (I != ECG.end()) {
    auto *Node = (FunctionNode *)I->second;
    IGC_ASSERT(F == Node->F);
    // one call-site and not a recursion
    if (Node->CallerList.size() == 1 && Node->CallerList.begin()->second == 1 &&
        Node->CallerList.begin()->first != Node) {
      return true;
    }
    // OpenCL specific, called once by passed kernel
    auto *MdWrapper = getAnalysisIfAvailable<MetaDataUtilsWrapper>();
    if (MdWrapper) {
      auto *pMdUtils = MdWrapper->getMetaDataUtils();
      for (const auto &[Caller, CallCount] : Node->CallerList) {
        if (CallCount > 1 && Caller->F == CallerF) {
          return false;
        }
        if (!isEntryFunc(pMdUtils, Caller->F)) {
          return false;
        }
      }
      return true;
    }
  }
  return false;
}

void EstimateFunctionSize::reduceKernelSize() {
  uint32_t threshold = KernelTotalSizeThreshold;
  llvm::SmallVector<void *, 64> unitHeads;
  for (auto node : kernelEntries)
    unitHeads.push_back((FunctionNode *)node);
  for (auto node : addressTakenFuncs)
    unitHeads.push_back((FunctionNode *)node);
  trimCompilationUnit(unitHeads, threshold, true);
  return;
}

bool EstimateFunctionSize::isTrimmedFunction(llvm::Function *F) { return get<FunctionNode>(F)->isTrimmed(); }

bool EstimateFunctionSize::isLargeKernelThresholdExceeded() const {
  for (auto *node : kernelEntries) {
    auto *kernelNode = (FunctionNode *)node;
    if (kernelNode->ExpandedSize > KernelTotalSizeThreshold * LargeKernelThresholdMultiplier) {
      return true;
    }
  }
  return false;
}

// Initialize data structures for topological traversal: FunctionsInKernel and BottomUpQueue.
// FunctionsInKernel is a map data structure where the key is FunctionNode and value is the number of edges to callee
// nodes. FunctionsInKernel is primarily used for topological traversal and also used to check whether a function is in
// the currently processed kernel/unit. BottomUpQueue will contain the leaf nodes of a kernel/unit and they are starting
// points of topological traversal.
void EstimateFunctionSize::initializeTopologicalVisit(Function *root,
                                                      std::unordered_map<void *, uint32_t> &FunctionsInKernel,
                                                      std::deque<void *> &BottomUpQueue, bool ignoreStackCallBoundary) {
  std::deque<FunctionNode *> Queue;
  FunctionNode *unitHead = get<FunctionNode>(root);
  Queue.push_back(unitHead);
  FunctionsInKernel[unitHead] = unitHead->CalleeList.size();
  // top down traversal to visit functions which will be processed reversely
  while (!Queue.empty()) {
    FunctionNode *Node = Queue.front();
    Queue.pop_front();
    Node->tmpSize = Node->InitialSize;
    for (auto &Callee : Node->CalleeList) {
      FunctionNode *CalleeNode = Callee.first;
      if (FunctionsInKernel.find(CalleeNode) != FunctionsInKernel.end())
        continue;
      if (!ignoreStackCallBoundary &&
          CalleeNode
              ->isStackCallAssigned()) // This callee is a compilation unit head, so not in the current compilation unit
      {
        FunctionsInKernel[Node] -= 1; // Ignore different compilation unit
        continue;
      }
      FunctionsInKernel[CalleeNode] = CalleeNode->CalleeList.size(); // Update the number of edges to callees
      Queue.push_back(CalleeNode);
    }
    if (FunctionsInKernel[Node] == 0) // This means no children or all children are compilation unit heads: leaf node
      BottomUpQueue.push_back(Node);
  }
  return;
}

llvm::ScaledNumber<uint64_t> EstimateFunctionSize::calculateTotalWeight(Function *root) {
  FunctionNode *root_node = get<FunctionNode>(root);
  std::deque<void *> TopdownQueue;
  TopdownQueue.push_back(root_node);
  std::unordered_set<void *> visit;
  visit.insert(root_node);
  Scaled64 totalSizeContributionSq = Scaled64::getZero();
  Scaled64 totalSubroutineFreq = Scaled64::getZero();
  while (!TopdownQueue.empty()) {
    FunctionNode *node = (FunctionNode *)TopdownQueue.front();
    TopdownQueue.pop_front();
    totalSizeContributionSq += Scaled64::get(node->getSizeContribution() * node->getSizeContribution());
    if (!node->willBeInlined())
      totalSubroutineFreq += node->getStaticFuncFreq();
    for (auto &callee_info : node->CalleeList) {
      FunctionNode *callee = callee_info.first;
      if (visit.find(callee) == visit.end()) {
        visit.insert(callee);
        TopdownQueue.push_back(callee);
      }
    }
  }
  return totalSizeContributionSq * totalSizeContributionSq * totalSubroutineFreq;
}

// Update the information about how many time a function will be inlined
void EstimateFunctionSize::updateInlineCnt(Function *root) {
  FunctionNode *root_node = get<FunctionNode>(root);
  std::unordered_map<void *, uint32_t>
      unprocessed_callers; // A data structure to collect the number of callers for a functoin in a kernel boundary
  unprocessed_callers[root_node] = 0;

  std::deque<void *> TopdownQueue;
  TopdownQueue.push_back(root_node);

  std::unordered_set<void *> visit;
  visit.insert(root_node);

  // Top down traversal to initialize the number of callers and inline count in a kernel boundary
  // This step is just for initialization for the topological traverse at the second step
  while (!TopdownQueue.empty()) {
    FunctionNode *node = (FunctionNode *)TopdownQueue.front();
    TopdownQueue.pop_front();
    node->Inline_cnt = 0;
    for (auto &callee_info : node->CalleeList) {
      FunctionNode *callee = callee_info.first;
      if (unprocessed_callers.find(callee) == unprocessed_callers.end())
        unprocessed_callers[callee] = 0; // Initialize callee's caller count

      unprocessed_callers[callee] += 1; // Increment by 1 since the callee is called by the node
      if (visit.find(callee) == visit.end()) {
        visit.insert(callee);
        TopdownQueue.push_back(callee);
      }
    }
  }
  TopdownQueue.push_back(root_node);
  while (!TopdownQueue.empty()) {
    FunctionNode *node = (FunctionNode *)TopdownQueue.front();
    TopdownQueue.pop_front();
    for (auto &callee_info : node->CalleeList) {
      FunctionNode *callee = callee_info.first;
      uint16_t call_cnt = callee_info.second;
      IGC_ASSERT(unprocessed_callers[callee] != 0);
      unprocessed_callers[callee] -= 1;
      if (callee->willBeInlined())
        callee->Inline_cnt += call_cnt * (node->Inline_cnt == 0 ? 1 : node->Inline_cnt);
      if (unprocessed_callers[callee] == 0)
        TopdownQueue.push_back(callee);
    }
  }
  return;
}

// This function compute the size of each function when must-be-inlined functions are all inlined
// must-be-inlined functions are two kinds: 1) have force-inline attribute, 2) small leaf functions
// Functions with those two kinds should be inlined no matter what the reason is.
// When all small leaf functions are inlined and collapsed, there may be a set of new leaf functions
// So, the algorithm repeat collapsing small leaf functions until only large leaf functions are left
void EstimateFunctionSize::UpdateSizeAfterCollapsing(std::deque<void *> &nodesToProcess,
                                                     std::unordered_set<void *> &funcsInKernel) {
  for (auto n : funcsInKernel) {
    // Initialize the size after inlining
    FunctionNode *Node = (FunctionNode *)n;
    Node->SizeAfterCollapsing = Node->InitialSize;
  }
  std::unordered_map<FunctionNode *, uint16_t> remainingCallee;
  std::unordered_set<FunctionNode *> hasCalleesAfterInline;

  while (!nodesToProcess.empty()) {
    FunctionNode *Node = (FunctionNode *)nodesToProcess.front();
    nodesToProcess.pop_front();
    bool hasCallee = hasCalleesAfterInline.find(Node) != hasCalleesAfterInline.end();
    if (Node->willBeInlined() && !hasCallee && Node->SizeAfterCollapsing < ControlInlineTinySizeForSPGT) {
      if (!Node->isForcedInlined()) {
        PrintTrimUnit(0x8, "Small leaf functions should always be inlined"
                               << Node->F->getName().str() << ", Size after Inline: " << Node->SizeAfterCollapsing);
        Node->setForceInline(); // If the node is supposed to have no callee in the end and small size, it should be
                                // inlined
      }
    }

    for (const auto &c : Node->CallerList) {
      FunctionNode *caller = c.first;
      uint16_t call_cnt = c.second;
      if (funcsInKernel.find(caller) ==
          funcsInKernel.end()) // This caller must not be in the currently processing kernel
        continue;

      if (remainingCallee.find(caller) == remainingCallee.end())
        remainingCallee[caller] = caller->CalleeList.size();
      remainingCallee[caller] -= 1;

      if (remainingCallee[caller] == 0)
        nodesToProcess.push_back((FunctionNode *)caller);

      if (Node->isForcedInlined()) { // Will be inlined in any case
        caller->SizeAfterCollapsing += Node->SizeAfterCollapsing * call_cnt;
        if (hasCallee) // Fucntion that already has force inline might have callee
          hasCalleesAfterInline.insert(caller);
      } else { // Otherwise we don't know, so conservatively mark it having callees
        hasCalleesAfterInline.insert(caller);
      }
    }
  }
  return;
}

// Find the total size of a unit when to-be-inlined functions are expanded
// Topologically traverse from leaf nodes and expand nodes to callers except noinline and stackcall functions
uint32_t EstimateFunctionSize::updateExpandedUnitSize(Function *F, bool ignoreStackCallBoundary) {
  FunctionNode *root = get<FunctionNode>(F);
  std::deque<void *> BottomUpQueue;
  std::unordered_map<void *, uint32_t> FunctionsInUnit;
  initializeTopologicalVisit(root->F, FunctionsInUnit, BottomUpQueue, ignoreStackCallBoundary);
  uint32_t unitTotalSize = 0;
  while (!BottomUpQueue.empty()) // Topologically visit nodes and collape for each compilation unit
  {
    FunctionNode *node = (FunctionNode *)BottomUpQueue.front();
    BottomUpQueue.pop_front();
    IGC_ASSERT(FunctionsInUnit[node] == 0);
    FunctionsInUnit.erase(node);
    node->ExpandedSize = node->tmpSize; // Update the size of an expanded chunk
    if (!node->willBeInlined()) {
      // dbgs() << "Not be inlined Attr: " << (int)node->FunctionAttr << "\n";
      unitTotalSize += node->ExpandedSize;
      PrintTrimUnit(0x10, "Expansion stop at " << node->F->getName().str() << ", Attribute: " << node->getFuncAttrStr()
                                               << ", Chunck size: " << node->ExpandedSize
                                               << ", Total chunck size: " << unitTotalSize);
    }

    for (const auto &c : node->CallerList) {
      FunctionNode *caller = c.first;
      if (FunctionsInUnit.find(caller) == FunctionsInUnit.end()) // Caller is in another compilation unit
      {
        node->InMultipleUnit = true;
        continue;
      }
      FunctionsInUnit[caller] -= 1;
      if (FunctionsInUnit[caller] == 0)
        BottomUpQueue.push_back(caller);
      if (node->willBeInlined())
        caller->expand(node); // collapse and update tmpSize of the caller
    }
  }
  // Has recursion
  if (!FunctionsInUnit.empty())
    HasRecursion = true;

  PrintTrimUnit(0x10, "Final expanded size of " << root->F->getName().str() << ": " << unitTotalSize);
  return root->ExpandedSize = unitTotalSize;
}

// Partition kernels using bottom-up heristic.
uint32_t EstimateFunctionSize::bottomUpHeuristic(Function *F, uint32_t &stackCall_cnt) {
  uint32_t threshold = UnitSizeThreshold;
  std::deque<void *> BottomUpQueue;
  std::unordered_map<void *, uint32_t> FunctionsInUnit; // Set of functions in the boundary of a kernel. Record
                                                        // unprocessed callee counter for topological sort.
  initializeTopologicalVisit(F, FunctionsInUnit, BottomUpQueue, false);
  FunctionNode *unitHeader = get<FunctionNode>(F);
  uint32_t max_unit_size = 0;
  while (!BottomUpQueue.empty()) {
    FunctionNode *Node = (FunctionNode *)BottomUpQueue.front();
    BottomUpQueue.pop_front();
    IGC_ASSERT(FunctionsInUnit[Node] == 0);
    FunctionsInUnit.erase(Node);
    Node->UnitSize = Node->tmpSize; // Update the size

    if (Node == unitHeader) // The last node to process is the unit header
    {
      max_unit_size = std::max(max_unit_size, Node->updateUnitSize());
      continue;
    }

    bool beStackCall = Node->canAssignStackCall() && Node->UnitSize > threshold && Node->updateUnitSize() > threshold &&
                       Node->getStaticFuncFreq() < threshold_func_freq;

    if (beStackCall) {
      PrintPartitionUnit(0x4, "Stack call marked " << Node->F->getName().str() << " Unit size: " << Node->UnitSize
                                                   << " > Threshold " << threshold
                                                   << " Function frequency: " << Node->getStaticFuncFreqStr() << " < "
                                                   << threshold_func_freq.toString())
          stackCallFuncs.push_back(Node); // We have a new unit head
      Node->setStackCall();
      max_unit_size = std::max(max_unit_size, Node->UnitSize);
      stackCall_cnt += 1;
    } else {
      if (!Node->canAssignStackCall()) {
        PrintPartitionUnit(0x4, "Stack call not marked: not best effort or trimmed " << Node->F->getName().str())
      } else if (Node->UnitSize <= threshold || Node->updateUnitSize() <= threshold) {
        PrintPartitionUnit(0x4, "Stack call not marked: unit size too small " << Node->F->getName().str())
      } else {
        PrintPartitionUnit(0x4, "Stack call not marked: too many function frequencies "
                                    << Node->getStaticFuncFreqStr() << " > " << threshold_func_freq.toString() << " "
                                    << Node->F->getName().str())
      }
    }

    for (const auto &c : Node->CallerList) {
      FunctionNode *caller = c.first;
      if (FunctionsInUnit.find(caller) == FunctionsInUnit.end()) // The caller is in another kernel, skip
        continue;
      FunctionsInUnit[caller] -= 1;
      if (FunctionsInUnit[caller] == 0) // All callees of the caller are processed: become leaf.
        BottomUpQueue.push_back(caller);
      if (!beStackCall)
        caller->tmpSize += Node->UnitSize;
    }
  }
  return max_unit_size;
}

// For all function F : F->Us = size(F), F->U# = 0 // unit size and unit number
// For each kernel K
//     kernelSize = K->UnitSize // O(C)
//     IF(kernelSize > T)
//         workList = ReverseTopoOrderList(K)  // Bottom up traverse
//         WHILE(worklist not empty) // O(N)
//             remove F from worklist
//             //F->Us might be overestimated due to overcounting issue -> recompute F->Us to find the actual size
//             IF(F->Us > T || recompute(F->Us) > T) {   // recompute(F->Us): O(N) only when F->Us is larger than T
//                 mark F as stackcall;
//                 Add F to end of headList;
//                 continue;
//             }
//             Foreach F->callers P{ P->Us += F->Us; }
//         ENDWHILE
//     ENDIF
// ENDFOR
void EstimateFunctionSize::partitionKernel() {
  uint32_t threshold = UnitSizeThreshold;
  uint32_t max_unit_size = 0;
  uint32_t stackCall_cnt = 0;

  // Iterate over kernel
  llvm::SmallVector<void *, 64> unitHeads;
  for (auto node : kernelEntries)
    unitHeads.push_back((FunctionNode *)node);
  for (auto node : stackCallFuncs)
    unitHeads.push_back((FunctionNode *)node);
  for (auto node : addressTakenFuncs)
    unitHeads.push_back((FunctionNode *)node);

  for (auto node : unitHeads) {
    FunctionNode *UnitHead = (FunctionNode *)node;
    if (UnitHead->UnitSize <= threshold) // Unit size is within threshold, skip
    {
      max_unit_size = std::max(max_unit_size, UnitHead->UnitSize);
      continue;
    }
    PrintPartitionUnit(0x2, "Partition Kernel " << UnitHead->F->getName().str()
                                                << " Original Unit Size: " << UnitHead->UnitSize)
        uint32_t size_after_partition = bottomUpHeuristic(UnitHead->F, stackCall_cnt);
    max_unit_size = std::max(max_unit_size, size_after_partition);
    PrintPartitionUnit(0x2, "Unit size after partitioning: " << size_after_partition)
  }
  float threshold_err = (float)(max_unit_size - threshold) / threshold * 100;
  PrintPartitionUnit(0x2, "Max unit size: " << max_unit_size << " Threshold Error Rate: " << threshold_err << "%");
  PrintPartitionUnit(0x2, "Stack call cnt: " << stackCall_cnt);
  return;
}

// Work same as reduceKernel except for stackcall functions
void EstimateFunctionSize::reduceCompilationUnitSize() {
  uint32_t threshold = ExpandedUnitSizeThreshold;
  llvm::SmallVector<void *, 64> unitHeads;
  for (auto node : kernelEntries)
    unitHeads.push_back((FunctionNode *)node);
  for (auto node : stackCallFuncs)
    unitHeads.push_back((FunctionNode *)node);
  for (auto node : addressTakenFuncs)
    unitHeads.push_back((FunctionNode *)node);

  trimCompilationUnit(unitHeads, threshold, false);
  return;
}

// Top down traverse to find and retrieve functions that meet trimming criteria
void EstimateFunctionSize::getFunctionsToTrim(llvm::Function *root, llvm::SmallVector<void *, 64> &trimming_pool,
                                              llvm::SmallVector<void *, 64> &tiny_fn_trimming_pool,
                                              bool ignoreStackCallBoundary, uint32_t &func_cnt) {
  FunctionNode *unitHead = get<FunctionNode>(root);
  std::unordered_set<void *> visit;
  std::deque<FunctionNode *> TopDownQueue;
  TopDownQueue.push_back(unitHead);
  visit.insert((void *)unitHead);

  SmallVector<FunctionNode *, 64> funcsInKernel;
  uint64_t tinySizeThreshold = ControlInlineTinySize;

  std::deque<void *> bottomUpQueue;

  // Profile function information in the kernel boundary
  while (!TopDownQueue.empty()) {
    FunctionNode *Node = TopDownQueue.front();
    TopDownQueue.pop_front();
    for (auto &Callee : Node->CalleeList) {
      FunctionNode *calleeNode = Callee.first;
      if (visit.find((void *)calleeNode) != visit.end() ||
          (!ignoreStackCallBoundary && calleeNode->isStackCallAssigned()))
        continue;
      visit.insert((void *)calleeNode);
      TopDownQueue.push_back(calleeNode);
    }

    funcsInKernel.push_back(Node);
    if (Node->CalleeList.empty())
      bottomUpQueue.push_back((void *)Node);
  }
  func_cnt += visit.size();

  if (EnableSizeContributionOptimization)
    updateInlineCnt(root);
  if (EnableLeafCollapsing)
    UpdateSizeAfterCollapsing(bottomUpQueue, visit);

  if (EnableGreedyTrimming) {
    trimming_pool = llvm::SmallVector<void *, 64>(funcsInKernel.size());
    // Node with best effort and larger size contribution could be trimmed
    llvm::copy_if(funcsInKernel, std::back_inserter(trimming_pool),
                  [](void *node) { return ((FunctionNode *)node)->isBestEffortInline(); });
    return;
  }

  // Find all functions that meet trimming criteria

  for (FunctionNode *Node : funcsInKernel) {
    uint16_t func_trait = Node->getFunctionTrait(thresholdForTrimming);
    switch (func_trait) {
    case FT_NOT_BEST_EFFORT:
      Node->dumpFuncInfo(0x4, "Can't trim (not best effort inline)");
      break;
    case FT_MUL_KERNEL:
      Node->dumpFuncInfo(0x4, "Can't trim (in multiple kernels)");
      break;
    case FT_BIG_ENOUGH: // Functions are big enough to trim
      trimming_pool.push_back(Node);
      Node->dumpFuncInfo(0x4, "Good to trim (Big enough > " + std::to_string(tinySizeThreshold) + ")");
      break;
    case FT_TOO_TINY:
      // Small functions will be trimmed in special case if kernel still far exceeds threshold
      tiny_fn_trimming_pool.push_back(Node);
      Node->dumpFuncInfo(0x4, "Can't trim (Too tiny < " + std::to_string(tinySizeThreshold) + ")");
      break;
    case FT_HIGHER_WEIGHT:
      trimming_pool.push_back(Node);
      Node->dumpFuncInfo(0x4, "Good to trim (High weight > " + thresholdForTrimming.toString() + ")");
      break;
    case FT_LOWER_WEIGHT:
      Node->dumpFuncInfo(0x4, "Can't trim (Low weight < " + thresholdForTrimming.toString() + ")");
      break;
    default:
      PrintTrimUnit(0x4, "Something goes wrong with the function property");
      break;
    }
  }
  return;
}

// Trim kernel/unit by canceling out inline candidate functions one by one until the total size is within threshold
/*
For all F: F->ToBeInlined = True
For each kernel K
     kernelTotalSize = updateExpandedUnitSize(K)  // O(C) >= O(N*logN)
     IF (FullInlinedKernelSize > T)
         workList= non-tiny-functions sorted by size from large to small // O(N*logN)
         WHILE (worklist not empty) // O(N)
             remove F from worklist
             F->ToBeInlined = False
            kernelTotalSize = updateExpandedUnitSize(K)
            IF (kernelTotalSize <= T) break
         ENDWHILE
     Inline functions with ToBeInlined = True
     Inline functions with single caller // done
*/
void EstimateFunctionSize::trimCompilationUnit(llvm::SmallVector<void *, 64> &unitHeads, uint32_t threshold,
                                               bool ignoreStackCallBoundary) {
  llvm::SmallVector<FunctionNode *, 64> unitsToTrim;
  // Extract kernels / units that are larger than threshold
  for (auto node : unitHeads) {
    FunctionNode *unitEntry = (FunctionNode *)node;
    // Partitioning can add more stackcalls. So need to recompute the expanded unit size.
    updateExpandedUnitSize(unitEntry->F, ignoreStackCallBoundary);
    if (unitEntry->ExpandedSize > threshold) {
      PrintTrimUnit(0x2, "Kernel / Unit " << unitEntry->F->getName().str() << " expSize= " << unitEntry->ExpandedSize
                                          << " > " << threshold) unitsToTrim.push_back(unitEntry);
    } else {
      PrintTrimUnit(0x2, "Kernel / Unit " << unitEntry->F->getName().str() << " expSize= " << unitEntry->ExpandedSize
                                          << " <= " << threshold)
    }
  }

  if (unitsToTrim.empty()) {
    PrintTrimUnit(0x2, "Kernels / Units become no longer big enough to be trimmed (affected by partitioning)") return;
  }

  std::sort(unitsToTrim.begin(), unitsToTrim.end(), [&](const FunctionNode *LHS, const FunctionNode *RHS) {
    return LHS->ExpandedSize > RHS->ExpandedSize;
  }); // Sort by expanded size

  // Iterate over units
  for (auto unit : unitsToTrim) {
    size_t expandedUnitSize =
        updateExpandedUnitSize(unit->F, ignoreStackCallBoundary); // A kernel size can be reduced by a function that is
                                                                  // trimmed at previous kernels, so recompute it.
    PrintTrimUnit(0x2, "Trimming kernel / unit " << unit->F->getName().str() << " expanded size= "
                                                 << expandedUnitSize) if (expandedUnitSize <= threshold) {
      PrintTrimUnit(0x2, "Kernel / unit " << unit->F->getName().str() << ": The expanded unit size(" << expandedUnitSize
                                          << ") is smaller than threshold(" << threshold << ")") continue;
    }
    PrintTrimUnit(0x2, "Kernel size is bigger than threshold")

        SmallVector<void *, 64>
            trimming_pool;
    SmallVector<void *, 64> tiny_fn_trimming_pool;
    uint32_t func_cnt = 0;
    getFunctionsToTrim(unit->F, trimming_pool, tiny_fn_trimming_pool, ignoreStackCallBoundary, func_cnt);
    PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << " has " << trimming_pool.size()
                                        << " functions for trimming out of " << func_cnt) if (trimming_pool.empty()) {
      PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << " size " << unit->ExpandedSize
                                          << " has no sorted list") continue; // all functions are tiny.
    }
    uint64_t size_before_trimming = unit->ExpandedSize;
    if (EnableGreedyTrimming) {
      performGreedyTrimming(unit->F, trimming_pool, threshold, ignoreStackCallBoundary);
    } else {
      performTrimming(unit->F, trimming_pool, threshold, ignoreStackCallBoundary);
      if (ignoreStackCallBoundary && unit->ExpandedSize > threshold * LargeKernelThresholdMultiplier) {
        PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << ": Size: " << unit->ExpandedSize
                                            << " is much larger than threshold, trimming small functions as well.")
            performTrimming(unit->F, tiny_fn_trimming_pool, threshold, ignoreStackCallBoundary);
      }
    }
    if (unit->ExpandedSize < threshold) {
      PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << ": The size becomes below threshold")
    } else {
      PrintTrimUnit(0x2, "Kernel / Unit "
                             << unit->F->getName().str()
                             << ": The size is still above threshold even though all candidates are trimmed")
    }

    PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << " final size " << unit->ExpandedSize
                                        << " reduced from " << size_before_trimming)
  }
}

void EstimateFunctionSize::performGreedyTrimming(Function *head, llvm::SmallVector<void *, 64> &functions_to_trim,
                                                 uint32_t threshold, bool ignoreStackCallBoundary) {

  llvm::SmallVector<FunctionNode *, 64> candidates;
  llvm::SmallVector<FunctionNode *, 64> funcWithNoEffect;

  for (auto f : functions_to_trim) {
    FunctionNode *func = (FunctionNode *)f;
    if (func->getSizeContribution() != func->getPotentialBodySize()) {
      candidates.push_back(func);
    } else {
      funcWithNoEffect.push_back(func);
    }
  }

  uint32_t total_trim_cnt = 0;
  while (!candidates.empty()) {
    Scaled64 minWeight = calculateTotalWeight(head);
    FunctionNode *bestForTrim = NULL;
    Scaled64 weightBeforeTrim = minWeight;
    PrintTrimUnit(0x8, "Trimming candidate count: " << candidates.size());
    for (auto func : candidates) {
      func->setTrimmed();
      // Update inline count
      updateInlineCnt(head);
      // calculate weight
      Scaled64 weight = calculateTotalWeight(head);
      if (weight < minWeight) {
        minWeight = weight;
        bestForTrim = func;
      }
      func->unsetTrimmed();
      updateInlineCnt(head);
    }
    PrintTrimUnit(0x8, "Total weight before trim: " << weightBeforeTrim.toString()
                                                    << " Total weight after trim: " << minWeight.toString());
    if (bestForTrim == NULL) // Trimming any of functions result in better code
      break;
    PrintTrimUnit(0x8, "Trim the function " << bestForTrim->F->getName().str()
                                            << ", Function Attribute: " << bestForTrim->getFuncAttrStr()
                                            << ", Function size: " << bestForTrim->InitialSize
                                            << ", Size after inlining: " << bestForTrim->SizeAfterCollapsing
                                            << ", Size contribution: " << bestForTrim->getSizeContribution()
                                            << ", Freq: " << bestForTrim->getStaticFuncFreqStr()
                                            << ", Weight: " << bestForTrim->getWeightForTrimming().toString());

    bestForTrim->setTrimmed();
    updateInlineCnt(head);
    total_trim_cnt += 1;
    PrintTrimUnit(0x8,
                  "The size contribution of the trimmed function changes to " << bestForTrim->getSizeContribution());

    llvm::SmallVector<FunctionNode *, 64> new_candidates;
    for (auto func : candidates) {
      if (func->getSizeContribution() != func->getPotentialBodySize()) {
        new_candidates.push_back(func);
      } else {
        funcWithNoEffect.push_back(func);
      }
    }
    candidates = std::move(new_candidates);
  }
  updateExpandedUnitSize(head, ignoreStackCallBoundary);
  for (FunctionNode *trimNoGain : candidates) // Those remaining candidates will likely degrade performance
  {
    PrintTrimUnit(0x8, "Dont't trim (Performance penalty is higher than size reduction)"
                           << trimNoGain->F->getName().str() << ", Function Attribute: " << trimNoGain->getFuncAttrStr()
                           << ", Function size: " << trimNoGain->InitialSize
                           << ", Size after inlining: " << trimNoGain->SizeAfterCollapsing << ", Size contribution: "
                           << trimNoGain->getSizeContribution() << ", Freq: " << trimNoGain->getStaticFuncFreqStr()
                           << ", Weight: " << trimNoGain->getWeightForTrimming().toString());
  }
  for (FunctionNode *trimNoGain : funcWithNoEffect) // The kernel size will not change when those functions are trimmed
  {
    PrintTrimUnit(0x8, "Dont't trim (Trimming doesn't give size reduction)"
                           << trimNoGain->F->getName().str() << ", Function Attribute: " << trimNoGain->getFuncAttrStr()
                           << ", Function size: " << trimNoGain->InitialSize
                           << ", Size after inlining: " << trimNoGain->SizeAfterCollapsing << ", Size contribution: "
                           << trimNoGain->getSizeContribution() << ", Freq: " << trimNoGain->getStaticFuncFreqStr()
                           << ", Weight: " << trimNoGain->getWeightForTrimming().toString());
  }
  PrintTrimUnit(0x8, "In total, " << total_trim_cnt << " function(s) are trimmed out of " << functions_to_trim.size());
  return;
}

void EstimateFunctionSize::performTrimming(Function *head, llvm::SmallVector<void *, 64> &functions_to_trim,
                                           uint32_t threshold, bool ignoreStackCallBoundary) {
  FunctionNode *unitHead = get<FunctionNode>(head);
  uint32_t total_cand = functions_to_trim.size();
  uint32_t total_trim_cnt = 0;
  // Sort all to-be trimmed function according to the its actual size

  // Repeat trimming functions for cold functions until the unit size is smaller than threshold
  while (!functions_to_trim.empty() && unitHead->ExpandedSize >= threshold) {
    std::sort(functions_to_trim.begin(), functions_to_trim.end(), [&](const void *LHS, const void *RHS) {
      return ((FunctionNode *)LHS)->getWeightForTrimming() < ((FunctionNode *)RHS)->getWeightForTrimming();
    });
    FunctionNode *functionToTrim = (FunctionNode *)functions_to_trim.back(); // Pick the largest one first to trim
    functions_to_trim.pop_back();
    uint64_t original_expandedSize = unitHead->ExpandedSize;

    if (EnableSizeContributionOptimization) {
      uint64_t size_contribution = functionToTrim->getSizeContribution();
      uint64_t FuncSize = functionToTrim->getPotentialBodySize();
      if (FuncSize == size_contribution && FuncSize < SkipTrimmingOneCopyFunction) {
        functionToTrim->dumpFuncInfo(0x8, "Don't trim (Same size contribution and too small)");
        continue;
      }
      functionToTrim->dumpFuncInfo(0x8, "Trim the function");
      functionToTrim->setTrimmed();
      updateInlineCnt(head);
      PrintTrimUnit(0x8, "The size contribution of the trimmed function changes to "
                             << functionToTrim->getSizeContribution());
    } else {
      functionToTrim->dumpFuncInfo(0x8, "Trim the function");
      functionToTrim->setTrimmed();
    }
    total_trim_cnt += 1;
    // After trimming, update expanded size
    updateExpandedUnitSize(head, ignoreStackCallBoundary);
    PrintTrimUnit(0x8, "The kernel size is reduced after trimming from " << original_expandedSize << " to "
                                                                         << unitHead->ExpandedSize);
  }
  PrintTrimUnit(0x8, "In total, " << total_trim_cnt << " function(s) are trimmed out of " << total_cand);
  return;
}

bool EstimateFunctionSize::isStackCallAssigned(llvm::Function *F) {
  FunctionNode *Node = get<FunctionNode>(F);
  return Node->isStackCallAssigned();
}

uint32_t EstimateFunctionSize::getMaxUnitSize() {
  uint32_t max_val = 0;
  for (auto kernelEntry : kernelEntries) // For all kernel, update unitsize
  {
    FunctionNode *head = (FunctionNode *)kernelEntry;
    max_val = std::max(max_val, head->UnitSize);
  }
  for (auto stackCallFunc : stackCallFuncs) // For all address taken functions, update unitsize
  {
    FunctionNode *head = (FunctionNode *)stackCallFunc;
    max_val = std::max(max_val, head->UnitSize);
  }
  for (auto addrTakenFunc : addressTakenFuncs) // For all address taken functions, update unitsize
  {
    FunctionNode *head = (FunctionNode *)addrTakenFunc;
    max_val = std::max(max_val, head->UnitSize);
  }
  return max_val;
}