Files
intel-graphics-compiler/IGC/Compiler/GenTTI.cpp
Liou, Jhe-Yu cd5c825d4d Use alloca size instead number of element as the cutoff for promoting loop unrolling
Use alloca size instead number of element as the cutoff for promoting loop unrolling
2025-08-22 20:57:12 +02:00

700 lines
25 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include <utility>
#include "Compiler/GenTTI.h"
#include "GenISAIntrinsics/GenIntrinsics.h"
#include "GenISAIntrinsics/GenIntrinsicInst.h"
#include "Compiler/CodeGenPublic.h"
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "common/LLVMWarningsPush.hpp"
#include "llvm/Config/llvm-config.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Support/InstructionCost.h"
#include "llvmWrapper/Transforms/Utils/LoopUtils.h"
#include "common/LLVMWarningsPop.hpp"
#include <algorithm>
using namespace llvm;
using namespace IGC;
#define DEBUG_TYPE "GENtti"
namespace llvm {
bool GenIntrinsicsTTIImpl::isLoweredToCall(const Function *F) {
if (GenISAIntrinsic::isIntrinsic(F))
return false;
return BaseT::isLoweredToCall(F);
}
// CFG simplification may produce illegal integer types while simplifying switch
// instructions. Set this to false unless IGC legalization can fix them.
bool GenIntrinsicsTTIImpl::shouldBuildLookupTables() { return false; }
void *GenIntrinsicsTTIImpl::getAdjustedAnalysisPointer(const void *ID) {
if (ID == &TargetTransformInfoWrapperPass::ID)
return (TargetTransformInfo *)this;
return this;
}
bool isSendMessage(const llvm::GenIntrinsicInst *inst) {
if (isa<SamplerLoadIntrinsic>(inst) || isa<SampleIntrinsic>(inst) || isa<LdRawIntrinsic>(inst) ||
isa<InfoIntrinsic>(inst) || isa<SamplerGatherIntrinsic>(inst)) {
return true;
}
GenISAIntrinsic::ID ID = inst->getIntrinsicID();
if (/*ID == llvm::GenISAIntrinsic::GenISA_typedwrite ||
ID == llvm::GenISAIntrinsic::GenISA_typedread ||*/
ID == llvm::GenISAIntrinsic::GenISA_URBRead || isURBWriteIntrinsic(inst) ||
ID == llvm::GenISAIntrinsic::GenISA_ldstructured) {
return true;
}
return false;
}
unsigned countTotalInstructions(const Function *F, bool CheckSendMsg = true) {
unsigned EstimatedInstCnt = 0;
for (const auto &BB : *F) {
for (const auto &II : BB) {
if (IGCLLVM::isDebugOrPseudoInst(II))
continue;
if (auto pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(&II)) {
if (CheckSendMsg && isSendMessage(pIntrinsic)) {
EstimatedInstCnt += 4;
}
}
EstimatedInstCnt++;
}
}
return EstimatedInstCnt;
}
unsigned GenIntrinsicsTTIImpl::getFlatAddressSpace() { return ADDRESS_SPACE_PRIVATE; }
bool GenIntrinsicsTTIImpl::isGEPLoopConstDerived(GetElementPtrInst *GEP, const Loop *L, ScalarEvolution &SE) {
if (!GEP)
return false;
const SCEV *SGEP = SE.getSCEV(GEP);
if (auto *AR = dyn_cast<SCEVAddRecExpr>(SGEP)) {
if (AR->getLoop() == L)
return true;
}
// Don't let pointer base interfere the traversal. This is due to some frontends
// generate GEP without inbound.
const SCEV *SGEPMinusPointerBase = SE.removePointerBase(SGEP);
struct CheckConstDerived {
bool TraversalDone = false;
bool AddRecFound = false;
bool isConstDerived = true;
const Loop *L = nullptr;
const SCEV *S = nullptr;
CheckConstDerived(const Loop *L) : L(L) {}
bool setNotConstDerived() {
TraversalDone = true;
isConstDerived = false;
return false;
}
bool follow(const SCEV *S) {
switch (S->getSCEVType()) {
case scConstant:
case scPtrToInt:
case scTruncate:
case scZeroExtend:
case scSignExtend:
case scAddExpr:
case scMulExpr:
case scUMaxExpr:
case scSMaxExpr:
case scUMinExpr:
case scSMinExpr:
case scSequentialUMinExpr:
case scUDivExpr:
return true;
case scAddRecExpr: {
const auto *ARLoop = cast<SCEVAddRecExpr>(S)->getLoop();
if (L && (ARLoop == L || ARLoop->contains(L))) {
AddRecFound = true;
return false; // Don't traverse into it
}
return setNotConstDerived();
}
case scUnknown:
case scCouldNotCompute:
return setNotConstDerived();
}
llvm_unreachable("Unknown SCEV kind!");
}
bool isDone() { return TraversalDone; }
};
CheckConstDerived CCD(L);
SCEVTraversal<CheckConstDerived> ST(CCD);
ST.visitAll(SGEPMinusPointerBase);
return (CCD.isConstDerived && CCD.AddRecFound);
}
void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) {
bool IsJointMatrixApplyLoop = false;
for (auto BB : L->blocks()) {
for (auto &I : *BB) {
if (auto *MD = I.getMetadata("joint_matrix_apply")) {
IsJointMatrixApplyLoop = true;
break;
}
}
if (IsJointMatrixApplyLoop) {
break;
}
}
unsigned LoopUnrollThreshold = ctx->m_DriverInfo.GetLoopUnrollThreshold();
bool UnrollLoopForCodeSizeOnly =
IGC_IS_FLAG_ENABLED(UnrollLoopForCodeSizeOnly) || (!ctx->m_retryManager.IsFirstTry());
// override the LoopUnrollThreshold if the registry key is set
if (IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold) != 0) {
LoopUnrollThreshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold);
} else {
if (ctx->type == ShaderType::COMPUTE_SHADER && ctx->getModuleMetaData()->csInfo.SetLoopUnrollThreshold > 0) {
LoopUnrollThreshold = ctx->getModuleMetaData()->csInfo.SetLoopUnrollThreshold;
} else if ((ctx->type == ShaderType::PIXEL_SHADER || ctx->type == ShaderType::RAYTRACING_SHADER) &&
ctx->getModuleMetaData()->compOpt.SetLoopUnrollThreshold > 0) {
LoopUnrollThreshold = ctx->getModuleMetaData()->compOpt.SetLoopUnrollThreshold;
} else if (IsJointMatrixApplyLoop) {
// For joint_matrix_apply loops, we want to unroll them as much as possible so setting high threshold.
// From the other hand, unrolling huge loops can lead to unreasonable compile-time
// so we can not just use UINT_MAX.
// One case, where we hit the limit is when we print accumulator 32x64 elements
// using joint_matrix_apply with sycl::stream.
// If you increase this limit, please test that printing with sycl::stream still works.
LoopUnrollThreshold = 20000;
}
}
// Special case when DP emulation is needed.
if (ctx->m_hasDPEmu && IGC_IS_FLAG_ENABLED(SelectiveLoopUnrollForDPEmu)) {
bool hasDPInst = false;
for (auto BB : L->blocks()) {
for (auto &I : *BB) {
switch (I.getOpcode()) {
case Instruction::FMul:
case Instruction::FAdd:
case Instruction::FSub:
case Instruction::FDiv:
hasDPInst = I.getType()->isDoubleTy();
break;
case Instruction::FCmp:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPTrunc:
hasDPInst = I.getOperand(0)->getType()->isDoubleTy();
break;
case Instruction::UIToFP:
case Instruction::SIToFP:
case Instruction::FPExt:
hasDPInst = I.getType()->isDoubleTy();
break;
case Instruction::Call: {
if (isa<GenIntrinsicInst>(&I) || isa<IntrinsicInst>(&I)) {
CallInst *callI = cast<CallInst>(&I);
hasDPInst =
(callI->getType()->isDoubleTy() || std::any_of(callI->arg_begin(), callI->arg_end(),
[](Value *v) { return v->getType()->isDoubleTy(); }));
}
break;
}
default:
break;
}
if (hasDPInst)
break;
}
if (hasDPInst)
break;
}
if (hasDPInst) {
// Disable unroll
UP.Threshold = 0;
UP.OptSizeThreshold = 0;
UP.Count = 1;
UP.MaxCount = 1;
UP.Partial = false;
UP.Runtime = false;
return;
}
}
unsigned totalInstCountInShader = countTotalInstructions(L->getBlocks()[0]->getParent());
uint32_t registerPressureEst =
(uint32_t)(IGC_GET_FLAG_VALUE(SetRegisterPressureThresholdForLoopUnroll) * (ctx->getNumGRFPerThread() / 128.0));
bool lowPressure = (this->ctx->m_tempCount < registerPressureEst) && (totalInstCountInShader < LoopUnrollThreshold);
// For OCL shaders, do a two-step loop unrolling. The first
// unrolling is simple and full, and the second runs after
// LICM, which allows partial unrolling. Same for other APIs?
if (lowPressure || (ctx->type == ShaderType::OPENCL_SHADER))
{
UP.Threshold = LoopUnrollThreshold;
UP.PartialThreshold = LoopUnrollThreshold;
UP.Partial = true;
} else // for high registry pressure shaders, limit the unrolling to small loops and only fully unroll
{
UP.Threshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThresholdForHighRegPressure);
// This is similiar to LLVM OptForSize scenario in LoopUnrollPass
UP.MaxPercentThresholdBoost = IGC_GET_FLAG_VALUE(SetLoopUnrollMaxPercentThresholdBoostForHighRegPressure);
}
unsigned MaxTripCount = SE.getSmallConstantMaxTripCount(L);
const unsigned MaxTripCountToUseUpperBound = 4;
if (MaxTripCount && MaxTripCount <= MaxTripCountToUseUpperBound) {
UP.UpperBound = true;
UP.Force = true;
}
if (UnrollLoopForCodeSizeOnly) {
UP.Threshold = getLoopSize(L, *this) + 1;
UP.MaxPercentThresholdBoost = 100;
UP.Partial = false;
}
// For all the load/store who (having a GEP to),
// 1. Accessing a fixed size Alloca
// 2. Having an loop-iteration-inducted-only index
// For exmaple,
//
// bb:
// %ALLOC = alloca [32 x float], align 4
// Loop1:
// %i8 = phi i32 [ 0, %bb ], [ %i23, %Loop1 ]
// %i19 = getelementptr [32 x float], ptr %ALLOC, i64 0, i64 %i8
// %i23 = add i32 %i8, 1
// %i14 = fmul ...
// store float %i14, ptr %i19, align 4
// %i24 = icmp eq i32 %i23, 32
// br i1 %i24, label %..., label %Loop1
// ...
// Loop5:
// %i93 = phi i32 [ %i115, %Loop5 ], [ 0, %... ]
// %i99 = getelementptr [32 x float], ptr %ALLOC, i64 0, i64 %i93
// %i103 = load float, ptr %i99, align 4
// %i107 = fmul float %i103, 0x3F699999A0000000
// %i115 = add i32 %i93, 1
// %i116 = icmp eq i32 %i115, 32
// br i1 %i116, label %bb117, label %Loop5
//
// Fully unrolling both loops leads SROA pass eliminate the entire access chain of the alloca. This is one of most
// impacted yet super common pattern across all application types. In many cases, especially when the only values that
// stored into alloca are compiler-detectable constant, these loops need to be unroll regardless how high the register
// pressure is.
//
// TODO: Having an analysis pass to link alloca with loops globally so that they are either unrolled together or not.
// It can potentially do some global cost estimations.
const unsigned UnrollMaxCountForAlloca = IGC_GET_FLAG_VALUE(PromoteLoopUnrollwithAllocaCountThreshold);
bool AllocaFound = false;
if (MaxTripCount && MaxTripCount <= UnrollMaxCountForAlloca &&
IGC_IS_FLAG_ENABLED(EnablePromoteLoopUnrollwithAlloca)) {
unsigned int ThresholdBoost = 0;
for (auto BB : L->blocks()) {
for (auto &I : *BB) {
AllocaInst *AI = nullptr;
GetElementPtrInst *GEP = nullptr;
if (auto *LI = dyn_cast<LoadInst>(&I))
AI = dyn_cast<AllocaInst>(LI->getPointerOperand());
else if ((GEP = dyn_cast<GetElementPtrInst>(&I))) {
// Test if the GEP index is a function of the loop induction variable.
if (!isGEPLoopConstDerived(GEP, L, SE))
continue;
auto *SBase = dyn_cast<SCEVUnknown>(SE.getPointerBase(SE.getSCEV(GEP)));
AI = dyn_cast<AllocaInst>(SBase->getValue());
} else
continue;
if (!AI)
continue;
// Not fixed size or not in entry block
// TODO: Can a alloca with a fixed size not reside in the entry block?
if (!AI->isStaticAlloca())
continue;
// Using alloca size in bytes as the threshold boost seems a bit tricky.
unsigned AllocaSize = *(AI->getAllocationSizeInBits(DL)) / 8;
// Assume every iteration consumes 1 DW (64 bytes).
if (AllocaSize/8 > UnrollMaxCountForAlloca)
continue;
ThresholdBoost += AllocaSize;
if (GEP)
isGEPLoopInduction[GEP] = true;
AllocaFound = true;
}
}
if (AllocaFound) {
// LLVM default only to 10, boost to UnrollMaxCountForAlloca
UP.MaxIterationsCountToAnalyze = UnrollMaxCountForAlloca;
UP.Threshold += ThresholdBoost;
UP.UpperBound = true;
UP.Force = UnrollLoopForCodeSizeOnly ? false : true;
LLVM_DEBUG(dbgs() << "Increasing L:" << L->getName() << " threshold to " << UP.Threshold
<< " due to Alloca accessed by:");
for (const auto &pair : isGEPLoopInduction)
LLVM_DEBUG(dbgs() << " " << pair.first->getName());
LLVM_DEBUG(dbgs() << " \n");
}
}
if (IGC_IS_FLAG_ENABLED(UnrollLoopForCodeSizeOnly))
return;
unsigned sendMessage = 0;
unsigned TripCount = 0;
BasicBlock *ExitingBlock = L->getLoopLatch();
if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
ExitingBlock = L->getExitingBlock();
if (ExitingBlock)
TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
// Do not enable partial unrolling if the loop counter is float. It can cause precision issue.
if (ExitingBlock) {
if (UP.Partial) {
IGCLLVM::TerminatorInst *Term = ExitingBlock->getTerminator();
if (BranchInst *BI = dyn_cast<BranchInst>(Term)) {
if (dyn_cast<FCmpInst>(BI->getCondition())) {
UP.Partial = false;
return;
}
}
}
// Add heuristic to disable loop unroll for single short BB loop with
// barrier. Unrolling such a loop won't remove dependency due to that
// barrier but only add register pressure potentially.
if (L->getNumBlocks() == 1) {
BasicBlock *BB = *L->block_begin();
SmallPtrSet<const Value *, 32> EphValues;
CodeMetrics Metrics;
Metrics.analyzeBasicBlock(BB, *this, EphValues);
if (Metrics.NumInsts < 50) {
for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
CallInst *Call = dyn_cast<CallInst>(I);
if (!Call)
continue;
Function *F = Call->getCalledFunction();
if (!F)
continue;
// FIXME: Shall we already inline barrier even in two-phase
// inlining?
if (F->getName() == "_Z7barrierj") {
// Disable loop unrolling for short loop with
// barrier, where we prefer wider SIMD to mitigate
// the barrier overhead.
UP.Count = 1;
UP.MaxCount = UP.Count;
UP.Partial = false;
UP.Runtime = false;
return;
}
}
}
}
}
// Skip non-simple loop.
if (L->getNumBlocks() != 1) {
if (IGC_IS_FLAG_ENABLED(EnableAdvRuntimeUnroll) && IGCLLVM::isInnermost(L)) {
auto countNonPHI = [](BasicBlock *BB) {
// Count the number of instructions in the basic block without dbg instructions
unsigned InstCountInBB = BB->sizeWithoutDebug();
unsigned PHIs = 0;
for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
if (!isa<PHINode>(&*BI))
break;
++PHIs;
}
return InstCountInBB - PHIs;
};
auto hasLoad = [](BasicBlock *BB) {
for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI)
if (isa<LoadInst>(&*BI))
return true;
return false;
};
auto hasStore = [](BasicBlock *BB) {
for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI)
if (isa<StoreInst>(&*BI))
return true;
return false;
};
auto hasCall = [](BasicBlock *BB) {
for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI)
if (isa<CallInst>(&*BI) && !isa<IntrinsicInst>(&*BI) && !isa<GenIntrinsicInst>(&*BI))
return true;
return false;
};
// For innermost loop, allow certain patterns.
unsigned Count = 0;
bool HasCall = false;
bool HasStore = false;
bool MayHasLoadInHeaderOnly = true;
for (auto BI = L->block_begin(), BE = L->block_end(); BI != BE; ++BI) {
Count += countNonPHI(*BI);
HasCall |= hasCall(*BI);
HasStore |= hasStore(*BI);
if (L->getHeader() != *BI)
MayHasLoadInHeaderOnly &= !hasLoad(*BI);
}
// Runtime unroll it.
if (!HasCall && !HasStore && MayHasLoadInHeaderOnly && Count < 100) {
unsigned C = IGC_GET_FLAG_VALUE(AdvRuntimeUnrollCount);
if (C == 0)
C = 4;
UP.Runtime = true;
UP.Count = C;
UP.MaxCount = UP.Count;
// The following is only available and required from LLVM 3.7+.
UP.AllowExpensiveTripCount = true;
}
}
return;
}
llvm::BasicBlock::InstListType::iterator I;
llvm::BasicBlock *loopBlock = L->getBlocks()[0];
int instCount =
std::distance(loopBlock->instructionsWithoutDebug().begin(), loopBlock->instructionsWithoutDebug().end());
// Check if the specific basic block has block read or write.
auto hasBlockReadWrite = [](BasicBlock *BB) {
for (auto I = BB->begin(), E = BB->end(); I != E; ++I)
if (auto GII = dyn_cast<GenIntrinsicInst>(I))
switch (GII->getIntrinsicID()) {
case GenISAIntrinsic::GenISA_simdBlockRead:
case GenISAIntrinsic::GenISA_simdBlockWrite:
return true;
default:
break;
}
return false;
};
// Skip the following logic for OCL. Apparently, it's designed to prevent
// loops in 3D shaders being aggressively unrolled to increase shader
// binary size dramatically. So far, OCL doesn't have such concern and, if
// we need to consider that, more factors need consideration. Just skip
// that for OCL.
if (ctx->type == ShaderType::OPENCL_SHADER &&
// Only try to fully unroll small loop with known but small trip count.
// This's PURELY heuristics.
((TripCount != 0 && TripCount <= 40 && instCount < 40) || hasBlockReadWrite(L->getHeader())) &&
// FIXME: WA for cases where the compiler is running with a smaller stack size
// we run into stack overflow in
!ctx->m_DriverInfo.HasSmallStack()) {
return;
}
for (I = loopBlock->begin(); I != loopBlock->end(); I++) {
if (const auto pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(I)) {
if (isSendMessage(pIntrinsic)) {
sendMessage++;
}
}
}
unsigned int estimateUnrolledInstCount = (instCount + sendMessage * 4) * TripCount;
unsigned int unrollLimitInstCount =
LoopUnrollThreshold > totalInstCountInShader ? LoopUnrollThreshold - totalInstCountInShader : 0;
bool limitUnrolling = (estimateUnrolledInstCount > unrollLimitInstCount) || (TripCount > unrollLimitInstCount) ||
(instCount + sendMessage * 4 > unrollLimitInstCount);
// if the loop doesn't have sample, skip the unrolling parameter change
if (!sendMessage) {
// if the estimated unrolled instruction count is larger than the unrolling threshold, limit unrolling.
if (limitUnrolling) {
UP.Count = MIN(unrollLimitInstCount / (instCount + sendMessage * 4), 4);
if (TripCount != 0)
while (UP.Count != 0 && TripCount % UP.Count != 0)
UP.Count--;
UP.MaxCount = UP.Count;
}
return;
}
// if the TripCount is known, and the estimated unrolled count exceed LoopUnrollThreshold, set the unrolling count to
// 4
if (limitUnrolling) {
UP.Count = MIN(TripCount, 4);
UP.MaxCount = UP.Count;
}
unsigned int runtimeUnroll = IGC_GET_FLAG_VALUE(RuntimeLoopUnrolling); // 0: default, 1: on, 2: off
if (runtimeUnroll == 2) {
return;
} else if (runtimeUnroll == 0) {
// do not enable runtime unrolling if the loop is long or trip count is already known.
// skip this check if RuntimeLoopUnrolling is set to force on.
if (instCount > 35 || TripCount) {
return;
}
}
if (!limitUnrolling) {
UP.Runtime = true;
UP.Count = 4;
UP.MaxCount = UP.Count;
// The following is only available and required from LLVM 3.7+.
UP.AllowExpensiveTripCount = true;
}
if (MDNode *LoopID = L->getLoopID()) {
const llvm::StringRef maxIterMetadataNames = "spv.loop.iterations.max";
for (unsigned i = 0; i < LoopID->getNumOperands(); ++i) {
if (MDNode *MD = llvm::dyn_cast<MDNode>(LoopID->getOperand(i))) {
if (MDString *S = llvm::dyn_cast<MDString>(MD->getOperand(0))) {
if (maxIterMetadataNames.equals(S->getString())) {
UP.MaxCount = static_cast<unsigned>(mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue());
}
}
}
}
}
}
// [LLVM-UPGRADE] Peeling information was separated
// https://github.com/llvm/llvm-project/commit/e541e1b757237172c247904b670c9894d6b3759d
void GenIntrinsicsTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
llvm::TargetTransformInfo::PeelingPreferences &PP) {
if (MDNode *LoopID = L->getLoopID()) {
const llvm::StringRef peelCountMetadataNames = "spv.loop.peel.count";
for (unsigned i = 0; i < LoopID->getNumOperands(); ++i) {
if (MDNode *MD = llvm::dyn_cast<MDNode>(LoopID->getOperand(i))) {
if (MDString *S = llvm::dyn_cast<MDString>(MD->getOperand(0))) {
if (peelCountMetadataNames.equals(S->getString())) {
PP.AllowPeeling = true;
PP.PeelCount = static_cast<unsigned>(mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue());
}
}
}
}
}
}
bool GenIntrinsicsTTIImpl::isProfitableToHoist(Instruction *I) {
if (auto *CI = dyn_cast<CallInst>(I)) {
if (CI->isConvergent() && CI->onlyAccessesInaccessibleMemory()) {
return false;
}
}
return BaseT::isProfitableToHoist(I);
}
// TODO: Upon the complete removal of pre-LLVM 14 conditions, move to 'getInstructionCost' per LLVM 16 API
llvm::InstructionCost GenIntrinsicsTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands,
TTI::TargetCostKind CostKind) {
return GenIntrinsicsTTIImpl::internalCalculateCost(U, Operands, CostKind);
}
#if LLVM_VERSION_MAJOR >= 16
llvm::InstructionCost GenIntrinsicsTTIImpl::getInstructionCost(const User *U, ArrayRef<const Value *> Operands,
TTI::TargetCostKind CostKind) {
return GenIntrinsicsTTIImpl::internalCalculateCost(U, Operands, CostKind);
}
#endif
llvm::InstructionCost GenIntrinsicsTTIImpl::internalCalculateCost(const User *U, ArrayRef<const Value *> Operands,
TTI::TargetCostKind CostKind) {
// The extra cost of speculative execution for math intrinsics
if (auto *II = dyn_cast_or_null<IntrinsicInst>(U)) {
if (Intrinsic::ID IID = II->getIntrinsicID()) {
switch (IID) {
case Intrinsic::cos:
case Intrinsic::sin:
case Intrinsic::sqrt:
return TTI::TCC_Expensive;
default:
break;
}
}
}
if (IGC_IS_FLAG_ENABLED(EnablePromoteLoopUnrollwithAlloca)) {
const GetElementPtrInst *GEP = nullptr;
if (Operator::getOpcode(U) == Instruction::Load)
GEP = dyn_cast<GetElementPtrInst>(cast<LoadInst>(U)->getPointerOperand());
if (Operator::getOpcode(U) == Instruction::Store)
GEP = dyn_cast<GetElementPtrInst>(cast<StoreInst>(U)->getPointerOperand());
if (GEP) {
if (isGEPLoopInduction.find(GEP) != isGEPLoopInduction.end())
return TTI::TCC_Free;
}
}
const Function *F = dyn_cast<Function>(U);
if (F != nullptr) {
IGC::CodeGenContext *CGC = this->ctx;
if (!CGC->enableFunctionCall() && !GenISAIntrinsic::isIntrinsic(F) && !F->isIntrinsic()) {
// If subroutine call is not enabled but we have function call. They
// are not inlined. e.g. due to two-phase inlining. Return function
// size instead of to avoid under-estimating the cost of function call.
//
// FIXME: We need to collect the cost following calling graph. However,
// as LLVM's ininer only support bottom-up inlining currently. That's
// not a big issue so far.
//
// FIXME: We also need to consider the case where sub-routine call is
// enabled.
unsigned FuncSize = countTotalInstructions(F, false);
return TargetTransformInfo::TCC_Basic * FuncSize;
}
}
return BaseT::getInstructionCost(U, Operands, CostKind);
}
// Strip from LLVM::LoopUnrollPass::ApproximateLoopSize
unsigned getLoopSize(const Loop *L, const TargetTransformInfo &TTI) {
SmallPtrSet<const Value *, 32> EphValues;
CodeMetrics Metrics;
for (BasicBlock *BB : L->blocks())
Metrics.analyzeBasicBlock(BB, TTI, EphValues);
InstructionCost LoopSize;
LoopSize = Metrics.NumInsts;
LoopSize = (LoopSize > 3/*BEInsns + 1*/) ? LoopSize : 3;
return *LoopSize.getValue();
}
} // namespace llvm