mirror of
https://github.com/intel/intel-graphics-compiler.git
synced 2025-11-04 08:21:06 +08:00
700 lines
25 KiB
C++
700 lines
25 KiB
C++
/*========================== begin_copyright_notice ============================
|
|
|
|
Copyright (C) 2017-2021 Intel Corporation
|
|
|
|
SPDX-License-Identifier: MIT
|
|
|
|
============================= end_copyright_notice ===========================*/
|
|
|
|
#include <utility>
|
|
#include "Compiler/GenTTI.h"
|
|
#include "GenISAIntrinsics/GenIntrinsics.h"
|
|
#include "GenISAIntrinsics/GenIntrinsicInst.h"
|
|
#include "Compiler/CodeGenPublic.h"
|
|
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
|
|
|
|
#include "common/LLVMWarningsPush.hpp"
|
|
#include "llvm/Config/llvm-config.h"
|
|
|
|
#include "llvm/Analysis/CodeMetrics.h"
|
|
#include "llvm/Analysis/LoopInfo.h"
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
|
#include "llvm/Support/InstructionCost.h"
|
|
#include "llvmWrapper/Transforms/Utils/LoopUtils.h"
|
|
#include "common/LLVMWarningsPop.hpp"
|
|
|
|
#include <algorithm>
|
|
|
|
using namespace llvm;
|
|
using namespace IGC;
|
|
|
|
#define DEBUG_TYPE "GENtti"
|
|
|
|
namespace llvm {
|
|
|
|
bool GenIntrinsicsTTIImpl::isLoweredToCall(const Function *F) {
|
|
if (GenISAIntrinsic::isIntrinsic(F))
|
|
return false;
|
|
return BaseT::isLoweredToCall(F);
|
|
}
|
|
|
|
// CFG simplification may produce illegal integer types while simplifying switch
|
|
// instructions. Set this to false unless IGC legalization can fix them.
|
|
bool GenIntrinsicsTTIImpl::shouldBuildLookupTables() { return false; }
|
|
|
|
void *GenIntrinsicsTTIImpl::getAdjustedAnalysisPointer(const void *ID) {
|
|
if (ID == &TargetTransformInfoWrapperPass::ID)
|
|
return (TargetTransformInfo *)this;
|
|
return this;
|
|
}
|
|
|
|
bool isSendMessage(const llvm::GenIntrinsicInst *inst) {
|
|
if (isa<SamplerLoadIntrinsic>(inst) || isa<SampleIntrinsic>(inst) || isa<LdRawIntrinsic>(inst) ||
|
|
isa<InfoIntrinsic>(inst) || isa<SamplerGatherIntrinsic>(inst)) {
|
|
return true;
|
|
}
|
|
|
|
GenISAIntrinsic::ID ID = inst->getIntrinsicID();
|
|
if (/*ID == llvm::GenISAIntrinsic::GenISA_typedwrite ||
|
|
ID == llvm::GenISAIntrinsic::GenISA_typedread ||*/
|
|
ID == llvm::GenISAIntrinsic::GenISA_URBRead || isURBWriteIntrinsic(inst) ||
|
|
ID == llvm::GenISAIntrinsic::GenISA_ldstructured) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
unsigned countTotalInstructions(const Function *F, bool CheckSendMsg = true) {
|
|
unsigned EstimatedInstCnt = 0;
|
|
for (const auto &BB : *F) {
|
|
for (const auto &II : BB) {
|
|
if (IGCLLVM::isDebugOrPseudoInst(II))
|
|
continue;
|
|
|
|
if (auto pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(&II)) {
|
|
if (CheckSendMsg && isSendMessage(pIntrinsic)) {
|
|
EstimatedInstCnt += 4;
|
|
}
|
|
}
|
|
EstimatedInstCnt++;
|
|
}
|
|
}
|
|
return EstimatedInstCnt;
|
|
}
|
|
|
|
unsigned GenIntrinsicsTTIImpl::getFlatAddressSpace() { return ADDRESS_SPACE_PRIVATE; }
|
|
|
|
bool GenIntrinsicsTTIImpl::isGEPLoopConstDerived(GetElementPtrInst *GEP, const Loop *L, ScalarEvolution &SE) {
|
|
if (!GEP)
|
|
return false;
|
|
|
|
const SCEV *SGEP = SE.getSCEV(GEP);
|
|
|
|
if (auto *AR = dyn_cast<SCEVAddRecExpr>(SGEP)) {
|
|
if (AR->getLoop() == L)
|
|
return true;
|
|
}
|
|
|
|
// Don't let pointer base interfere the traversal. This is due to some frontends
|
|
// generate GEP without inbound.
|
|
const SCEV *SGEPMinusPointerBase = SE.removePointerBase(SGEP);
|
|
|
|
struct CheckConstDerived {
|
|
bool TraversalDone = false;
|
|
bool AddRecFound = false;
|
|
bool isConstDerived = true;
|
|
|
|
const Loop *L = nullptr;
|
|
const SCEV *S = nullptr;
|
|
|
|
CheckConstDerived(const Loop *L) : L(L) {}
|
|
|
|
bool setNotConstDerived() {
|
|
TraversalDone = true;
|
|
isConstDerived = false;
|
|
return false;
|
|
}
|
|
|
|
bool follow(const SCEV *S) {
|
|
switch (S->getSCEVType()) {
|
|
case scConstant:
|
|
case scPtrToInt:
|
|
case scTruncate:
|
|
case scZeroExtend:
|
|
case scSignExtend:
|
|
case scAddExpr:
|
|
case scMulExpr:
|
|
case scUMaxExpr:
|
|
case scSMaxExpr:
|
|
case scUMinExpr:
|
|
case scSMinExpr:
|
|
case scSequentialUMinExpr:
|
|
case scUDivExpr:
|
|
return true;
|
|
|
|
case scAddRecExpr: {
|
|
const auto *ARLoop = cast<SCEVAddRecExpr>(S)->getLoop();
|
|
if (L && (ARLoop == L || ARLoop->contains(L))) {
|
|
AddRecFound = true;
|
|
return false; // Don't traverse into it
|
|
}
|
|
|
|
return setNotConstDerived();
|
|
}
|
|
|
|
case scUnknown:
|
|
case scCouldNotCompute:
|
|
return setNotConstDerived();
|
|
}
|
|
llvm_unreachable("Unknown SCEV kind!");
|
|
}
|
|
|
|
bool isDone() { return TraversalDone; }
|
|
};
|
|
|
|
CheckConstDerived CCD(L);
|
|
SCEVTraversal<CheckConstDerived> ST(CCD);
|
|
ST.visitAll(SGEPMinusPointerBase);
|
|
return (CCD.isConstDerived && CCD.AddRecFound);
|
|
}
|
|
|
|
void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
|
|
OptimizationRemarkEmitter *ORE) {
|
|
bool IsJointMatrixApplyLoop = false;
|
|
for (auto BB : L->blocks()) {
|
|
for (auto &I : *BB) {
|
|
if (auto *MD = I.getMetadata("joint_matrix_apply")) {
|
|
IsJointMatrixApplyLoop = true;
|
|
break;
|
|
}
|
|
}
|
|
if (IsJointMatrixApplyLoop) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
unsigned LoopUnrollThreshold = ctx->m_DriverInfo.GetLoopUnrollThreshold();
|
|
bool UnrollLoopForCodeSizeOnly =
|
|
IGC_IS_FLAG_ENABLED(UnrollLoopForCodeSizeOnly) || (!ctx->m_retryManager.IsFirstTry());
|
|
|
|
// override the LoopUnrollThreshold if the registry key is set
|
|
if (IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold) != 0) {
|
|
LoopUnrollThreshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThreshold);
|
|
} else {
|
|
if (ctx->type == ShaderType::COMPUTE_SHADER && ctx->getModuleMetaData()->csInfo.SetLoopUnrollThreshold > 0) {
|
|
LoopUnrollThreshold = ctx->getModuleMetaData()->csInfo.SetLoopUnrollThreshold;
|
|
} else if ((ctx->type == ShaderType::PIXEL_SHADER || ctx->type == ShaderType::RAYTRACING_SHADER) &&
|
|
ctx->getModuleMetaData()->compOpt.SetLoopUnrollThreshold > 0) {
|
|
LoopUnrollThreshold = ctx->getModuleMetaData()->compOpt.SetLoopUnrollThreshold;
|
|
} else if (IsJointMatrixApplyLoop) {
|
|
// For joint_matrix_apply loops, we want to unroll them as much as possible so setting high threshold.
|
|
// From the other hand, unrolling huge loops can lead to unreasonable compile-time
|
|
// so we can not just use UINT_MAX.
|
|
// One case, where we hit the limit is when we print accumulator 32x64 elements
|
|
// using joint_matrix_apply with sycl::stream.
|
|
// If you increase this limit, please test that printing with sycl::stream still works.
|
|
LoopUnrollThreshold = 20000;
|
|
}
|
|
}
|
|
|
|
// Special case when DP emulation is needed.
|
|
if (ctx->m_hasDPEmu && IGC_IS_FLAG_ENABLED(SelectiveLoopUnrollForDPEmu)) {
|
|
bool hasDPInst = false;
|
|
for (auto BB : L->blocks()) {
|
|
for (auto &I : *BB) {
|
|
switch (I.getOpcode()) {
|
|
case Instruction::FMul:
|
|
case Instruction::FAdd:
|
|
case Instruction::FSub:
|
|
case Instruction::FDiv:
|
|
hasDPInst = I.getType()->isDoubleTy();
|
|
break;
|
|
case Instruction::FCmp:
|
|
case Instruction::FPToUI:
|
|
case Instruction::FPToSI:
|
|
case Instruction::FPTrunc:
|
|
hasDPInst = I.getOperand(0)->getType()->isDoubleTy();
|
|
break;
|
|
case Instruction::UIToFP:
|
|
case Instruction::SIToFP:
|
|
case Instruction::FPExt:
|
|
hasDPInst = I.getType()->isDoubleTy();
|
|
break;
|
|
case Instruction::Call: {
|
|
if (isa<GenIntrinsicInst>(&I) || isa<IntrinsicInst>(&I)) {
|
|
CallInst *callI = cast<CallInst>(&I);
|
|
hasDPInst =
|
|
(callI->getType()->isDoubleTy() || std::any_of(callI->arg_begin(), callI->arg_end(),
|
|
[](Value *v) { return v->getType()->isDoubleTy(); }));
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (hasDPInst)
|
|
break;
|
|
}
|
|
if (hasDPInst)
|
|
break;
|
|
}
|
|
if (hasDPInst) {
|
|
// Disable unroll
|
|
UP.Threshold = 0;
|
|
UP.OptSizeThreshold = 0;
|
|
UP.Count = 1;
|
|
UP.MaxCount = 1;
|
|
UP.Partial = false;
|
|
UP.Runtime = false;
|
|
return;
|
|
}
|
|
}
|
|
|
|
unsigned totalInstCountInShader = countTotalInstructions(L->getBlocks()[0]->getParent());
|
|
uint32_t registerPressureEst =
|
|
(uint32_t)(IGC_GET_FLAG_VALUE(SetRegisterPressureThresholdForLoopUnroll) * (ctx->getNumGRFPerThread() / 128.0));
|
|
bool lowPressure = (this->ctx->m_tempCount < registerPressureEst) && (totalInstCountInShader < LoopUnrollThreshold);
|
|
// For OCL shaders, do a two-step loop unrolling. The first
|
|
// unrolling is simple and full, and the second runs after
|
|
// LICM, which allows partial unrolling. Same for other APIs?
|
|
if (lowPressure || (ctx->type == ShaderType::OPENCL_SHADER))
|
|
{
|
|
UP.Threshold = LoopUnrollThreshold;
|
|
UP.PartialThreshold = LoopUnrollThreshold;
|
|
UP.Partial = true;
|
|
} else // for high registry pressure shaders, limit the unrolling to small loops and only fully unroll
|
|
{
|
|
UP.Threshold = IGC_GET_FLAG_VALUE(SetLoopUnrollThresholdForHighRegPressure);
|
|
// This is similiar to LLVM OptForSize scenario in LoopUnrollPass
|
|
UP.MaxPercentThresholdBoost = IGC_GET_FLAG_VALUE(SetLoopUnrollMaxPercentThresholdBoostForHighRegPressure);
|
|
}
|
|
|
|
unsigned MaxTripCount = SE.getSmallConstantMaxTripCount(L);
|
|
const unsigned MaxTripCountToUseUpperBound = 4;
|
|
if (MaxTripCount && MaxTripCount <= MaxTripCountToUseUpperBound) {
|
|
UP.UpperBound = true;
|
|
UP.Force = true;
|
|
}
|
|
|
|
if (UnrollLoopForCodeSizeOnly) {
|
|
UP.Threshold = getLoopSize(L, *this) + 1;
|
|
UP.MaxPercentThresholdBoost = 100;
|
|
UP.Partial = false;
|
|
}
|
|
|
|
// For all the load/store who (having a GEP to),
|
|
// 1. Accessing a fixed size Alloca
|
|
// 2. Having an loop-iteration-inducted-only index
|
|
// For exmaple,
|
|
//
|
|
// bb:
|
|
// %ALLOC = alloca [32 x float], align 4
|
|
// Loop1:
|
|
// %i8 = phi i32 [ 0, %bb ], [ %i23, %Loop1 ]
|
|
// %i19 = getelementptr [32 x float], ptr %ALLOC, i64 0, i64 %i8
|
|
// %i23 = add i32 %i8, 1
|
|
// %i14 = fmul ...
|
|
// store float %i14, ptr %i19, align 4
|
|
// %i24 = icmp eq i32 %i23, 32
|
|
// br i1 %i24, label %..., label %Loop1
|
|
// ...
|
|
// Loop5:
|
|
// %i93 = phi i32 [ %i115, %Loop5 ], [ 0, %... ]
|
|
// %i99 = getelementptr [32 x float], ptr %ALLOC, i64 0, i64 %i93
|
|
// %i103 = load float, ptr %i99, align 4
|
|
// %i107 = fmul float %i103, 0x3F699999A0000000
|
|
// %i115 = add i32 %i93, 1
|
|
// %i116 = icmp eq i32 %i115, 32
|
|
// br i1 %i116, label %bb117, label %Loop5
|
|
//
|
|
// Fully unrolling both loops leads SROA pass eliminate the entire access chain of the alloca. This is one of most
|
|
// impacted yet super common pattern across all application types. In many cases, especially when the only values that
|
|
// stored into alloca are compiler-detectable constant, these loops need to be unroll regardless how high the register
|
|
// pressure is.
|
|
//
|
|
// TODO: Having an analysis pass to link alloca with loops globally so that they are either unrolled together or not.
|
|
// It can potentially do some global cost estimations.
|
|
const unsigned UnrollMaxCountForAlloca = IGC_GET_FLAG_VALUE(PromoteLoopUnrollwithAllocaCountThreshold);
|
|
bool AllocaFound = false;
|
|
if (MaxTripCount && MaxTripCount <= UnrollMaxCountForAlloca &&
|
|
IGC_IS_FLAG_ENABLED(EnablePromoteLoopUnrollwithAlloca)) {
|
|
unsigned int ThresholdBoost = 0;
|
|
for (auto BB : L->blocks()) {
|
|
for (auto &I : *BB) {
|
|
AllocaInst *AI = nullptr;
|
|
GetElementPtrInst *GEP = nullptr;
|
|
|
|
if (auto *LI = dyn_cast<LoadInst>(&I))
|
|
AI = dyn_cast<AllocaInst>(LI->getPointerOperand());
|
|
else if ((GEP = dyn_cast<GetElementPtrInst>(&I))) {
|
|
// Test if the GEP index is a function of the loop induction variable.
|
|
if (!isGEPLoopConstDerived(GEP, L, SE))
|
|
continue;
|
|
|
|
auto *SBase = dyn_cast<SCEVUnknown>(SE.getPointerBase(SE.getSCEV(GEP)));
|
|
AI = dyn_cast<AllocaInst>(SBase->getValue());
|
|
} else
|
|
continue;
|
|
|
|
if (!AI)
|
|
continue;
|
|
// Not fixed size or not in entry block
|
|
// TODO: Can a alloca with a fixed size not reside in the entry block?
|
|
if (!AI->isStaticAlloca())
|
|
continue;
|
|
|
|
// Using alloca size in bytes as the threshold boost seems a bit tricky.
|
|
unsigned AllocaSize = *(AI->getAllocationSizeInBits(DL)) / 8;
|
|
// Assume every iteration consumes 1 DW (64 bytes).
|
|
if (AllocaSize/8 > UnrollMaxCountForAlloca)
|
|
continue;
|
|
|
|
ThresholdBoost += AllocaSize;
|
|
if (GEP)
|
|
isGEPLoopInduction[GEP] = true;
|
|
AllocaFound = true;
|
|
}
|
|
}
|
|
if (AllocaFound) {
|
|
// LLVM default only to 10, boost to UnrollMaxCountForAlloca
|
|
UP.MaxIterationsCountToAnalyze = UnrollMaxCountForAlloca;
|
|
UP.Threshold += ThresholdBoost;
|
|
UP.UpperBound = true;
|
|
UP.Force = UnrollLoopForCodeSizeOnly ? false : true;
|
|
|
|
LLVM_DEBUG(dbgs() << "Increasing L:" << L->getName() << " threshold to " << UP.Threshold
|
|
<< " due to Alloca accessed by:");
|
|
for (const auto &pair : isGEPLoopInduction)
|
|
LLVM_DEBUG(dbgs() << " " << pair.first->getName());
|
|
LLVM_DEBUG(dbgs() << " \n");
|
|
}
|
|
}
|
|
|
|
if (IGC_IS_FLAG_ENABLED(UnrollLoopForCodeSizeOnly))
|
|
return;
|
|
|
|
unsigned sendMessage = 0;
|
|
unsigned TripCount = 0;
|
|
BasicBlock *ExitingBlock = L->getLoopLatch();
|
|
if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
|
|
ExitingBlock = L->getExitingBlock();
|
|
if (ExitingBlock)
|
|
TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
|
|
|
|
// Do not enable partial unrolling if the loop counter is float. It can cause precision issue.
|
|
if (ExitingBlock) {
|
|
if (UP.Partial) {
|
|
IGCLLVM::TerminatorInst *Term = ExitingBlock->getTerminator();
|
|
if (BranchInst *BI = dyn_cast<BranchInst>(Term)) {
|
|
if (dyn_cast<FCmpInst>(BI->getCondition())) {
|
|
UP.Partial = false;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
// Add heuristic to disable loop unroll for single short BB loop with
|
|
// barrier. Unrolling such a loop won't remove dependency due to that
|
|
// barrier but only add register pressure potentially.
|
|
if (L->getNumBlocks() == 1) {
|
|
BasicBlock *BB = *L->block_begin();
|
|
|
|
SmallPtrSet<const Value *, 32> EphValues;
|
|
CodeMetrics Metrics;
|
|
Metrics.analyzeBasicBlock(BB, *this, EphValues);
|
|
if (Metrics.NumInsts < 50) {
|
|
for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
|
|
CallInst *Call = dyn_cast<CallInst>(I);
|
|
if (!Call)
|
|
continue;
|
|
Function *F = Call->getCalledFunction();
|
|
if (!F)
|
|
continue;
|
|
// FIXME: Shall we already inline barrier even in two-phase
|
|
// inlining?
|
|
if (F->getName() == "_Z7barrierj") {
|
|
// Disable loop unrolling for short loop with
|
|
// barrier, where we prefer wider SIMD to mitigate
|
|
// the barrier overhead.
|
|
UP.Count = 1;
|
|
UP.MaxCount = UP.Count;
|
|
UP.Partial = false;
|
|
UP.Runtime = false;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Skip non-simple loop.
|
|
if (L->getNumBlocks() != 1) {
|
|
if (IGC_IS_FLAG_ENABLED(EnableAdvRuntimeUnroll) && IGCLLVM::isInnermost(L)) {
|
|
auto countNonPHI = [](BasicBlock *BB) {
|
|
// Count the number of instructions in the basic block without dbg instructions
|
|
unsigned InstCountInBB = BB->sizeWithoutDebug();
|
|
unsigned PHIs = 0;
|
|
for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
|
|
if (!isa<PHINode>(&*BI))
|
|
break;
|
|
++PHIs;
|
|
}
|
|
return InstCountInBB - PHIs;
|
|
};
|
|
auto hasLoad = [](BasicBlock *BB) {
|
|
for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI)
|
|
if (isa<LoadInst>(&*BI))
|
|
return true;
|
|
return false;
|
|
};
|
|
auto hasStore = [](BasicBlock *BB) {
|
|
for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI)
|
|
if (isa<StoreInst>(&*BI))
|
|
return true;
|
|
return false;
|
|
};
|
|
auto hasCall = [](BasicBlock *BB) {
|
|
for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI)
|
|
if (isa<CallInst>(&*BI) && !isa<IntrinsicInst>(&*BI) && !isa<GenIntrinsicInst>(&*BI))
|
|
return true;
|
|
return false;
|
|
};
|
|
// For innermost loop, allow certain patterns.
|
|
unsigned Count = 0;
|
|
bool HasCall = false;
|
|
bool HasStore = false;
|
|
bool MayHasLoadInHeaderOnly = true;
|
|
for (auto BI = L->block_begin(), BE = L->block_end(); BI != BE; ++BI) {
|
|
Count += countNonPHI(*BI);
|
|
HasCall |= hasCall(*BI);
|
|
HasStore |= hasStore(*BI);
|
|
if (L->getHeader() != *BI)
|
|
MayHasLoadInHeaderOnly &= !hasLoad(*BI);
|
|
}
|
|
// Runtime unroll it.
|
|
if (!HasCall && !HasStore && MayHasLoadInHeaderOnly && Count < 100) {
|
|
unsigned C = IGC_GET_FLAG_VALUE(AdvRuntimeUnrollCount);
|
|
if (C == 0)
|
|
C = 4;
|
|
UP.Runtime = true;
|
|
UP.Count = C;
|
|
UP.MaxCount = UP.Count;
|
|
// The following is only available and required from LLVM 3.7+.
|
|
UP.AllowExpensiveTripCount = true;
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
llvm::BasicBlock::InstListType::iterator I;
|
|
llvm::BasicBlock *loopBlock = L->getBlocks()[0];
|
|
int instCount =
|
|
std::distance(loopBlock->instructionsWithoutDebug().begin(), loopBlock->instructionsWithoutDebug().end());
|
|
|
|
// Check if the specific basic block has block read or write.
|
|
auto hasBlockReadWrite = [](BasicBlock *BB) {
|
|
for (auto I = BB->begin(), E = BB->end(); I != E; ++I)
|
|
if (auto GII = dyn_cast<GenIntrinsicInst>(I))
|
|
switch (GII->getIntrinsicID()) {
|
|
case GenISAIntrinsic::GenISA_simdBlockRead:
|
|
case GenISAIntrinsic::GenISA_simdBlockWrite:
|
|
return true;
|
|
default:
|
|
break;
|
|
}
|
|
return false;
|
|
};
|
|
// Skip the following logic for OCL. Apparently, it's designed to prevent
|
|
// loops in 3D shaders being aggressively unrolled to increase shader
|
|
// binary size dramatically. So far, OCL doesn't have such concern and, if
|
|
// we need to consider that, more factors need consideration. Just skip
|
|
// that for OCL.
|
|
if (ctx->type == ShaderType::OPENCL_SHADER &&
|
|
// Only try to fully unroll small loop with known but small trip count.
|
|
// This's PURELY heuristics.
|
|
((TripCount != 0 && TripCount <= 40 && instCount < 40) || hasBlockReadWrite(L->getHeader())) &&
|
|
// FIXME: WA for cases where the compiler is running with a smaller stack size
|
|
// we run into stack overflow in
|
|
!ctx->m_DriverInfo.HasSmallStack()) {
|
|
return;
|
|
}
|
|
|
|
for (I = loopBlock->begin(); I != loopBlock->end(); I++) {
|
|
if (const auto pIntrinsic = llvm::dyn_cast<llvm::GenIntrinsicInst>(I)) {
|
|
if (isSendMessage(pIntrinsic)) {
|
|
sendMessage++;
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned int estimateUnrolledInstCount = (instCount + sendMessage * 4) * TripCount;
|
|
unsigned int unrollLimitInstCount =
|
|
LoopUnrollThreshold > totalInstCountInShader ? LoopUnrollThreshold - totalInstCountInShader : 0;
|
|
bool limitUnrolling = (estimateUnrolledInstCount > unrollLimitInstCount) || (TripCount > unrollLimitInstCount) ||
|
|
(instCount + sendMessage * 4 > unrollLimitInstCount);
|
|
|
|
// if the loop doesn't have sample, skip the unrolling parameter change
|
|
if (!sendMessage) {
|
|
// if the estimated unrolled instruction count is larger than the unrolling threshold, limit unrolling.
|
|
if (limitUnrolling) {
|
|
UP.Count = MIN(unrollLimitInstCount / (instCount + sendMessage * 4), 4);
|
|
if (TripCount != 0)
|
|
while (UP.Count != 0 && TripCount % UP.Count != 0)
|
|
UP.Count--;
|
|
UP.MaxCount = UP.Count;
|
|
}
|
|
return;
|
|
}
|
|
|
|
// if the TripCount is known, and the estimated unrolled count exceed LoopUnrollThreshold, set the unrolling count to
|
|
// 4
|
|
if (limitUnrolling) {
|
|
UP.Count = MIN(TripCount, 4);
|
|
UP.MaxCount = UP.Count;
|
|
}
|
|
|
|
unsigned int runtimeUnroll = IGC_GET_FLAG_VALUE(RuntimeLoopUnrolling); // 0: default, 1: on, 2: off
|
|
if (runtimeUnroll == 2) {
|
|
return;
|
|
} else if (runtimeUnroll == 0) {
|
|
// do not enable runtime unrolling if the loop is long or trip count is already known.
|
|
// skip this check if RuntimeLoopUnrolling is set to force on.
|
|
if (instCount > 35 || TripCount) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (!limitUnrolling) {
|
|
UP.Runtime = true;
|
|
UP.Count = 4;
|
|
UP.MaxCount = UP.Count;
|
|
// The following is only available and required from LLVM 3.7+.
|
|
UP.AllowExpensiveTripCount = true;
|
|
}
|
|
|
|
if (MDNode *LoopID = L->getLoopID()) {
|
|
const llvm::StringRef maxIterMetadataNames = "spv.loop.iterations.max";
|
|
for (unsigned i = 0; i < LoopID->getNumOperands(); ++i) {
|
|
if (MDNode *MD = llvm::dyn_cast<MDNode>(LoopID->getOperand(i))) {
|
|
if (MDString *S = llvm::dyn_cast<MDString>(MD->getOperand(0))) {
|
|
if (maxIterMetadataNames.equals(S->getString())) {
|
|
UP.MaxCount = static_cast<unsigned>(mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// [LLVM-UPGRADE] Peeling information was separated
|
|
// https://github.com/llvm/llvm-project/commit/e541e1b757237172c247904b670c9894d6b3759d
|
|
|
|
void GenIntrinsicsTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
|
|
llvm::TargetTransformInfo::PeelingPreferences &PP) {
|
|
if (MDNode *LoopID = L->getLoopID()) {
|
|
const llvm::StringRef peelCountMetadataNames = "spv.loop.peel.count";
|
|
|
|
for (unsigned i = 0; i < LoopID->getNumOperands(); ++i) {
|
|
if (MDNode *MD = llvm::dyn_cast<MDNode>(LoopID->getOperand(i))) {
|
|
if (MDString *S = llvm::dyn_cast<MDString>(MD->getOperand(0))) {
|
|
if (peelCountMetadataNames.equals(S->getString())) {
|
|
PP.AllowPeeling = true;
|
|
PP.PeelCount = static_cast<unsigned>(mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool GenIntrinsicsTTIImpl::isProfitableToHoist(Instruction *I) {
|
|
if (auto *CI = dyn_cast<CallInst>(I)) {
|
|
if (CI->isConvergent() && CI->onlyAccessesInaccessibleMemory()) {
|
|
return false;
|
|
}
|
|
}
|
|
return BaseT::isProfitableToHoist(I);
|
|
}
|
|
|
|
// TODO: Upon the complete removal of pre-LLVM 14 conditions, move to 'getInstructionCost' per LLVM 16 API
|
|
llvm::InstructionCost GenIntrinsicsTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands,
|
|
TTI::TargetCostKind CostKind) {
|
|
return GenIntrinsicsTTIImpl::internalCalculateCost(U, Operands, CostKind);
|
|
}
|
|
|
|
#if LLVM_VERSION_MAJOR >= 16
|
|
llvm::InstructionCost GenIntrinsicsTTIImpl::getInstructionCost(const User *U, ArrayRef<const Value *> Operands,
|
|
TTI::TargetCostKind CostKind) {
|
|
return GenIntrinsicsTTIImpl::internalCalculateCost(U, Operands, CostKind);
|
|
}
|
|
#endif
|
|
|
|
llvm::InstructionCost GenIntrinsicsTTIImpl::internalCalculateCost(const User *U, ArrayRef<const Value *> Operands,
|
|
TTI::TargetCostKind CostKind) {
|
|
// The extra cost of speculative execution for math intrinsics
|
|
if (auto *II = dyn_cast_or_null<IntrinsicInst>(U)) {
|
|
if (Intrinsic::ID IID = II->getIntrinsicID()) {
|
|
switch (IID) {
|
|
case Intrinsic::cos:
|
|
case Intrinsic::sin:
|
|
case Intrinsic::sqrt:
|
|
return TTI::TCC_Expensive;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (IGC_IS_FLAG_ENABLED(EnablePromoteLoopUnrollwithAlloca)) {
|
|
const GetElementPtrInst *GEP = nullptr;
|
|
if (Operator::getOpcode(U) == Instruction::Load)
|
|
GEP = dyn_cast<GetElementPtrInst>(cast<LoadInst>(U)->getPointerOperand());
|
|
if (Operator::getOpcode(U) == Instruction::Store)
|
|
GEP = dyn_cast<GetElementPtrInst>(cast<StoreInst>(U)->getPointerOperand());
|
|
|
|
if (GEP) {
|
|
if (isGEPLoopInduction.find(GEP) != isGEPLoopInduction.end())
|
|
return TTI::TCC_Free;
|
|
}
|
|
}
|
|
|
|
const Function *F = dyn_cast<Function>(U);
|
|
if (F != nullptr) {
|
|
IGC::CodeGenContext *CGC = this->ctx;
|
|
if (!CGC->enableFunctionCall() && !GenISAIntrinsic::isIntrinsic(F) && !F->isIntrinsic()) {
|
|
// If subroutine call is not enabled but we have function call. They
|
|
// are not inlined. e.g. due to two-phase inlining. Return function
|
|
// size instead of to avoid under-estimating the cost of function call.
|
|
//
|
|
// FIXME: We need to collect the cost following calling graph. However,
|
|
// as LLVM's ininer only support bottom-up inlining currently. That's
|
|
// not a big issue so far.
|
|
//
|
|
// FIXME: We also need to consider the case where sub-routine call is
|
|
// enabled.
|
|
unsigned FuncSize = countTotalInstructions(F, false);
|
|
return TargetTransformInfo::TCC_Basic * FuncSize;
|
|
}
|
|
}
|
|
|
|
return BaseT::getInstructionCost(U, Operands, CostKind);
|
|
}
|
|
|
|
// Strip from LLVM::LoopUnrollPass::ApproximateLoopSize
|
|
unsigned getLoopSize(const Loop *L, const TargetTransformInfo &TTI) {
|
|
SmallPtrSet<const Value *, 32> EphValues;
|
|
|
|
CodeMetrics Metrics;
|
|
for (BasicBlock *BB : L->blocks())
|
|
Metrics.analyzeBasicBlock(BB, TTI, EphValues);
|
|
|
|
InstructionCost LoopSize;
|
|
LoopSize = Metrics.NumInsts;
|
|
|
|
LoopSize = (LoopSize > 3/*BEInsns + 1*/) ? LoopSize : 3;
|
|
return *LoopSize.getValue();
|
|
}
|
|
|
|
} // namespace llvm
|