Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/Simd32Profitability.cpp
Ashwini Kardile 68a90e4539 KW fix for uninitialized variables
KW fix for uninitialized variables
2021-10-19 06:41:49 +02:00

1209 lines
40 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2021 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "Compiler/CISACodeGen/Simd32Profitability.hpp"
#include "Compiler/CodeGenPublic.h"
#include "Compiler/IGCPassSupport.h"
#include "Compiler/CISACodeGen/Platform.hpp"
#include "common/LLVMWarningsPush.hpp"
#include <llvmWrapper/IR/DerivedTypes.h>
#include <llvmWrapper/Transforms/Utils/LoopUtils.h>
#include <llvm/IR/InstIterator.h>
#include <llvm/IR/Operator.h>
#include <llvmWrapper/IR/DerivedTypes.h>
#include "common/LLVMWarningsPop.hpp"
#include "GenISAIntrinsics/GenIntrinsics.h"
#include "GenISAIntrinsics/GenIntrinsicInst.h"
#include "Probe/Assertion.h"
using namespace llvm;
using namespace IGC;
using namespace IGC::IGCMD;
// Register pass to igc-opt
#define PASS_FLAG "simd32-profit"
#define PASS_DESCRIPTION "Check SIMD32 Profitability for OpenCL"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS true
IGC_INITIALIZE_PASS_BEGIN(Simd32ProfitabilityAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
IGC_INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_END(Simd32ProfitabilityAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
char Simd32ProfitabilityAnalysis::ID = 0;
const unsigned BRANCHY_MINPATH = 8;
Simd32ProfitabilityAnalysis::Simd32ProfitabilityAnalysis()
: FunctionPass(ID), F(nullptr), PDT(nullptr), LI(nullptr),
pMdUtils(nullptr), WI(nullptr), m_isSimd32Profitable(true),
m_isSimd16Profitable(true) {
initializeSimd32ProfitabilityAnalysisPass(*PassRegistry::getPassRegistry());
}
static std::tuple<Value* /*INIT*/, Value* /*CURR*/, Value* /*STEP*/, Value* /*NEXT*/>
getInductionVariable(Loop* L) {
BasicBlock* H = L->getHeader();
BasicBlock* Incoming = 0, *Backedge = 0;
pred_iterator PI = pred_begin(H);
IGC_ASSERT_MESSAGE(PI != pred_end(H), "Loop must have at least one backedge!");
Backedge = *PI++;
if (PI == pred_end(H)) // dead loop
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
Incoming = *PI++;
if (PI != pred_end(H)) // multiple backedges?
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
if (L->contains(Incoming)) {
if (L->contains(Backedge))
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
std::swap(Incoming, Backedge);
}
else if (!L->contains(Backedge))
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
// Loop over all of the PHI nodes, looking for an indvar.
for (auto I = H->begin(); isa<PHINode>(I); ++I) {
PHINode* PN = cast<PHINode>(I);
if (auto Inc = dyn_cast<Instruction>(PN->getIncomingValueForBlock(Backedge))) {
if (Inc->getOpcode() == Instruction::Add && Inc->getOperand(0) == PN) {
return
std::make_tuple(PN->getIncomingValueForBlock(Incoming), PN,
Inc->getOperand(1), Inc);
}
}
}
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
}
enum {
LOOPCOUNT_LIKELY_SMALL,
LOOPCOUNT_LIKELY_LARGE,
LOOPCOUNT_UNKNOWN
};
static bool isSignedPredicate(CmpInst::Predicate Pred) {
switch (Pred) {
default: break;
case CmpInst::ICMP_EQ:
case CmpInst::ICMP_NE:
case CmpInst::ICMP_SGT:
case CmpInst::ICMP_SLT:
case CmpInst::ICMP_SGE:
case CmpInst::ICMP_SLE:
return true;
}
return false;
}
static bool isUnsignedPredicate(CmpInst::Predicate Pred) {
switch (Pred) {
default: break;
case CmpInst::ICMP_EQ:
case CmpInst::ICMP_NE:
case CmpInst::ICMP_UGT:
case CmpInst::ICMP_ULT:
case CmpInst::ICMP_UGE:
case CmpInst::ICMP_ULE:
return true;
}
return false;
}
static bool hasSameSignedness(CmpInst::Predicate LHS, CmpInst::Predicate RHS) {
if (isSignedPredicate(LHS) && isSignedPredicate(RHS))
return true;
if (isUnsignedPredicate(LHS) && isUnsignedPredicate(RHS))
return true;
return false;
}
static std::tuple<Value*, Value*, Value*, bool>
isOutOfRangeComparison(Value* Cond) {
BinaryOperator* BO = dyn_cast<BinaryOperator>(Cond);
if (!BO || BO->getOpcode() != Instruction::Or)
return std::make_tuple(nullptr, nullptr, nullptr, false);
ICmpInst* LHS = dyn_cast<ICmpInst>(BO->getOperand(0));
ICmpInst* RHS = dyn_cast<ICmpInst>(BO->getOperand(1));
if (!LHS || !RHS)
return std::make_tuple(nullptr, nullptr, nullptr, false);
CmpInst::Predicate P0 = LHS->getPredicate();
CmpInst::Predicate P1 = RHS->getPredicate();
if (!hasSameSignedness(P0, P1))
return std::make_tuple(nullptr, nullptr, nullptr, false);
// Simplify the checking since they have the same signedness.
P0 = ICmpInst::getSignedPredicate(P0);
P1 = ICmpInst::getSignedPredicate(P1);
if (!(P0 == CmpInst::ICMP_SLT || P0 == CmpInst::ICMP_SLE)) {
std::swap(LHS, RHS);
std::swap(P0, P1);
}
if (!(P0 == CmpInst::ICMP_SLT || P0 == CmpInst::ICMP_SLE) ||
!(P1 == CmpInst::ICMP_SGT || P1 == CmpInst::ICMP_SGE))
return std::make_tuple(nullptr, nullptr, nullptr, false);
if (LHS->getOperand(0) != RHS->getOperand(0))
return std::make_tuple(nullptr, nullptr, nullptr, false);
return std::make_tuple(LHS->getOperand(0),
LHS->getOperand(1), RHS->getOperand(1),
isSignedPredicate(LHS->getPredicate()));
}
static Value* getLoopCounter(Loop* L, Value* X) {
BasicBlock* H = L->getHeader();
BasicBlock* Incoming = 0, *Backedge = 0;
pred_iterator PI = pred_begin(H);
IGC_ASSERT_MESSAGE(PI != pred_end(H), "Loop must have at least one backedge!");
Backedge = *PI++;
if (PI == pred_end(H)) // dead loop
return nullptr;
Incoming = *PI++;
if (PI != pred_end(H)) // multiple backedges?
return nullptr;
if (L->contains(Incoming)) {
if (L->contains(Backedge))
return nullptr;
std::swap(Incoming, Backedge);
}
else if (!L->contains(Backedge))
return nullptr;
for (auto I = H->begin(); isa<PHINode>(I); ++I) {
PHINode* PN = cast<PHINode>(I);
if (X == PN->getIncomingValueForBlock(Backedge))
return PN;
}
return nullptr;
}
static std::tuple<int, int>
countOperands(Value* V, Value* LHS, Value* RHS) {
if (V == LHS || V == RHS)
return std::make_tuple((V == LHS), (V == RHS));
// Count LHS, RHS in an expression like m*L + n*R +/- C, where C is
// constant.
BinaryOperator* BO = dyn_cast<BinaryOperator>(V);
if (!BO ||
(BO->getOpcode() != Instruction::Add &&
BO->getOpcode() != Instruction::Sub &&
BO->getOpcode() != Instruction::Shl &&
BO->getOpcode() != Instruction::Xor))
return std::make_tuple(0, 0);
if (BO->getOpcode() == Instruction::Shl) {
ConstantInt* CI = dyn_cast<ConstantInt>(BO->getOperand(1));
if (!CI)
return std::make_tuple(0, 0);
int L = 0, R = 0;
std::tie(L, R) = countOperands(BO->getOperand(0), LHS, RHS);
uint64_t ShAmt = CI->getZExtValue();
return std::make_tuple((L << ShAmt), (R << ShAmt));
}
if (BO->getOpcode() == Instruction::Xor) {
ConstantInt* CI = dyn_cast<ConstantInt>(BO->getOperand(1));
if (!CI || CI->getSExtValue() != -1)
return std::make_tuple(0, 0);
int L = 0, R = 0;
std::tie(L, R) = countOperands(BO->getOperand(0), LHS, RHS);
return std::make_tuple(-L, -R);
}
IGC_ASSERT((BO->getOpcode() == Instruction::Add) || (BO->getOpcode() == Instruction::Sub));
if (isa<Constant>(BO->getOperand(1)))
return countOperands(BO->getOperand(0), LHS, RHS);
int L0 = 0, L1 = 0;
std::tie(L0, L1) = countOperands(BO->getOperand(0), LHS, RHS);
int R0 = 0, R1 = 0;
std::tie(R0, R1) = countOperands(BO->getOperand(1), LHS, RHS);
if (BO->getOpcode() == Instruction::Add)
return std::make_tuple(L0 + R0, L1 + R1);
IGC_ASSERT(BO->getOpcode() == Instruction::Sub);
return std::make_tuple(L0 - R0, L1 - R1);
}
static bool isNegatedByLB(Value* V, Value* X, Value* LB) {
// Check if `V` is calculated as LB - X +/- C, where C is constant.
int L = 0, R = 0;
std::tie(L, R) = countOperands(V, LB, X);
return (L == 1) && (R == -1);
}
static bool isNegatedBy2UB(Value* V, Value* X, Value* UB) {
// Check if `V` is calculated as 2UB - X +/- C, where C is constant.
int L = 0, R = 0;
std::tie(L, R) = countOperands(V, UB, X);
return (L == 2) && (R == -1);
}
unsigned Simd32ProfitabilityAnalysis::estimateLoopCount_CASE1(Loop* L) {
BasicBlock* Exit = L->getExitingBlock();
if (!Exit)
return LOOPCOUNT_UNKNOWN;
BranchInst* Br = dyn_cast<BranchInst>(Exit->getTerminator());
if (!Br || !Br->isConditional())
return LOOPCOUNT_UNKNOWN;
if (!L->contains(Br->getSuccessor(0)))
return LOOPCOUNT_UNKNOWN;
Value* X = nullptr, * LB = nullptr, * UB = nullptr;
bool Signed = false;
std::tie(X, LB, UB, Signed) = isOutOfRangeComparison(Br->getCondition());
if (!X) {
ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
if (!Cmp)
return LOOPCOUNT_UNKNOWN;
switch (Cmp->getPredicate()) {
default:
return LOOPCOUNT_UNKNOWN;
case CmpInst::ICMP_UGT:
case CmpInst::ICMP_UGE:
// A smart use of unsigned comparison on signed values to perform a
// out-of-range change of (0, N).
break;
}
X = Cmp->getOperand(0);
LB = Constant::getNullValue(X->getType());
UB = Cmp->getOperand(1);
Signed = true;
}
Value* LC = getLoopCounter(L, X);
if (!LC)
return LOOPCOUNT_UNKNOWN;
if (PHINode * PN = dyn_cast<PHINode>(X)) {
if (PN->getNumIncomingValues() != 2)
return LOOPCOUNT_UNKNOWN;
BasicBlock* BB0 = PN->getIncomingBlock(0);
BasicBlock* IfBB = BB0->getSinglePredecessor();
if (!IfBB)
return LOOPCOUNT_UNKNOWN;
Br = dyn_cast<BranchInst>(IfBB->getTerminator());
if (!Br || !Br->isConditional())
return LOOPCOUNT_UNKNOWN;
ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
if (!Cmp)
return LOOPCOUNT_UNKNOWN;
CmpInst::Predicate Pred = Cmp->getPredicate();
Value* LHS = Cmp->getOperand(0);
Value* RHS = Cmp->getOperand(1);
if (LHS != LC) {
std::swap(LHS, RHS);
Pred = CmpInst::getSwappedPredicate(Pred);
}
if (LHS != LC)
return LOOPCOUNT_UNKNOWN;
if (!Signed)
Pred = ICmpInst::getSignedPredicate(Pred);
if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_SLE)
return LOOPCOUNT_UNKNOWN;
if (RHS != LB)
return LOOPCOUNT_UNKNOWN;
Value* X0 = PN->getIncomingValue(0);
Value* X1 = PN->getIncomingValue(1);
if (!isNegatedByLB(X0, LC, LB))
return LOOPCOUNT_UNKNOWN;
if (!isNegatedBy2UB(X1, LC, UB))
return LOOPCOUNT_UNKNOWN;
}
else if (BinaryOperator * BO = dyn_cast<BinaryOperator>(X)) {
if (BO->getOpcode() != Instruction::Sub)
return LOOPCOUNT_UNKNOWN;
if (BO->getOperand(1) != LC)
return LOOPCOUNT_UNKNOWN;
SelectInst* SI = dyn_cast<SelectInst>(BO->getOperand(0));
if (!SI)
return LOOPCOUNT_UNKNOWN;
ICmpInst* Cmp = dyn_cast<ICmpInst>(SI->getCondition());
if (!Cmp)
return LOOPCOUNT_UNKNOWN;
CmpInst::Predicate Pred = Cmp->getPredicate();
Value* LHS = Cmp->getOperand(0);
Value* RHS = Cmp->getOperand(1);
if (LHS != LC) {
std::swap(LHS, RHS);
Pred = CmpInst::getSwappedPredicate(Pred);
}
if (LHS != LC)
return LOOPCOUNT_UNKNOWN;
if (!Signed)
Pred = ICmpInst::getSignedPredicate(Pred);
Value* X0 = SI->getTrueValue();
Value* X1 = SI->getFalseValue();
if (Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE) {
std::swap(X0, X1);
Pred = CmpInst::getInversePredicate(Pred);
}
if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_SLE)
return LOOPCOUNT_UNKNOWN;
if (RHS != LB)
return LOOPCOUNT_UNKNOWN;
int L0 = 0, R0 = 0;
std::tie(L0, R0) = countOperands(X0, LB, nullptr);
int L1 = 0, R1 = 0;
std::tie(L1, R1) = countOperands(X1, UB, nullptr);
if (L0 != 1 || L1 != 2)
return LOOPCOUNT_UNKNOWN;
}
else
return LOOPCOUNT_UNKNOWN;
// Ok, we found a loop of the following pattern:
//
// do {
// if (x < 0) {
// x = 0 - x +/- c0;
// } else {
// x = 2 * UB - x +/- c1;
// }
// } while (x < LB || x > UB);
//
// such loop will run only once or twice when non-arbitary large `x`. If a
// non-uniform loop only runs several iterations, divergence cost due to
// SIMD32 could be ignored.
return LOOPCOUNT_LIKELY_SMALL;
}
unsigned Simd32ProfitabilityAnalysis::estimateLoopCount_CASE2(Loop* L) {
SmallVector<BasicBlock*, 8> ExitingBBs;
L->getExitingBlocks(ExitingBBs);
Value* Init = nullptr, * Curr= nullptr, * Next= nullptr, * Step= nullptr;
std::tie(Init, Curr, Step, Next) = getInductionVariable(L);
if (!Init || !Curr || !Step || !Next)
return LOOPCOUNT_UNKNOWN;
ConstantInt* I0 = dyn_cast<ConstantInt>(Init);
ConstantInt* S0 = dyn_cast<ConstantInt>(Step);
if (!I0 || !S0)
return LOOPCOUNT_UNKNOWN;
for (auto BB : ExitingBBs) {
BranchInst* Br = dyn_cast<BranchInst>(BB->getTerminator());
if (!Br || !Br->isConditional())
continue;
if (!L->contains(Br->getSuccessor(0))) // Not condition of `continue`.
continue;
ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
if (!WI->isUniform(Br)) {
BinaryOperator* BO = dyn_cast<BinaryOperator>(Br->getCondition());
if (!BO)
continue;
if (BO->getOpcode() != Instruction::And)
continue;
ICmpInst* Cond = nullptr;
ICmpInst* Op0 = dyn_cast<ICmpInst>(BO->getOperand(0));
if (Op0 && WI->isUniform(Op0))
Cond = Op0;
if (!Cond) {
ICmpInst* Op1 = dyn_cast<ICmpInst>(BO->getOperand(1));
if (Op1 && WI->isUniform(Op1))
Cond = Op1;
}
if (!Cond)
continue;
Cmp = Cond;
}
if (!Cmp)
continue;
CmpInst::Predicate Pred = Cmp->getPredicate();
switch (Pred) {
default:
// TODO: Handle more predicates.
continue;
case ICmpInst::ICMP_SLT:
case ICmpInst::ICMP_ULT:
break;
}
Value* Op0 = Cmp->getOperand(0);
Value* Op1 = Cmp->getOperand(1);
if (Op0 != Next)
continue;
ConstantInt* E0 = dyn_cast<ConstantInt>(Op1);
if (!E0)
continue;
ConstantInt* N = dyn_cast<ConstantInt>(
Pred == ICmpInst::ICMP_SLT
? ConstantExpr::getSDiv(ConstantExpr::getSub(E0, I0), S0)
: ConstantExpr::getUDiv(ConstantExpr::getSub(E0, I0), S0));
if (!N)
continue;
if (N->getValue().slt(0))
continue;
if (N->getValue().slt(100))
return LOOPCOUNT_LIKELY_SMALL;
}
// Ok, we found a non-uniform loop with multiple exiting conditions.
// However, one of them is uniform one and has small loop count.
return LOOPCOUNT_UNKNOWN;
}
unsigned Simd32ProfitabilityAnalysis::estimateLoopCount(Loop* L) {
unsigned Ret;
Ret = estimateLoopCount_CASE1(L);
if (Ret != LOOPCOUNT_UNKNOWN)
return Ret;
Ret = estimateLoopCount_CASE2(L);
if (Ret != LOOPCOUNT_UNKNOWN)
return Ret;
return Ret;
}
static Value* getLoopCount(Value* Start, Value* End) {
// Poorman's loop count checking as we need to check that result with WIA.
ConstantInt* CStart = dyn_cast<ConstantInt>(Start);
ConstantInt* CEnd = dyn_cast<ConstantInt>(End);
if (CStart && CEnd)
return ConstantExpr::getSub(CEnd, CStart);
if (CStart && CStart->isNullValue())
return End;
BinaryOperator* BO = dyn_cast<BinaryOperator>(End);
if (!BO || BO->getOpcode() != Instruction::Add)
return nullptr;
Value* Op0 = BO->getOperand(0);
Value* Op1 = BO->getOperand(1);
if (Op0 != Start)
std::swap(Op0, Op1);
if (Op0 == Start)
return Op1;
return nullptr;
}
/// hasIEEESqrtOrDivFunc - Check whether IEEE correctly-rounded SQRT or DIV is
/// used in the given function.
static bool hasIEEESqrtOrDivFunc(const Function& F) {
for (auto& BB : F)
for (auto& I : BB) {
const GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(&I);
if (!GII)
continue;
switch (GII->getIntrinsicID()) {
case GenISAIntrinsic::GenISA_IEEE_Sqrt:
case GenISAIntrinsic::GenISA_IEEE_Divide:
return true;
default: break;
}
}
return false;
}
/// hasSubGroupFunc - Check whether subgroup functions are used in the given
/// function.
static bool hasSubGroupFunc(const Function& F)
{
for (auto& BB : F)
{
for (auto& I : BB)
{
if (isSubGroupIntrinsic(&I))
{
return true;
}
}
}
return false;
}
bool Simd32ProfitabilityAnalysis::runOnFunction(Function& F)
{
this->F = &F;
CodeGenContext* context = nullptr;
context = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
if (context->type == ShaderType::OPENCL_SHADER)
{
PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
WI = &getAnalysis<WIAnalysis>();
pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
m_isSimd16Profitable = checkSimd16Profitable(context);
m_isSimd32Profitable = m_isSimd16Profitable && checkSimd32Profitable(context);
}
else if (context->type == ShaderType::PIXEL_SHADER)
{
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
m_isSimd32Profitable = checkPSSimd32Profitable();
}
return false;
}
static bool isPayloadHeader(Value* V) {
Argument* Arg = dyn_cast<Argument>(V);
if (!Arg || !Arg->hasName())
return false;
IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
if (!VTy || VTy->getNumElements() != 8 ||
!VTy->getElementType()->isIntegerTy(32))
return false;
return Arg->getName() != "payloadHeader";
}
static bool isR0(Value* V) {
Argument* Arg = dyn_cast<Argument>(V);
if (!Arg || !Arg->hasName())
return false;
IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
if (!VTy || VTy->getNumElements() != 8 ||
!VTy->getElementType()->isIntegerTy(32))
return false;
return Arg->getName() != "r0";
}
static bool isEnqueuedLocalSize(Value* V) {
Argument* Arg = dyn_cast<Argument>(V);
if (!Arg || !Arg->hasName())
return false;
IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
if (!VTy || VTy->getNumElements() != 3 ||
!VTy->getElementType()->isIntegerTy(32))
return false;
return Arg->getName() != "enqueuedLocalSize";
}
static bool isGetGroupIdX(Value* V) {
auto EEI = dyn_cast<ExtractElementInst>(V);
if (!EEI)
return false;
if (!EEI->getType()->isIntegerTy(32))
return false;
auto CI = dyn_cast<Constant>(EEI->getOperand(1));
if (!CI || !CI->isOneValue())
return false;
return isR0(EEI->getOperand(0));
}
static bool isGetEnqueuedLocalSizeX(Value* V) {
auto EEI = dyn_cast<ExtractElementInst>(V);
if (!EEI)
return false;
if (!EEI->getType()->isIntegerTy(32))
return false;
auto CI = dyn_cast<Constant>(EEI->getOperand(1));
if (!CI || !CI->isNullValue())
return false;
return isEnqueuedLocalSize(EEI->getOperand(0));
}
static bool isGetLocalIdX(Value* V) {
if (auto ZEI = dyn_cast<ZExtInst>(V))
return isGetLocalIdX(ZEI->getOperand(0));
Argument* Arg = dyn_cast<Argument>(V);
if (!Arg || !Arg->hasName())
return false;
if (!Arg->getType()->isIntegerTy(16))
return false;
return Arg->getName() == "localIdX";
}
static bool isGetGlobalOffsetX(Value* V) {
auto EEI = dyn_cast<ExtractElementInst>(V);
if (!EEI)
return false;
if (!EEI->getType()->isIntegerTy(32))
return false;
auto CI = dyn_cast<Constant>(EEI->getOperand(1));
if (!CI || !CI->isNullValue())
return false;
return isPayloadHeader(EEI->getOperand(0));
}
static bool isGetGlobalIdX(Value* V) {
// GlobalIdX = GroupIdX * EnqueuedLocalSizeX + LocalIdX + GlobalOffsetX
auto BO = dyn_cast<BinaryOperator>(V);
if (!BO || BO->getOpcode() != Instruction::Add)
return false;
auto BO1 = dyn_cast<BinaryOperator>(BO->getOperand(0));
auto A0 = BO->getOperand(1);
if (!BO1) {
BO1 = dyn_cast<BinaryOperator>(BO->getOperand(1));
A0 = BO->getOperand(0);
}
if (!BO1 || BO1->getOpcode() != Instruction::Add)
return false;
auto BO2 = dyn_cast<BinaryOperator>(BO1->getOperand(0));
auto A1 = BO1->getOperand(1);
if (!BO2) {
BO2 = dyn_cast<BinaryOperator>(BO1->getOperand(1));
A1 = BO1->getOperand(0);
}
if (!BO2 || BO2->getOpcode() != Instruction::Mul)
return false;
auto M0 = BO2->getOperand(0);
auto M1 = BO2->getOperand(1);
if (!((isGetGroupIdX(M0) && isGetEnqueuedLocalSizeX(M1)) ||
(isGetGroupIdX(M1) && isGetEnqueuedLocalSizeX(M0))))
return false;
return ((isGetLocalIdX(A0) && isGetGlobalOffsetX(A1)) ||
(isGetLocalIdX(A1) && isGetGlobalOffsetX(A0)));
}
bool Simd32ProfitabilityAnalysis::isSelectBasedOnGlobalIdX(Value* V) {
PHINode* PN = dyn_cast<PHINode>(V);
while (!PN) {
auto BO = dyn_cast<BinaryOperator>(V);
if (!BO || BO->getOpcode() != Instruction::Shl)
return false;
if (!isa<Constant>(BO->getOperand(1)))
return false;
V = BO->getOperand(0);
PN = dyn_cast<PHINode>(V);
}
if (PN->getNumIncomingValues() != 2)
return false;
auto Op0 = PN->getIncomingValue(0);
if (!WI->isUniform(Op0))
return false;
auto Op1 = PN->getIncomingValue(1);
if (!WI->isUniform(Op1))
return false;
auto BB0 = PN->getIncomingBlock(0);
auto BB1 = PN->getIncomingBlock(1);
auto IfBB = BB0->getSinglePredecessor();
if (!IfBB || IfBB == BB1->getSinglePredecessor())
return false;
auto Br = dyn_cast<BranchInst>(IfBB->getTerminator());
if (!Br || !Br->isConditional())
return false;
ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
if (!Cmp)
return false;
Value* LHS = Cmp->getOperand(0);
Value* RHS = Cmp->getOperand(1);
switch (Cmp->getPredicate()) {
default:
return false;
case CmpInst::ICMP_SLT:
case CmpInst::ICMP_SLE:
break;
case CmpInst::ICMP_SGT:
case CmpInst::ICMP_SGE:
std::swap(LHS, RHS);
break;
}
if (!WI->isUniform(RHS))
return false;
return !isGetGlobalIdX(LHS);
}
bool Simd32ProfitabilityAnalysis::checkSimd32Profitable(CodeGenContext* ctx)
{
// If a kernel is too big, it would probably have enough work for EUs
// even without simd32; and simd32 would have more visa variables than
// 64K limit (ocl c99 64 bit PrintHalf/half8.c for example); thus make
// sense to skip simd32.
size_t programSize = 0;
for (Function::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI)
{
BasicBlock* BB = &*FI;
programSize += BB->size();
}
if (programSize > 8000)
{
return false;
}
// If we have workgroup size (or workgroup size hint) metadata, check whether the X dimension
// is expected to be of size 16 or below. If it is, no point in using SIMD32, we'll just
// get empty lanes.
auto funcInfoMD = pMdUtils->findFunctionsInfoItem(F);
if (funcInfoMD != pMdUtils->end_FunctionsInfo())
{
ThreadGroupSizeMetaDataHandle tgSize = funcInfoMD->second->getThreadGroupSize();
ThreadGroupSizeMetaDataHandle tgSizeHint = funcInfoMD->second->getThreadGroupSizeHint();
if (ctx->getModuleMetaData()->csInfo.maxWorkGroupSize && ctx->getModuleMetaData()->csInfo.maxWorkGroupSize <= 16)
return false;
if ((tgSize->hasValue() && (tgSize->getXDim() * tgSize->getYDim() * tgSize->getZDim()) <= 16) ||
(tgSizeHint->hasValue() && (tgSizeHint->getXDim() * tgSizeHint->getYDim() * tgSizeHint->getZDim()) <= 16)) {
return false;
}
}
// WORKAROUND - Skip SIMD32 if subgroup functions are present.
if (hasSubGroupFunc(*F)) {
return false;
}
const CPlatform* platform = &ctx->platform;
switch (platform->GetPlatformFamily()) {
case IGFX_GEN9_CORE:
/* TODO: Try to apply for platform->getPlatformInfo().eProductFamily ==
* IGFX_BROXTON only. */
// FALL THROUGH
case IGFX_GEN10_CORE:
if (hasIEEESqrtOrDivFunc(*F)) {
return false;
}
break;
default:
break;
}
// END OF WORKAROUND
// Ok, that's not the case.
// Now, check whether we have any non-uniform loops.
// The idea is that if there are divergenet loops, then SIMD32 will be harmful,
// because we'll waste time running loops with very few full lanes.
// If there are no divergent loops, SIMD32 is worth a shot. It still may not
// be selected, due to spills.
for (LoopInfo::iterator li = LI->begin(), le = LI->end(); li != le; ++li) {
llvm::Loop* loop = *li;
SmallVector<BasicBlock*, 8> exitingBlocks;
loop->getExitingBlocks(exitingBlocks);
bool AllUniform = true;
for (auto BBI = exitingBlocks.begin(), BBE = exitingBlocks.end(); BBI != BBE; ++BBI) {
BasicBlock* block = *BBI;
Instruction* term = block->getTerminator();
if (!WI->isUniform(term)) {
auto Br = dyn_cast<BranchInst>(term);
// Check special case for non-uniform loop where, except the
// initial, current, and next values, STEP and COUNT are
// uniform. In such a case, the loop is only diverged at the
// termination. It should be still profitable to be compiled
// into SIMD32 mode.
if (Br && Br->isConditional()) {
auto ICmp = dyn_cast<ICmpInst>(Br->getCondition());
if (ICmp) {
Value* Init = nullptr, * Curr = nullptr, * Step= nullptr, * Next = nullptr;
std::tie(Init, Curr, Step, Next)
= getInductionVariable(loop);
if (Init && Curr && Next && Step &&
WI->isUniform(Step)) {
auto Op0 = ICmp->getOperand(0);
auto Op1 = ICmp->getOperand(1);
if (SExtInst *SI0 = dyn_cast<SExtInst>(Op0))
Op0 = SI0->getOperand(0);
if (SExtInst *SI1 = dyn_cast<SExtInst>(Op1))
Op1 = SI1->getOperand(0);
if (Op0 != Next && Op0 != Curr)
std::swap(Op0, Op1);
// Skip non-uniform loop which only terminates on
// comparison between non-uniform induction variable
// and uniform value.
if (Op0 == Next || Op0 == Curr) {
// TODO: Need to check whether Init is linear to
// global/local ID. However, that checking is not
// that straightforward before code emitter.
if (WI->isUniform(Op1))
continue;
// TODO: Eable IndVarSimplify to simlify the
// following check.
if (Value * Count = getLoopCount(Init, Op1)) {
if (WI->isUniform(Count))
continue;
if (isSelectBasedOnGlobalIdX(Count))
continue;
}
}
}
}
}
AllUniform = false;
break;
}
}
if (!AllUniform) {
switch (estimateLoopCount(loop)) {
case LOOPCOUNT_LIKELY_LARGE:
case LOOPCOUNT_UNKNOWN:
return false;
case LOOPCOUNT_LIKELY_SMALL:
break;
}
}
}
return true;
}
/// Cyclomatic complexity measures of the number of linearly independent paths
/// through a region.
///
/// M = a * E - N + 2 where
/// E = the number of edges of the graph
/// N = the number of nodes of the graph
/// a = scalar factor (1 for uniform branches).
///
/// We focus on loops instead of the entire program, since cyclomatic
/// complexity is roughly linear when concatenating two programs, i.e.
/// CC(F # G) = (E1 + E2 + 1) - (N1 + N2) + 2
/// = (E1 - N1 + 2) + (E2 - N2 + 2) - 1
/// = CC(F) + CC(G) - 1.
///
static const unsigned CYCLOMATIC_COMPLEXITY_THRESHOLD = 200;
unsigned Simd32ProfitabilityAnalysis::getLoopCyclomaticComplexity() {
unsigned MaxCC = 0;
for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
Loop* L = *I;
unsigned CC = 2;
for (auto BI = L->block_begin(), BE = L->block_end(); BI != BE; ++BI) {
BasicBlock* BB = *BI;
IGCLLVM::TerminatorInst* TI = BB->getTerminator();
bool IsUniform = WI->isUniform(TI);
CC += TI->getNumSuccessors() * (IsUniform ? 1 : 2);
}
CC -= L->getNumBlocks();
MaxCC = std::max(CC, MaxCC);
}
return MaxCC;
}
static unsigned getNumOfNonUniformExits(Loop* L, WIAnalysis* WI) {
SmallVector<BasicBlock*, 8> ExistingBlocks;
L->getExitingBlocks(ExistingBlocks);
unsigned Count = 0;
for (auto BB : ExistingBlocks) {
IGCLLVM::TerminatorInst* TI = BB->getTerminator();
bool IsUniform = WI->isUniform(TI);
Count += !IsUniform;
}
return Count;
}
/// Check if a loop or its subloop has multiple non-uniform exists.
static bool hasMultipleExits(Loop* L, WIAnalysis* WI) {
if (getNumOfNonUniformExits(L, WI) > 1)
return true;
for (auto InnerL : L->getSubLoops())
if (hasMultipleExits(InnerL, WI))
return true;
return false;
}
/// Given a loop, return nested (inner) loops with multiple non-uniform exits.
/// E.g. assume L2, L3, L5, L7 are only loops with multiple non-uniform exists
/// L1
/// L2
/// L3
/// L4
/// L5
/// L6
/// L7
/// then it returns {L2, L5}
///
static void getNestedLoopsWithMultpleExists(Loop* L, WIAnalysis* WI,
SmallVectorImpl<Loop*>& Result) {
if (getNumOfNonUniformExits(L, WI) > 1) {
for (auto InnerL : L->getSubLoops()) {
if (hasMultipleExits(InnerL, WI)) {
Result.push_back(L);
return;
}
}
// Only a single level, do not add into the result.
return;
}
// Outer loop is normal. Check its inner loop structure, recursively.
for (auto InnerL : L->getSubLoops())
getNestedLoopsWithMultpleExists(InnerL, WI, Result);
}
/// Check if loops with multiple exists dominate the entire function.
static bool hasNestedLoopsWithMultipleExits(Function* F, LoopInfo* LI,
WIAnalysis* WI) {
// Find top level nested loops with multiple non-uniform exists.
SmallVector<Loop*, 8> Loops;
for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
Loop* L = *I;
getNestedLoopsWithMultpleExists(L, WI, Loops);
}
// Sum the IR size of these loops.
unsigned LoopSize = 0;
for (auto L : Loops)
for (auto BB : L->getBlocks())
LoopSize += (unsigned)BB->size();
// Check the ratio between nested loops with multiple exists and the total
// number of instructions. A higher ratio means these loops dominate this
// kernel.
unsigned FuncSize = 0;
for (auto& BB : F->getBasicBlockList())
FuncSize += (unsigned)BB.size();
bool retVal = false;
if (FuncSize > 0)
{
retVal = float(LoopSize) / FuncSize >= 0.7f;
}
return retVal;
}
static bool hasLongStridedLdStInLoop(Function* F, LoopInfo* LI, WIAnalysis* WI) {
SmallVector<Loop*, 32> Loops;
// Collect innermost simple loop.
for (auto I = LI->begin(), E = LI->end(); I != E; ++I) {
auto L = *I;
if (!IGCLLVM::isInnermost(L))
continue;
if (L->getNumBlocks() != 2)
continue;
auto* Latch = L->getLoopLatch();
if (!Latch || !Latch->front().isTerminator())
continue;
Loops.push_back(L);
}
unsigned LDs = 0;
unsigned STs = 0;
for (auto L : Loops) {
auto BB = L->getHeader();
for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
if (auto LD = dyn_cast<LoadInst>(&*I)) {
VectorType* VTy = dyn_cast<VectorType>(LD->getType());
if (!VTy || IGCLLVM::GetVectorTypeBitWidth(VTy) <= 128)
continue;
if (WI->isUniform(LD))
continue;
++LDs;
}
if (auto ST = dyn_cast<StoreInst>(&*I)) {
Value* Ptr = ST->getPointerOperand();
Value* Val = ST->getValueOperand();
VectorType* VTy = dyn_cast<VectorType>(Val->getType());
if (!VTy || IGCLLVM::GetVectorTypeBitWidth(VTy) <= 128)
continue;
if (WI->isUniform(Ptr))
continue;
++STs;
}
}
if (LDs > 3 || STs > 3)
return true;
}
return false;
}
bool Simd32ProfitabilityAnalysis::checkSimd16Profitable(CodeGenContext* ctx) {
if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x1) &&
getLoopCyclomaticComplexity() >= CYCLOMATIC_COMPLEXITY_THRESHOLD) {
return false;
}
if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x2) &&
hasNestedLoopsWithMultipleExits(F, LI, WI)) {
return false;
}
// If there's wider vector load/store in a loop, skip SIMD16.
if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x4) &&
hasLongStridedLdStInLoop(F, LI, WI)) {
return false;
}
auto hasDouble = [](Function& F) {
for (auto& BB : F)
for (auto& I : BB) {
if (I.getType()->isDoubleTy())
return true;
for (Value* V : I.operands())
if (V->getType()->isDoubleTy())
return true;
}
return false;
};
const CPlatform* platform = &ctx->platform;
if (platform->GetPlatformFamily() == IGFX_GEN9_CORE &&
platform->getPlatformInfo().eProductFamily == IGFX_GEMINILAKE &&
hasDouble(*F)) {
return false;
}
return true;
}
bool Simd32ProfitabilityAnalysis::checkPSSimd32Profitable()
{
unsigned int numberInstructions = 0;
unsigned int numberOfHalfInstructions = 0;
unsigned int numberOfCmp = 0;
unsigned int numberOfSample = 0;
unsigned int numberOfBB = 0;
BasicBlock* returnBlock = nullptr;
bool hasDiscard = F->getParent()->getNamedMetadata("KillPixel") != nullptr;
for (Function::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI)
{
for (auto II = FI->begin(), IE = FI->end(); II != IE; ++II)
{
if (II->getType() == Type::getHalfTy(F->getContext()))
{
numberOfHalfInstructions++;
}
if (isa<CmpInst>(*II))
{
numberOfCmp++;
}
if (isSampleLoadGather4InfoInstruction(&(*II)))
{
numberOfSample++;
}
numberInstructions++;
}
if (isa<ReturnInst>(FI->getTerminator()))
{
returnBlock = &(*FI);
}
numberOfBB++;
}
if (numberInstructions > 4000 || numberInstructions == 0)
{
return false;
}
// Original SIMD32 heurtistic
// if 1BB, short, has sample, no discard, no cmp, enable SIMD32
// skip cmp to avoid flag spill
if (!hasDiscard && numberOfCmp == 0 && numberOfSample > 0 && numberOfBB == 1 && numberInstructions < 80)
{
return true;
}
// disable SIMD32 for shader with multiple render target as it puts pressure on the render cache
unsigned int numberRTWrite = 0;
for (auto it = returnBlock->begin(), ie = returnBlock->end(); it != ie; ++it)
{
if (GenIntrinsicInst * intr = dyn_cast<GenIntrinsicInst>(it))
{
if (intr->getIntrinsicID() == GenISAIntrinsic::GenISA_RTWrite)
{
numberRTWrite++;
}
}
}
if (numberRTWrite > 1)
{
return false;
}
// Case where we expect to be bound by pixel dispatch time. For small shaderd without IO
// It is better to go with SIMD32
if (returnBlock == &F->getEntryBlock() && !hasDiscard)
{
bool hasIO = false;
unsigned int numberInstructions = returnBlock->size();
if (numberInstructions < 10)
{
for (auto II = returnBlock->begin(), IE = returnBlock->end(); II != IE; ++II)
{
if (II->mayReadOrWriteMemory() && !isa<RTWritIntrinsic>(II))
{
hasIO = true;
break;
}
if (isa<SampleIntrinsic>(II) ||
isa<SamplerLoadIntrinsic>(II) ||
isa<InfoIntrinsic>(II) ||
isa<SamplerGatherIntrinsic>(II))
{
hasIO = true;
break;
}
}
if (!hasIO)
{
// for small program without IO using SIMD32 allows hiding the thread dispatch time
return true;
}
}
}
if (IGC_IS_FLAG_ENABLED(PSSIMD32HeuristicFP16))
{
// If we have a large ratio of half use SIMD32 to hide latency better
float ratioHalf = (float)numberOfHalfInstructions / (float)numberInstructions;
if (ratioHalf >= 0.5f)
{
return true;
}
}
if (IGC_IS_FLAG_ENABLED(PSSIMD32HeuristicLoopAndDiscard))
{
// If we have a discard and the first block is small we may be bound by PSD so we try to enable SIMD32
if (hasDiscard)
{
BasicBlock& entryBB = F->getEntryBlock();
if (!isa<ReturnInst>(entryBB.getTerminator()) && entryBB.size() < 50)
{
return true;
}
}
// If we have a loop with high latency enable SIMD32 to reduce latency
unsigned int numberOfInstructions = 0;
unsigned int numberOfHighLatencyInst = 0;
for (LoopInfo::iterator li = LI->begin(), le = LI->end(); li != le; ++li)
{
llvm::Loop* loop = *li;
for (auto BI = loop->block_begin(), BE = loop->block_end(); BI != BE; ++BI)
{
for (auto II = (*BI)->begin(), IE = (*BI)->end(); II != IE; ++II)
{
if (isa<SampleIntrinsic>(II))
{
numberOfHighLatencyInst++;
}
numberOfInstructions++;
}
}
}
if (numberOfInstructions < 85 && numberOfHighLatencyInst >= 1)
{
// high latency small loop
return true;
}
}
return false;
}