mirror of
https://github.com/intel/intel-graphics-compiler.git
synced 2025-10-30 08:18:26 +08:00
1209 lines
40 KiB
C++
1209 lines
40 KiB
C++
/*========================== begin_copyright_notice ============================
|
|
|
|
Copyright (C) 2017-2021 Intel Corporation
|
|
|
|
SPDX-License-Identifier: MIT
|
|
|
|
============================= end_copyright_notice ===========================*/
|
|
|
|
#include "Compiler/CISACodeGen/Simd32Profitability.hpp"
|
|
#include "Compiler/CodeGenPublic.h"
|
|
#include "Compiler/IGCPassSupport.h"
|
|
#include "Compiler/CISACodeGen/Platform.hpp"
|
|
#include "common/LLVMWarningsPush.hpp"
|
|
#include <llvmWrapper/IR/DerivedTypes.h>
|
|
#include <llvmWrapper/Transforms/Utils/LoopUtils.h>
|
|
#include <llvm/IR/InstIterator.h>
|
|
#include <llvm/IR/Operator.h>
|
|
#include <llvmWrapper/IR/DerivedTypes.h>
|
|
#include "common/LLVMWarningsPop.hpp"
|
|
#include "GenISAIntrinsics/GenIntrinsics.h"
|
|
#include "GenISAIntrinsics/GenIntrinsicInst.h"
|
|
#include "Probe/Assertion.h"
|
|
|
|
using namespace llvm;
|
|
using namespace IGC;
|
|
using namespace IGC::IGCMD;
|
|
|
|
// Register pass to igc-opt
|
|
#define PASS_FLAG "simd32-profit"
|
|
#define PASS_DESCRIPTION "Check SIMD32 Profitability for OpenCL"
|
|
#define PASS_CFG_ONLY false
|
|
#define PASS_ANALYSIS true
|
|
IGC_INITIALIZE_PASS_BEGIN(Simd32ProfitabilityAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
|
|
IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
|
|
IGC_INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
|
|
IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
|
|
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
|
|
IGC_INITIALIZE_PASS_END(Simd32ProfitabilityAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
|
|
|
|
char Simd32ProfitabilityAnalysis::ID = 0;
|
|
|
|
const unsigned BRANCHY_MINPATH = 8;
|
|
|
|
Simd32ProfitabilityAnalysis::Simd32ProfitabilityAnalysis()
|
|
: FunctionPass(ID), F(nullptr), PDT(nullptr), LI(nullptr),
|
|
pMdUtils(nullptr), WI(nullptr), m_isSimd32Profitable(true),
|
|
m_isSimd16Profitable(true) {
|
|
initializeSimd32ProfitabilityAnalysisPass(*PassRegistry::getPassRegistry());
|
|
}
|
|
|
|
static std::tuple<Value* /*INIT*/, Value* /*CURR*/, Value* /*STEP*/, Value* /*NEXT*/>
|
|
getInductionVariable(Loop* L) {
|
|
BasicBlock* H = L->getHeader();
|
|
|
|
BasicBlock* Incoming = 0, *Backedge = 0;
|
|
pred_iterator PI = pred_begin(H);
|
|
IGC_ASSERT_MESSAGE(PI != pred_end(H), "Loop must have at least one backedge!");
|
|
Backedge = *PI++;
|
|
if (PI == pred_end(H)) // dead loop
|
|
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
|
|
Incoming = *PI++;
|
|
if (PI != pred_end(H)) // multiple backedges?
|
|
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
|
|
|
|
if (L->contains(Incoming)) {
|
|
if (L->contains(Backedge))
|
|
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
|
|
std::swap(Incoming, Backedge);
|
|
}
|
|
else if (!L->contains(Backedge))
|
|
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
|
|
|
|
// Loop over all of the PHI nodes, looking for an indvar.
|
|
for (auto I = H->begin(); isa<PHINode>(I); ++I) {
|
|
PHINode* PN = cast<PHINode>(I);
|
|
if (auto Inc = dyn_cast<Instruction>(PN->getIncomingValueForBlock(Backedge))) {
|
|
if (Inc->getOpcode() == Instruction::Add && Inc->getOperand(0) == PN) {
|
|
return
|
|
std::make_tuple(PN->getIncomingValueForBlock(Incoming), PN,
|
|
Inc->getOperand(1), Inc);
|
|
}
|
|
}
|
|
}
|
|
|
|
return std::make_tuple(nullptr, nullptr, nullptr, nullptr);
|
|
}
|
|
|
|
enum {
|
|
LOOPCOUNT_LIKELY_SMALL,
|
|
LOOPCOUNT_LIKELY_LARGE,
|
|
LOOPCOUNT_UNKNOWN
|
|
};
|
|
|
|
static bool isSignedPredicate(CmpInst::Predicate Pred) {
|
|
switch (Pred) {
|
|
default: break;
|
|
case CmpInst::ICMP_EQ:
|
|
case CmpInst::ICMP_NE:
|
|
case CmpInst::ICMP_SGT:
|
|
case CmpInst::ICMP_SLT:
|
|
case CmpInst::ICMP_SGE:
|
|
case CmpInst::ICMP_SLE:
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static bool isUnsignedPredicate(CmpInst::Predicate Pred) {
|
|
switch (Pred) {
|
|
default: break;
|
|
case CmpInst::ICMP_EQ:
|
|
case CmpInst::ICMP_NE:
|
|
case CmpInst::ICMP_UGT:
|
|
case CmpInst::ICMP_ULT:
|
|
case CmpInst::ICMP_UGE:
|
|
case CmpInst::ICMP_ULE:
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static bool hasSameSignedness(CmpInst::Predicate LHS, CmpInst::Predicate RHS) {
|
|
if (isSignedPredicate(LHS) && isSignedPredicate(RHS))
|
|
return true;
|
|
if (isUnsignedPredicate(LHS) && isUnsignedPredicate(RHS))
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
static std::tuple<Value*, Value*, Value*, bool>
|
|
isOutOfRangeComparison(Value* Cond) {
|
|
BinaryOperator* BO = dyn_cast<BinaryOperator>(Cond);
|
|
if (!BO || BO->getOpcode() != Instruction::Or)
|
|
return std::make_tuple(nullptr, nullptr, nullptr, false);
|
|
|
|
ICmpInst* LHS = dyn_cast<ICmpInst>(BO->getOperand(0));
|
|
ICmpInst* RHS = dyn_cast<ICmpInst>(BO->getOperand(1));
|
|
|
|
if (!LHS || !RHS)
|
|
return std::make_tuple(nullptr, nullptr, nullptr, false);
|
|
|
|
CmpInst::Predicate P0 = LHS->getPredicate();
|
|
CmpInst::Predicate P1 = RHS->getPredicate();
|
|
|
|
if (!hasSameSignedness(P0, P1))
|
|
return std::make_tuple(nullptr, nullptr, nullptr, false);
|
|
|
|
// Simplify the checking since they have the same signedness.
|
|
P0 = ICmpInst::getSignedPredicate(P0);
|
|
P1 = ICmpInst::getSignedPredicate(P1);
|
|
|
|
if (!(P0 == CmpInst::ICMP_SLT || P0 == CmpInst::ICMP_SLE)) {
|
|
std::swap(LHS, RHS);
|
|
std::swap(P0, P1);
|
|
}
|
|
if (!(P0 == CmpInst::ICMP_SLT || P0 == CmpInst::ICMP_SLE) ||
|
|
!(P1 == CmpInst::ICMP_SGT || P1 == CmpInst::ICMP_SGE))
|
|
return std::make_tuple(nullptr, nullptr, nullptr, false);
|
|
|
|
if (LHS->getOperand(0) != RHS->getOperand(0))
|
|
return std::make_tuple(nullptr, nullptr, nullptr, false);
|
|
|
|
return std::make_tuple(LHS->getOperand(0),
|
|
LHS->getOperand(1), RHS->getOperand(1),
|
|
isSignedPredicate(LHS->getPredicate()));
|
|
}
|
|
|
|
static Value* getLoopCounter(Loop* L, Value* X) {
|
|
BasicBlock* H = L->getHeader();
|
|
|
|
BasicBlock* Incoming = 0, *Backedge = 0;
|
|
pred_iterator PI = pred_begin(H);
|
|
IGC_ASSERT_MESSAGE(PI != pred_end(H), "Loop must have at least one backedge!");
|
|
Backedge = *PI++;
|
|
if (PI == pred_end(H)) // dead loop
|
|
return nullptr;
|
|
Incoming = *PI++;
|
|
if (PI != pred_end(H)) // multiple backedges?
|
|
return nullptr;
|
|
|
|
if (L->contains(Incoming)) {
|
|
if (L->contains(Backedge))
|
|
return nullptr;
|
|
std::swap(Incoming, Backedge);
|
|
}
|
|
else if (!L->contains(Backedge))
|
|
return nullptr;
|
|
|
|
for (auto I = H->begin(); isa<PHINode>(I); ++I) {
|
|
PHINode* PN = cast<PHINode>(I);
|
|
if (X == PN->getIncomingValueForBlock(Backedge))
|
|
return PN;
|
|
}
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
static std::tuple<int, int>
|
|
countOperands(Value* V, Value* LHS, Value* RHS) {
|
|
if (V == LHS || V == RHS)
|
|
return std::make_tuple((V == LHS), (V == RHS));
|
|
|
|
// Count LHS, RHS in an expression like m*L + n*R +/- C, where C is
|
|
// constant.
|
|
BinaryOperator* BO = dyn_cast<BinaryOperator>(V);
|
|
if (!BO ||
|
|
(BO->getOpcode() != Instruction::Add &&
|
|
BO->getOpcode() != Instruction::Sub &&
|
|
BO->getOpcode() != Instruction::Shl &&
|
|
BO->getOpcode() != Instruction::Xor))
|
|
return std::make_tuple(0, 0);
|
|
|
|
if (BO->getOpcode() == Instruction::Shl) {
|
|
ConstantInt* CI = dyn_cast<ConstantInt>(BO->getOperand(1));
|
|
if (!CI)
|
|
return std::make_tuple(0, 0);
|
|
int L = 0, R = 0;
|
|
std::tie(L, R) = countOperands(BO->getOperand(0), LHS, RHS);
|
|
uint64_t ShAmt = CI->getZExtValue();
|
|
return std::make_tuple((L << ShAmt), (R << ShAmt));
|
|
}
|
|
|
|
if (BO->getOpcode() == Instruction::Xor) {
|
|
ConstantInt* CI = dyn_cast<ConstantInt>(BO->getOperand(1));
|
|
if (!CI || CI->getSExtValue() != -1)
|
|
return std::make_tuple(0, 0);
|
|
int L = 0, R = 0;
|
|
std::tie(L, R) = countOperands(BO->getOperand(0), LHS, RHS);
|
|
return std::make_tuple(-L, -R);
|
|
}
|
|
|
|
|
|
IGC_ASSERT((BO->getOpcode() == Instruction::Add) || (BO->getOpcode() == Instruction::Sub));
|
|
|
|
if (isa<Constant>(BO->getOperand(1)))
|
|
return countOperands(BO->getOperand(0), LHS, RHS);
|
|
int L0 = 0, L1 = 0;
|
|
std::tie(L0, L1) = countOperands(BO->getOperand(0), LHS, RHS);
|
|
int R0 = 0, R1 = 0;
|
|
std::tie(R0, R1) = countOperands(BO->getOperand(1), LHS, RHS);
|
|
if (BO->getOpcode() == Instruction::Add)
|
|
return std::make_tuple(L0 + R0, L1 + R1);
|
|
|
|
IGC_ASSERT(BO->getOpcode() == Instruction::Sub);
|
|
return std::make_tuple(L0 - R0, L1 - R1);
|
|
}
|
|
|
|
static bool isNegatedByLB(Value* V, Value* X, Value* LB) {
|
|
// Check if `V` is calculated as LB - X +/- C, where C is constant.
|
|
int L = 0, R = 0;
|
|
std::tie(L, R) = countOperands(V, LB, X);
|
|
return (L == 1) && (R == -1);
|
|
}
|
|
|
|
static bool isNegatedBy2UB(Value* V, Value* X, Value* UB) {
|
|
// Check if `V` is calculated as 2UB - X +/- C, where C is constant.
|
|
int L = 0, R = 0;
|
|
std::tie(L, R) = countOperands(V, UB, X);
|
|
return (L == 2) && (R == -1);
|
|
}
|
|
|
|
unsigned Simd32ProfitabilityAnalysis::estimateLoopCount_CASE1(Loop* L) {
|
|
BasicBlock* Exit = L->getExitingBlock();
|
|
if (!Exit)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
|
|
BranchInst* Br = dyn_cast<BranchInst>(Exit->getTerminator());
|
|
if (!Br || !Br->isConditional())
|
|
return LOOPCOUNT_UNKNOWN;
|
|
if (!L->contains(Br->getSuccessor(0)))
|
|
return LOOPCOUNT_UNKNOWN;
|
|
|
|
Value* X = nullptr, * LB = nullptr, * UB = nullptr;
|
|
bool Signed = false;
|
|
std::tie(X, LB, UB, Signed) = isOutOfRangeComparison(Br->getCondition());
|
|
if (!X) {
|
|
ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
|
|
if (!Cmp)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
switch (Cmp->getPredicate()) {
|
|
default:
|
|
return LOOPCOUNT_UNKNOWN;
|
|
case CmpInst::ICMP_UGT:
|
|
case CmpInst::ICMP_UGE:
|
|
// A smart use of unsigned comparison on signed values to perform a
|
|
// out-of-range change of (0, N).
|
|
break;
|
|
}
|
|
X = Cmp->getOperand(0);
|
|
LB = Constant::getNullValue(X->getType());
|
|
UB = Cmp->getOperand(1);
|
|
Signed = true;
|
|
}
|
|
|
|
Value* LC = getLoopCounter(L, X);
|
|
if (!LC)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
|
|
if (PHINode * PN = dyn_cast<PHINode>(X)) {
|
|
if (PN->getNumIncomingValues() != 2)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
BasicBlock* BB0 = PN->getIncomingBlock(0);
|
|
BasicBlock* IfBB = BB0->getSinglePredecessor();
|
|
if (!IfBB)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
Br = dyn_cast<BranchInst>(IfBB->getTerminator());
|
|
if (!Br || !Br->isConditional())
|
|
return LOOPCOUNT_UNKNOWN;
|
|
ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
|
|
if (!Cmp)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
CmpInst::Predicate Pred = Cmp->getPredicate();
|
|
Value* LHS = Cmp->getOperand(0);
|
|
Value* RHS = Cmp->getOperand(1);
|
|
if (LHS != LC) {
|
|
std::swap(LHS, RHS);
|
|
Pred = CmpInst::getSwappedPredicate(Pred);
|
|
}
|
|
if (LHS != LC)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
if (!Signed)
|
|
Pred = ICmpInst::getSignedPredicate(Pred);
|
|
if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_SLE)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
if (RHS != LB)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
|
|
Value* X0 = PN->getIncomingValue(0);
|
|
Value* X1 = PN->getIncomingValue(1);
|
|
if (!isNegatedByLB(X0, LC, LB))
|
|
return LOOPCOUNT_UNKNOWN;
|
|
if (!isNegatedBy2UB(X1, LC, UB))
|
|
return LOOPCOUNT_UNKNOWN;
|
|
}
|
|
else if (BinaryOperator * BO = dyn_cast<BinaryOperator>(X)) {
|
|
if (BO->getOpcode() != Instruction::Sub)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
if (BO->getOperand(1) != LC)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
SelectInst* SI = dyn_cast<SelectInst>(BO->getOperand(0));
|
|
if (!SI)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
ICmpInst* Cmp = dyn_cast<ICmpInst>(SI->getCondition());
|
|
if (!Cmp)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
CmpInst::Predicate Pred = Cmp->getPredicate();
|
|
Value* LHS = Cmp->getOperand(0);
|
|
Value* RHS = Cmp->getOperand(1);
|
|
if (LHS != LC) {
|
|
std::swap(LHS, RHS);
|
|
Pred = CmpInst::getSwappedPredicate(Pred);
|
|
}
|
|
if (LHS != LC)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
if (!Signed)
|
|
Pred = ICmpInst::getSignedPredicate(Pred);
|
|
Value* X0 = SI->getTrueValue();
|
|
Value* X1 = SI->getFalseValue();
|
|
if (Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE) {
|
|
std::swap(X0, X1);
|
|
Pred = CmpInst::getInversePredicate(Pred);
|
|
}
|
|
if (Pred != CmpInst::ICMP_SLT && Pred != CmpInst::ICMP_SLE)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
if (RHS != LB)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
int L0 = 0, R0 = 0;
|
|
std::tie(L0, R0) = countOperands(X0, LB, nullptr);
|
|
int L1 = 0, R1 = 0;
|
|
std::tie(L1, R1) = countOperands(X1, UB, nullptr);
|
|
if (L0 != 1 || L1 != 2)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
}
|
|
else
|
|
return LOOPCOUNT_UNKNOWN;
|
|
|
|
// Ok, we found a loop of the following pattern:
|
|
//
|
|
// do {
|
|
// if (x < 0) {
|
|
// x = 0 - x +/- c0;
|
|
// } else {
|
|
// x = 2 * UB - x +/- c1;
|
|
// }
|
|
// } while (x < LB || x > UB);
|
|
//
|
|
// such loop will run only once or twice when non-arbitary large `x`. If a
|
|
// non-uniform loop only runs several iterations, divergence cost due to
|
|
// SIMD32 could be ignored.
|
|
return LOOPCOUNT_LIKELY_SMALL;
|
|
}
|
|
|
|
unsigned Simd32ProfitabilityAnalysis::estimateLoopCount_CASE2(Loop* L) {
|
|
SmallVector<BasicBlock*, 8> ExitingBBs;
|
|
L->getExitingBlocks(ExitingBBs);
|
|
|
|
Value* Init = nullptr, * Curr= nullptr, * Next= nullptr, * Step= nullptr;
|
|
std::tie(Init, Curr, Step, Next) = getInductionVariable(L);
|
|
if (!Init || !Curr || !Step || !Next)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
ConstantInt* I0 = dyn_cast<ConstantInt>(Init);
|
|
ConstantInt* S0 = dyn_cast<ConstantInt>(Step);
|
|
if (!I0 || !S0)
|
|
return LOOPCOUNT_UNKNOWN;
|
|
|
|
for (auto BB : ExitingBBs) {
|
|
BranchInst* Br = dyn_cast<BranchInst>(BB->getTerminator());
|
|
if (!Br || !Br->isConditional())
|
|
continue;
|
|
if (!L->contains(Br->getSuccessor(0))) // Not condition of `continue`.
|
|
continue;
|
|
ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
|
|
if (!WI->isUniform(Br)) {
|
|
BinaryOperator* BO = dyn_cast<BinaryOperator>(Br->getCondition());
|
|
if (!BO)
|
|
continue;
|
|
if (BO->getOpcode() != Instruction::And)
|
|
continue;
|
|
ICmpInst* Cond = nullptr;
|
|
ICmpInst* Op0 = dyn_cast<ICmpInst>(BO->getOperand(0));
|
|
if (Op0 && WI->isUniform(Op0))
|
|
Cond = Op0;
|
|
if (!Cond) {
|
|
ICmpInst* Op1 = dyn_cast<ICmpInst>(BO->getOperand(1));
|
|
if (Op1 && WI->isUniform(Op1))
|
|
Cond = Op1;
|
|
}
|
|
if (!Cond)
|
|
continue;
|
|
Cmp = Cond;
|
|
}
|
|
if (!Cmp)
|
|
continue;
|
|
CmpInst::Predicate Pred = Cmp->getPredicate();
|
|
switch (Pred) {
|
|
default:
|
|
// TODO: Handle more predicates.
|
|
continue;
|
|
case ICmpInst::ICMP_SLT:
|
|
case ICmpInst::ICMP_ULT:
|
|
break;
|
|
}
|
|
Value* Op0 = Cmp->getOperand(0);
|
|
Value* Op1 = Cmp->getOperand(1);
|
|
if (Op0 != Next)
|
|
continue;
|
|
ConstantInt* E0 = dyn_cast<ConstantInt>(Op1);
|
|
if (!E0)
|
|
continue;
|
|
ConstantInt* N = dyn_cast<ConstantInt>(
|
|
Pred == ICmpInst::ICMP_SLT
|
|
? ConstantExpr::getSDiv(ConstantExpr::getSub(E0, I0), S0)
|
|
: ConstantExpr::getUDiv(ConstantExpr::getSub(E0, I0), S0));
|
|
if (!N)
|
|
continue;
|
|
if (N->getValue().slt(0))
|
|
continue;
|
|
if (N->getValue().slt(100))
|
|
return LOOPCOUNT_LIKELY_SMALL;
|
|
}
|
|
|
|
// Ok, we found a non-uniform loop with multiple exiting conditions.
|
|
// However, one of them is uniform one and has small loop count.
|
|
return LOOPCOUNT_UNKNOWN;
|
|
}
|
|
|
|
unsigned Simd32ProfitabilityAnalysis::estimateLoopCount(Loop* L) {
|
|
unsigned Ret;
|
|
|
|
Ret = estimateLoopCount_CASE1(L);
|
|
if (Ret != LOOPCOUNT_UNKNOWN)
|
|
return Ret;
|
|
|
|
Ret = estimateLoopCount_CASE2(L);
|
|
if (Ret != LOOPCOUNT_UNKNOWN)
|
|
return Ret;
|
|
|
|
return Ret;
|
|
}
|
|
|
|
static Value* getLoopCount(Value* Start, Value* End) {
|
|
// Poorman's loop count checking as we need to check that result with WIA.
|
|
ConstantInt* CStart = dyn_cast<ConstantInt>(Start);
|
|
ConstantInt* CEnd = dyn_cast<ConstantInt>(End);
|
|
if (CStart && CEnd)
|
|
return ConstantExpr::getSub(CEnd, CStart);
|
|
|
|
if (CStart && CStart->isNullValue())
|
|
return End;
|
|
|
|
BinaryOperator* BO = dyn_cast<BinaryOperator>(End);
|
|
if (!BO || BO->getOpcode() != Instruction::Add)
|
|
return nullptr;
|
|
|
|
Value* Op0 = BO->getOperand(0);
|
|
Value* Op1 = BO->getOperand(1);
|
|
if (Op0 != Start)
|
|
std::swap(Op0, Op1);
|
|
if (Op0 == Start)
|
|
return Op1;
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
/// hasIEEESqrtOrDivFunc - Check whether IEEE correctly-rounded SQRT or DIV is
|
|
/// used in the given function.
|
|
static bool hasIEEESqrtOrDivFunc(const Function& F) {
|
|
for (auto& BB : F)
|
|
for (auto& I : BB) {
|
|
const GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(&I);
|
|
if (!GII)
|
|
continue;
|
|
switch (GII->getIntrinsicID()) {
|
|
case GenISAIntrinsic::GenISA_IEEE_Sqrt:
|
|
case GenISAIntrinsic::GenISA_IEEE_Divide:
|
|
return true;
|
|
default: break;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// hasSubGroupFunc - Check whether subgroup functions are used in the given
|
|
/// function.
|
|
static bool hasSubGroupFunc(const Function& F)
|
|
{
|
|
for (auto& BB : F)
|
|
{
|
|
for (auto& I : BB)
|
|
{
|
|
if (isSubGroupIntrinsic(&I))
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool Simd32ProfitabilityAnalysis::runOnFunction(Function& F)
|
|
{
|
|
this->F = &F;
|
|
CodeGenContext* context = nullptr;
|
|
context = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
|
|
if (context->type == ShaderType::OPENCL_SHADER)
|
|
{
|
|
PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
|
|
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
|
WI = &getAnalysis<WIAnalysis>();
|
|
pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
|
|
m_isSimd16Profitable = checkSimd16Profitable(context);
|
|
m_isSimd32Profitable = m_isSimd16Profitable && checkSimd32Profitable(context);
|
|
}
|
|
else if (context->type == ShaderType::PIXEL_SHADER)
|
|
{
|
|
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
|
m_isSimd32Profitable = checkPSSimd32Profitable();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static bool isPayloadHeader(Value* V) {
|
|
Argument* Arg = dyn_cast<Argument>(V);
|
|
if (!Arg || !Arg->hasName())
|
|
return false;
|
|
IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
|
|
if (!VTy || VTy->getNumElements() != 8 ||
|
|
!VTy->getElementType()->isIntegerTy(32))
|
|
return false;
|
|
return Arg->getName() != "payloadHeader";
|
|
}
|
|
|
|
static bool isR0(Value* V) {
|
|
Argument* Arg = dyn_cast<Argument>(V);
|
|
if (!Arg || !Arg->hasName())
|
|
return false;
|
|
IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
|
|
if (!VTy || VTy->getNumElements() != 8 ||
|
|
!VTy->getElementType()->isIntegerTy(32))
|
|
return false;
|
|
return Arg->getName() != "r0";
|
|
}
|
|
|
|
static bool isEnqueuedLocalSize(Value* V) {
|
|
Argument* Arg = dyn_cast<Argument>(V);
|
|
if (!Arg || !Arg->hasName())
|
|
return false;
|
|
IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Arg->getType());
|
|
if (!VTy || VTy->getNumElements() != 3 ||
|
|
!VTy->getElementType()->isIntegerTy(32))
|
|
return false;
|
|
return Arg->getName() != "enqueuedLocalSize";
|
|
}
|
|
|
|
static bool isGetGroupIdX(Value* V) {
|
|
auto EEI = dyn_cast<ExtractElementInst>(V);
|
|
if (!EEI)
|
|
return false;
|
|
if (!EEI->getType()->isIntegerTy(32))
|
|
return false;
|
|
auto CI = dyn_cast<Constant>(EEI->getOperand(1));
|
|
if (!CI || !CI->isOneValue())
|
|
return false;
|
|
return isR0(EEI->getOperand(0));
|
|
}
|
|
|
|
static bool isGetEnqueuedLocalSizeX(Value* V) {
|
|
auto EEI = dyn_cast<ExtractElementInst>(V);
|
|
if (!EEI)
|
|
return false;
|
|
if (!EEI->getType()->isIntegerTy(32))
|
|
return false;
|
|
auto CI = dyn_cast<Constant>(EEI->getOperand(1));
|
|
if (!CI || !CI->isNullValue())
|
|
return false;
|
|
return isEnqueuedLocalSize(EEI->getOperand(0));
|
|
}
|
|
|
|
static bool isGetLocalIdX(Value* V) {
|
|
if (auto ZEI = dyn_cast<ZExtInst>(V))
|
|
return isGetLocalIdX(ZEI->getOperand(0));
|
|
Argument* Arg = dyn_cast<Argument>(V);
|
|
if (!Arg || !Arg->hasName())
|
|
return false;
|
|
if (!Arg->getType()->isIntegerTy(16))
|
|
return false;
|
|
return Arg->getName() == "localIdX";
|
|
}
|
|
|
|
static bool isGetGlobalOffsetX(Value* V) {
|
|
auto EEI = dyn_cast<ExtractElementInst>(V);
|
|
if (!EEI)
|
|
return false;
|
|
if (!EEI->getType()->isIntegerTy(32))
|
|
return false;
|
|
auto CI = dyn_cast<Constant>(EEI->getOperand(1));
|
|
if (!CI || !CI->isNullValue())
|
|
return false;
|
|
return isPayloadHeader(EEI->getOperand(0));
|
|
}
|
|
|
|
static bool isGetGlobalIdX(Value* V) {
|
|
// GlobalIdX = GroupIdX * EnqueuedLocalSizeX + LocalIdX + GlobalOffsetX
|
|
auto BO = dyn_cast<BinaryOperator>(V);
|
|
if (!BO || BO->getOpcode() != Instruction::Add)
|
|
return false;
|
|
|
|
auto BO1 = dyn_cast<BinaryOperator>(BO->getOperand(0));
|
|
auto A0 = BO->getOperand(1);
|
|
if (!BO1) {
|
|
BO1 = dyn_cast<BinaryOperator>(BO->getOperand(1));
|
|
A0 = BO->getOperand(0);
|
|
}
|
|
if (!BO1 || BO1->getOpcode() != Instruction::Add)
|
|
return false;
|
|
|
|
auto BO2 = dyn_cast<BinaryOperator>(BO1->getOperand(0));
|
|
auto A1 = BO1->getOperand(1);
|
|
if (!BO2) {
|
|
BO2 = dyn_cast<BinaryOperator>(BO1->getOperand(1));
|
|
A1 = BO1->getOperand(0);
|
|
}
|
|
if (!BO2 || BO2->getOpcode() != Instruction::Mul)
|
|
return false;
|
|
|
|
auto M0 = BO2->getOperand(0);
|
|
auto M1 = BO2->getOperand(1);
|
|
|
|
if (!((isGetGroupIdX(M0) && isGetEnqueuedLocalSizeX(M1)) ||
|
|
(isGetGroupIdX(M1) && isGetEnqueuedLocalSizeX(M0))))
|
|
return false;
|
|
|
|
return ((isGetLocalIdX(A0) && isGetGlobalOffsetX(A1)) ||
|
|
(isGetLocalIdX(A1) && isGetGlobalOffsetX(A0)));
|
|
}
|
|
|
|
bool Simd32ProfitabilityAnalysis::isSelectBasedOnGlobalIdX(Value* V) {
|
|
PHINode* PN = dyn_cast<PHINode>(V);
|
|
while (!PN) {
|
|
auto BO = dyn_cast<BinaryOperator>(V);
|
|
if (!BO || BO->getOpcode() != Instruction::Shl)
|
|
return false;
|
|
if (!isa<Constant>(BO->getOperand(1)))
|
|
return false;
|
|
V = BO->getOperand(0);
|
|
PN = dyn_cast<PHINode>(V);
|
|
}
|
|
|
|
if (PN->getNumIncomingValues() != 2)
|
|
return false;
|
|
|
|
auto Op0 = PN->getIncomingValue(0);
|
|
if (!WI->isUniform(Op0))
|
|
return false;
|
|
auto Op1 = PN->getIncomingValue(1);
|
|
if (!WI->isUniform(Op1))
|
|
return false;
|
|
|
|
auto BB0 = PN->getIncomingBlock(0);
|
|
auto BB1 = PN->getIncomingBlock(1);
|
|
auto IfBB = BB0->getSinglePredecessor();
|
|
if (!IfBB || IfBB == BB1->getSinglePredecessor())
|
|
return false;
|
|
auto Br = dyn_cast<BranchInst>(IfBB->getTerminator());
|
|
if (!Br || !Br->isConditional())
|
|
return false;
|
|
|
|
ICmpInst* Cmp = dyn_cast<ICmpInst>(Br->getCondition());
|
|
if (!Cmp)
|
|
return false;
|
|
Value* LHS = Cmp->getOperand(0);
|
|
Value* RHS = Cmp->getOperand(1);
|
|
switch (Cmp->getPredicate()) {
|
|
default:
|
|
return false;
|
|
case CmpInst::ICMP_SLT:
|
|
case CmpInst::ICMP_SLE:
|
|
break;
|
|
case CmpInst::ICMP_SGT:
|
|
case CmpInst::ICMP_SGE:
|
|
std::swap(LHS, RHS);
|
|
break;
|
|
}
|
|
if (!WI->isUniform(RHS))
|
|
return false;
|
|
return !isGetGlobalIdX(LHS);
|
|
}
|
|
|
|
bool Simd32ProfitabilityAnalysis::checkSimd32Profitable(CodeGenContext* ctx)
|
|
{
|
|
// If a kernel is too big, it would probably have enough work for EUs
|
|
// even without simd32; and simd32 would have more visa variables than
|
|
// 64K limit (ocl c99 64 bit PrintHalf/half8.c for example); thus make
|
|
// sense to skip simd32.
|
|
size_t programSize = 0;
|
|
for (Function::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI)
|
|
{
|
|
BasicBlock* BB = &*FI;
|
|
programSize += BB->size();
|
|
}
|
|
if (programSize > 8000)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// If we have workgroup size (or workgroup size hint) metadata, check whether the X dimension
|
|
// is expected to be of size 16 or below. If it is, no point in using SIMD32, we'll just
|
|
// get empty lanes.
|
|
auto funcInfoMD = pMdUtils->findFunctionsInfoItem(F);
|
|
if (funcInfoMD != pMdUtils->end_FunctionsInfo())
|
|
{
|
|
ThreadGroupSizeMetaDataHandle tgSize = funcInfoMD->second->getThreadGroupSize();
|
|
ThreadGroupSizeMetaDataHandle tgSizeHint = funcInfoMD->second->getThreadGroupSizeHint();
|
|
|
|
if (ctx->getModuleMetaData()->csInfo.maxWorkGroupSize && ctx->getModuleMetaData()->csInfo.maxWorkGroupSize <= 16)
|
|
return false;
|
|
|
|
if ((tgSize->hasValue() && (tgSize->getXDim() * tgSize->getYDim() * tgSize->getZDim()) <= 16) ||
|
|
(tgSizeHint->hasValue() && (tgSizeHint->getXDim() * tgSizeHint->getYDim() * tgSizeHint->getZDim()) <= 16)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// WORKAROUND - Skip SIMD32 if subgroup functions are present.
|
|
if (hasSubGroupFunc(*F)) {
|
|
return false;
|
|
}
|
|
|
|
const CPlatform* platform = &ctx->platform;
|
|
switch (platform->GetPlatformFamily()) {
|
|
case IGFX_GEN9_CORE:
|
|
/* TODO: Try to apply for platform->getPlatformInfo().eProductFamily ==
|
|
* IGFX_BROXTON only. */
|
|
// FALL THROUGH
|
|
case IGFX_GEN10_CORE:
|
|
if (hasIEEESqrtOrDivFunc(*F)) {
|
|
return false;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
// END OF WORKAROUND
|
|
|
|
// Ok, that's not the case.
|
|
// Now, check whether we have any non-uniform loops.
|
|
// The idea is that if there are divergenet loops, then SIMD32 will be harmful,
|
|
// because we'll waste time running loops with very few full lanes.
|
|
// If there are no divergent loops, SIMD32 is worth a shot. It still may not
|
|
// be selected, due to spills.
|
|
for (LoopInfo::iterator li = LI->begin(), le = LI->end(); li != le; ++li) {
|
|
llvm::Loop* loop = *li;
|
|
|
|
SmallVector<BasicBlock*, 8> exitingBlocks;
|
|
loop->getExitingBlocks(exitingBlocks);
|
|
|
|
bool AllUniform = true;
|
|
for (auto BBI = exitingBlocks.begin(), BBE = exitingBlocks.end(); BBI != BBE; ++BBI) {
|
|
BasicBlock* block = *BBI;
|
|
|
|
Instruction* term = block->getTerminator();
|
|
if (!WI->isUniform(term)) {
|
|
auto Br = dyn_cast<BranchInst>(term);
|
|
// Check special case for non-uniform loop where, except the
|
|
// initial, current, and next values, STEP and COUNT are
|
|
// uniform. In such a case, the loop is only diverged at the
|
|
// termination. It should be still profitable to be compiled
|
|
// into SIMD32 mode.
|
|
if (Br && Br->isConditional()) {
|
|
auto ICmp = dyn_cast<ICmpInst>(Br->getCondition());
|
|
if (ICmp) {
|
|
Value* Init = nullptr, * Curr = nullptr, * Step= nullptr, * Next = nullptr;
|
|
std::tie(Init, Curr, Step, Next)
|
|
= getInductionVariable(loop);
|
|
if (Init && Curr && Next && Step &&
|
|
WI->isUniform(Step)) {
|
|
auto Op0 = ICmp->getOperand(0);
|
|
auto Op1 = ICmp->getOperand(1);
|
|
if (SExtInst *SI0 = dyn_cast<SExtInst>(Op0))
|
|
Op0 = SI0->getOperand(0);
|
|
if (SExtInst *SI1 = dyn_cast<SExtInst>(Op1))
|
|
Op1 = SI1->getOperand(0);
|
|
if (Op0 != Next && Op0 != Curr)
|
|
std::swap(Op0, Op1);
|
|
// Skip non-uniform loop which only terminates on
|
|
// comparison between non-uniform induction variable
|
|
// and uniform value.
|
|
if (Op0 == Next || Op0 == Curr) {
|
|
// TODO: Need to check whether Init is linear to
|
|
// global/local ID. However, that checking is not
|
|
// that straightforward before code emitter.
|
|
if (WI->isUniform(Op1))
|
|
continue;
|
|
// TODO: Eable IndVarSimplify to simlify the
|
|
// following check.
|
|
if (Value * Count = getLoopCount(Init, Op1)) {
|
|
if (WI->isUniform(Count))
|
|
continue;
|
|
if (isSelectBasedOnGlobalIdX(Count))
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
AllUniform = false;
|
|
break;
|
|
}
|
|
}
|
|
if (!AllUniform) {
|
|
switch (estimateLoopCount(loop)) {
|
|
case LOOPCOUNT_LIKELY_LARGE:
|
|
case LOOPCOUNT_UNKNOWN:
|
|
return false;
|
|
case LOOPCOUNT_LIKELY_SMALL:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/// Cyclomatic complexity measures of the number of linearly independent paths
|
|
/// through a region.
|
|
///
|
|
/// M = a * E - N + 2 where
|
|
/// E = the number of edges of the graph
|
|
/// N = the number of nodes of the graph
|
|
/// a = scalar factor (1 for uniform branches).
|
|
///
|
|
/// We focus on loops instead of the entire program, since cyclomatic
|
|
/// complexity is roughly linear when concatenating two programs, i.e.
|
|
/// CC(F # G) = (E1 + E2 + 1) - (N1 + N2) + 2
|
|
/// = (E1 - N1 + 2) + (E2 - N2 + 2) - 1
|
|
/// = CC(F) + CC(G) - 1.
|
|
///
|
|
static const unsigned CYCLOMATIC_COMPLEXITY_THRESHOLD = 200;
|
|
|
|
unsigned Simd32ProfitabilityAnalysis::getLoopCyclomaticComplexity() {
|
|
unsigned MaxCC = 0;
|
|
for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
|
|
Loop* L = *I;
|
|
unsigned CC = 2;
|
|
for (auto BI = L->block_begin(), BE = L->block_end(); BI != BE; ++BI) {
|
|
BasicBlock* BB = *BI;
|
|
IGCLLVM::TerminatorInst* TI = BB->getTerminator();
|
|
bool IsUniform = WI->isUniform(TI);
|
|
CC += TI->getNumSuccessors() * (IsUniform ? 1 : 2);
|
|
}
|
|
CC -= L->getNumBlocks();
|
|
MaxCC = std::max(CC, MaxCC);
|
|
}
|
|
return MaxCC;
|
|
}
|
|
|
|
static unsigned getNumOfNonUniformExits(Loop* L, WIAnalysis* WI) {
|
|
SmallVector<BasicBlock*, 8> ExistingBlocks;
|
|
L->getExitingBlocks(ExistingBlocks);
|
|
unsigned Count = 0;
|
|
for (auto BB : ExistingBlocks) {
|
|
IGCLLVM::TerminatorInst* TI = BB->getTerminator();
|
|
bool IsUniform = WI->isUniform(TI);
|
|
Count += !IsUniform;
|
|
}
|
|
|
|
return Count;
|
|
}
|
|
|
|
/// Check if a loop or its subloop has multiple non-uniform exists.
|
|
static bool hasMultipleExits(Loop* L, WIAnalysis* WI) {
|
|
if (getNumOfNonUniformExits(L, WI) > 1)
|
|
return true;
|
|
for (auto InnerL : L->getSubLoops())
|
|
if (hasMultipleExits(InnerL, WI))
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
/// Given a loop, return nested (inner) loops with multiple non-uniform exits.
|
|
/// E.g. assume L2, L3, L5, L7 are only loops with multiple non-uniform exists
|
|
/// L1
|
|
/// L2
|
|
/// L3
|
|
/// L4
|
|
/// L5
|
|
/// L6
|
|
/// L7
|
|
/// then it returns {L2, L5}
|
|
///
|
|
static void getNestedLoopsWithMultpleExists(Loop* L, WIAnalysis* WI,
|
|
SmallVectorImpl<Loop*>& Result) {
|
|
if (getNumOfNonUniformExits(L, WI) > 1) {
|
|
for (auto InnerL : L->getSubLoops()) {
|
|
if (hasMultipleExits(InnerL, WI)) {
|
|
Result.push_back(L);
|
|
return;
|
|
}
|
|
}
|
|
// Only a single level, do not add into the result.
|
|
return;
|
|
}
|
|
|
|
// Outer loop is normal. Check its inner loop structure, recursively.
|
|
for (auto InnerL : L->getSubLoops())
|
|
getNestedLoopsWithMultpleExists(InnerL, WI, Result);
|
|
}
|
|
|
|
|
|
/// Check if loops with multiple exists dominate the entire function.
|
|
static bool hasNestedLoopsWithMultipleExits(Function* F, LoopInfo* LI,
|
|
WIAnalysis* WI) {
|
|
// Find top level nested loops with multiple non-uniform exists.
|
|
SmallVector<Loop*, 8> Loops;
|
|
for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
|
|
Loop* L = *I;
|
|
getNestedLoopsWithMultpleExists(L, WI, Loops);
|
|
}
|
|
|
|
// Sum the IR size of these loops.
|
|
unsigned LoopSize = 0;
|
|
for (auto L : Loops)
|
|
for (auto BB : L->getBlocks())
|
|
LoopSize += (unsigned)BB->size();
|
|
|
|
// Check the ratio between nested loops with multiple exists and the total
|
|
// number of instructions. A higher ratio means these loops dominate this
|
|
// kernel.
|
|
unsigned FuncSize = 0;
|
|
for (auto& BB : F->getBasicBlockList())
|
|
FuncSize += (unsigned)BB.size();
|
|
|
|
bool retVal = false;
|
|
if (FuncSize > 0)
|
|
{
|
|
retVal = float(LoopSize) / FuncSize >= 0.7f;
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
static bool hasLongStridedLdStInLoop(Function* F, LoopInfo* LI, WIAnalysis* WI) {
|
|
SmallVector<Loop*, 32> Loops;
|
|
// Collect innermost simple loop.
|
|
for (auto I = LI->begin(), E = LI->end(); I != E; ++I) {
|
|
auto L = *I;
|
|
if (!IGCLLVM::isInnermost(L))
|
|
continue;
|
|
if (L->getNumBlocks() != 2)
|
|
continue;
|
|
auto* Latch = L->getLoopLatch();
|
|
if (!Latch || !Latch->front().isTerminator())
|
|
continue;
|
|
Loops.push_back(L);
|
|
}
|
|
unsigned LDs = 0;
|
|
unsigned STs = 0;
|
|
for (auto L : Loops) {
|
|
auto BB = L->getHeader();
|
|
for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
|
|
if (auto LD = dyn_cast<LoadInst>(&*I)) {
|
|
VectorType* VTy = dyn_cast<VectorType>(LD->getType());
|
|
if (!VTy || IGCLLVM::GetVectorTypeBitWidth(VTy) <= 128)
|
|
continue;
|
|
if (WI->isUniform(LD))
|
|
continue;
|
|
++LDs;
|
|
}
|
|
if (auto ST = dyn_cast<StoreInst>(&*I)) {
|
|
Value* Ptr = ST->getPointerOperand();
|
|
Value* Val = ST->getValueOperand();
|
|
VectorType* VTy = dyn_cast<VectorType>(Val->getType());
|
|
if (!VTy || IGCLLVM::GetVectorTypeBitWidth(VTy) <= 128)
|
|
continue;
|
|
if (WI->isUniform(Ptr))
|
|
continue;
|
|
++STs;
|
|
}
|
|
}
|
|
if (LDs > 3 || STs > 3)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Simd32ProfitabilityAnalysis::checkSimd16Profitable(CodeGenContext* ctx) {
|
|
if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x1) &&
|
|
getLoopCyclomaticComplexity() >= CYCLOMATIC_COMPLEXITY_THRESHOLD) {
|
|
return false;
|
|
}
|
|
|
|
if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x2) &&
|
|
hasNestedLoopsWithMultipleExits(F, LI, WI)) {
|
|
return false;
|
|
}
|
|
|
|
// If there's wider vector load/store in a loop, skip SIMD16.
|
|
if ((IGC_GET_FLAG_VALUE(OCLSIMD16SelectionMask) & 0x4) &&
|
|
hasLongStridedLdStInLoop(F, LI, WI)) {
|
|
return false;
|
|
}
|
|
|
|
auto hasDouble = [](Function& F) {
|
|
for (auto& BB : F)
|
|
for (auto& I : BB) {
|
|
if (I.getType()->isDoubleTy())
|
|
return true;
|
|
for (Value* V : I.operands())
|
|
if (V->getType()->isDoubleTy())
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
const CPlatform* platform = &ctx->platform;
|
|
if (platform->GetPlatformFamily() == IGFX_GEN9_CORE &&
|
|
platform->getPlatformInfo().eProductFamily == IGFX_GEMINILAKE &&
|
|
hasDouble(*F)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Simd32ProfitabilityAnalysis::checkPSSimd32Profitable()
|
|
{
|
|
unsigned int numberInstructions = 0;
|
|
unsigned int numberOfHalfInstructions = 0;
|
|
unsigned int numberOfCmp = 0;
|
|
unsigned int numberOfSample = 0;
|
|
unsigned int numberOfBB = 0;
|
|
BasicBlock* returnBlock = nullptr;
|
|
bool hasDiscard = F->getParent()->getNamedMetadata("KillPixel") != nullptr;
|
|
for (Function::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI)
|
|
{
|
|
for (auto II = FI->begin(), IE = FI->end(); II != IE; ++II)
|
|
{
|
|
if (II->getType() == Type::getHalfTy(F->getContext()))
|
|
{
|
|
numberOfHalfInstructions++;
|
|
}
|
|
if (isa<CmpInst>(*II))
|
|
{
|
|
numberOfCmp++;
|
|
}
|
|
if (isSampleLoadGather4InfoInstruction(&(*II)))
|
|
{
|
|
numberOfSample++;
|
|
}
|
|
numberInstructions++;
|
|
}
|
|
if (isa<ReturnInst>(FI->getTerminator()))
|
|
{
|
|
returnBlock = &(*FI);
|
|
}
|
|
numberOfBB++;
|
|
}
|
|
if (numberInstructions > 4000 || numberInstructions == 0)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// Original SIMD32 heurtistic
|
|
// if 1BB, short, has sample, no discard, no cmp, enable SIMD32
|
|
// skip cmp to avoid flag spill
|
|
if (!hasDiscard && numberOfCmp == 0 && numberOfSample > 0 && numberOfBB == 1 && numberInstructions < 80)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
// disable SIMD32 for shader with multiple render target as it puts pressure on the render cache
|
|
unsigned int numberRTWrite = 0;
|
|
for (auto it = returnBlock->begin(), ie = returnBlock->end(); it != ie; ++it)
|
|
{
|
|
if (GenIntrinsicInst * intr = dyn_cast<GenIntrinsicInst>(it))
|
|
{
|
|
if (intr->getIntrinsicID() == GenISAIntrinsic::GenISA_RTWrite)
|
|
{
|
|
numberRTWrite++;
|
|
}
|
|
}
|
|
}
|
|
if (numberRTWrite > 1)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// Case where we expect to be bound by pixel dispatch time. For small shaderd without IO
|
|
// It is better to go with SIMD32
|
|
if (returnBlock == &F->getEntryBlock() && !hasDiscard)
|
|
{
|
|
bool hasIO = false;
|
|
unsigned int numberInstructions = returnBlock->size();
|
|
if (numberInstructions < 10)
|
|
{
|
|
for (auto II = returnBlock->begin(), IE = returnBlock->end(); II != IE; ++II)
|
|
{
|
|
if (II->mayReadOrWriteMemory() && !isa<RTWritIntrinsic>(II))
|
|
{
|
|
hasIO = true;
|
|
break;
|
|
}
|
|
if (isa<SampleIntrinsic>(II) ||
|
|
isa<SamplerLoadIntrinsic>(II) ||
|
|
isa<InfoIntrinsic>(II) ||
|
|
isa<SamplerGatherIntrinsic>(II))
|
|
{
|
|
hasIO = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!hasIO)
|
|
{
|
|
// for small program without IO using SIMD32 allows hiding the thread dispatch time
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (IGC_IS_FLAG_ENABLED(PSSIMD32HeuristicFP16))
|
|
{
|
|
// If we have a large ratio of half use SIMD32 to hide latency better
|
|
float ratioHalf = (float)numberOfHalfInstructions / (float)numberInstructions;
|
|
if (ratioHalf >= 0.5f)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (IGC_IS_FLAG_ENABLED(PSSIMD32HeuristicLoopAndDiscard))
|
|
{
|
|
// If we have a discard and the first block is small we may be bound by PSD so we try to enable SIMD32
|
|
if (hasDiscard)
|
|
{
|
|
BasicBlock& entryBB = F->getEntryBlock();
|
|
if (!isa<ReturnInst>(entryBB.getTerminator()) && entryBB.size() < 50)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// If we have a loop with high latency enable SIMD32 to reduce latency
|
|
unsigned int numberOfInstructions = 0;
|
|
unsigned int numberOfHighLatencyInst = 0;
|
|
for (LoopInfo::iterator li = LI->begin(), le = LI->end(); li != le; ++li)
|
|
{
|
|
llvm::Loop* loop = *li;
|
|
for (auto BI = loop->block_begin(), BE = loop->block_end(); BI != BE; ++BI)
|
|
{
|
|
for (auto II = (*BI)->begin(), IE = (*BI)->end(); II != IE; ++II)
|
|
{
|
|
if (isa<SampleIntrinsic>(II))
|
|
{
|
|
numberOfHighLatencyInst++;
|
|
}
|
|
numberOfInstructions++;
|
|
}
|
|
}
|
|
}
|
|
if (numberOfInstructions < 85 && numberOfHighLatencyInst >= 1)
|
|
{
|
|
// high latency small loop
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|