Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/MemOpt.cpp
Milczek, Szymon 035b1f3d39 Minor fixes and refactors.
Change args to const ref where makes sense.

Put std::move where makes sense

Apply rule of three

Change usage of unique_ptr and ptrs to unique_ptr to use just shared_ptr

Update comment in CodeSinking

Use saved boolean value instead of calling method over and over.
2025-09-22 15:49:55 +02:00

5021 lines
184 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017-2023 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "common/LLVMWarningsPush.hpp"
#include <llvmWrapper/Analysis/InstructionSimplify.h>
#include <llvmWrapper/Analysis/TargetLibraryInfo.h>
#include <llvmWrapper/Analysis/AliasSetTracker.h>
#include <llvm/Analysis/InstructionSimplify.h>
#include <llvm/Analysis/ScalarEvolution.h>
#include <llvm/Analysis/ScalarEvolutionExpressions.h>
#include <llvm/Analysis/ValueTracking.h>
#include <llvm/IR/DataLayout.h>
#include <llvm/IR/Function.h>
#include <llvm/IR/GetElementPtrTypeIterator.h>
#include <llvm/IR/GlobalAlias.h>
#include <llvmWrapper/IR/IRBuilder.h>
#include <llvm/Pass.h>
#include <llvmWrapper/Support/Alignment.h>
#include <llvmWrapper/IR/DerivedTypes.h>
#include <llvm/Support/Debug.h>
#include <llvm/Support/DebugCounter.h>
#include <llvm/Support/raw_ostream.h>
#include "llvm/Support/CommandLine.h"
#include <llvm/Transforms/Utils/Local.h>
#include <optional>
#include "common/LLVMWarningsPop.hpp"
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "Compiler/CISACodeGen/OpenCLKernelCodeGen.hpp"
#include "Compiler/CISACodeGen/SLMConstProp.hpp"
#include "Compiler/IGCPassSupport.h"
#include "Compiler/MetaDataUtilsWrapper.h"
#include "Compiler/CISACodeGen/WIAnalysis.hpp"
#include "Compiler/InitializePasses.h"
#include "Compiler/CISACodeGen/MemOpt.h"
#include "Probe/Assertion.h"
#include <DebugInfo/DwarfDebug.cpp>
#include "MemOptUtils.h"
using namespace llvm;
using namespace IGC;
using namespace IGC::IGCMD;
static cl::opt<bool> EnableRemoveRedBlockreads("remove-red-blockreads", cl::init(false), cl::Hidden,
cl::desc("Enable removal of redundant blockread instructions."));
DEBUG_COUNTER(MergeLoadCounter, "memopt-merge-load", "Controls count of merged loads");
DEBUG_COUNTER(MergeStoreCounter, "memopt-merge-store", "Controls count of merged stores");
namespace {
// This pass merge consecutive loads/stores within a BB when it's safe:
// - Two loads (one of them is denoted as the leading load if it happens
// before the other one in the program order) are safe to be merged, i.e.
// the non-leading load is merged into the leading load, iff there's no
// memory dependency between them which may results in different loading
// result.
// - Two stores (one of them is denoted as the tailing store if it happens
// after the other one in the program order) are safe to be merged, i.e.
// the non-tailing store is merged into the tailing one, iff there's no
// memory dependency between them which may results in different result.
//
class MemOpt : public FunctionPass {
const DataLayout *DL;
AliasAnalysis *AA;
ScalarEvolution *SE;
WIAnalysis *WI;
CodeGenContext *CGC;
TargetLibraryInfo *TLI;
bool AllowNegativeSymPtrsForLoad = false;
bool AllowVector8LoadStore = false;
// Map of profit vector lengths per scalar type. Each entry specifies the
// profit vector length of a given scalar type.
// NOTE: Prepare the profit vector lengths in the *DESCENDING* order.
typedef DenseMap<unsigned int, SmallVector<unsigned, 4>> ProfitVectorLengthsMap;
ProfitVectorLengthsMap ProfitVectorLengths;
// A list of memory references (within a BB) with the distance to the begining of the BB.
typedef std::vector<std::pair<Instruction *, unsigned>> MemRefListTy;
typedef std::vector<Instruction *> TrivialMemRefListTy;
// ALoadInst, Offset, MemRefListTy::iterator, LeadingLoad's int2PtrOffset
typedef SmallVector<std::tuple<Instruction *, int64_t, MemRefListTy::iterator>, 8> MergeVector;
public:
static char ID;
MemOpt(bool AllowNegativeSymPtrsForLoad = false, bool AllowVector8LoadStore = false)
: FunctionPass(ID), DL(nullptr), AA(nullptr), SE(nullptr), WI(nullptr), CGC(nullptr),
AllowNegativeSymPtrsForLoad(AllowNegativeSymPtrsForLoad), AllowVector8LoadStore(AllowVector8LoadStore) {
initializeMemOptPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
StringRef getPassName() const override { return "MemOpt"; }
private:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<CodeGenContextWrapper>();
AU.addRequired<MetaDataUtilsWrapper>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<WIAnalysis>();
}
void buildProfitVectorLengths(Function &F);
bool mergeLoad(ALoadInst &LeadingLoad, MemRefListTy::iterator MI, MemRefListTy &MemRefs, TrivialMemRefListTy &ToOpt);
bool mergeStore(AStoreInst &LeadingStore, MemRefListTy::iterator MI, MemRefListTy &MemRefs,
TrivialMemRefListTy &ToOpt);
bool removeRedBlockRead(GenIntrinsicInst *LeadingLoad, MemRefListTy::iterator MI, MemRefListTy &MemRefs,
TrivialMemRefListTy &ToOpt, unsigned &SimdSize);
std::optional<unsigned> chainedSelectAndPhis(Instruction *Inst, unsigned depth,
llvm::DenseMap<Instruction *, unsigned> &depthTracking);
void removeVectorBlockRead(Instruction *BlockReadToOptimize, Instruction *BlockReadToRemove, Value *SgId,
llvm::IRBuilder<> &Builder, unsigned &sg_size);
void removeScalarBlockRead(Instruction *BlockReadToOptimize, Instruction *BlockReadToRemove, Value *SgId,
llvm::IRBuilder<> &Builder);
Value *getShuffle(Value *ShflId, Instruction *BlockReadToOptimize, Value *SgId, llvm::IRBuilder<> &Builder,
unsigned &ToOptSize);
unsigned getNumElements(Type *Ty) {
return Ty->isVectorTy() ? (unsigned)cast<IGCLLVM::FixedVectorType>(Ty)->getNumElements() : 1;
}
Type *getVectorElementType(Type *Ty) const {
return isa<VectorType>(Ty) ? cast<VectorType>(Ty)->getElementType() : Ty;
}
bool hasSameSize(Type *A, Type *B) const {
// Shortcut if A is equal to B.
if (A == B)
return true;
return DL->getTypeStoreSize(A) == DL->getTypeStoreSize(B);
}
Value *createBitOrPointerCast(Value *V, Type *DestTy, IGCIRBuilder<> &Builder) {
if (V->getType() == DestTy)
return V;
if (V->getType()->isPointerTy() && DestTy->isPointerTy()) {
PointerType *SrcPtrTy = cast<PointerType>(V->getType());
PointerType *DstPtrTy = cast<PointerType>(DestTy);
if (SrcPtrTy->getPointerAddressSpace() != DstPtrTy->getPointerAddressSpace())
return Builder.CreateAddrSpaceCast(V, DestTy);
}
if (V->getType()->isPointerTy()) {
if (DestTy->isIntegerTy()) {
return Builder.CreatePtrToInt(V, DestTy);
} else if (DestTy->isFloatingPointTy()) {
uint32_t Size = (uint32_t)DestTy->getPrimitiveSizeInBits();
Value *Cast = Builder.CreatePtrToInt(V, Builder.getIntNTy(Size));
return Builder.CreateBitCast(Cast, DestTy);
}
}
if (DestTy->isPointerTy()) {
if (V->getType()->isIntegerTy()) {
return Builder.CreateIntToPtr(V, DestTy);
} else if (V->getType()->isFloatingPointTy()) {
uint32_t Size = (uint32_t)V->getType()->getPrimitiveSizeInBits();
Value *Cast = Builder.CreateBitCast(V, Builder.getIntNTy(Size));
return Builder.CreateIntToPtr(Cast, DestTy);
}
}
return Builder.CreateBitCast(V, DestTy);
}
/**
* @brief Creates a new merge value for merged load from a set of predicated loads' merge values.
*
* This function constructs a new combined merge value by merging the merge values of multiple predicated load
* intrinsics. Merge value from each input predicated load is inserted into the appropriate position in the resulting
* merge vector value, based on its offset and the scalar size. The function handles both scalar and vector merge
* input values.
*
* @param MergeValTy The type of the merged value to be created.
* @param LoadsToMerge A vector of tuples, each containing a load instruction and its associated offset.
* @param LdScalarSize The size (in bytes) of the scalar element being loaded in the combined load.
* @param NumElts Number of elements in the merged value vector.
* @return Value* The newly created merged value, or nullptr if we are merging generic loads, not predicated.
*/
Value *CreateNewMergeValue(IGCIRBuilder<> &Builder, Type *MergeValTy, const MergeVector &LoadsToMerge,
unsigned LdScalarSize, unsigned &NumElts) {
Value *NewMergeValue = UndefValue::get(MergeValTy);
unsigned Pos = 0;
int64_t FirstOffset = std::get<1>(LoadsToMerge.front());
for (auto &I : LoadsToMerge) {
PredicatedLoadIntrinsic *PLI = ALoadInst::get(std::get<0>(I))->getPredicatedLoadIntrinsic();
if (!PLI)
return nullptr;
Value *MergeValue = PLI->getMergeValue();
unsigned MergeValNumElements = getNumElements(MergeValue->getType());
Type *MergeValScalarTy = MergeValTy->getScalarType();
Pos = unsigned((std::get<1>(I) - FirstOffset) / LdScalarSize);
if (MergeValNumElements == 1) {
IGC_ASSERT_MESSAGE(Pos < NumElts, "Index is larger than the number of elements, we cannot update merge value.");
MergeValue = createBitOrPointerCast(MergeValue, MergeValScalarTy, Builder);
NewMergeValue = Builder.CreateInsertElement(NewMergeValue, MergeValue, Builder.getInt32(Pos));
continue;
}
IGC_ASSERT_MESSAGE(Pos + MergeValNumElements <= NumElts,
"Index is larger than the number of elements, we cannot update merge value.");
for (unsigned i = 0; i < MergeValNumElements; ++i) {
Value *ExtractValue = Builder.CreateExtractElement(MergeValue, Builder.getInt32(i));
ExtractValue = createBitOrPointerCast(ExtractValue, MergeValScalarTy, Builder);
NewMergeValue = Builder.CreateInsertElement(NewMergeValue, ExtractValue, Builder.getInt32(Pos + i));
}
}
return NewMergeValue;
}
bool isSafeToMergeLoad(const ALoadInst &Ld, const SmallVectorImpl<Instruction *> &checkList) const;
bool isSafeToMergeStores(const SmallVectorImpl<std::tuple<Instruction *, int64_t, MemRefListTy::iterator>> &Stores,
const SmallVectorImpl<Instruction *> &checkList) const;
bool shouldSkip(const Value *Ptr) const {
PointerType *PtrTy = cast<PointerType>(Ptr->getType());
unsigned AS = PtrTy->getPointerAddressSpace();
if (PtrTy->getPointerAddressSpace() != ADDRESS_SPACE_PRIVATE) {
if (CGC->type != ShaderType::OPENCL_SHADER) {
// For non-OpenCL shader, skip constant buffer accesses.
bool DirectIndex = false;
unsigned BufID = 0;
BufferType BufTy = DecodeAS4GFXResource(AS, DirectIndex, BufID);
if (BufTy == CONSTANT_BUFFER && UsesTypedConstantBuffer(CGC, BufTy))
return true;
}
return false;
}
return false;
}
/// Skip irrelevant instructions.
bool shouldSkip(const Instruction *I) const {
if (!I->mayReadOrWriteMemory())
return true;
if (auto GInst = dyn_cast<GenIntrinsicInst>(I)) {
if (GInst->getIntrinsicID() == GenISAIntrinsic::GenISA_simdBlockRead ||
GInst->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad ||
GInst->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore) {
return shouldSkip(I->getOperand(0));
}
}
if (auto LD = dyn_cast<LoadInst>(I))
return shouldSkip(LD->getPointerOperand());
if (auto ST = dyn_cast<StoreInst>(I))
return shouldSkip(ST->getPointerOperand());
return false;
}
template <typename AccessInstruction>
bool
checkAlignmentBeforeMerge(const AccessInstruction &inst,
SmallVector<std::tuple<Instruction *, int64_t, MemRefListTy::iterator>, 8> &AccessIntrs,
unsigned &NumElts) {
auto alignment = inst.getAlignmentValue();
if (alignment == 0) {
// SROA LLVM pass may sometimes set a load/store alignment to 0. It happens when
// deduced alignment (based on GEP instructions) matches an alignment specified
// in datalayout for a specific type. It can be problematic as MemOpt merging
// logic is implemented in a way that a product of merging inherits an alignment
// from the leading load/store. It results in creating memory instruction with
// different type, without alignment set, therefore the information about the
// correct alignment gets lost.
CGC->EmitWarning("MemOpt expects alignment to be always explicitly set for the leading instruction!");
}
if (alignment < 4 && !WI->isUniform(inst.inst())) {
llvm::Type *dataType = inst.getValue()->getType();
unsigned scalarTypeSizeInBytes = unsigned(DL->getTypeSizeInBits(dataType->getScalarType()) / 8);
// Need the first offset value (not necessarily zero)
int64_t firstOffset = std::get<1>(AccessIntrs[0]);
int64_t mergedSize = 0;
for (auto rit = AccessIntrs.rbegin(), rie = AccessIntrs.rend(); rit != rie; ++rit) {
int64_t accessSize = 0;
int64_t cur_offset = std::get<1>(*rit);
auto acessInst = std::get<0>(*rit);
auto AI = AccessInstruction::get(acessInst);
accessSize = int64_t(DL->getTypeSizeInBits(AI->getValue()->getType())) / 8;
mergedSize = cur_offset - firstOffset + accessSize;
// limit the size of merge when alignment < 4
if (mergedSize > 8)
AccessIntrs.pop_back();
else
break;
}
if (AccessIntrs.size() < 2)
return false;
for (auto rit = AccessIntrs.rbegin(), rie = AccessIntrs.rend(); rit != rie; ++rit) {
if (AccessInstruction::get(std::get<0>(*rit))->getAlignmentValue() >= 4)
return false;
}
// Need to subtract the last offset by the first offset and add one to
// get the new size of the vector
NumElts = unsigned(mergedSize / scalarTypeSizeInBytes);
}
return true;
}
// This is for enabling the mergeload improvement (comparing GEP's last
// index instead) as it requires to turn off GEP canonicalization.
bool EnableCanonicalizeGEP() const {
IGC_ASSERT(CGC != nullptr);
// The new mergeload improvement is intended for PVC+ for now.
if (CGC->platform.getPlatformInfo().eProductFamily != IGFX_PVC && !CGC->platform.isProductChildOf(IGFX_PVC)) {
// No mergeload improvement
return true;
}
switch (IGC_GET_FLAG_VALUE(MemOptGEPCanon)) {
case 1:
return false;
case 2: {
if (CGC->type == ShaderType::OPENCL_SHADER)
return false;
break;
}
default:
break;
}
return true;
}
/// Canonicalize the calculation of 64-bit pointer by performing the
/// following transformations to help SCEV to identify the constant offset
/// between pointers.
///
/// (sext (add.nsw LHS RHS)) => (add.nsw (sext LHS) (sext RHS))
/// (zext (add.nuw LHS RHS)) => (add.nuw (zext LHS) (zext RHS))
///
/// For SLM (and potentially private) memory, we could ignore `nsw`/`nuw`
/// as there are only 32 significant bits.
bool canonicalizeGEP64(Instruction *) const;
/// Optimize the calculation of 64-bit pointer by performing the following
/// transformations to reduce instruction strength.
///
/// (add.nsw (sext LHS) (sext RHS)) => (sext (add.nsw LHS RHS))
/// (add.nuw (zext LHS) (zext RHS)) => (zext (add.nuw LHS RHS))
///
/// In fact, this's the reverse operation of 64-bit pointer
/// canonicalization, which helps SCEV analysis but increases instruction
/// strength on 64-bit integer operations.
bool optimizeGEP64(Instruction *) const;
};
template <int M> struct less_tuple {
template <typename T> bool operator()(const T &LHS, const T &RHS) const {
return std::get<M>(LHS) < std::get<M>(RHS);
}
};
// SymbolicPtr represents how a pointer is calculated from the following
// equation:
//
// Ptr := BasePtr + \sum_i Scale_i * Index_i + Offset
//
// where Scale_i and Offset are constants.
//
enum ExtensionKind {
EK_NotExtended,
EK_SignExt,
EK_ZeroExt,
};
typedef PointerIntPair<Value *, 2, ExtensionKind> SymbolicIndex;
struct Term {
SymbolicIndex Idx;
int64_t Scale;
bool operator==(const Term &Other) const { return Idx == Other.Idx && Scale == Other.Scale; }
bool operator!=(const Term &Other) const { return !operator==(Other); }
};
struct SymbolicPointer {
const Value *BasePtr;
int64_t Offset;
SmallVector<Term, 8> Terms;
bool getConstantOffset(SymbolicPointer &Other, int64_t &Off);
static Value *getLinearExpression(Value *Val, APInt &Scale, APInt &Offset, ExtensionKind &Extension, unsigned Depth,
const DataLayout *DL);
static bool decomposePointer(const Value *Ptr, SymbolicPointer &SymPtr, CodeGenContext *DL);
static const unsigned MaxLookupSearchDepth = 6;
private:
void saveTerm(Value *Src, int64_t IndexScale, uint64_t Scale, int64_t IndexOffset, ExtensionKind Extension,
unsigned int ptrSize);
bool checkTerms(const Term *T, const Term *OtherT, int64_t &Off) const;
};
} // namespace
FunctionPass *IGC::createMemOptPass(bool AllowNegativeSymPtrsForLoad, bool AllowVector8LoadStore) {
return new MemOpt(AllowNegativeSymPtrsForLoad, AllowVector8LoadStore);
}
#define PASS_FLAG "igc-memopt"
#define PASS_DESC "IGC Memory Optimization"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(MemOpt, PASS_FLAG, PASS_DESC, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
IGC_INITIALIZE_PASS_END(MemOpt, PASS_FLAG, PASS_DESC, PASS_CFG_ONLY, PASS_ANALYSIS)
char MemOpt::ID = 0;
void MemOpt::buildProfitVectorLengths(Function &F) {
ProfitVectorLengths.clear();
if (AllowVector8LoadStore) {
ProfitVectorLengths[64].push_back(4);
ProfitVectorLengths[32].push_back(8);
}
// 64-bit integer
ProfitVectorLengths[64].push_back(2);
// 32-bit integer and Float
ProfitVectorLengths[32].push_back(4);
ProfitVectorLengths[32].push_back(3);
ProfitVectorLengths[32].push_back(2);
// 16-bit integer and Hald
ProfitVectorLengths[16].push_back(8);
ProfitVectorLengths[16].push_back(6);
ProfitVectorLengths[16].push_back(4);
ProfitVectorLengths[16].push_back(2);
// 8-bit integer
ProfitVectorLengths[8].push_back(16);
ProfitVectorLengths[8].push_back(12);
ProfitVectorLengths[8].push_back(8);
ProfitVectorLengths[8].push_back(4);
ProfitVectorLengths[8].push_back(2);
}
bool MemOpt::runOnFunction(Function &F) {
// Skip non-kernel function.
MetaDataUtils *MDU = nullptr;
MDU = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
auto FII = MDU->findFunctionsInfoItem(&F);
if (FII == MDU->end_FunctionsInfo())
return false;
DL = &F.getParent()->getDataLayout();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
WI = &getAnalysis<WIAnalysis>();
CGC = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
if (ProfitVectorLengths.empty())
buildProfitVectorLengths(F);
// If LdStCombining is on, no need to do memopt.
const bool DisableMergeStore = (doLdStCombine(CGC) && IGC_IS_FLAG_ENABLED(DisableMergeStore));
bool Changed = false;
IGC::IGCMD::FunctionInfoMetaDataHandle funcInfoMD = MDU->getFunctionsInfoItem(&F);
unsigned SimdSize = funcInfoMD->getSubGroupSize()->getSIMDSize();
for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) {
// Find all instructions with memory reference. Remember the distance one
// by one.
BasicBlock *BB = &*BBI;
MemRefListTy MemRefs;
TrivialMemRefListTy MemRefsToOptimize;
unsigned Distance = 0;
for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI, ++Distance) {
Instruction *I = &(*BI);
// Make sure we don't count debug info intrinsincs
// This is required to keep debug and non-debug optimizations identical
if (isDbgIntrinsic(I)) {
Distance--;
continue;
}
// Skip irrelevant instructions.
if (shouldSkip(I))
continue;
MemRefs.push_back(std::make_pair(I, Distance));
}
// Skip BB with no more than 2 loads/stores.
if (MemRefs.size() < 2)
continue;
if (EnableCanonicalizeGEP()) {
// Canonicalize 64-bit GEP to help SCEV find constant offset by
// distributing `zext`/`sext` over safe expressions.
for (auto &M : MemRefs)
Changed |= canonicalizeGEP64(M.first);
}
for (auto MI = MemRefs.begin(), ME = MemRefs.end(); MI != ME; ++MI) {
Instruction *I = MI->first;
// Skip already merged one.
if (!I)
continue;
if (auto ALI = ALoadInst::get(I); ALI.has_value())
Changed |= mergeLoad(ALI.value(), MI, MemRefs, MemRefsToOptimize);
else if (auto ASI = AStoreInst::get(I); ASI.has_value()) {
if (!DisableMergeStore)
Changed |= mergeStore(ASI.value(), MI, MemRefs, MemRefsToOptimize);
} else if (EnableRemoveRedBlockreads) {
if (GenIntrinsicInst *GInst = dyn_cast<GenIntrinsicInst>(I)) {
if (GInst->getIntrinsicID() == GenISAIntrinsic::GenISA_simdBlockRead) {
Changed |= removeRedBlockRead(GInst, MI, MemRefs, MemRefsToOptimize, SimdSize);
}
}
}
}
if (EnableCanonicalizeGEP()) {
// Optimize 64-bit GEP to reduce strength by factoring out `zext`/`sext`
// over safe expressions.
for (auto I : MemRefsToOptimize)
Changed |= optimizeGEP64(I);
}
}
DL = nullptr;
AA = nullptr;
SE = nullptr;
return Changed;
}
// This function removes redundant blockread instructions
// if they read from addresses with the same base.
// It replaces redundant blockread with a set of shuffle instructions.
//
// For example,
//
// before:
// %0 = inttoptr i64 %i64input to i32 addrspace(1)*
// %1 = inttoptr i64 %i64input to i8 addrspace(1)*
// %2 = call i32 @llvm.genx.GenISA.simdBlockRead.i32.p1i32(i32 addrspace(1)* %0)
// %3 = call i8 @llvm.genx.GenISA.simdBlockRead.i8.p1i8(i8 addrspace(1)* %1)
// store i32 %2, i32 addrspace(1)* %i32addr, align 4
// store i8 %3, i8 addrspace(1)* %i8addr, align 1
//
// after:
// %0 = inttoptr i64 %i64input to i32 addrspace(1)*
// %1 = inttoptr i64 %i64input to i8 addrspace(1)*
// %2 = call i32 @llvm.genx.GenISA.simdBlockRead.i32.p1i32(i32 addrspace(1)* %0)
// %3 = call i16 @llvm.genx.GenISA.simdLaneId()
// %4 = zext i16 %3 to i32
// %5 = lshr i32 %4, 2
// %6 = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %2, i32 %5, i32 0)
// %7 = and i32 %4, 3
// %8 = mul i32 %7, 8
// %9 = lshr i32 %6, %8
// %10 = trunc i32 %9 to i8
// store i32 %2, i32 addrspace(1)* %i32addr, align 4
// store i8 %10, i8 addrspace(1)* %i8addr, align 1
bool MemOpt::removeRedBlockRead(GenIntrinsicInst *LeadingBlockRead, MemRefListTy::iterator aMI, MemRefListTy &MemRefs,
TrivialMemRefListTy &ToOpt, unsigned &sg_size) {
MemRefListTy::iterator MI = aMI;
const unsigned Limit = IGC_GET_FLAG_VALUE(MemOptWindowSize);
const unsigned windowEnd = Limit + MI->second;
auto ME = MemRefs.end();
MemoryLocation LeadingBlockReadMemLoc = getLocation(cast<Instruction>(LeadingBlockRead), TLI);
Type *LeadingBlockReadType = LeadingBlockRead->getType();
Value *LeadingBlockReadBase = LeadingBlockRead->getOperand(0)->stripPointerCasts();
Instruction *BlockReadToOptimize = LeadingBlockRead;
MemRefListTy::iterator MIToOpt = aMI;
llvm::SmallVector<std::tuple<Instruction *, MemRefListTy::iterator>, 8> BlockReadToRemove;
uint64_t MaxBlockReadSize = LeadingBlockReadType->getPrimitiveSizeInBits();
// Go through MemRefs to collect blockreads that can be removed.
for (++MI; MI != ME && MI->second <= windowEnd; ++MI) {
Instruction *NextMemRef = MI->first;
if (!NextMemRef) {
continue;
}
if (GenIntrinsicInst *GInst = dyn_cast<GenIntrinsicInst>(NextMemRef)) {
if (GInst->getIntrinsicID() == GenISAIntrinsic::GenISA_simdBlockRead) {
Type *GInstType = GInst->getType();
uint64_t NextSize = GInstType->getPrimitiveSizeInBits();
Value *NextBlockReadBase = NextMemRef->getOperand(0)->stripPointerCasts();
if (isa<IntToPtrInst>(LeadingBlockReadBase) && isa<IntToPtrInst>(NextBlockReadBase)) {
LeadingBlockReadBase = cast<IntToPtrInst>(LeadingBlockReadBase)->getOperand(0);
NextBlockReadBase = cast<IntToPtrInst>(NextBlockReadBase)->getOperand(0);
}
if (LeadingBlockReadBase == NextBlockReadBase) {
if (NextSize > MaxBlockReadSize) {
BlockReadToRemove.push_back(std::make_tuple(BlockReadToOptimize, MIToOpt));
MaxBlockReadSize = NextSize;
BlockReadToOptimize = NextMemRef;
MIToOpt = MI;
} else {
BlockReadToRemove.push_back(std::make_tuple(NextMemRef, MI));
}
}
}
} else if (NextMemRef->mayWriteToMemory()) {
MemoryLocation WriteInstrMemLoc = getLocation(NextMemRef, TLI);
if (!WriteInstrMemLoc.Ptr || !LeadingBlockReadMemLoc.Ptr || AA->alias(WriteInstrMemLoc, LeadingBlockReadMemLoc)) {
break;
}
}
}
if (BlockReadToRemove.size() == 0) {
return false;
}
IRBuilder<> Builder(LeadingBlockRead);
// Raise the blockread, which we will not remove, in place of the leading blockread.
if (BlockReadToOptimize != LeadingBlockRead) {
Type *ArgType = BlockReadToOptimize->getOperand(0)->getType();
BlockReadToOptimize->moveBefore(LeadingBlockRead);
Builder.SetInsertPoint(BlockReadToOptimize);
Value *BitCast = Builder.CreateBitCast(LeadingBlockRead->getOperand(0), ArgType);
BlockReadToOptimize->setOperand(0, BitCast);
aMI->first = BlockReadToOptimize;
}
Builder.SetInsertPoint(BlockReadToOptimize->getNextNonDebugInstruction());
Value *subgroupLocalInvocationId = nullptr;
// Go through the collected blockreads to replace them with shuffles
for (const auto &ITuple : BlockReadToRemove) {
Instruction *I = std::get<0>(ITuple);
if (BlockReadToOptimize != I) {
if (!subgroupLocalInvocationId) {
Function *simdLaneIdIntrinsic =
GenISAIntrinsic::getDeclaration(BlockReadToOptimize->getModule(), GenISAIntrinsic::GenISA_simdLaneId);
subgroupLocalInvocationId =
Builder.CreateZExtOrTrunc(Builder.CreateCall(simdLaneIdIntrinsic), Builder.getInt32Ty());
}
// Case when one of blockreads is vector
if (I->getType()->isVectorTy() || BlockReadToOptimize->getType()->isVectorTy()) {
MemOpt::removeVectorBlockRead(BlockReadToOptimize, I, subgroupLocalInvocationId, Builder, sg_size);
} // Case when blockreads are scalars
else {
MemOpt::removeScalarBlockRead(BlockReadToOptimize, I, subgroupLocalInvocationId, Builder);
}
std::get<1>(ITuple)->first = nullptr;
I->eraseFromParent();
Builder.SetInsertPoint(BlockReadToOptimize->getNextNonDebugInstruction());
}
}
aMI->first = BlockReadToOptimize;
return true;
}
// Removes redundant blockread if both blockreads are scalar.
void MemOpt::removeScalarBlockRead(Instruction *BlockReadToOptimize, Instruction *BlockReadToRemove, Value *SgId,
llvm::IRBuilder<> &Builder) {
Type *BlockReadToOptType = BlockReadToOptimize->getType();
unsigned ToOptSize = (unsigned)(BlockReadToOptType->getPrimitiveSizeInBits());
Type *BlockReadToRemoveType = BlockReadToRemove->getType();
int rat = (int)(ToOptSize / (2 * BlockReadToRemoveType->getPrimitiveSizeInBits()));
Value *LShr = Builder.CreateLShr(SgId, Builder.getInt32(rat));
Value *shuffle = getShuffle(LShr, BlockReadToOptimize, SgId, Builder, ToOptSize);
Value *and_instr = Builder.CreateAnd(SgId, Builder.getInt32(rat * 2 - 1));
Value *shift = Builder.CreateMul(and_instr, Builder.getInt32((int)(BlockReadToRemoveType->getPrimitiveSizeInBits())));
Value *extr_elem = Builder.CreateLShr(shuffle, Builder.CreateZExtOrTrunc(shift, BlockReadToOptType));
Value *TypeConvInstr = Builder.CreateTrunc(extr_elem, cast<Type>(BlockReadToRemoveType));
BlockReadToRemove->replaceAllUsesWith(TypeConvInstr);
}
// Removes redundant blockreads if one of the pair is a vector blockread.
void MemOpt::removeVectorBlockRead(Instruction *BlockReadToOptimize, Instruction *BlockReadToRemove, Value *SgId,
llvm::IRBuilder<> &Builder, unsigned &sg_size) {
Type *BlockReadToOptType = BlockReadToOptimize->getType();
Type *BlockReadToRemoveType = BlockReadToRemove->getType();
unsigned ToOptSize = BlockReadToOptType->getScalarSizeInBits();
if (BlockReadToOptType->getScalarSizeInBits() < BlockReadToRemoveType->getScalarSizeInBits()) {
unsigned step = BlockReadToRemoveType->getScalarSizeInBits() / BlockReadToOptType->getScalarSizeInBits();
unsigned ToRemoveNumElem = getNumElements(BlockReadToRemoveType);
Type *ElemType = getVectorElementType(BlockReadToRemoveType);
Function *shufflefn =
GenISAIntrinsic::getDeclaration(BlockReadToOptimize->getModule(), GenISAIntrinsic::GenISA_WaveShuffleIndex,
getVectorElementType(BlockReadToOptType));
unsigned LimitElem = step * ToRemoveNumElem;
std::vector<Instruction *> ExtractElemInstrVector;
// Extracting elements from BlockReadToOptimize to use them in shuffles
for (unsigned i = 0; i < LimitElem; i++) {
Instruction *ExtrElemInstr =
cast<Instruction>(Builder.CreateExtractElement(BlockReadToOptimize, Builder.getInt32(i)));
ExtractElemInstrVector.push_back(ExtrElemInstr);
}
Type *NewType = VectorType::get(getVectorElementType(BlockReadToOptType), LimitElem * sg_size, false);
std::vector<Instruction *> ShuffleInstrVector;
Value *CollectedData = nullptr;
// Generating set of shuffles and collecting them in vector
for (unsigned index = 0; index < LimitElem; index++) {
for (unsigned id = 0; id < sg_size; id++) {
SmallVector<Value *, 3> Args;
Args.push_back(cast<Value>(ExtractElemInstrVector[index]));
Args.push_back(Builder.getInt32(id));
Args.push_back(Builder.getInt32(0));
if (index == 0 && id == 0) {
Value *ShuffleInstr = Builder.CreateCall(shufflefn, Args);
Value *InsertIndex = cast<Value>(Builder.getInt64(0));
CollectedData = Builder.CreateInsertElement(UndefValue::get(NewType), ShuffleInstr, InsertIndex);
} else {
Value *ShuffleInstr = Builder.CreateCall(shufflefn, Args);
Value *InsertIndex = cast<Value>(Builder.getInt64(id + index * sg_size));
CollectedData = Builder.CreateInsertElement(CollectedData, ShuffleInstr, InsertIndex);
}
}
}
Value *offset = Builder.CreateMul(SgId, Builder.getInt32(step));
Type *TypeVectForBitCast = VectorType::get(getVectorElementType(BlockReadToOptType), step, false);
Value *ResVect = nullptr;
// Getting the result of a blockread that has been deleted
for (unsigned k = 0; k < ToRemoveNumElem; k++) {
Value *VectForBitCast = nullptr;
Value *Index = Builder.CreateAdd(offset, Builder.getInt32(k * sg_size * step));
for (unsigned i = 0; i < step; i++) {
Value *AddInstr = Builder.CreateAdd(Index, Builder.getInt32(i));
Value *extr_elem = cast<Instruction>(Builder.CreateExtractElement(CollectedData, AddInstr));
if (i == 0) {
VectForBitCast = Builder.CreateInsertElement(UndefValue::get(TypeVectForBitCast), extr_elem,
cast<Value>(Builder.getInt64(0)));
} else {
VectForBitCast = Builder.CreateInsertElement(VectForBitCast, extr_elem, cast<Value>(Builder.getInt64(i)));
}
}
Value *BitCastInstr = Builder.CreateBitCast(VectForBitCast, ElemType);
if (BlockReadToRemoveType->isVectorTy()) {
if (k == 0) {
ResVect = Builder.CreateInsertElement(UndefValue::get(BlockReadToRemoveType), BitCastInstr,
cast<Value>(Builder.getInt64(0)));
} else {
ResVect = Builder.CreateInsertElement(ResVect, BitCastInstr, cast<Value>(Builder.getInt64(k)));
}
} else {
ResVect = BitCastInstr;
}
}
BlockReadToRemove->replaceAllUsesWith(ResVect);
} else if (BlockReadToOptType->getScalarSizeInBits() > BlockReadToRemoveType->getScalarSizeInBits()) {
unsigned step = BlockReadToOptType->getScalarSizeInBits() / BlockReadToRemoveType->getScalarSizeInBits();
unsigned ToRemoveNumElem = getNumElements(BlockReadToRemoveType);
Type *IElemType = getVectorElementType(BlockReadToRemoveType);
unsigned tmp = step;
int pw = 0;
while (tmp >>= 1)
++pw;
Value *SgidDivStep = Builder.CreateLShr(SgId, Builder.getInt32(pw));
Value *SimdDivStep = Builder.CreateLShr(Builder.getInt32(sg_size), Builder.getInt32(pw));
unsigned LimitElem = ToRemoveNumElem / step;
if (ToRemoveNumElem % step) {
LimitElem++;
}
std::vector<Instruction *> ExtractElemInstrVector;
// Extracting elements from BlockReadToOptimize to use them in shuffles
for (unsigned i = 0; i < LimitElem; i++) {
if (BlockReadToOptType->isVectorTy()) {
Instruction *ExtrElemInstr =
cast<Instruction>(Builder.CreateExtractElement(BlockReadToOptimize, Builder.getInt32(i)));
ExtractElemInstrVector.push_back(ExtrElemInstr);
} else {
ExtractElemInstrVector.push_back(BlockReadToOptimize);
}
}
std::vector<Instruction *> ShuffleInstrVector;
unsigned LimitId = step;
if (ToRemoveNumElem < step) {
LimitId = ToRemoveNumElem;
}
// Generating set of shuffles and collecting them in vector
for (unsigned k = 0; k < LimitElem; k++) {
for (unsigned i = 0; i < LimitId; i++) {
Value *SgIdShfl = Builder.CreateAdd(SgidDivStep, Builder.CreateMul(SimdDivStep, Builder.getInt32(i)));
Value *shuffle = getShuffle(SgIdShfl, ExtractElemInstrVector[k], SgId, Builder, ToOptSize);
ShuffleInstrVector.push_back(cast<Instruction>(shuffle));
}
}
unsigned ShufflesNum = LimitElem * LimitId;
Type *TypeVectForBitCast = VectorType::get(IElemType, step, false);
Value *ResVect = nullptr;
// Getting the result of a blockread that has been deleted
for (unsigned ShfflCnt = 0; ShfflCnt < ShufflesNum; ShfflCnt++) {
Value *VectBitcast = Builder.CreateBitCast(ShuffleInstrVector[ShfflCnt], TypeVectForBitCast);
Value *Index = Builder.CreateAnd(SgId, Builder.CreateSub(Builder.getInt32(step), Builder.getInt32(1)));
Value *Elem = Builder.CreateExtractElement(VectBitcast, Index);
if (BlockReadToRemoveType->isVectorTy()) {
if (ShfflCnt == 0) {
ResVect = Builder.CreateInsertElement(UndefValue::get(BlockReadToRemoveType), Elem, Builder.getInt32(0));
} else {
ResVect = Builder.CreateInsertElement(ResVect, Elem, Builder.getInt32(ShfflCnt));
}
} else {
ResVect = Elem;
}
}
BlockReadToRemove->replaceAllUsesWith(ResVect);
} else {
BlockReadToRemove->replaceAllUsesWith(BlockReadToOptimize);
}
}
// This function return shuffle instruction(if BlockedToOptimize size < 64)
// or it returns value which is concatenation of two shuffle instructions.
Value *MemOpt::getShuffle(Value *ShflId, Instruction *BlockReadToOptimize, Value *SgId, llvm::IRBuilder<> &Builder,
unsigned &ToOptSize) {
Value *shuffle = nullptr;
Type *BlockReadToOptType = BlockReadToOptimize->getType();
if (ToOptSize < 64) {
Type *shufflefntype = getVectorElementType(BlockReadToOptType);
Function *shufflefn = GenISAIntrinsic::getDeclaration(BlockReadToOptimize->getModule(),
GenISAIntrinsic::GenISA_WaveShuffleIndex, shufflefntype);
SmallVector<Value *, 3> Args;
Args.push_back(cast<Value>(BlockReadToOptimize));
Args.push_back(ShflId);
Args.push_back(Builder.getInt32(0));
shuffle = Builder.CreateCall(shufflefn, Args);
} else if (ToOptSize == 64) {
Type *NewType = VectorType::get(Builder.getInt32Ty(), 2, false);
Instruction *BitCastInstr = cast<Instruction>(Builder.CreateBitCast(BlockReadToOptimize, cast<Type>(NewType)));
Instruction *ExtractElemInstr0 = cast<Instruction>(Builder.CreateExtractElement(BitCastInstr, Builder.getInt32(0)));
Instruction *ExtractElemInstr1 = cast<Instruction>(Builder.CreateExtractElement(BitCastInstr, Builder.getInt32(1)));
Function *shufflefn = GenISAIntrinsic::getDeclaration(
BlockReadToOptimize->getModule(), GenISAIntrinsic::GenISA_WaveShuffleIndex, Builder.getInt32Ty());
SmallVector<Value *, 3> Args0;
Args0.push_back(cast<Value>(ExtractElemInstr0));
Args0.push_back(ShflId);
Args0.push_back(Builder.getInt32(0));
Value *shuffle0 = Builder.CreateCall(shufflefn, Args0);
SmallVector<Value *, 3> Args1;
Args1.push_back(cast<Value>(ExtractElemInstr1));
Args1.push_back(ShflId);
Args1.push_back(Builder.getInt32(0));
Value *shuffle1 = Builder.CreateCall(shufflefn, Args1);
Value *ins_elem0 =
Builder.CreateInsertElement(UndefValue::get(NewType), shuffle0, cast<Value>(Builder.getInt64(0)));
Value *ins_elem1 = Builder.CreateInsertElement(ins_elem0, shuffle1, Builder.getInt64(1));
shuffle = Builder.CreateBitCast(ins_elem1, BlockReadToOptType);
}
return shuffle;
}
// The following function "chainedSelectAndPhis" is designed to avoid going into SCEV in special circumstances
// when the shader has a large set of chained phi nodes and selects. One of the downsides of SCEV is it is a
// recursive approach and can cause a stack overflow when tracing back instructions.
std::optional<unsigned> MemOpt::chainedSelectAndPhis(Instruction *Inst, unsigned depth,
llvm::DenseMap<Instruction *, unsigned> &depthTracking) {
// Max depth set to 300
if (depth >= 300) {
return std::nullopt;
}
if (auto I = depthTracking.find(Inst); I != depthTracking.end()) {
if ((depth + I->second) >= 300)
return std::nullopt;
return I->second;
}
unsigned MaxRemDepth = 0;
for (auto &operand : Inst->operands()) {
if (auto *op_inst = dyn_cast<Instruction>(operand)) {
if (isa<PHINode>(op_inst) || isa<SelectInst>(op_inst)) {
std::optional<unsigned> RemDepth = chainedSelectAndPhis(op_inst, depth + 1, depthTracking);
if (!RemDepth)
return std::nullopt;
MaxRemDepth = std::max(MaxRemDepth, *RemDepth + 1);
}
}
}
depthTracking[Inst] = MaxRemDepth;
return MaxRemDepth;
}
bool MemOpt::mergeLoad(ALoadInst &LeadingLoad, MemRefListTy::iterator aMI, MemRefListTy &MemRefs,
TrivialMemRefListTy &ToOpt) {
MemRefListTy::iterator MI = aMI;
// For cases like the following:
// ix0 = sext i32 a0 to i64
// addr0 = gep base, i64 ix0
//
// ix1 = sext i32 a1 to i64
// addr1 = gep base, i64 ix1
// Since SCEV does not do well with sext/zext/longer expression on
// comparing addr0 with addr1, this function compares a0 with a1 instead.
// In doing so, it skip sext/zext and only on the last index (thus shorter
// expression). The condition for doing so is that if all indices are
// identical except the last one.
//
// Return value: byte offset to LeadLastIdx. Return 0 if unknown.
auto getGEPIdxDiffIfAppliable = [this](const SCEV *&LeadLastIdx, ALoadInst &LeadLd, ALoadInst &NextLd) {
// Only handle single-index GEP for now.
auto LeadGEP = dyn_cast<GetElementPtrInst>(LeadLd.getPointerOperand());
auto NextGEP = dyn_cast<GetElementPtrInst>(NextLd.getPointerOperand());
if (LeadGEP && NextGEP && LeadGEP->getPointerOperand() == NextGEP->getPointerOperand() &&
LeadGEP->getNumIndices() == NextGEP->getNumIndices() && LeadLd.getType() == NextLd.getType() &&
LeadGEP->getResultElementType() == NextGEP->getResultElementType() && LeadGEP->getNumIndices() > 0) {
const int N = LeadGEP->getNumIndices();
for (int i = 1; i < N; ++i) {
// GEP 0:base, 1:1st_index, 2:2nd_index, ..., N:Nth_index
Value *ix0 = LeadGEP->getOperand(i);
Value *ix1 = NextGEP->getOperand(i);
if (ix0 == ix1)
continue;
ConstantInt *Cix0 = dyn_cast<ConstantInt>(ix0);
ConstantInt *Cix1 = dyn_cast<ConstantInt>(ix1);
if (Cix0 && Cix1 && Cix0->getSExtValue() == Cix1->getSExtValue())
continue;
// don't handle, skip
return (int64_t)0;
}
// Make sure the last index is to the array (indexed type is array
// element type).
// For N = 1, the type is an implicit array of the pointee type
// of GEP's pointer operand. But N > 1, need to check as the last
// index might be to a struct.
if (N > 1) {
// get type of the second index from the last.
SmallVector<Value *, 4> Indices(LeadGEP->idx_begin(), std::prev(LeadGEP->idx_end()));
Type *srcEltTy = LeadGEP->getSourceElementType();
Type *Idx2Ty = GetElementPtrInst::getIndexedType(srcEltTy, Indices);
if (!Idx2Ty || !Idx2Ty->isArrayTy())
return (int64_t)0;
}
CastInst *lastIx0 = dyn_cast<CastInst>(LeadGEP->getOperand(N));
CastInst *lastIx1 = dyn_cast<CastInst>(NextGEP->getOperand(N));
if (lastIx0 && lastIx1 && lastIx0->getOpcode() == lastIx1->getOpcode() &&
(isa<SExtInst>(lastIx0) || isa<ZExtInst>(lastIx0)) && lastIx0->getType() == lastIx1->getType() &&
lastIx0->getSrcTy() == lastIx1->getSrcTy()) {
if (!LeadLastIdx)
LeadLastIdx = SE->getSCEV(lastIx0->getOperand(0));
const SCEV *NextIdx = SE->getSCEV(lastIx1->getOperand(0));
auto Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV(NextIdx, LeadLastIdx));
if (Diff) {
// This returns 16 for <3 x i32>, not 12!
uint32_t EltBytes = (uint32_t)DL->getTypeStoreSize(NextGEP->getResultElementType());
int64_t eltDiff = Diff->getValue()->getSExtValue();
return (int64_t)(eltDiff * EltBytes);
}
}
}
return (int64_t)0;
};
// Push the leading load into the list to be optimized (after
// canonicalization.) It will be swapped with the new one if it's merged.
ToOpt.push_back(LeadingLoad.inst());
if (!LeadingLoad.isSimple())
return false;
Type *LeadingLoadType = LeadingLoad.getType();
if (LeadingLoadType->isPointerTy()) {
unsigned int AS = LeadingLoadType->getPointerAddressSpace();
if (CGC->getRegisterPointerSizeInBits(AS) != DL->getPointerSizeInBits(AS)) {
// we cannot coalesce pointers which have been reduced as they are
// bigger in memory than in register
return false;
}
}
Type *LeadingLoadScalarType = LeadingLoadType->getScalarType();
unsigned TypeSizeInBits = unsigned(DL->getTypeSizeInBits(LeadingLoadScalarType));
if (!ProfitVectorLengths.count(TypeSizeInBits))
return false;
SmallVector<unsigned, 8> profitVec;
// FIXME: Enable for OCL shader only as other clients have regressions but
// there's no way to trace down.
bool isUniformLoad = (CGC->type == ShaderType::OPENCL_SHADER) && (WI->isUniform(LeadingLoad.inst()));
if (isUniformLoad) {
unsigned C = IGC_GET_FLAG_VALUE(UniformMemOpt4OW);
C = (C == 1) ? 512 : 256;
C /= TypeSizeInBits;
for (; C >= 2; --C)
profitVec.push_back(C);
} else {
SmallVector<unsigned, 4> &Vec = ProfitVectorLengths[TypeSizeInBits];
profitVec.append(Vec.begin(), Vec.end());
}
unsigned LdSize = unsigned(DL->getTypeStoreSize(LeadingLoadType));
unsigned LdScalarSize = unsigned(DL->getTypeStoreSize(LeadingLoadScalarType));
// NumElts: num of elts if all candidates are actually merged.
unsigned NumElts = getNumElements(LeadingLoadType);
if (NumElts > profitVec[0])
return false;
if (auto *Ptr = dyn_cast<Instruction>(LeadingLoad.getPointerOperand())) {
llvm::DenseMap<Instruction *, unsigned> depthTracking;
if (!chainedSelectAndPhis(Ptr, 0, depthTracking)) {
return false;
}
}
const SCEV *LeadingPtr = SE->getSCEV(LeadingLoad.getPointerOperand());
if (isa<SCEVCouldNotCompute>(LeadingPtr))
return false;
const SCEV *LeadingLastIdx = nullptr; // set on-demand
bool DoCmpOnLastIdx = false;
if (!EnableCanonicalizeGEP()) {
auto aGEP = dyn_cast<GetElementPtrInst>(LeadingLoad.getPointerOperand());
if (aGEP && aGEP->hasIndices()) {
// index starts from 1
Value *ix = aGEP->getOperand(aGEP->getNumIndices());
DoCmpOnLastIdx = (isa<SExtInst>(ix) || isa<ZExtInst>(ix));
}
}
// ALoadInst, Offset, MemRefListTy::iterator, LeadingLoad's int2PtrOffset
MergeVector LoadsToMerge;
LoadsToMerge.push_back(std::make_tuple(LeadingLoad.inst(), 0, MI));
// Loads to be merged is scanned in the program order and will be merged into
// the leading load. So two edges of that consecutive region are checked
// against the leading load, i.e.
// - the left-side edge, the leading load to the first load (mergable load
// with the minimal offset)
// - the right-side edge, the last load (mergable load with the maximal
// offset) to the leading load.
//
// A check list is maintained from the leading load to the current
// instruction as the list of instrucitons which may read or write memory but
// is not able to be merged into that leading load. Since we merge
// consecutive loads into the leading load, that check list is accumulated
// and each consecutive load needs to check against that accumulated check
// list.
// Two edges of the region where loads are merged into.
int64_t HighestOffset = LdSize;
int64_t LowestOffset = 0;
// List of instructions need dependency check.
SmallVector<Instruction *, 8> CheckList;
const unsigned Limit = IGC_GET_FLAG_VALUE(MemOptWindowSize);
// Given the Start position of the Window is MI->second,
// the End postion of the Window is "limit + Windows' start".
const unsigned windowEnd = Limit + MI->second;
auto ME = MemRefs.end();
for (++MI; MI != ME && MI->second <= windowEnd; ++MI) {
Instruction *NextMemRef = MI->first;
// Skip already merged one.
if (!NextMemRef)
continue;
CheckList.push_back(NextMemRef);
auto NextLoad = ALoadInst::get(NextMemRef);
// Skip non-load instruction.
if (!NextLoad.has_value())
continue;
// Bail out if that load is not a simple one.
if (!NextLoad->isSimple())
break;
// Skip if that load is from different address spaces.
if (NextLoad->getPointerAddressSpace() != LeadingLoad.getPointerAddressSpace())
continue;
// Skip if predicates are different (for non-predicated load, predicate
// is nullptr, so this check also filters out combination of predicated
// and non-predicated loads)
if (NextLoad->getPredicate() != LeadingLoad.getPredicate())
continue;
Type *NextLoadType = NextLoad->getType();
// Skip if they have different sizes.
if (!hasSameSize(NextLoadType->getScalarType(), LeadingLoadScalarType))
continue;
const SCEV *NextPtr = SE->getSCEV(NextLoad->getPointerOperand());
if (isa<SCEVCouldNotCompute>(NextPtr))
continue;
int64_t Off = 0;
const SCEVConstant *Offset = dyn_cast<SCEVConstant>(SE->getMinusSCEV(NextPtr, LeadingPtr));
// If addr cmp fails, try whether index cmp can be applied.
if (DoCmpOnLastIdx && Offset == nullptr)
Off = getGEPIdxDiffIfAppliable(LeadingLastIdx, LeadingLoad, NextLoad.value());
// Skip load with non-constant distance.
// If Off != 0, it is already a constant via index cmp
if (Off == 0) {
if (!Offset) {
SymbolicPointer LeadingSymPtr;
SymbolicPointer NextSymPtr;
if (SymbolicPointer::decomposePointer(LeadingLoad.getPointerOperand(), LeadingSymPtr, CGC) ||
SymbolicPointer::decomposePointer(NextLoad->getPointerOperand(), NextSymPtr, CGC) ||
NextSymPtr.getConstantOffset(LeadingSymPtr, Off)) {
continue;
} else {
if (!AllowNegativeSymPtrsForLoad && LeadingSymPtr.Offset < 0)
continue;
}
} else {
Off = Offset->getValue()->getSExtValue();
}
}
unsigned NextLoadSize = unsigned(DL->getTypeStoreSize(NextLoadType));
// By assuming dead load elimination always works correctly, if the load on
// the same location is observed again, that is probably because there is
// an instruction with global effect between them. Bail out directly.
if (Off == 0 && LdSize == NextLoadSize)
break;
int64_t newHighestOffset = std::max(Off + NextLoadSize, HighestOffset);
int64_t newLowestOffset = std::min(Off, LowestOffset);
uint64_t newNumElts = uint64_t((newHighestOffset - newLowestOffset) / LdScalarSize);
// Ensure that the total size read evenly divides the element type.
// For example, we could have a packed struct <{i64, i32, i64}> that
// would compute a size of 20 but, without this guard, would set
// 'NumElts' to 2 as if the i32 wasn't present.
if (uint64_t(newHighestOffset - newLowestOffset) % LdScalarSize != 0)
continue;
// Bail out if the resulting vector load is already not profitable.
if (newNumElts > profitVec[0])
continue;
HighestOffset = newHighestOffset;
LowestOffset = newLowestOffset;
NumElts = static_cast<unsigned>(newNumElts);
// This load is to be merged. Remove it from check list.
CheckList.pop_back();
// If the candidate load cannot be safely merged, merge mergable loads
// currently found.
if (!isSafeToMergeLoad(NextLoad.value(), CheckList))
break;
LoadsToMerge.push_back(std::make_tuple(NextLoad->inst(), Off, MI));
}
unsigned s = LoadsToMerge.size();
if (s < 2)
return false;
IGCIRBuilder<> Builder(LeadingLoad.inst());
// Start to merge loads.
IGC_ASSERT_MESSAGE(1 < NumElts, "It's expected to merge into at least 2-element vector!");
// Sort loads based on their offsets (to the leading load) from the smallest to the largest.
// And then try to find the profitable vector length first.
std::sort(LoadsToMerge.begin(), LoadsToMerge.end(), less_tuple<1>());
unsigned MaxElts = profitVec[0];
for (unsigned k = 1, e = profitVec.size(); NumElts != MaxElts && k != e && s != 1;) {
// Try next legal vector length.
while (NumElts < MaxElts && k != e) {
MaxElts = profitVec[k++];
}
if (EnableCanonicalizeGEP()) {
// Guard under the key to distinguish new code (GEPCanon is off) from the old.
// Note: not sure about the reason for the following check.
if (NumElts == 3 && (LeadingLoadScalarType->isIntegerTy(16) || LeadingLoadScalarType->isHalfTy())) {
return false;
}
}
// Try remove loads to be merged.
while (NumElts > MaxElts && s != 1) {
Type *Ty = std::get<0>(LoadsToMerge[--s])->getType();
NumElts -= getNumElements(Ty);
}
}
if (NumElts != MaxElts || s < 2)
return false;
LoadsToMerge.resize(s);
// Loads to be merged will be merged into the leading load. However, the
// pointer from the first load (with the minimal offset) will be used as the
// new pointer.
ALoadInst FirstLoad = ALoadInst::get(std::get<0>(LoadsToMerge.front())).value();
int64_t FirstOffset = std::get<1>(LoadsToMerge.front());
IGC_ASSERT_MESSAGE(FirstOffset <= 0, "The 1st load should be either the leading load or load with smaller offset!");
// Next we need to check alignment
if (!checkAlignmentBeforeMerge(FirstLoad, LoadsToMerge, NumElts))
return false;
if (!DebugCounter::shouldExecute(MergeLoadCounter))
return false;
// Calculate the new pointer. If the leading load is not the first load,
// re-calculate it from the leading pointer.
// Alternatively, we could schedule instructions calculating the first
// pointer ahead the leading load. But it's much simpler to re-calculate
// it due to the constant offset.
Value *Ptr = LeadingLoad.getPointerOperand();
if (FirstOffset < 0) {
// If the first load is not the leading load, re-calculate the pointer
// from the pointer of the leading load.
IGC_ASSERT(LdScalarSize);
IGC_ASSERT_MESSAGE(FirstOffset % LdScalarSize == 0, "Remainder is expected to be 0!");
Value *Idx = Builder.getInt64(FirstOffset / LdScalarSize);
Type *Ty = PointerType::get(LeadingLoadScalarType, LeadingLoad.getPointerAddressSpace());
Ptr = Builder.CreateBitCast(Ptr, Ty);
GEPOperator *FirstGEP = dyn_cast<GEPOperator>(FirstLoad.getPointerOperand());
if (FirstGEP && FirstGEP->isInBounds())
Ptr = Builder.CreateInBoundsGEP(LeadingLoadScalarType, Ptr, Idx);
else
Ptr = Builder.CreateGEP(LeadingLoadScalarType, Ptr, Idx);
}
Type *NewLoadType = IGCLLVM::FixedVectorType::get(LeadingLoadScalarType, NumElts);
Type *NewPointerType = PointerType::get(NewLoadType, LeadingLoad.getPointerAddressSpace());
Value *NewPointer = Builder.CreateBitCast(Ptr, NewPointerType);
// Prepare Merge Value if needed:
Value *NewMergeValue = CreateNewMergeValue(Builder, NewLoadType, LoadsToMerge, LdScalarSize, NumElts);
Instruction *NewLoad = FirstLoad.CreateAlignedLoad(Builder, NewLoadType, NewPointer, NewMergeValue);
NewLoad->setDebugLoc(LeadingLoad.inst()->getDebugLoc());
// Unpack the load value to their uses. For original vector loads, extracting
// and inserting is necessary to avoid tracking uses of each element in the
// original vector load value.
unsigned Pos = 0;
MDNode *mdLoadInv = nullptr;
bool allInvariantLoads = true;
MDNode *nonTempMD = LeadingLoad.inst()->getMetadata("nontemporal");
for (auto &I : LoadsToMerge) {
Type *Ty = std::get<0>(I)->getType();
Type *ScalarTy = Ty->getScalarType();
IGC_ASSERT(hasSameSize(ScalarTy, LeadingLoadScalarType));
mdLoadInv = std::get<0>(I)->getMetadata(LLVMContext::MD_invariant_load);
if (!mdLoadInv) {
allInvariantLoads = false;
}
nonTempMD = MDNode::concatenate(std::get<0>(I)->getMetadata("nontemporal"), nonTempMD);
Pos = unsigned((std::get<1>(I) - FirstOffset) / LdScalarSize);
if (Ty->isVectorTy()) {
if (Pos + cast<IGCLLVM::FixedVectorType>(Ty)->getNumElements() > NumElts) {
// This implies we're trying to extract an element from our new load
// with an index > the size of the new load. If this happens,
// we'll generate correct code if it does since we don't remove the
// original load for this element.
continue;
}
Value *Val = UndefValue::get(Ty);
for (unsigned i = 0, e = (unsigned)cast<IGCLLVM::FixedVectorType>(Ty)->getNumElements(); i != e; ++i) {
Value *Ex = Builder.CreateExtractElement(NewLoad, Builder.getInt32(Pos + i));
Ex = createBitOrPointerCast(Ex, ScalarTy, Builder);
Val = Builder.CreateInsertElement(Val, Ex, Builder.getInt32(i));
}
std::get<0>(I)->replaceAllUsesWith(Val);
} else {
if (Pos + 1 > NumElts) {
continue;
}
Value *Val = Builder.CreateExtractElement(NewLoad, Builder.getInt32(Pos));
Val = createBitOrPointerCast(Val, ScalarTy, Builder);
std::get<0>(I)->replaceAllUsesWith(Val);
}
}
if (allInvariantLoads) {
NewLoad->setMetadata(LLVMContext::MD_invariant_load, mdLoadInv);
}
// Transfer !nontemporal metadata to the new load
if (nonTempMD) {
NewLoad->setMetadata("nontemporal", nonTempMD);
}
// Replace the list to be optimized with the new load.
Instruction *NewOne = NewLoad;
std::swap(ToOpt.back(), NewOne);
for (auto &I : LoadsToMerge) {
ALoadInst LD = ALoadInst::get(std::get<0>(I)).value();
Value *Ptr = LD.getPointerOperand();
// make sure the load was merged before actually removing it
if (LD.inst()->use_empty()) {
LD.inst()->eraseFromParent();
}
RecursivelyDeleteTriviallyDeadInstructions(Ptr);
// Mark it as already merged.
// Also, skip updating distance as the Window size is just a heuristic.
std::get<2>(I)->first = nullptr;
}
// Add merged load into the leading load position in MemRefListTy
// so that MemRefList is still valid and can be reused.
aMI->first = NewOne;
return true;
}
bool MemOpt::mergeStore(AStoreInst &LeadingStore, MemRefListTy::iterator MI, MemRefListTy &MemRefs,
TrivialMemRefListTy &ToOpt) {
// Push the leading store into the list to be optimized (after
// canonicalization.) It will be swapped with the new one if it's merged.
ToOpt.push_back(LeadingStore.inst());
if (!LeadingStore.isSimple())
return false;
if (LeadingStore.getValueOperand()->getType()->isPointerTy()) {
unsigned AS = LeadingStore.getValueOperand()->getType()->getPointerAddressSpace();
if (CGC->getRegisterPointerSizeInBits(AS) != DL->getPointerSizeInBits(AS)) {
// we cannot coalesce pointers which have been reduced as they are
// bigger in memory than in register
return false;
}
}
unsigned NumElts = 0;
Value *LeadingStoreVal = LeadingStore.getValueOperand();
Type *LeadingStoreType = LeadingStoreVal->getType();
Type *LeadingStoreScalarType = LeadingStoreType->getScalarType();
unsigned StSize = unsigned(DL->getTypeStoreSize(LeadingStoreType));
unsigned typeSizeInBits = unsigned(DL->getTypeSizeInBits(LeadingStoreScalarType));
if (!ProfitVectorLengths.count(typeSizeInBits))
return false;
SmallVector<unsigned, 4> &profitVec = ProfitVectorLengths[typeSizeInBits];
NumElts += getNumElements(LeadingStoreType);
if (NumElts >= profitVec[0])
return false;
const SCEV *LeadingPtr = SE->getSCEV(LeadingStore.getPointerOperand());
if (isa<SCEVCouldNotCompute>(LeadingPtr))
return false;
// AStoreInst, Offset, MemRefListTy::iterator, LeadingStore's int2PtrOffset
SmallVector<std::tuple<Instruction *, int64_t, MemRefListTy::iterator>, 8> StoresToMerge;
StoresToMerge.push_back(std::make_tuple(LeadingStore.inst(), 0, MI));
// Stores to be merged are scanned in the program order from the leading store
// but need to be merged into the tailing store. So two edges of that
// consecutive region are checked against the leading store, i.e.
// - the left-side edge, the leading store to the first store (mergable store
// with the minimal offset)
// - the right-side edge, the last store (mergable store with the maximal
// offset) to the leading store.
//
// A check list is maintained from a previous tailing mergable store to the
// new tailing store instruction because all those stores will be merged into
// the new tailing store. That is, we need to check all mergable stores each
// time a "new" tailing store is found. However, that check list needs not
// accumulating as we already check that all stores to be merged are safe to
// be merged into the "previous" tailing store.
// Two edges of the region where stores are merged into.
int64_t LastToLeading = StSize, LastToLeading4Transpose = 0;
int64_t LeadingToFirst = 0;
// List of instructions need dependency check.
SmallVector<Instruction *, 8> CheckList;
const unsigned Limit = IGC_GET_FLAG_VALUE(MemOptWindowSize);
// Given the Start position of the Window is MI->second,
// the End postion of the Window is "limit + Windows' start".
const unsigned windowEnd = Limit + MI->second;
auto ME = MemRefs.end();
for (++MI; MI != ME && MI->second <= windowEnd; ++MI) {
Instruction *NextMemRef = MI->first;
// Skip already merged one.
if (!NextMemRef)
continue;
CheckList.push_back(NextMemRef);
std::optional<AStoreInst> NextStore = AStoreInst::get(NextMemRef);
// Skip non-store instruction.
if (!NextStore.has_value())
continue;
// Bail out if that store is not a simple one.
if (!NextStore->isSimple())
break;
// Skip if that store is from different address spaces.
if (NextStore->getPointerAddressSpace() != LeadingStore.getPointerAddressSpace())
continue;
// Skip if it is a predicated store and predicates are different
// (for non-predicated store, predicate is nullptr, so this check also
// filters out combination of predicated and non-predicated stores)
if (NextStore->getPredicate() != LeadingStore.getPredicate())
continue;
Value *NextStoreVal = NextStore->getValueOperand();
Type *NextStoreType = NextStoreVal->getType();
// Skip if they have different sizes.
if (!hasSameSize(NextStoreType->getScalarType(), LeadingStoreScalarType))
continue;
const SCEV *NextPtr = SE->getSCEV(NextStore->getPointerOperand());
if (isa<SCEVCouldNotCompute>(NextPtr))
continue;
int64_t Off = 0;
const SCEVConstant *Offset = dyn_cast<SCEVConstant>(SE->getMinusSCEV(NextPtr, LeadingPtr));
// Skip store with non-constant distance.
if (!Offset) {
SymbolicPointer LeadingSymPtr;
SymbolicPointer NextSymPtr;
if (SymbolicPointer::decomposePointer(LeadingStore.getPointerOperand(), LeadingSymPtr, CGC) ||
SymbolicPointer::decomposePointer(NextStore->getPointerOperand(), NextSymPtr, CGC) ||
NextSymPtr.getConstantOffset(LeadingSymPtr, Off))
continue;
} else
Off = Offset->getValue()->getSExtValue();
// By assuming dead store elimination always works correctly, if the store
// on the same location is observed again, that is probably because there
// is an instruction with global effect between them. Bail out directly.
if (Off == 0)
break;
unsigned NextStoreSize = unsigned(DL->getTypeStoreSize(NextStoreType));
if ((Off > 0 && Off != LastToLeading) || (Off < 0 && (-Off) != (LeadingToFirst + NextStoreSize)))
// Check it's consecutive to the current stores to be merged.
continue;
NumElts += getNumElements(NextStoreType);
// Bail out if the resulting vector store is already not profitable.
if (NumElts > profitVec[0])
break;
// This store is to be merged. Remove it from check list.
CheckList.pop_back();
// If the candidate store cannot be safely merged, merge mergable stores
// currently found.
if (!isSafeToMergeStores(StoresToMerge, CheckList))
break;
// Clear check list.
CheckList.clear();
StoresToMerge.push_back(std::make_tuple(NextStore->inst(), Off, MI));
if (Off > 0) {
LastToLeading = Off + NextStoreSize;
LastToLeading4Transpose = Off;
} else
LeadingToFirst = (-Off);
// Early out if the maximal profitable vector length is reached.
if (NumElts == profitVec[0])
break;
}
unsigned s = StoresToMerge.size();
if (s < 2)
return false;
// Tailing store is always the last one in the program order.
Instruction *TailingStore = std::get<0>(StoresToMerge.back());
IGCIRBuilder<> Builder(TailingStore);
// Start to merge stores.
NumElts = 0;
for (auto &I : StoresToMerge) {
Type *Ty = AStoreInst::get(std::get<0>(I))->getValueOperand()->getType();
NumElts += getNumElements(Ty);
}
IGC_ASSERT_MESSAGE(1 < NumElts, "It's expected to merge into at least 2-element vector!");
// Try to find the profitable vector length first.
unsigned MaxElts = profitVec[0];
for (unsigned k = 1, e = profitVec.size(); NumElts != MaxElts && k != e && s != 1;) {
// Try next legal vector length.
while (NumElts < MaxElts && k != e)
MaxElts = profitVec[k++];
// Try remove stores to be merged.
while (NumElts > MaxElts && s != 1) {
Type *Ty = AStoreInst::get(std::get<0>(StoresToMerge[--s]))->getValueOperand()->getType();
NumElts -= getNumElements(Ty);
}
}
if (NumElts != MaxElts || s < 2)
return false;
// Resize stores to be merged to the profitable length and sort them based on
// their offsets to the leading store.
StoresToMerge.resize(s);
std::sort(StoresToMerge.begin(), StoresToMerge.end(), less_tuple<1>());
// Stores to be merged will be merged into the tailing store. However, the
// pointer from the first store (with the minimal offset) will be used as the
// new pointer.
AStoreInst FirstStore = AStoreInst::get(std::get<0>(StoresToMerge.front())).value();
// Next we need to check alignment
if (!checkAlignmentBeforeMerge(FirstStore, StoresToMerge, NumElts))
return false;
Type *NewStoreType = IGCLLVM::FixedVectorType::get(LeadingStoreScalarType, NumElts);
Value *NewStoreVal = UndefValue::get(NewStoreType);
MDNode *NonTempMD = TailingStore->getMetadata("nontemporal");
// Pack the store value from their original store values. For original vector
// store values, extracting and inserting is necessary to avoid tracking uses
// of each element in the original vector store value.
unsigned Pos = 0;
for (auto &I : StoresToMerge) {
Value *Val = AStoreInst::get(std::get<0>(I))->getValueOperand();
Type *Ty = Val->getType();
Type *ScalarTy = Ty->getScalarType();
IGC_ASSERT(hasSameSize(ScalarTy, LeadingStoreScalarType));
NonTempMD = MDNode::concatenate(std::get<0>(I)->getMetadata("nontemporal"), NonTempMD);
if (Ty->isVectorTy()) {
for (unsigned i = 0, e = (unsigned)cast<IGCLLVM::FixedVectorType>(Ty)->getNumElements(); i != e; ++i) {
Value *Ex = Builder.CreateExtractElement(Val, Builder.getInt32(i));
Ex = createBitOrPointerCast(Ex, LeadingStoreScalarType, Builder);
NewStoreVal = Builder.CreateInsertElement(NewStoreVal, Ex, Builder.getInt32(Pos++));
}
} else if (Ty->isPointerTy()) {
if (ScalarTy != LeadingStoreScalarType) {
if (LeadingStoreScalarType->isPointerTy()) {
Val = Builder.CreatePointerBitCastOrAddrSpaceCast(Val, LeadingStoreScalarType);
} else {
Val = Builder.CreatePtrToInt(
Val, Type::getIntNTy(Val->getContext(), (unsigned int)LeadingStoreScalarType->getPrimitiveSizeInBits()));
// LeadingStoreScalarType may not be an integer type, bitcast it to
// the appropiate type.
Val = Builder.CreateBitCast(Val, LeadingStoreScalarType);
}
}
NewStoreVal = Builder.CreateInsertElement(NewStoreVal, Val, Builder.getInt32(Pos++));
} else {
Val = createBitOrPointerCast(Val, LeadingStoreScalarType, Builder);
NewStoreVal = Builder.CreateInsertElement(NewStoreVal, Val, Builder.getInt32(Pos++));
}
}
if (!DebugCounter::shouldExecute(MergeStoreCounter))
return false;
// We don't need to recalculate the new pointer as we merge stores to the
// tailing store, which is dominated by all mergable stores' address
// calculations.
Type *NewPointerType = PointerType::get(NewStoreType, LeadingStore.getPointerAddressSpace());
Value *NewPointer = Builder.CreateBitCast(FirstStore.getPointerOperand(), NewPointerType);
Instruction *NewStore = FirstStore.CreateAlignedStore(Builder, NewStoreVal, NewPointer);
NewStore->setDebugLoc(TailingStore->getDebugLoc());
// Transfer !nontemporal metadata to the new store
if (NonTempMD)
NewStore->setMetadata("nontemporal", NonTempMD);
// Clone metadata
llvm::SmallVector<std::pair<unsigned, llvm::MDNode *>, 4> MDs;
TailingStore->getAllMetadata(MDs);
for (llvm::SmallVectorImpl<std::pair<unsigned, llvm::MDNode *>>::iterator MI = MDs.begin(), ME = MDs.end(); MI != ME;
++MI) {
NewStore->setMetadata(MI->first, MI->second);
}
// Replace the list to be optimized with the new store.
Instruction *NewOne = NewStore;
std::swap(ToOpt.back(), NewOne);
for (auto &I : StoresToMerge) {
AStoreInst ST = AStoreInst::get(std::get<0>(I)).value();
Value *Ptr = ST.getPointerOperand();
// Stores merged in the previous iterations can get merged again, so we need
// to update ToOpt vector to avoid null instruction in there
ToOpt.erase(std::remove(ToOpt.begin(), ToOpt.end(), ST.inst()), ToOpt.end());
ST.inst()->eraseFromParent();
RecursivelyDeleteTriviallyDeadInstructions(Ptr);
// Also, skip updating distance as the Window size is just a heuristic.
if (std::get<2>(I)->first == TailingStore)
// Writing NewStore to MemRefs for correct isSafeToMergeLoad working.
// For example if MemRefs contains this sequence: S1, S2, S3, L5, L6, L7, S4, L4
// after stores merge MemRefs contains : L5, L6, L7, S1234, L4 and loads are
// merged to L567, final instructions instructions sequence is L567, S1234, L4.
// Otherwise the sequence could be merged to sequence L4567, S1234 with
// unordered L4,S4 accesses.
std::get<2>(I)->first = NewStore;
else {
// Mark it as already merged.
std::get<2>(I)->first = nullptr;
}
}
return true;
}
/// isSafeToMergeLoad() - checks whether there is any alias from the specified
/// load to any one in the check list, which may write to that location.
bool MemOpt::isSafeToMergeLoad(const ALoadInst &Ld, const SmallVectorImpl<Instruction *> &CheckList) const {
MemoryLocation A = getLocation(Ld.inst(), TLI);
for (auto *I : CheckList) {
// Skip instructions never writing to memory.
if (!I->mayWriteToMemory())
continue;
MemoryLocation B = getLocation(I, TLI);
if (!A.Ptr || !B.Ptr || AA->alias(A, B))
return false;
}
return true;
}
/// isSafeToMergeStores() - checks whether there is any alias from the
/// specified store set to any one in the check list, which may read/write to
/// that location.
bool MemOpt::isSafeToMergeStores(
const SmallVectorImpl<std::tuple<Instruction *, int64_t, MemRefListTy::iterator>> &Stores,
const SmallVectorImpl<Instruction *> &CheckList) const {
// Arrange CheckList as the outer loop to favor the case where there are
// back-to-back stores only.
for (auto *I : CheckList) {
if (I->getMetadata(LLVMContext::MD_invariant_load))
continue;
MemoryLocation A = getLocation(I, TLI);
for (auto &S : Stores) {
MemoryLocation B = getLocation(std::get<0>(S), TLI);
if (!A.Ptr || !B.Ptr || AA->alias(A, B))
return false;
}
}
return true;
}
class ExtOperator : public Operator {
public:
static inline bool classof(const Instruction *I) {
return I->getOpcode() == Instruction::SExt || I->getOpcode() == Instruction::ZExt;
}
static inline bool classof(const ConstantExpr *CE) {
return CE->getOpcode() == Instruction::SExt || CE->getOpcode() == Instruction::ZExt;
}
static inline bool classof(const Value *V) {
return (isa<Instruction>(V) && classof(cast<Instruction>(V))) ||
(isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V)));
}
bool isZExt() const { return getOpcode() == Instruction::ZExt; }
bool isSExt() const { return getOpcode() == Instruction::SExt; }
~ExtOperator() = delete;
};
class OverflowingAdditiveOperator : public Operator {
public:
static inline bool classof(const Instruction *I) {
return I->getOpcode() == Instruction::Add || I->getOpcode() == Instruction::Sub;
}
static inline bool classof(const ConstantExpr *CE) {
return CE->getOpcode() == Instruction::Add || CE->getOpcode() == Instruction::Sub;
}
static inline bool classof(const Value *V) {
return (isa<Instruction>(V) && classof(cast<Instruction>(V))) ||
(isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V)));
}
bool hasNoUnsignedWrap() const { return cast<OverflowingBinaryOperator>(this)->hasNoUnsignedWrap(); }
bool hasNoSignedWrap() const { return cast<OverflowingBinaryOperator>(this)->hasNoSignedWrap(); }
~OverflowingAdditiveOperator() = delete;
};
class OrOperator : public ConcreteOperator<BinaryOperator, Instruction::Or> {
OrOperator() = delete;
OrOperator(const OrOperator &) = delete;
OrOperator operator=(const OrOperator &) = delete;
~OrOperator() = delete;
};
class BitCastOperator : public ConcreteOperator<Operator, Instruction::BitCast> {
~BitCastOperator() = delete;
};
bool MemOpt::canonicalizeGEP64(Instruction *I) const {
Value *Ptr = nullptr;
if (auto ALI = ALoadInst::get(I); ALI.has_value())
Ptr = ALI->getPointerOperand();
else if (auto ASI = AStoreInst::get(I); ASI.has_value())
Ptr = ASI->getPointerOperand();
// Skip non 64-bit or non GEP-based pointers if any.
if (auto Cast = dyn_cast_or_null<llvm::BitCastOperator>(Ptr))
Ptr = Cast->getOperand(0);
GEPOperator *GEPOp = dyn_cast_or_null<GEPOperator>(Ptr);
if (!GEPOp)
return false;
if (CGC->getRegisterPointerSizeInBits(GEPOp->getPointerAddressSpace()) != 64)
return false;
bool Changed = false;
for (auto U = GEPOp->idx_begin(), E = GEPOp->idx_end(); U != E; ++U) {
Value *Idx = U->get();
Type *IdxTy = Idx->getType();
IRBuilder<> Builder(isa<Instruction>(GEPOp) ? cast<Instruction>(GEPOp) : I);
if (!IdxTy->isIntegerTy(64))
continue;
auto ExtOp = dyn_cast<ExtOperator>(Idx);
if (!ExtOp)
continue;
auto CastOpcode = Instruction::CastOps(ExtOp->getOpcode());
// Distribute `ext` over binary operator with corresponding `nsw`/`nuw`
// flags.
auto BinOp = dyn_cast<OverflowingAdditiveOperator>(ExtOp->getOperand(0));
if (!BinOp) {
auto OrOp = dyn_cast<OrOperator>(ExtOp->getOperand(0));
if (!OrOp)
continue;
Value *LHS = OrOp->getOperand(0);
Value *RHS = OrOp->getOperand(1);
ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS);
if (!RHSC || !MaskedValueIsZero(LHS, RHSC->getValue(), *DL))
continue;
// Treat `or` as `add.nsw` or `add.nuw`.
LHS = Builder.CreateCast(CastOpcode, LHS, IdxTy);
RHS = Builder.CreateCast(CastOpcode, RHS, IdxTy);
bool HasNUW = ExtOp->isZExt();
bool HasNSW = ExtOp->isSExt();
U->set(Builder.CreateAdd(LHS, RHS, ".or", HasNUW, HasNSW));
RecursivelyDeleteTriviallyDeadInstructions(ExtOp);
Changed = true;
} else if ((ExtOp->isSExt() && BinOp->hasNoSignedWrap()) || (ExtOp->isZExt() && BinOp->hasNoUnsignedWrap())) {
Value *BinOpVal = cast<Value>(BinOp);
// We want to check if we should create a separate BinOp instruction for this gep instruction.
bool NeedToChangeBinOp = BinOpVal->hasOneUse();
if (NeedToChangeBinOp)
Builder.SetInsertPoint(cast<Instruction>(ExtOp));
auto BinOpcode = BinaryOperator::BinaryOps(BinOp->getOpcode());
Value *LHS = BinOp->getOperand(0);
Value *RHS = BinOp->getOperand(1);
LHS = Builder.CreateCast(CastOpcode, LHS, IdxTy);
RHS = Builder.CreateCast(CastOpcode, RHS, IdxTy);
auto BO = Builder.CreateBinOp(BinOpcode, LHS, RHS);
// BO can be a constant if both sides are constants
if (auto BOP = dyn_cast<BinaryOperator>(BO)) {
if (BinOp->hasNoUnsignedWrap())
BOP->setHasNoUnsignedWrap();
if (BinOp->hasNoSignedWrap())
BOP->setHasNoSignedWrap();
}
if (NeedToChangeBinOp)
ExtOp->replaceAllUsesWith(BO);
U->set(BO);
RecursivelyDeleteTriviallyDeadInstructions(ExtOp);
Changed = true;
}
}
return Changed;
}
bool MemOpt::optimizeGEP64(Instruction *I) const {
Value *Ptr = nullptr;
if (auto ALI = ALoadInst::get(I); ALI.has_value())
Ptr = ALI->getPointerOperand();
else if (auto ASI = AStoreInst::get(I); ASI.has_value())
Ptr = ASI->getPointerOperand();
// Skip non 64-bit or non GEP-based pointers if any.
if (auto Cast = dyn_cast_or_null<llvm::BitCastOperator>(Ptr))
Ptr = Cast->getOperand(0);
GEPOperator *GEPOp = dyn_cast_or_null<GEPOperator>(Ptr);
if (!GEPOp)
return false;
if (CGC->getRegisterPointerSizeInBits(GEPOp->getPointerAddressSpace()) != 64)
return false;
IRBuilder<> Builder(isa<Instruction>(GEPOp) ? cast<Instruction>(GEPOp) : I);
bool Changed = false;
for (auto U = GEPOp->idx_begin(), E = GEPOp->idx_end(); U != E; ++U) {
Value *Idx = U->get();
Type *IdxTy = Idx->getType();
if (!IdxTy->isIntegerTy(64))
continue;
// Factor out `ext` through binary operator with corresponding `nsw`/`nuw`
// flags.
auto BinOp = dyn_cast<OverflowingAdditiveOperator>(Idx);
if (!BinOp)
continue;
auto BinOpcode = BinaryOperator::BinaryOps(BinOp->getOpcode());
Value *LHS = BinOp->getOperand(0);
Value *RHS = BinOp->getOperand(1);
auto ExtOp0 = dyn_cast<ExtOperator>(LHS);
if (!ExtOp0)
continue;
auto CastOpcode = Instruction::CastOps(ExtOp0->getOpcode());
auto ExtOp1 = dyn_cast<ExtOperator>(RHS);
if (ExtOp1 && ExtOp0->getOpcode() == ExtOp1->getOpcode() &&
((ExtOp0->isZExt() && BinOp->hasNoUnsignedWrap()) || (ExtOp0->isSExt() && BinOp->hasNoSignedWrap()))) {
LHS = ExtOp0->getOperand(0);
RHS = ExtOp1->getOperand(0);
unsigned LHSBitWidth = LHS->getType()->getIntegerBitWidth();
unsigned RHSBitWidth = RHS->getType()->getIntegerBitWidth();
unsigned BitWidth = std::max(LHSBitWidth, RHSBitWidth);
// Either LHS or RHS may have smaller integer, extend them before
// creating `binop` over them.
if (LHSBitWidth < BitWidth) {
Type *Ty = Builder.getIntNTy(BitWidth);
LHS = Builder.CreateCast(CastOpcode, LHS, Ty);
}
if (RHSBitWidth < BitWidth) {
Type *Ty = Builder.getIntNTy(BitWidth);
RHS = Builder.CreateCast(CastOpcode, RHS, Ty);
}
} else if (isa<ConstantInt>(RHS)) {
LHS = ExtOp0->getOperand(0);
unsigned BitWidth = LHS->getType()->getIntegerBitWidth();
APInt Val = cast<ConstantInt>(RHS)->getValue();
if (!((ExtOp0->isZExt() && Val.isIntN(BitWidth)) || (ExtOp0->isSExt() && Val.isSignedIntN(BitWidth))))
continue;
if (!((ExtOp0->isZExt() && BinOp->hasNoUnsignedWrap()) || (ExtOp0->isSExt() && BinOp->hasNoSignedWrap())))
continue;
LHS = ExtOp0->getOperand(0);
RHS = Builder.CreateTrunc(RHS, LHS->getType());
} else
continue;
auto BO = cast<BinaryOperator>(Builder.CreateBinOp(BinOpcode, LHS, RHS));
if (BinOp->hasNoUnsignedWrap())
BO->setHasNoUnsignedWrap();
if (BinOp->hasNoSignedWrap())
BO->setHasNoSignedWrap();
U->set(Builder.CreateCast(CastOpcode, BO, IdxTy));
RecursivelyDeleteTriviallyDeadInstructions(BinOp);
Changed = true;
}
return Changed;
}
// getConstantOffset - Return the constant offset between two memory
// locations.
bool SymbolicPointer::getConstantOffset(SymbolicPointer &Other, int64_t &Off) {
Term *DiffTerm = nullptr;
Term *DiffOtherTerm = nullptr;
// Find how many differences there are between the two vectors of terms.
auto findDifferences = [&](SmallVector<Term, 8> &Terms1, SmallVector<Term, 8> &Terms2) -> int {
int DiffCount = 0;
for (unsigned i = 0, e = Terms1.size(); i != e; ++i) {
bool Found = false;
for (unsigned j = 0, f = Terms2.size(); !Found && j != f; ++j)
if (Terms1[i] == Terms2[j])
Found = true;
if (!Found) {
DiffCount++;
if (DiffCount > 1)
break;
DiffTerm = &Terms1[i];
}
}
// If there are no differences, no need to check further.
if (DiffCount == 0)
return DiffCount;
for (unsigned i = 0, e = Terms2.size(); i != e; ++i) {
bool Found = false;
for (unsigned j = 0, f = Terms1.size(); !Found && j != f; ++j)
if (Terms2[i] == Terms1[j])
Found = true;
if (!Found) {
DiffOtherTerm = &Terms2[i];
break;
}
}
return DiffCount;
};
if (!BasePtr || !Other.BasePtr)
return true;
if (BasePtr != Other.BasePtr && (!isa<ConstantPointerNull>(BasePtr) || !isa<ConstantPointerNull>(Other.BasePtr)))
return true;
if (Terms.size() != Other.Terms.size())
return true;
int DiffCount = findDifferences(Terms, Other.Terms);
if (DiffCount > 1)
return true;
Off = Offset - Other.Offset;
if (DiffCount == 0)
return false;
if (checkTerms(DiffTerm, DiffOtherTerm, Off))
return true;
return false;
}
// Try to match the pattern that can't be processed by the current decomposePointer algorithm.
// First chain:
// %145 = add nsw i32 %102, 1
// %146 = sub nsw i32 %145, %const_reg_dword18
//
// Second chain:
// %176 = add nsw i32 %102, 2
// %177 = sub nsw i32 %176, %const_reg_dword18
bool SymbolicPointer::checkTerms(const Term *T, const Term *OtherT, int64_t &Off) const {
bool IsPositive = true;
size_t OpNum = 0;
// Check that the instructions are add or sub with nsw flag.
auto checkInstructions = [&](const BinaryOperator *Inst0, const BinaryOperator *Inst1) -> bool {
if (!Inst0 || !Inst1)
return true;
if (Inst1->getOpcode() != Inst0->getOpcode())
return true;
if (Inst0->getOpcode() != Instruction::Add && Inst0->getOpcode() != Instruction::Sub)
return true;
if (!Inst0->hasNoSignedWrap() || !Inst1->hasNoSignedWrap())
return true;
if (Inst0->getOperand(0) != Inst1->getOperand(0) && Inst0->getOperand(1) != Inst1->getOperand(1))
return true;
if (Inst0->getOperand(0) == Inst1->getOperand(0) && Inst0->getOperand(1) == Inst1->getOperand(1)) {
OpNum = 3;
return false;
}
if (Inst0->getOperand(0) == Inst1->getOperand(0)) {
OpNum = 1;
} else {
OpNum = 0;
}
if (Inst0->getOpcode() == Instruction::Sub)
if (Inst0->getOperand(0) == Inst1->getOperand(0))
IsPositive = !IsPositive;
return false;
};
if (!T || !OtherT)
return true;
auto *Inst = dyn_cast<BinaryOperator>(T->Idx.getPointer());
auto *OtherInst = dyn_cast<BinaryOperator>(OtherT->Idx.getPointer());
if (checkInstructions(Inst, OtherInst))
return true;
auto InstOp0 = dyn_cast<BinaryOperator>(Inst->getOperand(OpNum));
auto OtherInstOp0 = dyn_cast<BinaryOperator>(OtherInst->getOperand(OpNum));
if (checkInstructions(InstOp0, OtherInstOp0))
return true;
if (OpNum == 3)
return false;
auto ConstInt = dyn_cast<ConstantInt>(InstOp0->getOperand(OpNum));
auto OtherConstInt = dyn_cast<ConstantInt>(OtherInstOp0->getOperand(OpNum));
if (!ConstInt || !OtherConstInt)
return true;
int64_t NewScale = T->Scale;
int64_t NewOtherScale = OtherT->Scale;
if (!IsPositive) {
NewScale = -NewScale;
NewOtherScale = -NewOtherScale;
}
Off += ConstInt->getSExtValue() * NewScale - OtherConstInt->getSExtValue() * NewOtherScale;
return false;
}
// Save Term in the vector of terms.
void SymbolicPointer::saveTerm(Value *Src, int64_t IndexScale, uint64_t Scale, int64_t IndexOffset,
ExtensionKind Extension, unsigned int ptrSize) {
;
this->Offset += IndexOffset * Scale;
Scale *= IndexScale;
SymbolicIndex Idx(Src, Extension);
// If we already had an occurrence of this index variable, merge this
// scale into it. For example, we want to handle:
// A[x][x] -> x*16 + x*4 -> x*20
// This also ensures that 'x' only appears in the index list once.
for (unsigned i = 0, e = this->Terms.size(); i != e; ++i) {
if (this->Terms[i].Idx == Idx) {
Scale += this->Terms[i].Scale;
this->Terms.erase(this->Terms.begin() + i);
break;
}
}
// Make sure that we have a scale that makes sense for this target's
// pointer size.
if (unsigned ShiftBits = 64 - ptrSize) {
Scale <<= ShiftBits;
Scale = (int64_t)Scale >> ShiftBits;
}
if (Scale) {
Term Entry = {Idx, int64_t(Scale)};
this->Terms.push_back(Entry);
}
}
Value *SymbolicPointer::getLinearExpression(Value *V, APInt &Scale, APInt &Offset, ExtensionKind &Extension,
unsigned Depth, const DataLayout *DL) {
IGC_ASSERT(nullptr != V);
IGC_ASSERT(nullptr != V->getType());
IGC_ASSERT_MESSAGE(V->getType()->isIntegerTy(), "Not an integer value");
// Limit our recursion depth.
if (Depth == 16) {
Scale = 1;
Offset = 0;
return V;
}
if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
switch (BOp->getOpcode()) {
default:
break;
case Instruction::Or:
// X|C == X+C if all the bits in C are unset in X. Otherwise we can't
// analyze it.
if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), *DL))
break;
// FALL THROUGH.
case Instruction::Add:
V = getLinearExpression(BOp->getOperand(0), Scale, Offset, Extension, Depth + 1, DL);
Offset += RHSC->getValue();
return V;
case Instruction::Mul:
V = getLinearExpression(BOp->getOperand(0), Scale, Offset, Extension, Depth + 1, DL);
Offset *= RHSC->getValue();
Scale *= RHSC->getValue();
return V;
case Instruction::Shl:
V = getLinearExpression(BOp->getOperand(0), Scale, Offset, Extension, Depth + 1, DL);
Offset <<= unsigned(RHSC->getValue().getLimitedValue());
Scale <<= unsigned(RHSC->getValue().getLimitedValue());
return V;
}
}
}
// Since GEP indices are sign extended anyway, we don't care about the high
// bits of a sign or zero extended value - just scales and offsets. The
// extensions have to be consistent though.
if ((isa<SExtInst>(V) && Extension != EK_ZeroExt) || (isa<ZExtInst>(V) && Extension != EK_SignExt)) {
Value *CastOp = cast<CastInst>(V)->getOperand(0);
unsigned OldWidth = Scale.getBitWidth();
unsigned SmallWidth = (unsigned int)CastOp->getType()->getPrimitiveSizeInBits();
Scale = Scale.trunc(SmallWidth);
Offset = Offset.trunc(SmallWidth);
Extension = isa<SExtInst>(V) ? EK_SignExt : EK_ZeroExt;
Value *Result = getLinearExpression(CastOp, Scale, Offset, Extension, Depth + 1, DL);
Scale = Scale.zext(OldWidth);
if (Extension == EK_SignExt)
Offset = Offset.sext(OldWidth);
else
Offset = Offset.zext(OldWidth);
return Result;
}
Scale = 1;
Offset = 0;
return V;
}
class IntToPtrOperator : public ConcreteOperator<Operator, Instruction::IntToPtr> {
~IntToPtrOperator() = delete;
};
bool SymbolicPointer::decomposePointer(const Value *Ptr, SymbolicPointer &SymPtr, CodeGenContext *pContext) {
unsigned MaxLookup = MaxLookupSearchDepth;
const DataLayout *DL = &pContext->getModule()->getDataLayout();
SymPtr.Offset = 0;
SymPtr.BasePtr = nullptr;
do {
const Operator *Op = dyn_cast<Operator>(Ptr);
if (!Op) {
// The only non-operator case we can handle are GlobalAliases.
if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
if (!GA->isInterposable()) {
Ptr = GA->getAliasee();
continue;
}
}
SymPtr.BasePtr = Ptr;
return false;
}
if (Op->getOpcode() == Instruction::BitCast || Op->getOpcode() == Instruction::AddrSpaceCast) {
Ptr = Op->getOperand(0);
continue;
}
const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
if (!GEPOp) {
// If it's not a GEP, hand it off to simplifyInstruction to see if it
// can come up with something. This matches what GetUnderlyingObject does.
if (const Instruction *I = dyn_cast<Instruction>(Ptr))
// TODO: Get a DominatorTree and use it here.
if (const Value *Simplified = IGCLLVM::simplifyInstruction(const_cast<Instruction *>(I), *DL)) {
Ptr = Simplified;
continue;
}
// IntToPtr is treated like gep(i8* 0, Src).
// TODO: Unify the common handling of IntToPtr & GEP into a single
// routine.
if (const IntToPtrOperator *I2POp = dyn_cast<IntToPtrOperator>(Op)) {
PointerType *PtrTy = cast<PointerType>(I2POp->getType());
unsigned int ptrSize = pContext->getRegisterPointerSizeInBits(PtrTy->getAddressSpace());
Value *Src = I2POp->getOperand(0);
Value *BasePtr = ConstantPointerNull::get(PtrTy);
// Constant pointer.
if (ConstantInt *CI = dyn_cast<ConstantInt>(Src)) {
SymPtr.Offset += CI->getSExtValue();
SymPtr.BasePtr = BasePtr;
return false;
}
// Treat that like (inttoptr (add (base offset)))
if (AddOperator *Add = dyn_cast<AddOperator>(Src)) {
// Note that we always assume LHS as the base and RHS as the offset.
// That's why GEP is invented in LLVM IR as the pointer arithmetic in
// C is always in form of (base + offset). By designating the base
// pointer, we won't run into the case where both operands are
// symmetric in `add` instruction.
if (!isa<ConstantInt>(Add->getOperand(1))) {
BasePtr = Add->getOperand(0);
Src = Add->getOperand(1);
}
}
uint64_t Scale = 1;
ExtensionKind Extension = EK_NotExtended;
unsigned Width = Src->getType()->getIntegerBitWidth();
if (ptrSize > Width)
Extension = EK_SignExt;
APInt IndexScale(Width, 0), IndexOffset(Width, 0);
Src = getLinearExpression(Src, IndexScale, IndexOffset, Extension, 0U, DL);
SymPtr.saveTerm(Src, IndexScale.getSExtValue(), Scale, IndexOffset.getSExtValue(), Extension, ptrSize);
Ptr = BasePtr;
}
SymPtr.BasePtr = Ptr;
return false;
}
// Don't attempt to analyze GEPs over unsized objects.
if (!GEPOp->getSourceElementType()->isSized()) {
SymPtr.BasePtr = Ptr;
return false;
}
// If we are lacking DataLayout information, we can't compute the offets of
// elements computed by GEPs. However, we can handle bitcast equivalent
// GEPs.
if (!DL) {
if (!GEPOp->hasAllZeroIndices()) {
SymPtr.BasePtr = Ptr;
return false;
}
Ptr = GEPOp->getOperand(0);
continue;
}
unsigned int ptrSize = pContext->getRegisterPointerSizeInBits(GEPOp->getPointerAddressSpace());
// Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
gep_type_iterator GTI = gep_type_begin(GEPOp);
for (User::const_op_iterator I = GEPOp->op_begin() + 1, E = GEPOp->op_end(); I != E; ++I, ++GTI) {
Value *Index = *I;
// Compute the (potentially symbolic) offset in bytes for this index.
if (StructType *STy = GTI.getStructTypeOrNull()) {
// For a struct, add the member offset.
unsigned FieldNo = unsigned(cast<ConstantInt>(Index)->getZExtValue());
if (FieldNo == 0)
continue;
SymPtr.Offset += DL->getStructLayout(STy)->getElementOffset(FieldNo);
continue;
}
// For an array/pointer, add the element offset, explicitly scaled.
if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
if (CIdx->isZero())
continue;
SymPtr.Offset += DL->getTypeAllocSize(GTI.getIndexedType()) * CIdx->getSExtValue();
continue;
}
// In some cases the GEP might have indices that don't directly have a baseoffset
// we need to dig deeper to find these
std::vector<Value *> terms = {Index};
if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Index)) {
if (!(dyn_cast<ConstantInt>(BOp->getOperand(1))) && BOp->getOpcode() == Instruction::Add) {
terms.clear();
terms.push_back(BOp->getOperand(0));
terms.push_back(BOp->getOperand(1));
}
}
for (auto Ind : terms) {
uint64_t Scale = DL->getTypeAllocSize(GTI.getIndexedType());
ExtensionKind Extension = EK_NotExtended;
// If the integer type is smaller than the pointer size, it is implicitly
// sign extended to pointer size.
unsigned Width = Index->getType()->getIntegerBitWidth();
if (ptrSize > Width)
Extension = EK_SignExt;
// Use getLinearExpression to decompose the index into a C1*V+C2 form.
APInt IndexScale(Width, 0), IndexOffset(Width, 0);
Value *new_Ind = getLinearExpression(Ind, IndexScale, IndexOffset, Extension, 0U, DL);
// The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
// This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
SymPtr.saveTerm(new_Ind, IndexScale.getSExtValue(), Scale, IndexOffset.getSExtValue(), Extension, ptrSize);
}
}
// Analyze the base pointer next.
Ptr = GEPOp->getOperand(0);
} while (--MaxLookup);
return true;
}
// Debugging
// #define _LDST_DEBUG 1
#undef _LDST_DEBUG
#if defined(_LDST_DEBUG)
static int _bundleid = 0;
#endif
namespace {
enum class AddressModel { BTS, A32, SLM, A64 };
class LdStInfo {
// Load (or load intrinsic) for loadCombine().
// store (or store intrinsic) for storeCombine.
Instruction *Inst;
// Byte offset of 'Inst'->getPointerOperand() relative to
// that of the leading load/store inst.
int64_t ByteOffset;
bool IsStore;
public:
LdStInfo(Instruction *aI, int64_t aBO) : Inst(aI), ByteOffset(aBO) {
auto ASI = AStoreInst::get(aI);
IsStore = ASI.has_value();
}
Type *getLdStType() const;
uint32_t getAlignment() const;
AddressModel getAddressModel(CodeGenContext *Ctx) const;
Value *getValueOperand() const;
bool isStore() const { return IsStore; }
int64_t getByteOffset() const { return ByteOffset; }
Instruction *getInst() const { return Inst; }
};
typedef SmallVector<LdStInfo, 8> InstAndOffsetPairs;
// A bundle: a group of consecutive loads or a group of consecutive stores.
// Each bundle maps to a single GEN load or store.
struct BundleInfo {
InstAndOffsetPairs LoadStores;
int bundle_eltBytes; // 1, 4, 8
int bundle_numElts;
// Valid for bundle_eltBytes = 1. It indicates whether D64 or
// D32(including D8U32 and D16U32) is used as data size.
bool useD64;
void print(raw_ostream &O, int BundleID = 0) const;
void dump() const;
};
typedef SmallVector<uint32_t, 8> BundleSize_t;
enum class LdStKind { IS_STORE, IS_LOAD };
// BundleConfig:
// To tell what vector size is legit. It may need GEN platform as input.
class BundleConfig {
public:
enum {
STORE_DEFAULT_BYTES_PER_LANE = 16, // 4 DW for non-uniform
LOAD_DEFAULT_BYTES_PER_LANE = 16 // 4 DW for non-uniform
};
BundleConfig(LdStKind K, int ByteAlign, bool Uniform, const AddressModel AddrModel, CodeGenContext *Ctx) {
uint32_t maxBytes = 0;
if (K == LdStKind::IS_STORE)
maxBytes = getMaxStoreBytes(Ctx);
else
maxBytes = getMaxLoadBytes(Ctx);
auto calculateSize = [this, maxBytes](bool Uniform) {
int sz = (int)m_currVecSizeVar->size();
if (Uniform) {
return (uint32_t)sz;
}
int ix = 0;
for (; ix < sz; ++ix) {
uint32_t currBytes = (*m_currVecSizeVar)[ix] * m_eltSizeInBytes;
if (currBytes > maxBytes) {
break;
}
}
return (uint32_t)(ix > 0 ? ix : 1);
};
if (Ctx->platform.LSCEnabled()) {
if (ByteAlign >= 8) {
m_currVecSizeVar = Uniform ? &m_d64VecSizes_u : &m_d64VecSizes;
m_eltSizeInBytes = 8;
m_actualSize = calculateSize(Uniform);
} else if (ByteAlign == 4) {
m_currVecSizeVar = Uniform ? &m_d32VecSizes_u : &m_d32VecSizes;
m_eltSizeInBytes = 4;
m_actualSize = calculateSize(Uniform);
} else {
m_currVecSizeVar = Uniform ? &m_d8VecSizes_u : &m_d8VecSizes;
m_eltSizeInBytes = 1;
m_actualSize = (uint32_t)m_currVecSizeVar->size();
}
} else {
m_currVecSizeVar = &m_vecSizeVar;
if (Uniform) {
// Limit to simd8 (reasonable?), scattered read/write
if (ByteAlign >= 4) {
m_vecSizeVar = {2, 4, 8};
m_eltSizeInBytes = (ByteAlign >= 8 ? 8 : 4);
} else {
m_vecSizeVar = {2, 4, 8, 16, 32};
m_eltSizeInBytes = 1;
}
m_actualSize = (uint32_t)m_vecSizeVar.size();
} else {
if (ByteAlign >= 8 && AddrModel == AddressModel::A64) {
m_vecSizeVar = {2, 4}; // QW scattered read/write
m_eltSizeInBytes = 8;
m_actualSize = calculateSize(Uniform);
} else if (ByteAlign < 4) {
m_vecSizeVar = {2, 4}; // Byte scattered read/write
m_eltSizeInBytes = 1;
m_actualSize = m_vecSizeVar.size();
} else {
m_vecSizeVar = {2, 3, 4}; // untyped read/write
m_eltSizeInBytes = 4;
m_actualSize = calculateSize(Uniform);
}
}
}
m_currIndex = 0;
}
uint32_t getAndUpdateVecSizeInBytes(uint32_t Bytes) {
const BundleSize_t &Var = *m_currVecSizeVar;
int sz = (int)getSize();
int i;
uint32_t total = 0;
for (i = m_currIndex; i < sz; ++i) {
uint32_t vecsize = Var[i];
total = vecsize * m_eltSizeInBytes;
if (total >= Bytes) {
break;
}
}
if (i >= sz) {
m_currIndex = 0;
return 0;
}
// update index
m_currIndex = i;
return total;
}
uint32_t getMaxVecSizeInBytes() const {
const BundleSize_t &Var = *m_currVecSizeVar;
return Var[getSize() - 1] * m_eltSizeInBytes;
}
uint32_t getCurrVecSize() const {
const BundleSize_t &Var = *m_currVecSizeVar;
IGC_ASSERT(0 <= m_currIndex && (int)getSize() > m_currIndex);
return Var[m_currIndex];
}
uint32_t getSize() const { return m_actualSize; }
//
// Legal vector sizes for load/store
//
// 64bit aligned, 64bit element (D64)
static const BundleSize_t m_d64VecSizes;
// 32bit aligned, 32bit element (D32)
static const BundleSize_t m_d32VecSizes;
// 8bit aligned, 8bit element (D16U32, D32, D64)
static const BundleSize_t m_d8VecSizes;
//
// uniform
//
// 64bit aligned, 64bit element (D64)
static const BundleSize_t m_d64VecSizes_u;
// 32bit aligned, 32bit element (D32)
static const BundleSize_t m_d32VecSizes_u;
// 8bit aligned, 8bit element (D16U32, D32, D64)
static const BundleSize_t m_d8VecSizes_u;
private:
// Special vecSize, used for pre-LSC platform.
BundleSize_t m_vecSizeVar;
const BundleSize_t *m_currVecSizeVar;
uint32_t m_eltSizeInBytes;
// m_currIndex is initialized to zero.
// m_actualSize is the actual size of BundleSize variable to use
// and it is no larger than the variable's capacity.
int m_currIndex;
int m_actualSize;
};
//
// Load and Store combine pass:
// combines consecutive loads/stores into a single load/store.
// It is based on a simple integer symbolic evaluation.
// 1. It can combine loads/stores of different element size; and
// 2. It does clean up to remove dead code after combining, thus
// no need to run DCE after this.
class LdStCombine : public FunctionPass {
const DataLayout *m_DL;
AliasAnalysis *m_AA;
WIAnalysis *m_WI;
CodeGenContext *m_CGC;
Function *m_F;
TargetLibraryInfo *m_TLI;
public:
static char ID;
LdStCombine()
: FunctionPass(ID), m_DL(nullptr), m_AA(nullptr), m_WI(nullptr), m_CGC(nullptr), m_F(nullptr), m_TLI(nullptr),
m_hasLoadCombined(false), m_hasStoreCombined(false) {
initializeLdStCombinePass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<CodeGenContextWrapper>();
AU.addRequired<MetaDataUtilsWrapper>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<WIAnalysis>();
}
StringRef getPassName() const override { return "LdStCombine"; }
void releaseMemory() override { clear(); }
private:
SymbolicEvaluation m_symEval;
bool m_hasLoadCombined;
bool m_hasStoreCombined;
//
// Caching
//
// If true, IGC needs to emulate I64.
bool m_hasI64Emu = false;
//
// Temporary reused for each BB.
//
// Inst order within a BB.
DenseMap<const Instruction *, int> m_instOrder;
// Per-BB: all insts that have been combined and will be deleted.
DenseMap<const Instruction *, int> m_combinedInsts;
// All root instructions (ie their uses are empty, including stores)
// that are to be deleted at the end of each BB.
SmallVector<Instruction *, 16> m_toBeDeleted;
void appendToBeDeleted(Instruction *I) {
if (I != nullptr)
m_toBeDeleted.push_back(I);
}
// Control the way that a load/store is handled.
// [more for future improvement]
DenseMap<const Instruction *, int> m_visited;
// a bundle : a group of loads or a group of store.
// Each bundle will be combined into a single load or single store.
std::list<BundleInfo> m_bundles;
void init(BasicBlock *BB) {
m_visited.clear();
m_instOrder.clear();
m_combinedInsts.clear();
}
void setInstOrder(BasicBlock *BB);
void setVisited(Instruction *I) { m_visited[I] = 1; }
bool isVisited(const Instruction *I) const { return m_visited.count(I) > 0; }
// store combining top function
void combineStores();
// load combining top function
void combineLoads();
void createBundles(BasicBlock *BB, InstAndOffsetPairs &Stores);
// Actually combining stores.
void createCombinedStores(BasicBlock *BB);
// Actually combining loads.
void createCombinedLoads(BasicBlock *BB);
// If V is vector, get all its elements (may generate extractElement
// insts; if V is not vector, just V itself.
void getOrCreateElements(Value *V, SmallVector<Value *, 16> &EltV, Instruction *InsertBefore);
// Return true if V is vector and splitting is beneficial.
bool splitVectorType(Value *V, LdStKind K) const;
bool splitVectorTypeForGather(Value *V) const { return splitVectorType(V, LdStKind::IS_STORE); }
bool splitVectorTypeForScatter(Value *V) const { return splitVectorType(V, LdStKind::IS_LOAD); }
void AllowDummyLoadCoalescing(const InstAndOffsetPairs &Loads);
// GatherCopy:
// copy multiple values (arg: Vals) into a single Dst (return value)
// (It's a packed copy, thus size(all Vals) = size(Dst).
Value *gatherCopy(const uint32_t DstEltBytes, int NElts, SmallVector<Value *, 16> &Vals, Instruction *InsertBefore);
// scatterCopy:
// copy components of a single value (arg: CompositeVal) into
// multiple values (arg: Vals)
void scatterCopy(SmallVector<Value *, 16> &Vals, int DstEltBytes, int NElts, Value *CompositeVal,
Instruction *InsertBefore);
// StructToVec:
// convert a struct type to a vector type.
// structVal -> <nelts x eltBytes>
Value *structToVec(IGCIRBuilder<> *irBuilder, BasicBlock *BB, Value *structVal, unsigned eltBytes, unsigned nelts);
// Helper functions
bool hasAlias(AliasSetTracker &AST, MemoryLocation &MemLoc);
// Create unique identified struct type
StructType *getOrCreateUniqueIdentifiedStructType(ArrayRef<Type *> EltTys, bool IsSOA, bool IsPacked = true);
uint32_t getNumElements(Type *Ty) const {
return Ty->isVectorTy() ? (unsigned)cast<IGCLLVM::FixedVectorType>(Ty)->getNumElements() : 1;
}
Type *generateLoadType(SmallVector<Value *, 16> &Vals, uint32_t ValEltBytes, uint32_t ValNElts);
// For layout struct (at most 2 level), given the current member
// position specified my Indices, advance Indices to the next member.
// Return value:
// false : if the current member is already the last;
// true : otherwise.
bool advanceStructIndices(SmallVector<uint32_t, 2> &Indices, StructType *StTy);
// Skip counting those insts as no code shall be emitted for them.
bool skipCounting(Instruction *I) {
if (auto *IntrinsicI = dyn_cast<llvm::IntrinsicInst>(I)) {
if (IntrinsicI->getIntrinsicID() == Intrinsic::assume)
return true;
}
return isDbgIntrinsic(I) || isa<BitCastInst>(I);
}
// For generating better code
bool getVecEltIfConstExtract(Value *V, SmallVector<Value *, 8> &EltV);
void mergeConstElements(SmallVector<Value *, 4> &EltVals, uint32_t MaxMergeBytes);
void eraseDeadInsts();
void clear() {
m_symEval.clear();
m_hasLoadCombined = false;
m_hasStoreCombined = false;
m_visited.clear();
m_instOrder.clear();
m_bundles.clear();
}
};
} // namespace
const BundleSize_t BundleConfig::m_d64VecSizes = {2, 3, 4};
const BundleSize_t BundleConfig::m_d32VecSizes = {2, 3, 4, 8};
const BundleSize_t BundleConfig::m_d8VecSizes = {2, 4, 8};
const BundleSize_t BundleConfig::m_d64VecSizes_u = {2, 3, 4, 8, 16, 32, 64};
const BundleSize_t BundleConfig::m_d32VecSizes_u = {2, 3, 4, 8, 16, 32, 64};
const BundleSize_t BundleConfig::m_d8VecSizes_u = {2, 4, 8, 16, 32};
bool IGC::doLdStCombine(const CodeGenContext *CGC) {
if (CGC->type == ShaderType::OPENCL_SHADER) {
auto oclCtx = (const OpenCLProgramContext *)CGC;
// internal flag overrides IGC key
switch (oclCtx->m_InternalOptions.LdStCombine) {
default:
break;
case 0:
return false;
case 1:
return CGC->platform.LSCEnabled();
case 2:
return true;
}
}
uint32_t keyval = IGC_GET_FLAG_VALUE(EnableLdStCombine);
if ((keyval & 0x3) == 1 && !CGC->platform.LSCEnabled())
return false;
return ((keyval & 0x3) || (keyval & 0x4));
}
uint32_t IGC::getMaxStoreBytes(const CodeGenContext *CGC) {
if (CGC->type == ShaderType::OPENCL_SHADER) {
auto oclCtx = (const OpenCLProgramContext *)CGC;
// internal flag overrides IGC key
if (oclCtx->m_InternalOptions.MaxStoreBytes != 0)
return oclCtx->m_InternalOptions.MaxStoreBytes;
}
uint32_t bytes = IGC_GET_FLAG_VALUE(MaxStoreVectorSizeInBytes);
if (bytes == 0 &&
(IGC_IS_FLAG_ENABLED(EnableVector8LoadStore) || CGC->type == ShaderType::RAYTRACING_SHADER ||
CGC->hasSyncRTCalls()) &&
CGC->platform.supports8DWLSCMessage()) {
// MaxStoreVectorSizeInBytes isn't set and it is RT
// EnableVector8LoadStore from memopt is supported as well
bytes = 32; // 8 DW
} else if (!(bytes >= 4 && bytes <= 32 && isPowerOf2_32(bytes))) {
// Use default if bytes from the key is not set or invalid
bytes = BundleConfig::STORE_DEFAULT_BYTES_PER_LANE;
}
return bytes;
}
uint32_t IGC::getMaxLoadBytes(const CodeGenContext *CGC) {
if (CGC->type == ShaderType::OPENCL_SHADER) {
auto oclCtx = (const OpenCLProgramContext *)CGC;
// internal flag overrides IGC key
if (oclCtx->m_InternalOptions.MaxLoadBytes != 0)
return oclCtx->m_InternalOptions.MaxLoadBytes;
}
uint32_t bytes = IGC_GET_FLAG_VALUE(MaxLoadVectorSizeInBytes);
if (bytes == 0 &&
(IGC_IS_FLAG_ENABLED(EnableVector8LoadStore) || CGC->type == ShaderType::RAYTRACING_SHADER ||
CGC->hasSyncRTCalls()) &&
CGC->platform.supports8DWLSCMessage()) {
// MaxLoadVectorSizeInBytes isn't set and it is RT
// EnableVector8LoadStore from memopt is supported as well
bytes = 32; // 8 DW
}
// Use default if bytes from the key is not set or invalid
else if (!(bytes >= 4 && bytes <= 32 && isPowerOf2_32(bytes))) {
// Use default if bytes from the key is not set or invalid
bytes = BundleConfig::LOAD_DEFAULT_BYTES_PER_LANE;
}
return bytes;
}
FunctionPass *IGC::createLdStCombinePass() { return new LdStCombine(); }
#undef PASS_FLAG
#undef PASS_DESC
#undef PASS_CFG_ONLY
#undef PASS_ANALYSIS
#undef DEBUG_TYPE
#define DEBUG_TYPE "LdStCombine"
#define PASS_FLAG "igc-ldstcombine"
#define PASS_DESC "IGC load/store combine"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(LdStCombine, PASS_FLAG, PASS_DESC, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(WIAnalysis)
IGC_INITIALIZE_PASS_END(LdStCombine, PASS_FLAG, PASS_DESC, PASS_CFG_ONLY, PASS_ANALYSIS)
char LdStCombine::ID = 0;
Type *LdStInfo::getLdStType() const {
if (!isStore())
return ALoadInst::get(Inst)->getType();
return AStoreInst::get(Inst)->getValueOperand()->getType();
}
uint32_t LdStInfo::getAlignment() const {
if (!isStore())
return (uint32_t)ALoadInst::get(Inst)->getAlignmentValue();
return (uint32_t)AStoreInst::get(Inst)->getAlignmentValue();
}
Value *LdStInfo::getValueOperand() const {
if (!isStore())
return Inst;
return AStoreInst::get(Inst)->getValueOperand();
}
AddressModel LdStInfo::getAddressModel(CodeGenContext *Ctx) const {
Value *Ptr = nullptr;
if (!isStore())
Ptr = ALoadInst::get(Inst)->getPointerOperand();
else
Ptr = AStoreInst::get(Inst)->getPointerOperand();
PointerType *PTy = cast<PointerType>(Ptr->getType());
const uint32_t AS = PTy->getPointerAddressSpace();
uint bufferIndex = 0;
bool directIndexing = false;
BufferType BTy = DecodeAS4GFXResource(AS, directIndexing, bufferIndex);
AddressModel addrModel;
if (BTy == SLM) {
addrModel = AddressModel::SLM;
} else if (BTy == ESURFACE_STATELESS) {
const bool isA32 = !IGC::isA64Ptr(PTy, Ctx);
addrModel = (isA32 ? AddressModel::A32 : AddressModel::A64);
} else {
addrModel = AddressModel::BTS;
}
return addrModel;
}
bool LdStCombine::hasAlias(AliasSetTracker &AST, MemoryLocation &MemLoc) {
for (auto &AS : AST) {
if (AS.isForwardingAliasSet())
continue;
AliasResult aresult = AS.aliasesPointer(MemLoc.Ptr, MemLoc.Size, MemLoc.AATags, AST.getAliasAnalysis());
if (aresult != AliasResult::NoAlias) {
return true;
}
}
return false;
}
void LdStCombine::setInstOrder(BasicBlock *BB) {
// Lazy initialization. Skip if it's been initialized.
if (m_instOrder.size() > 0)
return;
int i = -1;
for (auto II = BB->begin(), IE = BB->end(); II != IE; ++II) {
Instruction *I = &*II;
m_instOrder[I] = (++i);
}
}
bool LdStCombine::advanceStructIndices(SmallVector<uint32_t, 2> &Indices, StructType *StTy) {
IGC_ASSERT_MESSAGE(Indices[0] < StTy->getNumElements(), "Indices should be valid on entry to this function!");
Type *Ty1 = StTy->getElementType(Indices[0]);
if (Ty1->isStructTy()) {
StructType *subStTy = cast<StructType>(Ty1);
uint32_t nextIdx = Indices[1] + 1;
if (nextIdx == subStTy->getNumElements()) {
nextIdx = 0;
Indices[0] += 1;
}
Indices[1] = nextIdx;
} else {
Indices[0] += 1;
}
return Indices[0] < StTy->getNumElements();
}
bool LdStCombine::runOnFunction(Function &F) {
m_CGC = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
m_TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
// If EnableLdStCombine = 2, do it for both lsc and legacy messages.
// The plan is to do it for LSC message only, ie, EnableLdStCombine=1.
uint32_t keyval = IGC_GET_FLAG_VALUE(EnableLdStCombine);
if (F.hasOptNone() || ((keyval & 0x1) == 1 && !m_CGC->platform.LSCEnabled()))
return false;
m_DL = &F.getParent()->getDataLayout();
m_AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
m_WI = &getAnalysis<WIAnalysis>();
if (IGC_IS_FLAG_ENABLED(DisableUniformAnalysis)) {
m_WI = nullptr;
} else {
m_WI = &getAnalysis<WIAnalysis>();
}
m_F = &F;
// Initialize symbolic evaluation
m_symEval.setDataLayout(m_DL);
// i64Emu: mimic Emu64Ops's enabling condition. Seems conservative
// but can be improved in the future if needed.
m_hasI64Emu = (m_CGC->platform.need64BitEmulation() && (IGC_GET_FLAG_VALUE(Enable64BitEmulation) ||
IGC_GET_FLAG_VALUE(Enable64BitEmulationOnSelectedPlatform)));
combineStores();
combineLoads();
bool changed = (m_hasLoadCombined || m_hasStoreCombined);
return changed;
}
// getElments():
// Return all valid elements of a given vector V.
// It may need to insert ExtractElementInst.
void LdStCombine::getOrCreateElements(Value *V, SmallVector<Value *, 16> &EltV, Instruction *InsertBefore) {
Type *Ty = V->getType();
VectorType *VTy = dyn_cast<VectorType>(Ty);
if (!VTy) {
EltV.push_back(V);
return;
}
const int32_t nelts = getNumElements(VTy);
EltV.resize(nelts, UndefValue::get(VTy->getScalarType()));
Value *ChainVal = V;
while (!isa<Constant>(ChainVal)) {
InsertElementInst *IEI = dyn_cast<InsertElementInst>(ChainVal);
if (!IEI || !isa<ConstantInt>(IEI->getOperand(2))) {
break;
}
ConstantInt *CInt = cast<ConstantInt>(IEI->getOperand(2));
uint32_t idx = (uint32_t)CInt->getZExtValue();
// Make sure the last IEI will be recorded if an element is
// inserted multiple times.
if (isa<UndefValue>(EltV[idx])) {
EltV[idx] = IEI->getOperand(1);
}
ChainVal = IEI->getOperand(0);
}
if (isa<UndefValue>(ChainVal)) {
// All valid elements known. For example,
// v0 = extelt undef, s0, 0
// v1 = extelt v0, s1, 1
// v2 = extelt v1, s2, 2
// V = extelt v2, s3, 3
// EltV[] = { s0, s1, s2, s3 }
return;
}
if (ConstantVector *CV = dyn_cast<ConstantVector>(ChainVal)) {
// Get valid elements from the final constant vector, for example.
// v0 = extelt {1, 2, 3, 4}, s0, 0
// V = extelt v0, s2, 2
// EltV[] = { s0, 2, s2, 4}
for (int i = 0; i < nelts; ++i) {
Value *v = CV->getOperand(i);
if (isa<UndefValue>(EltV[i]) && !isa<UndefValue>(v)) {
EltV[i] = v;
}
}
return;
}
// Not all elements known, get remaining unknown elements
// LV = load
// v0 = extelt LV, s0, 0
// V = extelt v0, s2, 1
// EltV[] = {s0, s1, 'extElt LV, 2', 'extElt LV, 3' }
IRBuilder<> builder(InsertBefore);
for (int i = 0; i < nelts; ++i) {
if (isa<UndefValue>(EltV[i])) {
Value *v = builder.CreateExtractElement(V, builder.getInt32(i));
EltV[i] = v;
}
}
}
// Return value:
// true:
// if V is a vector and it is only used in ExtractElement with const index.
// 'EltV has all its elements.
// false: otherwise. 'EltV' has 'V' only.
// Note: unused elements are returned as UndefValue.
bool LdStCombine::getVecEltIfConstExtract(Value *V, SmallVector<Value *, 8> &EltV) {
auto useOrigVector = [&EltV, V]() {
EltV.clear();
EltV.push_back(V);
};
Type *Ty = V->getType();
VectorType *VTy = dyn_cast<VectorType>(Ty);
if (!VTy) {
useOrigVector();
return false;
}
uint32_t N = getNumElements(VTy);
Value *undef = UndefValue::get(Ty->getScalarType());
EltV.assign(N, undef);
for (auto UI : V->users()) {
ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(UI);
if (!EEI) {
useOrigVector();
return false;
}
ConstantInt *CI = dyn_cast<ConstantInt>(EEI->getIndexOperand());
if (!CI) {
useOrigVector();
return false;
}
uint32_t ix = (uint32_t)CI->getZExtValue();
if (!isa<UndefValue>(EltV[ix])) {
useOrigVector();
return false;
}
EltV[ix] = EEI;
}
return true;
}
void LdStCombine::combineStores() {
// All store candidates with addr = common-base + const-offset
// All stores have the same common-base but different const-offset.
InstAndOffsetPairs Stores;
auto isStoreCandidate = [&](Instruction *I) {
if (std::optional<AStoreInst> SI = AStoreInst::get(I); SI.has_value()) {
// Sanity check
Type *eTy = SI->getValueOperand()->getType()->getScalarType();
if (!isPowerOf2_32((uint32_t)m_DL->getTypeStoreSize(eTy))) {
return false;
}
// Only original, not-yet-visited store can be candidates.
const bool isOrigSt = (m_instOrder.size() == 0 || m_instOrder.count(I) > 0);
uint32_t eBytes = (uint32_t)m_DL->getTypeStoreSize(eTy);
const bool legitSize = isPowerOf2_32(eBytes);
return (isOrigSt && !isVisited(I) && SI->isSimple());
}
return false;
};
// If all Stores can move down across I, return true;
// otherwise, return false.
auto canCombineStoresAcross = [this](AliasSetTracker &aAST, Instruction *I) {
// Can't combine for non-debug fence like instructions
if (I->isFenceLike() && !IsDebugInst(I))
return false;
if (ALoadInst::get(I).has_value() || AStoreInst::get(I).has_value() || I->mayReadOrWriteMemory()) {
MemoryLocation memloc = getLocation(I, m_TLI);
return !hasAlias(aAST, memloc);
}
return true;
};
// If 'aI' with offset 'aStart' overlaps with any store in aStores,
// return true; otherwise, return false.
// Note: once we know the offset is constant, this checking is precise
// and better than using alias analysis (basicaa).
auto hasOverlap = [this](InstAndOffsetPairs &aStores, Instruction *aI, int64_t aStart) {
std::optional<AStoreInst> aSI = AStoreInst::get(aI);
if (!aSI.has_value())
return true;
Type *Ty = aSI->getValueOperand()->getType();
uint32_t TyBytes = (uint32_t)m_DL->getTypeStoreSize(Ty);
int64_t aEnd = aStart + TyBytes;
// 'aSI' byte range [aStart, aEnd)
for (auto &lsinfo : aStores) {
IGC_ASSERT(lsinfo.isStore());
Type *aTy = lsinfo.getLdStType();
uint32_t aTyBytes = (uint32_t)m_DL->getTypeStoreSize(aTy);
// 'lsinfo' byte range: [thisStart, thisEnd)
int64_t thisStart = lsinfo.getByteOffset();
int64_t thisEnd = thisStart + aTyBytes;
if ((aStart >= thisStart && aStart < thisEnd) || (thisStart >= aStart && thisStart < aEnd))
return true;
}
return false;
};
// Only handle stores within the given instruction window.
constexpr uint32_t WINDOWSIZE = 150;
m_hasStoreCombined = false;
for (auto &BB : *m_F) {
init(&BB);
auto IE = BB.end();
for (auto II = BB.begin(); II != IE; ++II) {
Instruction *base = &*II;
if (!isStoreCandidate(base)) {
continue;
}
uint32_t numInsts = 1;
Stores.push_back(LdStInfo(base, 0));
// Keep store candidates for checking alias to see if those
// stores can be moved to the place of the last store.
auto batchAARes = IGCLLVM::AliasAnalysis::createAAresults(m_AA);
AliasSetTracker AST = IGCLLVM::createAliasSetTracker(batchAARes);
AST.add(base);
for (auto JI = std::next(II); JI != IE; ++JI) {
Instruction *I = &*JI;
if (!skipCounting(I))
++numInsts;
if (numInsts > WINDOWSIZE)
break;
// Check if any store in AST may be aliased to I
bool mayAlias = (!canCombineStoresAcross(AST, I));
int64_t offset;
if (isStoreCandidate(I) && getAddressDiffIfConstant(base, I, offset, m_symEval)) {
// If both mayAlias and hasOverlap are true, stop
if (mayAlias && hasOverlap(Stores, I, offset))
break;
// if predicates are different - stop
if (AStoreInst::get(base)->getPredicate() != AStoreInst::get(I)->getPredicate())
break;
Stores.push_back(LdStInfo(I, offset));
AST.add(I);
} else if (mayAlias) {
break;
}
}
// Create bundles from those stores.
// Note: createBundles() will markt all stores as visited when
// it is returend, meaning each store is considered only
// once. For example,
// store a
// store b
// store c // alias to store a
// store d
// As 'store c' aliases to 'store a', candidate 'Stores' stop
// growing at 'store c', giving the first set {a, b} for
// combining. Even if {a, b} cannot be combined, but {b, c, d}
// can; it will go on with the next candidate set {c, d}, not
// {b, c, d}; missing opportunity to combine {b, c, d}.
// So far, this is fine as this case isn't important.
createBundles(&BB, Stores);
}
// Actually combining them.
createCombinedStores(&BB);
}
}
void LdStCombine::combineLoads() {
// this check's for testing, and to be removed when stable.
if ((IGC_GET_FLAG_VALUE(EnableLdStCombine) & 0x4) == 0)
return;
if (m_CGC->type != ShaderType::OPENCL_SHADER) {
if (!m_CGC->getModuleMetaData()->compOpt.EnableLdStCombineforLoad) {
return;
}
}
// All load candidates with addr = common-base + const-offset
InstAndOffsetPairs Loads;
auto isLoadCandidate = [&](Instruction *I) {
if (auto LI = ALoadInst::get(I); LI.has_value()) {
// Sanity check
Type *eTy = LI->getType()->getScalarType();
if (!isPowerOf2_32((uint32_t)m_DL->getTypeStoreSize(eTy))) {
return false;
}
// Only original, not-yet-visited load can be candidates.
bool isOrigLd = (m_instOrder.size() == 0 || m_instOrder.count(I) > 0);
return (isOrigLd && !isVisited(I) && LI->isSimple());
}
return false;
};
// If 'I' can be moved up accross all inst in aAST, return true.
auto canMoveUp = [this](AliasSetTracker &aAST, Instruction *I) {
if (ALoadInst::get(I).has_value()) {
MemoryLocation memloc = getLocation(I, m_TLI);
return !hasAlias(aAST, memloc);
}
return true;
};
// Only handle loads within the given instruction window.
constexpr uint32_t LDWINDOWSIZE = 150;
m_hasLoadCombined = false;
for (auto &BB : *m_F) {
LLVM_DEBUG(dbgs() << "Process BB: " << BB.getName() << "\n");
init(&BB);
auto IE = BB.end();
for (auto II = BB.begin(); II != IE; ++II) {
Instruction *base = &*II;
LLVM_DEBUG(dbgs() << "- Process base inst: " << *base << "\n");
if (!isLoadCandidate(base)) {
continue;
}
uint32_t numInsts = 1;
Loads.push_back(LdStInfo(base, 0));
LLVM_DEBUG(dbgs() << "- Added to Loads\n");
// Keep store/maywritemem/fence insts for checking alias to see if those
// stores block load candidates from moving to the first (leading) load.
auto batchAARes = IGCLLVM::AliasAnalysis::createAAresults(m_AA);
AliasSetTracker AST = IGCLLVM::createAliasSetTracker(batchAARes);
for (auto JI = std::next(II); JI != IE; ++JI) {
Instruction *I = &*JI;
LLVM_DEBUG(dbgs() << "- - Process inst: " << *I << "\n");
if (!skipCounting(I))
++numInsts;
// cannot merge beyond fence or window limit
if ((I->isFenceLike() && !isa<PredicatedLoadIntrinsic>(I) && !isa<PredicatedStoreIntrinsic>(I)) ||
numInsts > LDWINDOWSIZE) {
LLVM_DEBUG(dbgs() << "- - Stop at fence or window limit\n");
break;
}
if (AStoreInst::get(I).has_value() || I->mayWriteToMemory()) {
AST.add(I);
LLVM_DEBUG(dbgs() << "- - Added to AST. Continue to next instruction\n");
continue;
}
if (isLoadCandidate(I)) {
LLVM_DEBUG(dbgs() << "- - It is load candidate\n");
int64_t offset;
if (getAddressDiffIfConstant(base, I, offset, m_symEval)) {
LLVM_DEBUG(dbgs() << "- - Found offset: " << offset << "\n");
if (canMoveUp(AST, I)) {
LLVM_DEBUG(dbgs() << "- - Can move up\n");
// if predicates are different - stop
if (ALoadInst::get(base)->getPredicate() != ALoadInst::get(I)->getPredicate()) {
LLVM_DEBUG(dbgs() << "- - Predicates are different. Stop\n");
break;
}
Loads.push_back(LdStInfo(I, offset));
LLVM_DEBUG(dbgs() << "- - Added to Loads\n");
} else {
// If it cannot be moved up, either keep going or
// stopping. Choose stop for now.
LLVM_DEBUG(dbgs() << "- - Cannot move up. Stop\n");
break;
}
}
}
}
// Experiment: If its the last element of the load and does not fit the DWORD alignment,
// It creates a dummy load with the same alignment type as the previous load
if (m_CGC->type != ShaderType::OPENCL_SHADER) {
if (m_CGC->getModuleMetaData()->compOpt.EnableLdStCombinewithDummyLoad) {
LLVM_DEBUG(dbgs() << "- - Allow dummy load coalescing\n");
AllowDummyLoadCoalescing(Loads);
}
}
// Note: For now, each load is considered once. For example,
// load a
// store x : alias to load c
// load b
// load c
// load d
// As 'load c' aliases to 'store x', candidate 'Loads' stop
// growing at 'load b', giving the first set {a, b}. Even
// though {a, b} cannot be combined, 'load b' will not be
// reconsidered for a potential merging of {b, c, d}.
//
// This is controlled by setting visited. A better way of setting
// visited can overcome the above issue.
createBundles(&BB, Loads);
}
// Actually combining them.
createCombinedLoads(&BB);
}
}
void LdStCombine::createBundles(BasicBlock *BB, InstAndOffsetPairs &LoadStores) {
LLVM_DEBUG(dbgs() << "Create bundles for " << LoadStores.size() << " instructions\n");
//
// SelectD32OrD64:
// a utility class to select whether to use data element D32 or D64 when
// the alignment is 8 bytes or 1 bytes. Not used when alignment is 4.
// (Here, data element refers to data element in load/store messages.)
// 0) Don't use D64 if i64 is not nativaly supported (no Q mov).
// 1) use D32 if any store in the bundle has byte-element access (either
// scalar or element type of a vector), and the store is non-uniform,
// as D64 might require stride=8, which is not legit, to merge byte
// elements; or
// 2) use D64 if there are more D64 elements than D32 elements (thus
// less move instructions); or
// 3) use D64 if VecSize = 3 and there is at least one D64 store
// (note that V3D64 has no equivalent D32 messages).
// 4) otherwise, either D32 or D64 based on uniformity and size
// (details in useD64()).
//
class SelectD32OrD64 {
uint32_t LastNumD64, LastNumD32;
uint32_t currNumD64, currNumD32;
// If byte element is present, save its index.
int32_t lastStoreIdxWithByteElt;
// Whether this store is uniform or not.
const bool isUniform;
// Do tracking only for 8byte-aligned D64 or 1byte-aligned
const bool doTracking;
const CodeGenContext *Ctx;
const DataLayout *DL;
public:
SelectD32OrD64(const CodeGenContext *aCtx, const DataLayout *aDL, bool aUniform, uint32_t aAlign)
: Ctx(aCtx), DL(aDL), LastNumD64(0), LastNumD32(0), currNumD64(0), currNumD32(0), lastStoreIdxWithByteElt(-1),
isUniform(aUniform), doTracking(aAlign == 8 || aAlign == 1) {}
// LSI: the store to be tracked.
// LSIIdx: this store's index in the bundle.
// ByteOffset: starting offset of this LSI in the coalesced var.
void track(const LdStInfo *LSI, int32_t LSIIdx, uint32_t ByteOffset) {
if (!doTracking)
return;
Type *Ty = LSI->getLdStType();
Type *eTy = Ty->getScalarType();
// sanity check
if (!(eTy->isIntOrPtrTy() || eTy->isFloatingPointTy()))
return;
uint32_t eBytes = (uint32_t)DL->getTypeStoreSize(eTy);
uint32_t nElts = 1;
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
auto fVTy = cast<IGCLLVM::FixedVectorType>(VTy);
nElts = (uint32_t)fVTy->getNumElements();
}
// If ByteOffset is odd, need to use byte mov to pack coalesced var
// (packed struct). so, treat this the same as byte-element access.
if (eBytes == 1 || (ByteOffset & 1) != 0) {
lastStoreIdxWithByteElt = LSIIdx;
} else if (eBytes == 4) {
currNumD32 += nElts;
} else if (eBytes == 8) {
currNumD64 += nElts;
}
}
void save() {
if (!doTracking)
return;
LastNumD32 = currNumD32;
LastNumD64 = currNumD64;
}
bool useD64(uint32_t VecEltBytes, uint32_t VecSizeInElt) {
if (!doTracking)
return false;
if (VecEltBytes == 1) {
if (hasByteElement()) {
if (!isUniform) {
IGC_ASSERT(VecSizeInElt <= 4);
}
return false;
}
if (VecSizeInElt == 8 && !isUniform) {
return true;
}
// Currently, emit uses d32/d64 scatter for uniform store/load
// and is limited to simd8.
// Use LastNumD64 as the bundle has been found
if (isUniform && (VecSizeInElt > (4 * 8) || (LastNumD64 > 0 && (2 * LastNumD64 > LastNumD32)))) {
return true;
}
}
return false;
}
bool hasByteElement() const { return lastStoreIdxWithByteElt >= 0; }
bool skip(uint32_t VecEltBytes, uint32_t VecSizeInElt) const {
if (!doTracking)
return false;
if (VecEltBytes == 8 || (VecEltBytes == 1 && VecSizeInElt == 8)) {
if (hasByteElement() && !isUniform) {
// case 1: check whether to skip D64.
return true;
}
}
if (VecEltBytes == 8) {
// use currNumD64 during finding the bundle
if (currNumD64 > 0 && VecSizeInElt == 3) {
// case 2, check whether to skip D64.
return false;
}
if (currNumD64 > 0 && (2 * currNumD64 > currNumD32)) {
// case 3: check whether to skip D64.
return false;
}
// otherwise, skip 8byte-aligned D64
return true;
}
// VecEltBytes == 1; either D32 or D64 is okay, thus no skip.
// useD64() will select which one to use.
return false;
}
};
auto markVisited = [this](InstAndOffsetPairs &LoadStores) {
int32_t SZ = (int)LoadStores.size();
for (int i = 0; i < SZ; ++i) {
const LdStInfo *lsi = &LoadStores[i];
setVisited(lsi->getInst());
}
LoadStores.clear();
};
int32_t SZ = (int)LoadStores.size();
if (SZ <= 1) {
markVisited(LoadStores);
return;
}
auto isBundled = [](const LdStInfo *LSI, DenseMap<const Instruction *, int> &L) {
return (L.count(LSI->getInst()) > 0);
};
auto setBundled = [&isBundled](LdStInfo *LSI, DenseMap<const Instruction *, int> &L) {
if (!isBundled(LSI, L)) {
L[LSI->getInst()] = 1;
}
};
setInstOrder(BB);
// Sort loads/stores in the order of increasing ByteOffset
std::sort(LoadStores.begin(), LoadStores.end(),
[](const LdStInfo &A, const LdStInfo &B) { return A.getByteOffset() < B.getByteOffset(); });
const LdStInfo *lsi0 = &LoadStores[0];
auto LI = ALoadInst::get(lsi0->getInst());
auto SI = AStoreInst::get(lsi0->getInst());
LdStKind Kind = LI.has_value() ? LdStKind::IS_LOAD : LdStKind::IS_STORE;
bool isUniform = false;
if (m_WI) {
isUniform = m_WI->isUniform(LI.has_value() ? LI->getPointerOperand() : SI->getPointerOperand());
}
const AddressModel AddrModel = lsi0->getAddressModel(m_CGC);
// Starting from the largest alignment (favor larger alignment)
const uint32_t bundleAlign[] = {8, 4, 1};
const uint32_t aligns = (int)(sizeof(bundleAlign) / sizeof(bundleAlign[0]));
// keep track of the number of unmerged loads
uint32_t numRemainingLdSt = SZ;
for (int ix = 0; ix < aligns && numRemainingLdSt > 1; ++ix) {
const uint32_t theAlign = bundleAlign[ix];
// If i64 insts are not supported, don't do D64 as it might
// require i64 mov in codegen emit (I64 Emu only handles 1-level
// insertvalue and extractvalue so far).
if (m_hasI64Emu && theAlign > 4)
continue;
// Use alignment as element size, which maps to gen load/store element
// size as follows:
// 1) For byte-aligned, use vecEltBytes = 1 with different
// number of vector elements to map D16U32, D32, D64. The final
// store's type would be <2xi8> or i16 for D16U32, i32 for D32,
// and i64 for D64. For uniform, multiple of D32/D64 can be
// merged and store's type would be <n x i32> or <n x i64>.
// 2) 4-byte aligned D32. vecEltBytes = 4.
// The final store's type is <n x i32>
// 3) 8-byte aligned D64. vecEltBytes = 8.
// The final store's type is <n x i64>
const uint32_t vecEltBytes = theAlign;
int32_t i = 0;
while (i < SZ) {
// 1. The first one is the leading store.
const LdStInfo *leadLSI = &LoadStores[i];
LLVM_DEBUG(llvm::dbgs() << "Try leading LdSt: " << *leadLSI->getInst() << "\n");
if (isBundled(leadLSI, m_combinedInsts) || (i + 1) == SZ) /* skip for last one */ {
++i;
continue;
}
if (m_WI && isUniform && !m_WI->isUniform(leadLSI->getValueOperand())) {
LLVM_DEBUG(llvm::dbgs() << "No combining for *uniform-ptr = non-uniform value\n");
++i;
continue;
}
Type *leadTy = leadLSI->getLdStType();
Type *eltTy = leadTy->getScalarType();
uint32_t eltBytes = (uint32_t)(m_DL->getTypeStoreSize(eltTy));
const uint32_t align = leadLSI->getAlignment();
// Skip if align is less than the current alignment. Also, avoid
// merging non-uniform stores whose size >= 4 bytes when checking
// byte-aligned bundling.
if (align < theAlign || (theAlign == 1 && eltBytes >= 4 && !isUniform)) {
++i;
continue;
}
BundleConfig BC(Kind, theAlign, isUniform, AddrModel, m_CGC);
const uint32_t maxBytes = BC.getMaxVecSizeInBytes();
uint32_t totalBytes = (uint32_t)m_DL->getTypeStoreSize(leadTy);
SelectD32OrD64 D32OrD64(m_CGC, m_DL, isUniform, theAlign);
D32OrD64.track(leadLSI, i, 0);
if (totalBytes >= maxBytes) {
++i;
continue;
}
// 2. grow this bundle as much as possible
// [i, e]: the range of stores form a legit bundle (e > i).
int e = -1;
uint32_t vecSize = -1;
for (int j = i + 1; j < SZ; ++j) {
const LdStInfo *LSI = &LoadStores[j];
LLVM_DEBUG(llvm::dbgs() << "Try to make bundle with: " << *LSI->getInst() << "\n");
if (isBundled(LSI, m_combinedInsts) || (leadLSI->getByteOffset() + totalBytes) != LSI->getByteOffset()) {
// stop as remaining stores are not contiguous
break;
}
if (m_WI && isUniform && !m_WI->isUniform(LSI->getValueOperand())) {
LLVM_DEBUG(llvm::dbgs() << "No combining for *uniform-ptr = non-uniform value\n");
break;
}
Type *aTy = LSI->getLdStType();
uint32_t currByteOffset = totalBytes;
totalBytes += (uint32_t)m_DL->getTypeStoreSize(aTy);
if (totalBytes > maxBytes) {
break;
}
D32OrD64.track(LSI, j, currByteOffset);
int nextBytes = BC.getAndUpdateVecSizeInBytes(totalBytes);
if (m_hasI64Emu && vecEltBytes == 1 && nextBytes == 8) {
// If I64 emu is on, skip D64 as I64 emu would result
// in inefficient code.
continue;
}
if (totalBytes == nextBytes && !D32OrD64.skip(vecEltBytes, BC.getCurrVecSize())) {
e = j;
vecSize = BC.getCurrVecSize();
D32OrD64.save();
}
}
// If any ldst has byte element, skip D64 to avoid byte mov
// with dst-stride = 8.
if (vecEltBytes == 8 && D32OrD64.hasByteElement()) {
// go to next iteration with D32.
break;
}
const int bundle_nelts = e - i + 1;
if (e >= 0 && bundle_nelts > 1) {
// Have a bundle, save it.
m_bundles.emplace_back(BundleInfo());
BundleInfo &newBundle = m_bundles.back();
newBundle.bundle_eltBytes = vecEltBytes;
newBundle.bundle_numElts = vecSize;
newBundle.useD64 = (theAlign == 1) ? D32OrD64.useD64(vecEltBytes, vecSize) : false;
for (int k = i; k <= e; ++k) {
LdStInfo &tlsi = LoadStores[k];
newBundle.LoadStores.push_back(tlsi);
setBundled(&tlsi, m_combinedInsts);
if (tlsi.isStore()) {
appendToBeDeleted(tlsi.getInst());
}
setVisited(tlsi.getInst());
}
i = e + 1;
numRemainingLdSt -= bundle_nelts;
if (numRemainingLdSt < 2) {
// No enough loads/stores to merge
break;
}
} else {
++i;
}
}
}
markVisited(LoadStores);
}
void LdStCombine::AllowDummyLoadCoalescing(const InstAndOffsetPairs &Loads) {
// Currently supports only this pattern.
// % 164 = add i32 % 114, 1020
// % 165 = and i32 % 164, 1020
// % 166 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 165
// %167 = load half, half addrspace(3) * %166, align 8
// % 168 = or i32 % 165, 1
// % 169 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 168
// % 170 = load half, half addrspace(3) * %169, align 2
// % 171 = or i32 % 165, 2
// % 172 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 171
// % 173 = load half, half addrspace(3) * %172, align 4
// to
// % 164 = add i32 % 114, 1020
// % 165 = and i32 % 164, 1020
// % 166 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 165
// %167 = load half, half addrspace(3) * %166, align 8
// % 168 = or i32 % 165, 1
// % 169 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 168
// % 170 = load half, half addrspace(3) * %169, align 2
// % 171 = or i32 % 165, 2
// % 172 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 171
// % 173 = load half, half addrspace(3) * %172, align 4
// % 174 = add i32 % 165, 3
// % 175 = getelementptr[1024 x half], [1024 x half] addrspace(3) * null, i32 0, i32 % 174
// % 176 = load half, half addrspace(3) * %175, align 2
int size = Loads.size();
LdStInfo LastLoad = Loads[size - 1];
uint32_t LastLoadSize = (uint32_t)m_DL->getTypeStoreSize(LastLoad.getInst()->getType());
uint32_t currLoadSize = LastLoadSize + LastLoad.getByteOffset();
if (currLoadSize % 4) {
// Replicating the last load to make it DWORD aligned
uint32_t newLoadSize = LastLoadSize;
if (!((currLoadSize + newLoadSize) % 4)) {
ALoadInst lead = ALoadInst::get(LastLoad.getInst()).value();
Value *ldPtr = lead.getPointerOperand();
if (auto gep = dyn_cast<GetElementPtrInst>(ldPtr)) {
if ((gep->getNumOperands() == 3) && (isa<ConstantPointerNull>(gep->getPointerOperand()))) {
IGCIRBuilder<> irBuilder(LastLoad.getInst());
Value *AddInst = irBuilder.CreateAdd(gep->getOperand(2), irBuilder.getInt32(1));
Value *gepArg[] = {gep->getOperand(1), AddInst};
Value *Addr = irBuilder.CreateInBoundsGEP(gep->getSourceElementType(), gep->getOperand(0), gepArg);
// Create a dummy merge value:
Type *Ty = cast<GetElementPtrInst>(Addr)->getResultElementType();
Value *mergeValue = nullptr;
if (IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty))
mergeValue = ConstantAggregateZero::get(Ty);
else
mergeValue = Constant::getNullValue(Ty);
lead.CreateLoad(irBuilder, Ty, Addr, mergeValue);
}
}
}
}
return;
}
// A member of layout struct can be a vector type. This function will decide
// if the vector type or a sequence of its elements' types shall be used as
// the layout struct's member types. If spliting a vector type into a sequence
// of its elements' types is beneficial (ie, likely results in less mov
// instructions), return true; otherwise, return false.
//
// For example:
//
// Not split <2xi32> split <2xi32>
// ----------------- -------------
// struct SOA { struct SOA {
// <2 x i32> x; i32 x0;
// i32 x1;
// float y; float y;
// struct AOS { struct AOS {
// i16 a, i16 b} z; i16 a, i16 b} z;
// } }
//
// args:
// V : value to be checked
// K : indicate if V is a stored value or a loaded value.
// (special case: return false if V is null or V is scalar.)
bool LdStCombine::splitVectorType(Value *V, LdStKind K) const {
if (V == nullptr) {
return false;
}
Type *Ty = V->getType();
// Not vector, always return false;
if (!Ty->isVectorTy()) {
return false;
}
// If vector size isn't packed (store size != alloc size), must split to
// avoid holes in the layout struct.
// For example, alloc size(<3 x i32>) = 16B, not 12B
// struct { <3xi32>, float } : size = 20 Bytes
// struct { i32, i32, i32, float} : size = 16 bytes.
if (!isa<Constant>(V) && m_DL->getTypeStoreSize(Ty) != m_DL->getTypeAllocSize(Ty)) {
return true;
}
Value *val = V;
if (K == LdStKind::IS_STORE) {
while (auto IEI = dyn_cast<InsertElementInst>(val)) {
if (!isa<Constant>(IEI->getOperand(2))) {
return false;
}
val = IEI->getOperand(0);
}
if (isa<Constant>(val)) {
return true;
}
} else {
for (auto U : val->users()) {
Value *user = U;
if (auto EEI = dyn_cast<ExtractElementInst>(user)) {
if (isa<Constant>(EEI->getIndexOperand())) {
continue;
}
}
return false;
}
return true;
}
return false;
}
// mergeConstElements
// If EltVals has constant elements consecutively, merge them if possible.
// The merged constant's size is no more than MaxMergeByte.
void LdStCombine::mergeConstElements(SmallVector<Value *, 4> &EltVals, uint32_t MaxMergeBytes) {
// If all sub values are constants, coalescing them into a single
// constant of type DstEltTy.
//
// Merge goes with 2 bytes, 4 bytes, up to EltBytes (DstEltTy).
// For example: DstEltTy = i64
// {i8 1, i8 2, i16 0x77, i8 3, i8 4, i8 5, i8 %y}
// b = 2:
// { i16 0x201, i16 0x77, i16 0x403, i8 5, i8 %y}
// b = 4:
// { i32 0x770201, i16 0x403, i8 5, i8 %y}
// b = 8 :
// no change.
auto isValidConst = [](Value *v) { return isa<ConstantInt>(v) || isa<ConstantFP>(v) || isa<ConstantPointerNull>(v); };
// Check if it has two consecutive constants, skip if not.
// This is a quick check to skip for majority of cases.
bool isCandidate = false;
for (int i = 0, sz = (int)EltVals.size() - 1; i < sz; ++i) {
if (isValidConst(EltVals[i]) && isValidConst(EltVals[i + 1])) {
isCandidate = true;
break;
}
}
if (!isCandidate) {
return;
}
// If there is a vector constant, expand it with its components
bool hasMerged = false;
std::list<Value *> mergedElts(EltVals.begin(), EltVals.end());
// b : the number of bytes of the merged value.
for (uint32_t b = 2; b <= MaxMergeBytes; b *= 2) {
int currOff = 0;
auto NI = mergedElts.begin();
for (auto II = NI; II != mergedElts.end(); II = NI) {
++NI;
if (NI == mergedElts.end()) {
break;
}
// Try to merge (II, NI)
Value *elt0 = *II;
Type *ty0 = elt0->getType();
const uint32_t sz0 = (uint32_t)m_DL->getTypeStoreSize(ty0);
// Merged value shall be naturally aligned.
if ((currOff % b) != 0 || sz0 >= b) {
currOff += sz0;
continue;
}
Value *elt1 = *NI;
Type *ty1 = elt1->getType();
const uint32_t sz1 = (uint32_t)m_DL->getTypeStoreSize(ty1);
Constant *C0 = dyn_cast<Constant>(elt0);
Constant *C1 = dyn_cast<Constant>(elt1);
if (!C0 || !C1 || (sz0 + sz1) != b || !isValidConst(C0) || !isValidConst(C1)) {
currOff += sz0;
continue;
}
IGC_ASSERT_MESSAGE(!C0->getType()->isVectorTy() && !C1->getType()->isVectorTy(),
"Vector Constant not supported!");
uint64_t imm0 = GetImmediateVal(C0);
uint64_t imm1 = GetImmediateVal(C1);
imm0 &= maxUIntN(sz0 * 8);
imm1 &= maxUIntN(sz1 * 8);
uint64_t imm = ((imm1 << (sz0 * 8)) | imm0);
Type *ty = IntegerType::get(ty0->getContext(), (sz0 + sz1) * 8);
Constant *nC = ConstantInt::get(ty, imm, false);
mergedElts.insert(II, nC);
auto tII = NI;
++NI;
mergedElts.erase(II);
mergedElts.erase(tII);
hasMerged = true;
}
}
if (!hasMerged) {
return;
}
EltVals.clear();
EltVals.insert(EltVals.end(), mergedElts.begin(), mergedElts.end());
}
// This is to make sure to reuse the layout types. Two identified structs have
// the same layout if
// 1. both are SOA or both are AOS; and
// 2. both are packed; and
// 3, element types are matched in order.
StructType *LdStCombine::getOrCreateUniqueIdentifiedStructType(ArrayRef<Type *> EltTys, bool IsSOA, bool IsPacked) {
auto &layoutStructTypes = m_CGC->getLayoutStructTypes();
for (auto II : layoutStructTypes) {
StructType *stTy = II;
if (IsPacked == stTy->isPacked() && IsSOA == isLayoutStructTypeSOA(stTy) && stTy->elements() == EltTys)
return stTy;
}
// Create one
StructType *StTy =
StructType::create(EltTys, IsSOA ? getStructNameForSOALayout() : getStructNameForAOSLayout(), IsPacked);
layoutStructTypes.push_back(StTy);
return StTy;
}
// gatherCopy():
// Generate the final value by coalescing given values. The final value is
// of either struct type or vector type.
// Arguments:
// DstEltBytes: size of vector element if the final value is a vector.
// If the final value is a struct, its struct member size
// must be the same as DstEltBytes.
// DstNElts: the num of elements if the final value is a vector or
// the num of direct members if the final value is a struct.
// Vals: a list of values to be coalesced into the final value.
// InsertBefore: inserting pos for new instructions.
//
// DstEltTy: not an argument, but used often in this function and comments.
// It is the element type if the final value is a vector; or int type if
// the final value is a struct. For a struct, it could be
// 1. int64: DstEltBytes == 8; or // D64
// 2. int32: DstEltBytes == 4, or // D32
// 3. int16: DstEltBytes == 2. // D16U32
//
// Examples:
// 1. vector type;
// given DstEltBytes=4 and DstNElts=4
// Vals = { i32 a, i64 b, int c }
//
// 'b' is split into two i32, the final value is a vector and DstEltTy
// is i32.
//
// <4xi32> returnVal = {
// a,
// extractElement bitcast (i64 b to <2xi32>), 0
// extractElement bitcast (i64 b to <2xi32>), 1
// c
// };
// 2. struct type (multiple of 4 bytes)
// given DstNElts x DstEltBytes = 8x4B.
// Vals = { 4xi8 a, i64 b, 4xfloat c, i16 d, i8 e, i8 f}
//
// This function generates a val of struct type:
//
// struct {
// struct { // indexes
// i8 d0; // (0, 0): extElt <4xi8> a, 0
// i8 d1; // (0, 1): extElt <4xi8> a, 1
// i8 d2; // (0, 2): extElt <4xi8> a, 2
// i8 d3; // (0, 3): extElt <4xi8> a, 3
// } E0;
// i32 E1; // (1): extElt bitcast (i64 b to <2xi32>), 0
// i32 E2; // (2): extElt bitcast (i64 b to <2xi32>), 1
// float E3; // (3): extElt <4xfloat> c, 0
// float E4; // (4): extElt <4xfloat> c, 1
// float E5; // (5): extElt <4xfloat> c, 2
// float E6; // (6): extElt <4xfloat> c, 3
// struct {
// i16 d0; // (7, 0): d
// i8 d1; // (7, 1): e
// i8 d2; // (7, 2): f
// } E7;
// } returnVal;
//
// As DstEltBytes == 4, DstEltTy is i32.
//
// The struct layout:
// The direct members are in SOA. If its direct members are struct, those
// struct members (their size is either 32bit or 64bit) are in AOS. This
// is consistent with viewing struct as a vector < DstNElts x DstEltTy >
// from layout point of view.
//
// To distinguish the struct generated here from other structs, the struct
// generated here are identified with reserved names, returned by
// getStructNameForSOALayout() or getStructNameForAOSLayout().
//
Value *LdStCombine::gatherCopy(const uint32_t DstEltBytes, int DstNElts, SmallVector<Value *, 16> &Vals,
Instruction *InsertBefore) {
// AllEltVals:
// each entry is one direct member of struct or vector. If an entry has
// more than one elements, it is either D32 or D64 in size, and likely a
// member of type struct.
// The final value is either a struct or a vector. Its total size and its
// GRF layout is the same as vector type <DstNElts x DstEltTy>.
SmallVector<SmallVector<Value *, 4>, 16> allEltVals;
// eltVals:
// Pending values that are going to form a single element in allEltVals.
// Once all pending values is complete, save it into allEltVals.
SmallVector<Value *, 4> eltVals;
// worklist:
// initialized to all input values in this bundle. Its values are
// gradually moved to AllEltVals one by one until it is empty.
std::list<Value *> worklist(Vals.begin(), Vals.end());
IRBuilder<> irBuilder(InsertBefore);
// remainingBytes:
// initialize to be the size of DstEltTy. It is the size of each
// member of the struct or vector.
uint remainingBytes = DstEltBytes;
while (!worklist.empty()) {
Value *v = worklist.front();
worklist.pop_front();
if (v->getType()->isVectorTy()) {
IGC_ASSERT((v->getType()->getScalarSizeInBits() % 8) == 0);
uint32_t eBytes = (v->getType()->getScalarSizeInBits() / 8);
uint32_t n = getNumElements(v->getType());
// true if v is a legal vector at level 1
bool isLvl1 = (remainingBytes == DstEltBytes && eBytes == DstEltBytes);
// true if v is a legal vector at level 2
bool isLvl2 = (remainingBytes >= (eBytes * n));
bool keepVector = !splitVectorTypeForGather(v);
if (isLvl1 && keepVector) { // case 1
// 1st level vector member
eltVals.push_back(v);
allEltVals.push_back(eltVals);
eltVals.clear();
} else if (isLvl2 && keepVector) { // case 2
// 2nd level vector member
eltVals.push_back(v);
remainingBytes -= (eBytes * n);
if (remainingBytes == 0) {
mergeConstElements(eltVals, DstEltBytes);
allEltVals.push_back(eltVals);
// Initialization for the next element
eltVals.clear();
remainingBytes = DstEltBytes;
}
} else { // case 3
SmallVector<Value *, 16> elts;
getOrCreateElements(v, elts, InsertBefore);
worklist.insert(worklist.begin(), elts.begin(), elts.end());
}
continue;
}
Type *eTy = v->getType();
const uint32_t eBytes = (uint32_t)m_DL->getTypeStoreSize(eTy);
if (eTy->isPointerTy()) {
// need ptrtoint cast as bitcast does not work
IGC_ASSERT(eBytes == 8 || eBytes == 4 || eBytes == 2);
eTy = IntegerType::get(eTy->getContext(), eBytes * 8);
v = irBuilder.CreateCast(Instruction::PtrToInt, v, eTy);
}
// If v isn't element-size aligned in GRF at this offset, cannot
// generate a mov instruction. v must be split into small chunks
// that are aligned for mov to work.
uint32_t currAlign = (uint32_t)MinAlign(DstEltBytes, DstEltBytes - remainingBytes);
if (currAlign < eBytes) {
// Two cases:
// 1. DstEltBytes = 4
// store i32 p
// store i32 p+4
// store i64 p+8 <- v : i64
// Need to split i64 by casting i64 --> 2xi32
// 2. DstEltBytes = 4, packed struct
// store i8 p
// store i16 p+1 <- v : i16
// store i8 p+2
// Need to split i16 into 2xi8
IGC_ASSERT((eBytes % currAlign) == 0);
int n = eBytes / currAlign;
Type *newETy = IntegerType::get(m_F->getContext(), currAlign * 8);
VectorType *nVTy = VectorType::get(newETy, n, false);
Value *new_v = irBuilder.CreateCast(Instruction::BitCast, v, nVTy);
auto insPos = worklist.begin();
for (int i = 0; i < n; ++i) {
Value *v = irBuilder.CreateExtractElement(new_v, irBuilder.getInt32(i));
worklist.insert(insPos, v);
}
continue;
}
// v should fit into this remainingByts as v is element-size aligned.
IGC_ASSERT(remainingBytes >= eBytes);
eltVals.push_back(v);
remainingBytes -= eBytes;
if (remainingBytes == 0) {
// Found one element of size DstEltBytes.
mergeConstElements(eltVals, DstEltBytes);
allEltVals.push_back(eltVals);
// Initialization for the next element
eltVals.clear();
remainingBytes = DstEltBytes;
}
}
IGC_ASSERT(eltVals.empty());
Type *DstEltTy = nullptr;
// A new coalesced value could be one of two types
// 1 a vector type < DstNElts x DstEltTy >
// If all elements are of the same type (which is DstEltTy). It
// could be a float or integer type.
// 2 a struct type
// An integer type is used as DstEltTy whose size is DstEltBytes.
// All its members, include struct members, must be this same size.
// The struct nesting is at most 2 levels.
//
// More examples:
// 1) vector type (i64 as element)
// store i64 a, p; store i64 b, p+8; store i64 c, p+16
// -->
// store <3xi64> <a, b, c>, p
//
// Another example,
// store float a, p; store float b, p+4
// -->
// store <2xfloat> <a,b>, p
// 2)struct type (i32 as element type)
// store i32 a, p; store i32 b, p+4
// store i8 c0, p+8; store i8 c1, p+9;
// store i8 c2, p+10; store i8 c3, p+11
// store i32 d, p+12
// -->
// struct __StructSOALayout_ {
// i32, i32, struct {i8, i8, i16}, i32}
// }
// store __StructSOALayout__ <{a, b, <{c0, c1, c2, c3}>, d}>, p
//
// Instead of store on struct type, a vector store is used to take
// advantage of the existing vector store of codegen emit.
// let stVal = __StructSOALayout__ <{a, b, <{c0, c1, c2, c3}>, d}>
//
// val = call <4xi32> bitcastfromstruct( __StructSOALayout__ %stVal)
// store <4xi32> %val, p
//
// The "bitcastfromstruct" is no-op intrinsic (by dessa).
//
// Another example:
// store float a, p; store i32 b, p+4
// -->
// store __StructSOALayout__ <{float a, int b}>, p
// Note in this case, we can do
// store <2xi32> <bitcast(float a to i32), b>, p
// but this will introduce additional bitcast. And struct is
// preferred.
//
auto isLvl2Vecmember = [this, DstEltBytes](Type *ty) {
uint32_t n = (uint32_t)m_DL->getTypeStoreSize(ty->getScalarType());
return ty->isVectorTy() && n < DstEltBytes;
};
bool hasStructMember = false;
bool hasVecMember = false;
const int32_t sz = (int)allEltVals.size();
SmallVector<Type *, 16> StructTys;
for (int i = 0; i < sz; ++i) {
SmallVector<Value *, 4> &subElts = allEltVals[i];
int nelts = (int)subElts.size();
Type *ty = subElts[0]->getType();
uint32_t eBytes = (uint32_t)m_DL->getTypeStoreSize(ty->getScalarType());
if (nelts == 1 && !isLvl2Vecmember(ty)) {
IGC_ASSERT(eBytes == DstEltBytes);
StructTys.push_back(ty);
hasVecMember = (hasVecMember || ty->isVectorTy());
} else {
SmallVector<Type *, 4> subEltTys;
for (auto II : subElts) {
Value *elt = II;
subEltTys.push_back(elt->getType());
hasVecMember = (hasVecMember || elt->getType()->isVectorTy());
}
// create a member of a packed and identified struct type
// whose size = DstEltBytes. Use AOS layout.
Type *eltStTy = getOrCreateUniqueIdentifiedStructType(subEltTys, false, true);
StructTys.push_back(eltStTy);
hasStructMember = true;
}
}
// Check if a vector is preferred for the final value.
// (For reducing the number of struct types created, and also vector
// is better supported in codegen.)
if (!hasStructMember && !hasVecMember) {
// Set initial value for DstEltTy.
// Skip any const as it can be taken as either float or int.
int i = 0;
for (; i < sz; ++i) {
SmallVector<Value *, 4> &subElts = allEltVals[i];
int nelts = (int)subElts.size();
IGC_ASSERT(nelts == 1);
if (!isa<Constant>(subElts[0])) {
DstEltTy = subElts[0]->getType();
break;
}
}
if (DstEltTy != nullptr) {
for (++i; i < sz; ++i) {
SmallVector<Value *, 4> &subElts = allEltVals[i];
int nelts = (int)subElts.size();
IGC_ASSERT(nelts == 1);
Type *ty = subElts[0]->getType();
const bool isConst = isa<Constant>(subElts[0]);
if (!isConst && DstEltTy != ty) {
// Use struct is better
DstEltTy = nullptr;
break;
}
}
} else {
DstEltTy = Type::getIntNTy(m_F->getContext(), DstEltBytes * 8);
}
}
// If DstEltTy != null, use vector; otherwise, use struct as
// the struct will likely have less mov instructions.
Type *structTy;
Value *retVal;
if (DstEltTy != nullptr) { // case 1
if (DstNElts == 1) {
// Constant store values are combined into a single constant
// for D16U32, D32, D64
SmallVector<Value *, 4> &eltVals = allEltVals[0];
IGC_ASSERT(eltVals.size() == 1);
retVal = eltVals[0];
} else {
// normal vector
VectorType *newTy = VectorType::get(DstEltTy, DstNElts, false);
retVal = UndefValue::get(newTy);
for (int i = 0; i < sz; ++i) {
SmallVector<Value *, 4> &eltVals = allEltVals[i];
Value *tV = irBuilder.CreateBitCast(eltVals[0], DstEltTy);
retVal = irBuilder.CreateInsertElement(retVal, tV, irBuilder.getInt32(i));
}
}
} else { // case 2
// Packed and named identified struct. Prefix "__" make sure it won't
// collide with any user types. Use SOA layout.
structTy = getOrCreateUniqueIdentifiedStructType(StructTys, true, true);
// Create a value
retVal = UndefValue::get(structTy);
for (int i = 0; i < sz; ++i) {
SmallVector<Value *, 4> &eltVals = allEltVals[i];
const int sz1 = (int)eltVals.size();
Type *ty = eltVals[0]->getType();
if (sz1 == 1 && !isLvl2Vecmember(ty)) {
retVal = irBuilder.CreateInsertValue(retVal, eltVals[0], i);
} else {
for (int j = 0; j < sz1; ++j) {
uint32_t idxs[2] = {(unsigned)i, (unsigned)j};
retVal = irBuilder.CreateInsertValue(retVal, eltVals[j], idxs);
}
}
}
}
return retVal;
}
// Given a list of values in order (arg: Vals), return a new packed type
// that is composed of Vals' types. This new type is one of the following:
// 0. if all Vals have the same size of element, the new type will be
// a vector type with element size = that same size. This is to take
// advantage of extensive vector optimization in IGC; or
// 1. a vector type with element size = ValEltBytes and the number of
// elements = ValNElts; or
// 2. a struct type whose direct members are all the same size and are
// equal to ValEltBytes and the number of direct members = ValNElts.
// Note: this is for load combining as a type is needed before generating
// component values (store combining does not use this as component
// values are known before the type).
Type *LdStCombine::generateLoadType(SmallVector<Value *, 16> &Vals, uint32_t ValEltBytes, uint32_t ValNElts) {
// case 0: Optimization
// For now, use vector if elements of all Vals are the same size.
// Prefer using vector as vector has been well optimized.
const bool OptimPreferVec = true;
if (OptimPreferVec && Vals.size() > 1) {
Type *ETy = Vals[0]->getType()->getScalarType();
int eBytes = (int)m_DL->getTypeStoreSize(ETy);
bool isSameEltSize = true;
for (int i = 1, sz = (int)Vals.size(); i < sz; ++i) {
Type *ty = Vals[i]->getType()->getScalarType();
int tBytes = (int)m_DL->getTypeStoreSize(ty);
if (eBytes != tBytes) {
isSameEltSize = false;
break;
}
}
if (isSameEltSize) {
Type *newETy = Type::getIntNTy(m_F->getContext(), eBytes * 8);
uint32_t nElts = (ValNElts * ValEltBytes) / eBytes;
Type *retTy = VectorType::get(newETy, nElts, false);
return retTy;
}
}
// case 1 and 2
bool isStructTy = false;
SmallVector<Type *, 16> tys;
SmallVector<Type *, 16> subEltTys;
uint32_t remainingBytes = ValEltBytes;
std::list<Type *> worklist;
for (int i = 0, sz = (int)Vals.size(); i < sz; ++i) {
Value *V = Vals[i];
Type *Ty = V->getType();
worklist.push_back(Ty);
while (!worklist.empty()) {
Type *Ty = worklist.front();
worklist.pop_front();
Type *eTy = Ty->getScalarType();
uint32_t nElts = getNumElements(Ty);
uint32_t eBytes = (uint32_t)m_DL->getTypeStoreSize(eTy);
// true if v is either a vector or a scalar at level 1
bool isLvl1 = (remainingBytes == ValEltBytes && eBytes == ValEltBytes);
// true if v is a vector or scalar at level 2
bool isLvl2 = (remainingBytes >= (eBytes * nElts));
// It's ok not to split if V == nullptr (not original one from Vals)
// or if V is one from Vals and splitVectorTypeForScatter() returns
// false.
const bool noSplitOK = !splitVectorTypeForScatter(V);
if (noSplitOK && isLvl1) {
tys.push_back(Ty);
} else if (noSplitOK && isLvl2) {
subEltTys.push_back(Ty);
remainingBytes -= (eBytes * nElts);
if (remainingBytes == 0) {
// struct member.
Type *eltStTy = getOrCreateUniqueIdentifiedStructType(subEltTys, false, true);
tys.push_back(eltStTy);
subEltTys.clear();
isStructTy = true;
remainingBytes = ValEltBytes;
}
} else {
// Split Ty into smaller types if:
// 1. eBytes > ValEltBytes; or
// 2. eTy isn't aligned at this offset (cannot generate mov inst)
// Ty must be split into a list of smaller types that are aligned.
// Element size is assumed to be minimum alignment for a type.
uint32_t currAlign = (uint32_t)MinAlign(ValEltBytes, ValEltBytes - remainingBytes);
if (currAlign < eBytes) {
IGC_ASSERT((eBytes % currAlign) == 0);
int n = (eBytes / currAlign) * nElts;
Type *newETy = IntegerType::get(m_F->getContext(), currAlign * 8);
worklist.insert(worklist.begin(), n, newETy);
} else {
worklist.insert(worklist.begin(), nElts, eTy);
}
// For next iteration of while, it is for sub-part of V,
// so set V to nullptr.
V = nullptr;
}
}
}
IGC_ASSERT(remainingBytes == ValEltBytes);
Type *retTy;
if (isStructTy) {
retTy = getOrCreateUniqueIdentifiedStructType(tys, true, true);
} else {
Type *newEltTy = IntegerType::get(m_F->getContext(), ValEltBytes * 8);
retTy = VectorType::get(newEltTy, ValNElts, false);
}
return retTy;
}
// todo: re-do desc
// Given a list of values (arg: Vals), create a composite type (either
// struct type or vector type). A value of this composite type is loaded,
// and this value is futhter decomposed to the given list of values.
//
void LdStCombine::scatterCopy(SmallVector<Value *, 16> &Vals, int LoadedValEBytes, int LoadedValNElts,
Value *LoadedVecVal, Instruction *InsertBefore) {
// To split loadedVal, figure out its type first.
// 1. Try to use a vector type, if not possible, use a struct type.
// 2. for each V in Vals, its replacement value is created by mapping
// corresponding components of LoadedVal to itself.
IRBuilder<> irBuilder(InsertBefore);
Type *LoadedValTy = generateLoadType(Vals, LoadedValEBytes, LoadedValNElts);
{
int newTyBytes = (int)m_DL->getTypeStoreSize(LoadedValTy);
IGC_ASSERT(newTyBytes == (LoadedValNElts * LoadedValEBytes));
}
Value *LoadedVal = LoadedVecVal;
if (LoadedValTy->isStructTy()) {
// Set loadedVal's name to "StructV" so that both load/store
// will have names start with "StructV" for layout struct.
LoadedVal->setName("StructV");
Type *ITys[2] = {LoadedValTy, LoadedVal->getType()};
Function *IntrDcl =
GenISAIntrinsic::getDeclaration(m_F->getParent(), GenISAIntrinsic::ID::GenISA_bitcasttostruct, ITys);
LoadedVal = irBuilder.CreateCall(IntrDcl, LoadedVal);
} else if (LoadedValTy != LoadedVal->getType()) {
LoadedVal = irBuilder.CreateBitCast(LoadedVal, LoadedValTy);
}
auto createValueFromElements = [this, &irBuilder](SmallVector<Value *, 8> &Elts, Type *ValueTy) {
IGC_ASSERT(!Elts.empty());
Value *V0 = Elts[0];
Type *eTy = V0->getType();
uint32_t n = (uint32_t)Elts.size();
#if defined(_DEBUG)
{
IGC_ASSERT(!Elts.empty());
Value *V0 = Elts[0];
for (uint32_t i = 1; i < n; ++i) {
Value *V = Elts[i];
if (V0->getType() != V->getType()) {
IGC_ASSERT(false);
}
}
uint32_t EltsBytes = (uint32_t)m_DL->getTypeStoreSize(V0->getType());
EltsBytes *= n;
IGC_ASSERT(m_DL->getTypeStoreSize(ValueTy) == EltsBytes);
}
#endif
Value *retVal;
if (n == 1) {
retVal = Elts[0];
if (eTy != ValueTy) {
retVal = irBuilder.CreateBitCast(retVal, ValueTy);
}
} else {
VectorType *nTy = VectorType::get(eTy, n, false);
Value *nV = UndefValue::get(nTy);
for (uint32_t i = 0; i < n; ++i) {
nV = irBuilder.CreateInsertElement(nV, Elts[i], i);
}
retVal = irBuilder.CreateBitCast(nV, ValueTy);
}
return retVal;
};
// Copy component values from LoadedVal to the original values.
if (LoadedValTy->isStructTy()) {
StructType *StTy = cast<StructType>(LoadedValTy);
SmallVector<uint32_t, 2> Idx = {0, 0};
auto getCurrMemberTy = [StTy, &Idx]() {
Type *Ty0 = StTy->getElementType(Idx[0]);
if (StructType *stTy0 = dyn_cast<StructType>(Ty0))
return stTy0->getElementType(Idx[1]);
return Ty0;
};
auto getValueFromStruct = [&](Type *Ty) {
uint32_t TyBytes = (uint32_t)m_DL->getTypeStoreSize(Ty);
Type *Ty0 = StTy->getElementType(Idx[0]);
StructType *stTy0 = dyn_cast<StructType>(Ty0);
Type *Ty1 = stTy0 ? stTy0->getElementType(Idx[1]) : nullptr;
if (!stTy0 && (Ty0 == Ty || m_DL->getTypeStoreSize(Ty0) == TyBytes)) {
IGC_ASSERT(Idx[1] == 0);
Value *V = irBuilder.CreateExtractValue(LoadedVal, Idx[0]);
if (Ty0 != Ty) {
V = irBuilder.CreateBitCast(V, Ty);
}
(void)advanceStructIndices(Idx, StTy);
return V;
}
if (stTy0 && (Ty1 == Ty || m_DL->getTypeStoreSize(Ty1) == TyBytes)) {
Value *V = irBuilder.CreateExtractValue(LoadedVal, Idx);
if (Ty1 != Ty) {
V = irBuilder.CreateBitCast(V, Ty);
}
(void)advanceStructIndices(Idx, StTy);
return V;
}
// Original scalar type (if the original is a vector, it's its
// element type) could be split into smaller same-typed scalars.
Type *eTy = Ty->getScalarType();
uint32_t nelts = getNumElements(Ty);
uint32_t ebytes = (uint32_t)m_DL->getTypeStoreSize(eTy);
SmallVector<Value *, 8> vecElts;
for (uint32_t i = 0; i < nelts; ++i) {
int eltRemainingBytes = (int)ebytes;
SmallVector<Value *, 8> subElts;
do {
// Ty0 is type at Idx[0]
// stTy0 is dyn_cast<StructType>(Ty0).
Value *V;
uint32_t currBytes;
// type of matching struct member
Type *mTy;
if (stTy0) {
V = irBuilder.CreateExtractValue(LoadedVal, Idx);
mTy = stTy0->getElementType(Idx[1]);
} else {
V = irBuilder.CreateExtractValue(LoadedVal, Idx[0]);
mTy = Ty0;
}
currBytes = (uint32_t)m_DL->getTypeStoreSize(mTy);
IGC_ASSERT_MESSAGE(currBytes <= ebytes, "member should't be larger than the element size of load!");
eltRemainingBytes -= (int)currBytes;
subElts.push_back(V);
if (eltRemainingBytes < 0) {
IGC_ASSERT_UNREACHABLE();
break;
}
if (!advanceStructIndices(Idx, StTy)) {
// already last element
break;
}
// update Ty0/stTy0
Ty0 = StTy->getElementType(Idx[0]);
stTy0 = dyn_cast<StructType>(Ty0);
} while (eltRemainingBytes > 0);
IGC_ASSERT(eltRemainingBytes == 0);
Value *V = createValueFromElements(subElts, eTy);
vecElts.push_back(V);
}
Value *retVal = createValueFromElements(vecElts, Ty);
return retVal;
};
// Given mTy = type of the next member in the layout struct, and Ty is
// the type of one of all merged loads that are combined as this layout
// struct, the algorithm gurantees:
// 1. if mTy is a vector, Ty must be the same vector,
// 2. if mTy is a scalar, Ty can be either a vector or scalar, and
// size(mTy) <= size(Ty's element type)
for (auto &V : Vals) {
Type *memTy = getCurrMemberTy();
SmallVector<Value *, 8> allUses;
if (memTy->isVectorTy()) {
IGC_ASSERT(memTy == V->getType());
allUses.push_back(V);
} else {
// Optimization: If V's elements are available, use them.
getVecEltIfConstExtract(V, allUses);
}
for (auto &nV : allUses) {
Type *aTy = nV->getType();
Value *newV = getValueFromStruct(aTy);
if (isa<UndefValue>(nV)) {
appendToBeDeleted(dyn_cast<Instruction>(newV));
} else {
nV->replaceAllUsesWith(newV);
appendToBeDeleted(dyn_cast<Instruction>(nV));
}
}
}
} else {
// vector type or scalar type
uint32_t Idx = 0;
Type *LoadedEltTy = LoadedValTy->getScalarType();
uint32_t LoadedEltBytes = (uint32_t)m_DL->getTypeStoreSize(LoadedEltTy);
// Return a value of type Ty at the given Idx and advance Idx.
// If Ty is larger than the element type of LoadedVal, it means to
// form a value of Ty by merging several values of LoadedVal
// starting at Idx, and those merged values are guaranteed to be
// same-typed values.
auto collectValueFromVector = [&](Type *Ty) {
uint32_t TyBytes = (uint32_t)m_DL->getTypeStoreSize(Ty);
IGC_ASSERT(TyBytes >= LoadedEltBytes);
int n = TyBytes / LoadedEltBytes;
IGC_ASSERT((TyBytes % LoadedEltBytes) == 0);
Value *retVal;
if (n == 1) {
retVal = irBuilder.CreateExtractElement(LoadedVal, Idx);
if (LoadedEltTy != Ty) {
retVal = irBuilder.CreateBitCast(retVal, Ty);
}
++Idx;
} else {
VectorType *vTy = VectorType::get(LoadedEltTy, n, false);
Value *nV = UndefValue::get(vTy);
for (int i = 0; i < n; ++i) {
Value *V = irBuilder.CreateExtractElement(LoadedVal, Idx);
nV = irBuilder.CreateInsertElement(nV, V, i);
++Idx;
}
retVal = irBuilder.CreateBitCast(nV, Ty);
}
return retVal;
};
// Given ty = V's type, the algorithm gurantees that size of ty's
// element is no smaller than LoadedValEBytes
for (auto &V : Vals) {
SmallVector<Value *, 8> allUses;
getVecEltIfConstExtract(V, allUses);
for (auto &nV : allUses) {
Type *aTy = nV->getType();
Type *eTy = aTy->getScalarType();
uint32_t nelts = getNumElements(aTy);
IGC_ASSERT(m_DL->getTypeStoreSize(eTy) >= LoadedEltBytes);
SmallVector<Value *, 8> vecElts;
for (uint32_t i = 0; i < nelts; ++i) {
Value *V = collectValueFromVector(eTy);
vecElts.push_back(V);
}
Value *newV = createValueFromElements(vecElts, aTy);
if (isa<UndefValue>(nV)) {
appendToBeDeleted(dyn_cast<Instruction>(newV));
} else {
nV->replaceAllUsesWith(newV);
appendToBeDeleted(dyn_cast<Instruction>(nV));
}
}
}
}
}
Value *LdStCombine::structToVec(IGCIRBuilder<> *irBuilder, BasicBlock *BB, Value *structVal, unsigned eltBytes,
unsigned nelts) {
uint32_t totalBytes = eltBytes * nelts;
Type *eltTy;
// Use special bitcast from struct to int vec to use vector emit.
if (totalBytes < 4)
eltTy = Type::getIntNTy(BB->getContext(), totalBytes * 8); // <{i8, i8}>, use i16, not 2xi8
else
eltTy = Type::getIntNTy(BB->getContext(), eltBytes * 8);
// Use an int vector type as VTy
Type *VTy = (nelts == 1 || totalBytes < 4) ? eltTy : VectorType::get(eltTy, nelts, false);
Type *ITys[2] = {VTy, structVal->getType()};
Function *IntrDcl = GenISAIntrinsic::getDeclaration(BB->getParent()->getParent(),
GenISAIntrinsic::ID::GenISA_bitcastfromstruct, ITys);
return irBuilder->CreateCall(IntrDcl, structVal);
}
void LdStCombine::createCombinedStores(BasicBlock *BB) {
for (auto &bundle : m_bundles) {
InstAndOffsetPairs &Stores = bundle.LoadStores;
IGC_ASSERT(bundle.LoadStores.size() >= 2);
// The new store will be inserted at the place of the last store,
// called anchor store, in the bundle. The lead store is the first
// store in the bundle.
// (Lead store, amaong all stores in the bundle, does not necessarily
// appear first in the BB; and the last store does not necessarily
// have the largest offset in the bundle.)
AStoreInst leadStore = AStoreInst::get(Stores[0].getInst()).value();
SmallVector<Value *, 16> storedValues;
storedValues.push_back(leadStore.getValueOperand());
Instruction *anchorStore = leadStore.inst();
int n = m_instOrder[anchorStore];
// insts are assigned order number starting from 0. Anchor store is
// one with the largest inst order number.
for (int i = 1, sz = (int)bundle.LoadStores.size(); i < sz; ++i) {
AStoreInst SI = AStoreInst::get(Stores[i].getInst()).value();
int SI_no = m_instOrder[SI.inst()];
if (SI_no > n) {
n = SI_no;
anchorStore = SI.inst();
}
storedValues.push_back(SI.getValueOperand());
}
int eltBytes = bundle.bundle_eltBytes;
int nelts = bundle.bundle_numElts;
if (eltBytes == 1) { // byte-aligned
// D64, D32, D16U32
if ((nelts % 4) == 0) {
if (bundle.useD64) {
// D64
IGC_ASSERT((nelts % 8) == 0);
eltBytes = 8;
nelts = nelts / 8;
} else {
// D32
eltBytes = 4;
nelts = nelts / 4;
}
} else if (nelts == 2) {
// <2xi8>, D16U32
eltBytes = 2;
nelts = 1;
} else {
IGC_ASSERT(false);
}
}
// Generate the coalesced value.
Value *nV = gatherCopy(eltBytes, nelts, storedValues, anchorStore);
Type *VTy = nV->getType();
IGCIRBuilder<> irBuilder(anchorStore);
if (VTy->isStructTy()) {
nV = structToVec(&irBuilder, BB, nV, eltBytes, nelts);
VTy = nV->getType();
}
Value *Addr = leadStore.getPointerOperand();
PointerType *PTy = cast<PointerType>(Addr->getType());
PointerType *nPTy = PointerType::get(VTy, PTy->getAddressSpace());
Value *nAddr = irBuilder.CreateBitCast(Addr, nPTy);
Instruction *finalStore = leadStore.CreateAlignedStore(irBuilder, nV, nAddr, leadStore.isVolatile());
finalStore->setDebugLoc(anchorStore->getDebugLoc());
// Only keep metadata from leadStore.
// (If each store has a different metadata, should they be merged
// in the first place?)
//
// Special case:
// 1. set nontemporal if any merged store has it (make sense?)
SmallVector<std::pair<unsigned, llvm::MDNode *>, 4> MDs;
leadStore.inst()->getAllMetadata(MDs);
for (const auto &MII : MDs) {
finalStore->setMetadata(MII.first, MII.second);
}
if (finalStore->getMetadata("nontemporal") == nullptr) {
for (int i = 1, sz = (int)bundle.LoadStores.size(); i < sz; ++i) {
if (MDNode *N = Stores[i].getInst()->getMetadata("nontemporal")) {
finalStore->setMetadata("nontemporal", N);
break;
}
}
}
}
// Delete stores that have been combined.
eraseDeadInsts();
m_hasStoreCombined = (!m_bundles.empty());
m_bundles.clear();
}
void LdStCombine::createCombinedLoads(BasicBlock *BB) {
LLVM_DEBUG(dbgs() << "LdStCombine::createCombinedLoads for BB: " << BB->getName() << "\n");
for (auto &bundle : m_bundles) {
InstAndOffsetPairs &Loads = bundle.LoadStores;
IGC_ASSERT(bundle.LoadStores.size() >= 2);
#if defined(_LDST_DEBUG)
{
BundleInfo *pBundle = &bundle;
pBundle->print(dbgs(), _bundleid);
++_bundleid;
}
#endif
// The new load will be inserted at the place of the first load in the
// program order in this bundle, called the anchor load. The lead load
// is the load with the smallest offset in the bundle.
ALoadInst leadLoad = ALoadInst::get(Loads[0].getInst()).value();
SmallVector<Value *, 16> loadedValues;
loadedValues.push_back(leadLoad.inst());
// find anchor load.
Instruction *anchorLoad = leadLoad.inst();
const int leadLoadNum = m_instOrder[leadLoad.inst()];
const int leadOffset = (int)Loads[0].getByteOffset();
int anchorOffset = leadOffset;
int n = leadLoadNum;
// insts are assigned order number starting from 0. Anchor load is
// one with the smallest inst order number.
for (int i = 1, sz = (int)bundle.LoadStores.size(); i < sz; ++i) {
Instruction *LI = Loads[i].getInst();
int LI_no = m_instOrder[LI];
if (LI_no < n) {
n = LI_no;
anchorLoad = LI;
anchorOffset = (int)Loads[i].getByteOffset();
}
loadedValues.push_back(LI);
}
const int anchorLoadNum = n;
int eltBytes = bundle.bundle_eltBytes;
int nelts = bundle.bundle_numElts;
if (eltBytes == 1) { // byte-aligned
// D64, D32, D16U32
if ((nelts % 4) == 0) {
if (bundle.useD64) {
// D64
IGC_ASSERT((nelts % 8) == 0);
eltBytes = 8;
nelts = nelts / 8;
} else {
// D32
eltBytes = 4;
nelts = nelts / 4;
}
} else {
// <2xi8>, D16U32
IGC_ASSERT(nelts == 2);
}
}
// Create the new vector type for these combined loads.
Type *eltTy = Type::getIntNTy(BB->getContext(), eltBytes * 8);
Type *VTy = (nelts == 1 ? eltTy : VectorType::get(eltTy, nelts, false));
IGCIRBuilder<> irBuilder(anchorLoad);
Value *Addr = leadLoad.getPointerOperand();
// If leadLoad is different from anchorLoad and leadLoad's addr is
// an instruction after anchorLoad, need to re-generate the address
// of LeadLoad at anchorLoad place.
if (anchorLoad != leadLoad.inst() && isa<Instruction>(Addr)) {
Instruction *aI = cast<Instruction>(Addr);
auto MI = m_instOrder.find(aI);
if (MI != m_instOrder.end() && MI->second > anchorLoadNum) {
Value *anchorAddr = ALoadInst::get(anchorLoad)->getPointerOperand();
Type *bTy = Type::getInt8Ty(leadLoad.inst()->getContext());
Type *nTy = PointerType::get(bTy, leadLoad.getPointerAddressSpace());
Value *nAddr = irBuilder.CreateBitCast(anchorAddr, nTy);
Value *aIdx = irBuilder.getInt64(leadOffset - anchorOffset);
GEPOperator *aGEP = dyn_cast<GEPOperator>(anchorAddr);
if (aGEP && aGEP->isInBounds()) {
Addr = irBuilder.CreateInBoundsGEP(bTy, nAddr, aIdx, "anchorLoad");
} else {
Addr = irBuilder.CreateGEP(bTy, nAddr, aIdx, "anchorLoad");
}
};
}
PointerType *PTy = cast<PointerType>(Addr->getType());
PointerType *nPTy = PointerType::get(VTy, PTy->getAddressSpace());
Value *nAddr = irBuilder.CreateBitCast(Addr, nPTy);
// Merge "merge values" of each predicated load in loadedValues to use in a new load.
SmallVector<Value *, 16> mergeValues;
for (auto load : loadedValues) {
PredicatedLoadIntrinsic *PLI = ALoadInst::get(load)->getPredicatedLoadIntrinsic();
if (!PLI)
break; // not a predicated load, no merge values
mergeValues.push_back(PLI->getMergeValue());
}
Value *mergeVal = mergeValues.empty() ? nullptr : gatherCopy(eltBytes, nelts, mergeValues, anchorLoad);
if (mergeVal && mergeVal->getType()->isStructTy())
mergeVal = structToVec(&irBuilder, BB, mergeVal, eltBytes, nelts);
Instruction *finalLoad = leadLoad.CreateAlignedLoad(irBuilder, VTy, nAddr, mergeVal, leadLoad.isVolatile());
finalLoad->setDebugLoc(anchorLoad->getDebugLoc());
// Split loaded value and replace original loads with them.
scatterCopy(loadedValues, eltBytes, nelts, finalLoad, anchorLoad);
// Keep metadata
auto STII = std::find_if_not(bundle.LoadStores.begin(), bundle.LoadStores.end(), [](LdStInfo &LSI) {
auto md = LSI.getInst()->getMetadata(LLVMContext::MD_invariant_load);
return md != nullptr;
});
if (STII == bundle.LoadStores.end()) {
MDNode *md = anchorLoad->getMetadata(LLVMContext::MD_invariant_load);
IGC_ASSERT(md != nullptr);
finalLoad->setMetadata(LLVMContext::MD_invariant_load, md);
}
MDNode *nonTempMD = nullptr;
std::for_each(bundle.LoadStores.begin(), bundle.LoadStores.end(), [&nonTempMD](LdStInfo &LSI) {
if (auto md = LSI.getInst()->getMetadata("nontemporal"))
nonTempMD = MDNode::concatenate(md, nonTempMD);
});
if (nonTempMD) {
finalLoad->setMetadata("nontemporal", nonTempMD);
}
}
// Delete stores that have been combined.
eraseDeadInsts();
m_hasLoadCombined = (!m_bundles.empty());
m_bundles.clear();
}
void LdStCombine::eraseDeadInsts() {
RecursivelyDeleteDeadInstructions(m_toBeDeleted);
m_toBeDeleted.clear();
}
void BundleInfo::print(raw_ostream &O, int BundleID) const {
O << "\nBundle Info " << BundleID << "\n"
<< " Element bytes = " << bundle_eltBytes << " "
<< "num of elements = " << bundle_numElts << " "
<< "useD64 = " << (useD64 ? "true" : "false") << "\n\n";
for (const auto &II : LoadStores) {
const LdStInfo &LSI = II;
O << " (" << format_decimal(LSI.getByteOffset(), 3) << ") ";
O << *LSI.getInst() << "\n";
}
O << "\n";
}
void BundleInfo::dump() const { print(dbgs()); }
namespace IGC {
bool isLayoutStructType(const StructType *StTy) {
if (!StTy || StTy->isLiteral() || !StTy->hasName() || !StTy->isPacked())
return false;
StringRef stId = StTy->getName();
return (stId.startswith(getStructNameForSOALayout()) || stId.startswith(getStructNameForAOSLayout()));
}
bool isLayoutStructTypeAOS(const StructType *StTy) {
if (!StTy || StTy->isLiteral() || !StTy->hasName() || !StTy->isPacked())
return false;
StringRef stId = StTy->getName();
return stId.startswith(getStructNameForAOSLayout());
}
bool isLayoutStructTypeSOA(const StructType *StTy) { return isLayoutStructType(StTy) && !isLayoutStructTypeAOS(StTy); }
uint64_t bitcastToUI64(Constant *C, const DataLayout *DL) {
Type *ty = C->getType();
IGC_ASSERT(DL->getTypeStoreSizeInBits(ty) <= 64);
IGC_ASSERT(ty->isStructTy() || (ty->isSingleValueType() && !ty->isVectorTy()));
uint64_t imm = 0;
if (StructType *sTy = dyn_cast<StructType>(C->getType())) {
IGC_ASSERT(DL->getTypeStoreSizeInBits(sTy) <= 64);
IGC_ASSERT(isLayoutStructTypeAOS(sTy));
const StructLayout *SL = DL->getStructLayout(sTy);
int N = (int)sTy->getNumElements();
for (int i = 0; i < N; ++i) {
Constant *C_i = C->getAggregateElement(i);
if (isa<UndefValue>(C_i)) {
continue;
}
Type *ty_i = sTy->getElementType(i);
uint32_t offbits = (uint32_t)SL->getElementOffsetInBits(i);
if (auto iVTy = dyn_cast<IGCLLVM::FixedVectorType>(ty_i)) {
// C_I is vector
int32_t nelts = (int32_t)iVTy->getNumElements();
Type *eTy_i = ty_i->getScalarType();
IGC_ASSERT(eTy_i->isFloatingPointTy() || eTy_i->isIntegerTy());
uint32_t nbits = (uint32_t)DL->getTypeStoreSizeInBits(eTy_i);
for (int j = 0; j < nelts; ++j) {
Constant *c_ij = C_i->getAggregateElement(j);
uint64_t tImm = GetImmediateVal(c_ij);
tImm &= maxUIntN(nbits);
imm = imm | (tImm << (offbits + j * nbits));
}
} else {
// C_i is scalar of int, fp or null pointer
IGC_ASSERT(isa<ConstantInt>(C_i) || isa<ConstantFP>(C_i) || isa<ConstantPointerNull>(C_i));
uint32_t nbits = (uint32_t)DL->getTypeStoreSizeInBits(ty_i);
uint64_t tImm = GetImmediateVal(C_i);
tImm &= maxUIntN(nbits);
imm = imm | (tImm << offbits);
}
}
return imm;
}
if (isa<ConstantFP>(C) || isa<ConstantInt>(C)) {
return GetImmediateVal(C);
}
if (isa<UndefValue>(C) || isa<ConstantPointerNull>(C)) {
return 0;
}
IGC_ASSERT_MESSAGE(0, "unsupported Constant!");
return 0;
}
void getStructMemberByteOffsetAndType_1(const DataLayout *DL, StructType *StTy, const ArrayRef<unsigned> &Indices,
Type *&Ty, uint32_t &ByteOffset) {
IGC_ASSERT_MESSAGE(Indices.size() == 1, "ICE: nested struct not supported!");
const StructLayout *aSL = DL->getStructLayout(StTy);
uint32_t ix = Indices.front();
ByteOffset = (uint32_t)aSL->getElementOffset(ix);
Ty = StTy->getElementType(ix);
return;
};
void getStructMemberOffsetAndType_2(const DataLayout *DL, StructType *StTy, const ArrayRef<unsigned> &Indices,
Type *&Ty0, uint32_t &ByteOffset0, Type *&Ty1, uint32_t &ByteOffset1) {
uint32_t ix = Indices[0];
const StructLayout *SL0 = DL->getStructLayout(StTy);
ByteOffset0 = (uint32_t)SL0->getElementOffset(ix);
Ty0 = StTy->getElementType(ix);
ByteOffset1 = 0;
Ty1 = nullptr;
if (Indices.size() == 1) {
return;
}
IGC_ASSERT(isLayoutStructType(StTy));
IGC_ASSERT_MESSAGE(Indices.size() <= 2, "struct with nesting level > 2 not supported!");
IGC_ASSERT_MESSAGE((Ty0->isStructTy() && isLayoutStructTypeAOS(cast<StructType>(Ty0))),
"Only a special AOS layout struct is supported as a member");
uint32_t ix1 = Indices[1];
StructType *stTy0 = cast<StructType>(Ty0);
const StructLayout *SL1 = DL->getStructLayout(stTy0);
ByteOffset1 = (uint32_t)SL1->getElementOffset(ix1);
Ty1 = stTy0->getElementType(ix1);
return;
}
static void searchForDefinedMembers(const ConstantAggregate *S, const std::vector<unsigned> &currentIndices,
SmallVectorImpl<std::vector<unsigned>> &fieldsTBC) {
for (unsigned i = 0; i < S->getNumOperands(); i++) {
auto indices = currentIndices;
indices.push_back(i);
auto *E = S->getAggregateElement(i);
if (isa<UndefValue>(E))
continue;
if (auto *SE = dyn_cast<ConstantAggregate>(E)) {
searchForDefinedMembers(SE, indices, fieldsTBC);
} else {
fieldsTBC.push_back(indices);
}
}
}
void getAllDefinedMembers(const Value *IVI, SmallVectorImpl<std::vector<unsigned>> &fieldsTBC) {
IGC_ASSERT(IVI != nullptr);
const Value *V = IVI;
while (isa<InsertValueInst>(V)) {
const InsertValueInst *I = cast<const InsertValueInst>(V);
fieldsTBC.push_back(I->getIndices().vec());
V = I->getOperand(0);
}
// at the end we may have a constant struct like this:
// % 28 = insertvalue % __StructSOALayout_ < { i32 194816, i32 undef, i32 undef, <1 x float> undef } > , i32 % 17, 1
// we should traverse it and find the indices pointing to the constant values
if (auto *S = dyn_cast<ConstantAggregate>(V)) {
std::vector<unsigned> indices = {};
searchForDefinedMembers(S, indices, fieldsTBC);
}
// reverse the vector to get the ascending order of indices
std::reverse(fieldsTBC.begin(), fieldsTBC.end());
}
} // namespace IGC