Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/VectorPreProcess.cpp
Paige, Alexander 420b632df9 Update IGC code format
Update IGC code format
2025-07-20 06:20:11 +02:00

1805 lines
72 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "Compiler/CISACodeGen/VectorProcess.hpp"
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "Compiler/CISACodeGen/EmitVISAPass.hpp"
#include "Compiler/CodeGenPublic.h"
#include "Compiler/IGCPassSupport.h"
#include "common/LLVMWarningsPush.hpp"
#include <llvm/IR/DataLayout.h>
#include <llvm/IR/Instructions.h>
#include <llvmWrapper/IR/IRBuilder.h>
#include <llvm/IR/InstIterator.h>
#include <llvm/Support/MathExtras.h>
#include <llvm/Transforms/Utils/Local.h>
#include <llvmWrapper/IR/DerivedTypes.h>
#include <llvmWrapper/Support/Alignment.h>
#include <optional>
#include "common/LLVMWarningsPop.hpp"
#include "Probe/Assertion.h"
#include "common/debug/Debug.hpp"
#include <utility> // std::pair, std::make_pair
#include <sstream> // std::string, std::stringstream
#include <fstream> // std::ofstream
using namespace llvm;
using namespace IGC;
using IGCLLVM::FixedVectorType;
//
// Description of VectorPreProcess Pass
// The purpose is both to legalize vector types and to reduce register
// presure. Once this pass is done, there is no 3-element vector whose
// element size < 4 bytes, that is, no <3 x i8>, no <3 x i16>. (But
// we will have <3xi32> and <3xi64>.)
//
// 1. Split a vector load/stores with a large vector into ones with
// smaller vectors or scalars; and make sure that the sub-vectors
// are either multiple of DW, vector3, or their size is less than
// 4 bytes (see details in code). Vector3 will be specially
// handled later.
// For example,
// <16xi64> ---> four <4xi64>
// <15xi32> ---> <8xi32>, <7xi32>
// <13xi32> ---> <8xi32>, <5xi32>
// <31xi16> ---> <16xi16>, <12xi16>, <3xi16>
// <19xi16> ---> <16xi16>, <3xi16>
// <39xi8> ---> <32xi8>, <4xi8>, <3xi8>
// Note that splitting keeps the vector element's type without
// changing it.
//
// Note that as 6/2020,
// if size of vector element >= DW, the number of elements of the new vector
// should be power of 2 except for vector3. Thus, we should not see 5xi32,
// 7xi32, etc. This makes code emit easier.
//
// 2. Special processing of 3-element vectors
// If (vector element's size < 4 bytes)
// {
// 3-element vector load --> 2-element vector load + scalar load
// 3-element vector store --> 2-element vector store + scalar store.
// }
// Note that 3-element load could be optimized to 4-element load (check
// details in the code).
//
// for example,
// (1) %1 = load <3 x i8> *p
// converted into
// %pv = bitcast p to <2 x i8>* // %pv is type <2 x i8>*
// %ps = (i8*)p + 2; // %ps is type i8*
// %2 = load <2 x i8> *pv
// %3 = load i8 *ps
// original vector %1 == (%2, %3)
//
// (2) store <3 x i16> %1, <3 x i16> *p
// converted into
// %pv = bitcast p to <2 x i16>* // %pv is type <2 x i16>*
// %ps = (i16*)p + 2; // %ps is type i16*
// %new_v = (%1.x, %1.y)
// store <2 x i16> %new_v, <2 x i16> *pv
// store i16 %1.z i8 *ps
//
namespace {
// AbstractLoadInst and AbstractStoreInst abstract away the differences
// between ldraw, Load and PredicatedLoad and between storeraw, Store and PredicatedStore.
// Note on usage: The Value* passed as the ptr paramter to the Create method
// should be either the result of the getPointerOperand() method or the
// CreateConstScalarGEP() method. Do not attempt to do arithmetic
// (or pointer arithmetic) on these values.
class AbstractLoadInst {
Instruction *const m_inst;
const DataLayout &DL;
AbstractLoadInst(LoadInst *LI, const DataLayout &DL) : m_inst(LI), DL(DL) {}
AbstractLoadInst(LdRawIntrinsic *LdRI, const DataLayout &DL) : m_inst(LdRI), DL(DL) {}
AbstractLoadInst(PredicatedLoadIntrinsic *PLI, const DataLayout &DL) : m_inst(PLI), DL(DL) {}
LoadInst *getLoad() const { return cast<LoadInst>(m_inst); }
LdRawIntrinsic *getLdRaw() const { return cast<LdRawIntrinsic>(m_inst); }
PredicatedLoadIntrinsic *getPredicatedLoad() const { return cast<PredicatedLoadIntrinsic>(m_inst); }
public:
Instruction *getInst() const { return m_inst; }
alignment_t getAlignment() const {
if (isa<LoadInst>(m_inst))
return IGCLLVM::getAlignmentValue(getLoad());
if (isa<LdRawIntrinsic>(m_inst))
return getLdRaw()->getAlignment();
return getPredicatedLoad()->getAlignment();
}
void setAlignment(alignment_t alignment) {
if (isa<LoadInst>(m_inst)) {
getLoad()->setAlignment(IGCLLVM::getCorrectAlign(alignment));
} else if (isa<LdRawIntrinsic>(m_inst)) {
getLdRaw()->setAlignment(alignment);
} else {
getPredicatedLoad()->setAlignment(alignment);
}
}
Value *getPointerOperand() const {
if (isa<LoadInst>(m_inst))
return getLoad()->getPointerOperand();
if (isa<LdRawIntrinsic>(m_inst))
return getLdRaw()->getResourceValue();
return getPredicatedLoad()->getPointerOperand();
}
bool getIsVolatile() const {
if (isa<LoadInst>(m_inst))
return getLoad()->isVolatile();
if (isa<LdRawIntrinsic>(m_inst))
return getLdRaw()->isVolatile();
return getPredicatedLoad()->isVolatile();
}
unsigned getPointerAddressSpace() const { return getPointerOperand()->getType()->getPointerAddressSpace(); }
Value *getMergeValue() const {
if (isa<PredicatedLoadIntrinsic>(m_inst))
return getPredicatedLoad()->getMergeValue();
return nullptr;
}
Instruction *Create(Type *returnType, Value *mergeValue = nullptr) {
return Create(returnType, getPointerOperand(), getAlignment(), getIsVolatile(), mergeValue);
}
Instruction *Create(Type *returnType, Value *ptr, alignment_t alignment, bool isVolatile,
Value *mergeValue = nullptr) {
IGCLLVM::IRBuilder<> builder(m_inst);
if (isa<LoadInst>(m_inst)) {
Type *newPtrType = PointerType::get(returnType, ptr->getType()->getPointerAddressSpace());
ptr = builder.CreateBitCast(ptr, newPtrType);
LoadInst *newLI = builder.CreateAlignedLoad(returnType, ptr, IGCLLVM::getAlign(alignment), isVolatile);
if (MDNode *lscMetadata = m_inst->getMetadata("lsc.cache.ctrl")) {
newLI->setMetadata("lsc.cache.ctrl", lscMetadata);
}
return newLI;
}
if (isa<LdRawIntrinsic>(m_inst)) {
LdRawIntrinsic *ldraw = getLdRaw();
bool hasComputedOffset = ptr != ldraw->getResourceValue();
Value *offsetVal = hasComputedOffset ? ptr : ldraw->getOffsetValue();
ptr = ldraw->getResourceValue();
Type *types[2] = {returnType, ptr->getType()};
Value *args[4] = {ptr, offsetVal, builder.getInt32((uint32_t)alignment), builder.getInt1(isVolatile)};
Function *newLdRawFunction = GenISAIntrinsic::getDeclaration(ldraw->getModule(), ldraw->getIntrinsicID(), types);
return builder.CreateCall(newLdRawFunction, args);
}
IGC_ASSERT(isa<PredicatedLoadIntrinsic>(m_inst));
IGC_ASSERT(mergeValue);
IGC_ASSERT(mergeValue->getType() == returnType);
PredicatedLoadIntrinsic *PLI = getPredicatedLoad();
Type *newPtrType = PointerType::get(returnType, ptr->getType()->getPointerAddressSpace());
ptr = builder.CreateBitCast(ptr, newPtrType);
Type *types[3] = {returnType, ptr->getType(), returnType};
Function *predLoadFunc = GenISAIntrinsic::getDeclaration(m_inst->getModule(), PLI->getIntrinsicID(), types);
Value *args[4] = {ptr, builder.getInt64((uint64_t)alignment), PLI->getPredicate(), mergeValue};
Instruction *PredLoad = builder.CreateCall(predLoadFunc, args);
if (MDNode *lscMetadata = m_inst->getMetadata("lsc.cache.ctrl"))
PredLoad->setMetadata("lsc.cache.ctrl", lscMetadata);
return PredLoad;
}
// Emulates a GEP on a pointer of the scalar type of returnType.
Value *CreateConstScalarGEP(Type *returnType, Value *ptr, uint32_t offset) {
IGCLLVM::IRBuilder<> builder(m_inst);
if (isa<LoadInst>(m_inst) || isa<PredicatedLoadIntrinsic>(m_inst)) {
Type *ePtrType = PointerType::get(returnType->getScalarType(), ptr->getType()->getPointerAddressSpace());
ptr = builder.CreateBitCast(ptr, ePtrType);
return builder.CreateConstGEP1_32(returnType->getScalarType(), ptr, offset);
} else {
uint32_t sizeInBytes = int_cast<uint32_t>(DL.getTypeSizeInBits(returnType->getScalarType()) / 8);
Value *offsetInBytes = builder.getInt32(offset * sizeInBytes);
return builder.CreateAdd(offsetInBytes, getLdRaw()->getOffsetValue());
}
}
static std::optional<AbstractLoadInst> get(llvm::Value *value, const DataLayout &DL) {
if (LoadInst *LI = dyn_cast<LoadInst>(value)) {
return AbstractLoadInst{LI, DL};
} else if (LdRawIntrinsic *LdRI = dyn_cast<LdRawIntrinsic>(value)) {
return AbstractLoadInst{LdRI, DL};
} else if (PredicatedLoadIntrinsic *PLI = dyn_cast<PredicatedLoadIntrinsic>(value)) {
return AbstractLoadInst{PLI, DL};
} else {
return std::nullopt;
}
}
};
static bool isAbstractLoadInst(llvm::Value *value) {
return isa<LoadInst>(value) || isa<LdRawIntrinsic>(value) || isa<PredicatedLoadIntrinsic>(value);
}
class AbstractStoreInst {
Instruction *const m_inst;
const DataLayout &DL;
AbstractStoreInst(StoreInst *SI, const DataLayout &DL) : m_inst(SI), DL(DL) {}
AbstractStoreInst(StoreRawIntrinsic *SRI, const DataLayout &DL) : m_inst(SRI), DL(DL) {}
AbstractStoreInst(PredicatedStoreIntrinsic *PSI, const DataLayout &DL) : m_inst(PSI), DL(DL) {}
StoreInst *getStore() const { return cast<StoreInst>(m_inst); }
StoreRawIntrinsic *getStoreRaw() const { return cast<StoreRawIntrinsic>(m_inst); }
PredicatedStoreIntrinsic *getPredicatedStore() const { return cast<PredicatedStoreIntrinsic>(m_inst); }
public:
Instruction *getInst() const { return m_inst; }
alignment_t getAlignment() const {
if (isa<StoreInst>(m_inst))
return IGCLLVM::getAlignmentValue(getStore());
if (isa<StoreRawIntrinsic>(m_inst))
return getStoreRaw()->getAlignment();
return getPredicatedStore()->getAlignment();
}
void setAlignment(alignment_t alignment) {
if (isa<StoreInst>(m_inst)) {
getStore()->setAlignment(IGCLLVM::getCorrectAlign(alignment));
} else if (isa<PredicatedStoreIntrinsic>(m_inst)) {
getPredicatedStore()->setAlignment(alignment);
}
}
Value *getValueOperand() const {
if (isa<StoreInst>(m_inst))
return getStore()->getValueOperand();
if (isa<StoreRawIntrinsic>(m_inst))
return getStoreRaw()->getArgOperand(2);
return getPredicatedStore()->getValueOperand();
}
Value *getPointerOperand() const {
if (isa<StoreInst>(m_inst))
return getStore()->getPointerOperand();
if (isa<StoreRawIntrinsic>(m_inst))
return getStoreRaw()->getArgOperand(0);
return getPredicatedStore()->getPointerOperand();
}
bool getIsVolatile() const {
if (isa<StoreInst>(m_inst))
return getStore()->isVolatile();
if (isa<PredicatedLoadIntrinsic>(m_inst))
return getPredicatedStore()->isVolatile();
return false;
}
unsigned getPointerAddressSpace() const { return getPointerOperand()->getType()->getPointerAddressSpace(); }
Instruction *Create(Value *storedValue, Value *ptr, alignment_t alignment, bool isVolatile) {
IRBuilder<> builder(m_inst);
Type *newType = storedValue->getType();
if (isa<StoreInst>(m_inst)) {
Type *newPtrType = PointerType::get(newType, ptr->getType()->getPointerAddressSpace());
ptr = builder.CreateBitCast(ptr, newPtrType);
return alignment ? builder.CreateAlignedStore(storedValue, ptr, IGCLLVM::getAlign(alignment), isVolatile)
: builder.CreateStore(storedValue, ptr, isVolatile);
}
if (isa<StoreRawIntrinsic>(m_inst)) {
bool hasComputedOffset = ptr != getPointerOperand();
Value *offset = hasComputedOffset ? ptr : getStoreRaw()->getArgOperand(1);
ptr = getPointerOperand();
Type *types[2] = {ptr->getType(), newType};
Value *args[5] = {ptr, offset, storedValue, builder.getInt32((uint32_t)alignment), builder.getInt1(isVolatile)};
Function *newStoreRawFunction =
GenISAIntrinsic::getDeclaration(getStoreRaw()->getModule(), getStoreRaw()->getIntrinsicID(), types);
return builder.CreateCall(newStoreRawFunction, args);
}
Type *newPtrType = PointerType::get(newType, ptr->getType()->getPointerAddressSpace());
ptr = builder.CreateBitCast(ptr, newPtrType);
Type *types[2] = {ptr->getType(), newType};
Function *predStoreFunc = GenISAIntrinsic::getDeclaration(getPredicatedStore()->getModule(),
getPredicatedStore()->getIntrinsicID(), types);
Value *args[4] = {ptr, storedValue, builder.getInt64((uint64_t)alignment), getPredicatedStore()->getPredicate()};
return builder.CreateCall(predStoreFunc, args);
}
Instruction *Create(Value *storedValue) {
return Create(storedValue, getPointerOperand(), getAlignment(), getIsVolatile());
}
// Emulates a GEP on a pointer of the scalar type of storedType.
Value *CreateConstScalarGEP(Type *storedType, Value *ptr, uint32_t offset) {
IGCLLVM::IRBuilder<> builder(m_inst);
if (isa<StoreInst>(m_inst) || isa<PredicatedStoreIntrinsic>(m_inst)) {
Type *ePtrType = PointerType::get(storedType->getScalarType(), ptr->getType()->getPointerAddressSpace());
ptr = builder.CreateBitCast(ptr, ePtrType);
return builder.CreateConstGEP1_32(storedType->getScalarType(), ptr, offset);
} else {
uint32_t sizeInBytes = int_cast<uint32_t>(DL.getTypeSizeInBits(storedType->getScalarType()) / 8);
Value *offsetInBytes = builder.getInt32(offset * sizeInBytes);
return builder.CreateAdd(offsetInBytes, getStoreRaw()->getArgOperand(1));
}
}
static std::optional<AbstractStoreInst> get(llvm::Value *value, const DataLayout &DL) {
if (StoreInst *SI = dyn_cast<StoreInst>(value)) {
return AbstractStoreInst{SI, DL};
}
if (StoreRawIntrinsic *SRI = dyn_cast<StoreRawIntrinsic>(value)) {
return AbstractStoreInst{SRI, DL};
}
if (PredicatedStoreIntrinsic *PSI = dyn_cast<PredicatedStoreIntrinsic>(value)) {
return AbstractStoreInst{PSI, DL};
}
return std::nullopt;
}
};
static bool isAbstractStoreInst(llvm::Value *value) {
GenIntrinsicInst *II = dyn_cast<GenIntrinsicInst>(value);
return isa<StoreInst>(value) || (II && (II->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed ||
II->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore));
}
class VectorPreProcess : public FunctionPass {
public:
typedef SmallVector<Instruction *, 32> InstWorkVector;
typedef SmallVector<Value *, 16> ValVector;
// vector value -> (split size in bytes -> vector's component values)
typedef DenseMap<Value *, DenseMap<uint32_t, ValVector>> V2SMap;
enum class VPConst {
// If a vector's size is bigger than SPLIT_SIZE, split it into multiple
// of SPLIT_SIZE (plus smaller sub-vectors or scalar if any).
// With SPLIT_SIZE=32, we have the max vectors as below after this pass:
// <32 x i8>, 16xi16, 8xi32, or 4xi64!
SPLIT_SIZE = 32, // default, 32 bytes
LSC_D64_UNIFORM_SPLIT_SIZE = 512, // LSC transpose 64 x D64
LSC_D32_UNIFORM_SPLIT_SIZE = 256, // LSC transpose 64 x D32
RAW_SPLIT_SIZE = 16
};
static char ID; // Pass identification, replacement for typeid
VectorPreProcess() : FunctionPass(ID), m_DL(nullptr), m_C(nullptr), m_WorkList(), m_Temps(), m_CGCtx(nullptr) {
initializeVectorPreProcessPass(*PassRegistry::getPassRegistry());
}
StringRef getPassName() const override { return "VectorPreProcess"; }
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<CodeGenContextWrapper>();
AU.addRequired<MetaDataUtilsWrapper>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<PostDominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
}
private:
void getOrGenScalarValues(Function &F, Value *VecVal, ValVector &scalars, Instruction *&availBeforeInst);
void replaceAllVectorUsesWithScalars(Instruction *VI, ValVector &SVals);
// Return true if V is created by InsertElementInst with const index.
bool isValueCreatedOnlyByIEI(Value *V, InsertElementInst **IEInsts);
// Return true if V is only used by ExtractElement with const index.
bool isValueUsedOnlyByEEI(Value *V, ExtractElementInst **EEInsts);
// Split load/store that cannot be re-layout or is too big.
uint32_t getSplitByteSize(Instruction *I, WIAnalysisRunner &WI) const;
bool splitLoadStore(Instruction *Inst, V2SMap &vecToSubVec, WIAnalysisRunner &WI);
bool splitLoad(AbstractLoadInst &LI, V2SMap &vecToSubVec, WIAnalysisRunner &WI);
bool splitStore(AbstractStoreInst &SI, V2SMap &vecToSubVec, WIAnalysisRunner &WI);
bool splitVector3LoadStore(Instruction *Inst);
// Simplify load/store instructions if possible. Return itself if no
// simplification is performed.
Instruction *simplifyLoadStore(Instruction *LI);
void createSplitVectorTypes(Type *ETy, uint32_t NElts, uint32_t SplitSize,
SmallVector<std::pair<Type *, uint32_t>, 8> &SplitInfo);
// If predicated loads are split, we also need to split merge values
void createSplitMergeValues(Instruction *Inst, Value *OrigMergeVal,
const SmallVector<std::pair<Type *, uint32_t>, 8> &SplitInfo,
ValVector &NewMergeVals) const;
bool processScalarLoadStore(Function &F);
private:
const DataLayout *m_DL;
LLVMContext *m_C;
InstWorkVector m_WorkList;
ValVector m_Temps;
InstWorkVector m_Vector3List; // used for keep all 3-element vectors.
IGC::CodeGenContext *m_CGCtx;
};
} // namespace
// Register pass to igc-opt
#define PASS_FLAG "igc-vectorpreprocess"
#define PASS_DESCRIPTION "Split loads/stores of big (or 3-element) vectors into smaller ones."
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(VectorPreProcess, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
IGC_INITIALIZE_PASS_END(VectorPreProcess, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
char VectorPreProcess::ID = 0;
FunctionPass *IGC::createVectorPreProcessPass() { return new VectorPreProcess(); }
bool VectorPreProcess::isValueCreatedOnlyByIEI(Value *V, InsertElementInst **IEInsts) {
Value *ChainVal = V;
while (!isa<UndefValue>(ChainVal)) {
InsertElementInst *IEI = dyn_cast<InsertElementInst>(ChainVal);
if (!IEI || !isa<ConstantInt>(IEI->getOperand(2))) {
return false;
}
ConstantInt *CInt = cast<ConstantInt>(IEI->getOperand(2));
uint32_t idx = (uint32_t)CInt->getZExtValue();
// Make sure the last IEI will be recorded if an element is
// inserted multiple times.
if (IEInsts[idx] == nullptr) {
IEInsts[idx] = IEI;
}
ChainVal = IEI->getOperand(0);
}
return true;
}
bool VectorPreProcess::isValueUsedOnlyByEEI(Value *V, ExtractElementInst **EEInsts) {
for (Value::user_iterator UI = V->user_begin(), UE = V->user_end(); UI != UE; ++UI) {
ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(*UI);
if (!EEI || (EEI->getOperand(0) != V) || !isa<ConstantInt>(EEI->getOperand(1))) {
return false;
}
ConstantInt *CInt = cast<ConstantInt>(EEI->getOperand(1));
uint32_t idx = (uint32_t)CInt->getZExtValue();
// Quit if there are multiple extract from the same index.
if (EEInsts[idx] != nullptr) {
return false;
}
EEInsts[idx] = EEI;
}
return true;
}
// SVals[0:NumElements] has all scalar elements of vector VI. This function
// tries to replace all uses of VI with SVals[...] if possible, If not
// possible, re-generate the vector from SVals at the BB of VI.
//
// This function also erase VI.
void VectorPreProcess::replaceAllVectorUsesWithScalars(Instruction *VI, ValVector &SVals) {
SmallVector<Instruction *, 8> ToBeDeleted;
bool genVec = false;
for (Value::user_iterator UI = VI->user_begin(), UE = VI->user_end(); UI != UE; ++UI) {
ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(*UI);
if (!EEI) {
genVec = true;
continue;
}
ConstantInt *CI = dyn_cast<ConstantInt>(EEI->getOperand(1));
if (!CI) {
genVec = true;
continue;
}
uint32_t ix = (uint32_t)CI->getZExtValue();
EEI->replaceAllUsesWith(SVals[ix]);
ToBeDeleted.push_back(EEI);
}
if (genVec) {
Instruction *I;
if (!isa<PHINode>(VI)) {
I = VI;
} else {
I = VI->getParent()->getFirstNonPHI();
}
IRBuilder<> Builder(I);
IGCLLVM::FixedVectorType *VTy = cast<IGCLLVM::FixedVectorType>(VI->getType());
Value *newVec = UndefValue::get(VTy);
for (uint32_t i = 0, e = int_cast<uint32_t>(VTy->getNumElements()); i < e; ++i) {
newVec = Builder.CreateInsertElement(newVec, SVals[i], Builder.getInt32(i), "scalarize");
}
// Replace old instruction with new one
VI->replaceAllUsesWith(newVec);
}
for (uint32_t i = 0; i < ToBeDeleted.size(); ++i) {
ToBeDeleted[i]->eraseFromParent();
}
}
void VectorPreProcess::createSplitVectorTypes(Type *ETy, uint32_t NElts, uint32_t SplitSize,
SmallVector<std::pair<Type *, uint32_t>, 8> &SplitInfo) {
uint32_t ebytes = int_cast<uint32_t>(m_DL->getTypeSizeInBits(ETy) / 8);
// todo: generalize splitting for cases whose element size is bigger than splitsize!
if (IGC_IS_FLAG_ENABLED(EnableSplitUnalignedVector)) {
if (ebytes > SplitSize) {
IGC_ASSERT(SplitSize);
uint32_t M = NElts * ebytes / SplitSize;
Type *Ty = IntegerType::get(ETy->getContext(), SplitSize * 8);
SplitInfo.push_back(std::make_pair(Ty, M));
return;
}
}
// Both SplitSize and ebytes shall be a power of 2
IGC_ASSERT(ebytes);
IGC_ASSERT_MESSAGE((SplitSize % ebytes) == 0, "Internal Error: Wrong split size!");
uint32_t E = SplitSize / ebytes; // split size in elements
uint32_t N = NElts; // the number of elements to be split
IGC_ASSERT(E);
// 1. Make sure splitting it by SplitSize as required
uint32_t M = N / E; // the number of subvectors for split size E
if (M > 0) {
Type *Ty = (E == 1) ? ETy : FixedVectorType::get(ETy, E);
SplitInfo.push_back(std::make_pair(Ty, M));
}
N = N % E;
E = E / 2; // next split size
// 2. The remaining elts are splitted if not power of 2 until N <= 4.
while (N > 4) {
IGC_ASSERT(E);
M = N / E; // the number of subvectors for split size E
if (M > 0) {
SplitInfo.push_back(std::make_pair(FixedVectorType::get(ETy, E), M));
}
// The remaining elts are to be split for next iteration.
N = N % E;
E = E / 2; // next split size
}
// 3. A vector of 1|2|3|4 elements. No further splitting!
if (N > 0) {
Type *Ty = (N == 1) ? ETy : FixedVectorType::get(ETy, N);
SplitInfo.push_back(std::make_pair(Ty, 1));
}
}
void VectorPreProcess::createSplitMergeValues(Instruction *Inst, Value *OrigMergeVal,
const SmallVector<std::pair<Type *, uint32_t>, 8> &SplitInfo,
ValVector &NewMergeVals) const {
// if OrigMergeVal is a zeroinitializer, undef, or poison value, we just need to fill
// NewMergeVals with the same based on SplitInfo and return.
if (isa<ConstantAggregateZero>(OrigMergeVal) || isa<UndefValue>(OrigMergeVal) || isa<PoisonValue>(OrigMergeVal)) {
for (auto &SI : SplitInfo) {
Type *Ty = SI.first;
IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
uint32_t N = SI.second;
for (uint32_t i = 0; i < N; ++i) {
Value *NewMergeVal = nullptr;
if (isa<ConstantAggregateZero>(OrigMergeVal)) {
if (VTy)
NewMergeVal = ConstantAggregateZero::get(VTy);
else
NewMergeVal = Constant::getNullValue(Ty);
} else if (isa<PoisonValue>(OrigMergeVal)) {
NewMergeVal = PoisonValue::get(SI.first);
} else {
NewMergeVal = UndefValue::get(SI.first);
}
NewMergeVals.push_back(NewMergeVal);
}
}
return;
}
IRBuilder<> Builder(Inst);
// Case when we split vector merge value into subvectors. Element type is the same.
// Just one big vector is being split into subvectors.
if (IGCLLVM::FixedVectorType *OrigVTy = dyn_cast<IGCLLVM::FixedVectorType>(OrigMergeVal->getType())) {
unsigned OrigVTyNEl = OrigVTy->getNumElements();
uint32_t idx = 0; // index counting elements of the the original vector merge value
// Split the merge value into subvectors
for (auto &SI : SplitInfo) {
Type *Ty = SI.first;
IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
uint32_t N = SI.second;
for (uint32_t i = 0; i < N; ++i) {
Value *NewMergeVal = UndefValue::get(Ty);
if (VTy) {
for (uint32_t j = 0, e = int_cast<uint32_t>(VTy->getNumElements()); j < e; ++j) {
Value *Elt = (idx < OrigVTyNEl) ? Builder.CreateExtractElement(OrigMergeVal, Builder.getInt32(idx++))
: Constant::getNullValue(VTy->getElementType());
NewMergeVal = Builder.CreateInsertElement(NewMergeVal, Elt, Builder.getInt32(j));
}
} else {
NewMergeVal = Builder.CreateExtractElement(OrigMergeVal, Builder.getInt32(idx++));
}
NewMergeVals.push_back(NewMergeVal);
}
}
return;
}
// Case when we change scalar value into vector with smaller element type.
IGC_ASSERT_MESSAGE(SplitInfo.size() == 1, "Unexpected split info!");
IGC_ASSERT_MESSAGE(SplitInfo[0].second == 1, "Unexpected split info!");
Value *NewMergeVal = Builder.CreateBitCast(OrigMergeVal, SplitInfo[0].first);
NewMergeVals.push_back(NewMergeVal);
}
uint32_t VectorPreProcess::getSplitByteSize(Instruction *I, WIAnalysisRunner &WI) const {
uint32_t bytes = 0;
std::optional<AbstractLoadInst> ALI = AbstractLoadInst::get(I, *m_DL);
std::optional<AbstractStoreInst> ASI = AbstractStoreInst::get(I, *m_DL);
if (isa<LoadInst>(I) || isa<PredicatedLoadIntrinsic>(I)) {
IGC_ASSERT(ALI.has_value());
bytes = (uint32_t)VPConst::SPLIT_SIZE;
if (WI.isUniform(ALI->getPointerOperand()) &&
(m_CGCtx->platform.LSCEnabled() || IGC_GET_FLAG_VALUE(UniformMemOpt4OW))) {
if (ALI->getAlignment() >= 8)
bytes = (uint32_t)VPConst::LSC_D64_UNIFORM_SPLIT_SIZE;
else if (ALI->getAlignment() >= 4)
bytes = (uint32_t)VPConst::LSC_D32_UNIFORM_SPLIT_SIZE;
}
} else if (isa<StoreInst>(I) || isa<PredicatedStoreIntrinsic>(I)) {
IGC_ASSERT(ASI.has_value());
bytes = (uint32_t)VPConst::SPLIT_SIZE;
Value *Addr = ASI->getPointerOperand();
Value *Data = ASI->getValueOperand();
if (m_CGCtx->platform.LSCEnabled() && WI.isUniform(Addr) && WI.isUniform(Data)) {
if (ASI->getAlignment() >= 8)
bytes = (uint32_t)VPConst::LSC_D64_UNIFORM_SPLIT_SIZE;
else if (ASI->getAlignment() >= 4)
bytes = (uint32_t)VPConst::LSC_D32_UNIFORM_SPLIT_SIZE;
}
} else if (isa<LdRawIntrinsic>(I) || isa<StoreRawIntrinsic>(I)) {
uint32_t alignment =
isa<LdRawIntrinsic>(I) ? cast<LdRawIntrinsic>(I)->getAlignment() : cast<StoreRawIntrinsic>(I)->getAlignment();
Value *bufferAddr = isa<LdRawIntrinsic>(I) ? cast<LdRawIntrinsic>(I)->getResourceValue()
: cast<StoreRawIntrinsic>(I)->getResourceValue();
Value *offset = isa<LdRawIntrinsic>(I) ? cast<LdRawIntrinsic>(I)->getOffsetValue()
: cast<StoreRawIntrinsic>(I)->getOffsetValue();
Value *data = isa<LdRawIntrinsic>(I) ? nullptr : cast<StoreRawIntrinsic>(I)->getStoreValue();
bytes = (uint32_t)VPConst::RAW_SPLIT_SIZE;
if (EmitPass::shouldGenerateLSCQuery(*m_CGCtx, I) == Tristate::True) {
if (WI.isUniform(bufferAddr) && WI.isUniform(offset) && (data == nullptr || WI.isUniform(data))) {
if (alignment >= 8) {
bytes = (uint32_t)VPConst::LSC_D64_UNIFORM_SPLIT_SIZE;
} else if (alignment >= 4) {
bytes = (uint32_t)VPConst::LSC_D32_UNIFORM_SPLIT_SIZE;
}
} else {
bytes = (uint32_t)VPConst::SPLIT_SIZE;
}
} else {
Type *ValueTy = nullptr;
if (StoreRawIntrinsic *SRI = dyn_cast<StoreRawIntrinsic>(I)) {
ValueTy = SRI->getStoreValue()->getType();
} else {
ValueTy = I->getType();
}
IGCLLVM::FixedVectorType *vecType = dyn_cast_or_null<IGCLLVM::FixedVectorType>(ValueTy);
if (vecType && m_DL->getTypeSizeInBits(vecType->getScalarType()) == 64) {
bytes = 8; // use QW load/store
}
}
} else {
bytes = (uint32_t)VPConst::SPLIT_SIZE;
}
if ((isa<LoadInst>(I) || isa<StoreInst>(I) || isa<PredicatedLoadIntrinsic>(I) || isa<PredicatedStoreIntrinsic>(I)) &&
WI.isUniform(I)) {
auto Alignment = ALI.has_value() ? ALI->getAlignment() : ASI->getAlignment();
if (Alignment >= 16) {
Type *ETy = ALI.has_value() ? cast<VectorType>(I->getType())->getElementType()
: cast<VectorType>(ASI->getValueOperand()->getType())->getElementType();
Value *Ptr = ALI.has_value() ? ALI->getPointerOperand() : ASI->getPointerOperand();
bool SLM = Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_LOCAL;
uint32_t ebytes = int_cast<uint32_t>(m_DL->getTypeSizeInBits(ETy) / 8);
// Limit to DW and QW element types to avoid generating vectors that
// are too large (ideally, should be <= 32 elements currently).
if (ebytes == 4 || ebytes == 8) {
bytes = std::max(bytes, m_CGCtx->platform.getMaxBlockMsgSize(SLM));
}
}
}
return bytes;
}
bool VectorPreProcess::splitStore(AbstractStoreInst &ASI, V2SMap &vecToSubVec, WIAnalysisRunner &WI) {
Instruction *SI = ASI.getInst();
Value *StoredVal = ASI.getValueOperand();
IGCLLVM::FixedVectorType *VTy = cast<IGCLLVM::FixedVectorType>(StoredVal->getType());
Type *ETy = VTy->getElementType();
uint32_t nelts = int_cast<uint32_t>(VTy->getNumElements());
// splitInfo: Keep track of all pairs of (sub-vec type, #sub-vec).
SmallVector<std::pair<Type *, uint32_t>, 8> splitInfo;
bool isStoreInst = isa<StoreInst>(SI) || isa<PredicatedStoreIntrinsic>(SI);
uint32_t splitSize = getSplitByteSize(SI, WI);
if (IGC_IS_FLAG_ENABLED(EnableSplitUnalignedVector)) {
// byte and word-aligned stores can only store a dword at a time.
auto alignment = ASI.getAlignment();
if (isStoreInst && alignment < 4) {
alignment_t newAlign = (alignment_t)IGCLLVM::getAlignmentValue(getKnownAlignment(ASI.getPointerOperand(), *m_DL));
if (newAlign > alignment) {
// For the same reason as Load, use DW-aligned for OCL stateful.
if (newAlign > 4 && isStatefulAddrSpace(ASI.getPointerAddressSpace())) {
newAlign = 4;
}
ASI.setAlignment(newAlign);
}
}
bool needsDWordSplit =
(!isStoreInst || m_CGCtx->m_DriverInfo.splitUnalignedVectors() || !WI.isUniform(ASI.getInst())) &&
ASI.getAlignment() < 4;
if (needsDWordSplit) {
splitSize = 4;
}
}
createSplitVectorTypes(ETy, nelts, splitSize, splitInfo);
// return if no split
uint32_t len = splitInfo.size();
if (len == 1 && splitInfo[0].second == 1) {
return false;
}
// Create a new value in the map for store
ValVector &svals = vecToSubVec[SI][splitSize];
if (svals.size() == 0) {
// Need to create splitted values.
Instruction *insertBeforeInst = nullptr;
ValVector scalars(nelts, nullptr);
getOrGenScalarValues(*SI->getParent()->getParent(), StoredVal, scalars, insertBeforeInst);
insertBeforeInst = insertBeforeInst ? insertBeforeInst : SI;
IRBuilder<> aBuilder(insertBeforeInst);
Type *Ty1 = splitInfo[0].first;
if (IGC_IS_FLAG_ENABLED(EnableSplitUnalignedVector)) {
if (m_DL->getTypeSizeInBits(ETy) > m_DL->getTypeSizeInBits(Ty1->getScalarType())) {
std::vector<Value *> splitScalars;
IGC_ASSERT(m_DL->getTypeSizeInBits(Ty1->getScalarType()));
const uint32_t vectorSize =
(unsigned int)m_DL->getTypeSizeInBits(ETy) / (unsigned int)m_DL->getTypeSizeInBits(Ty1->getScalarType());
Type *splitType = FixedVectorType::get(Ty1, vectorSize);
for (uint32_t i = 0; i < nelts; i++) {
Value *splitInst = aBuilder.CreateBitCast(scalars[i], splitType);
for (uint32_t j = 0; j < vectorSize; j++) {
splitScalars.push_back(aBuilder.CreateExtractElement(splitInst, j));
}
}
scalars.resize(splitScalars.size());
for (uint32_t i = 0; i < splitScalars.size(); i++) {
scalars[i] = splitScalars[i];
}
}
}
// Now generate svals
for (uint32_t i = 0, Idx = 0; i < len; ++i) {
Type *Ty1 = splitInfo[i].first;
uint32_t len1 = splitInfo[i].second;
IGCLLVM::FixedVectorType *VTy1 = dyn_cast<IGCLLVM::FixedVectorType>(Ty1);
for (uint32_t j = 0; j < len1; ++j) {
Value *subVec;
if (!VTy1) {
subVec = scalars[Idx];
++Idx;
} else {
subVec = UndefValue::get(Ty1);
uint32_t n1 = int_cast<uint32_t>(VTy1->getNumElements());
for (uint32_t k = 0; k < n1; ++k) {
subVec = aBuilder.CreateInsertElement(subVec, scalars[Idx], aBuilder.getInt32(k));
++Idx;
}
}
svals.push_back(subVec);
}
}
}
Value *Addr = ASI.getPointerOperand();
auto Align = ASI.getAlignment();
bool IsVolatile = ASI.getIsVolatile();
uint32_t eOffset = 0;
uint32_t EBytes = int_cast<unsigned int>(m_DL->getTypeAllocSize(ETy));
for (uint32_t i = 0, subIdx = 0; i < len; ++i) {
Type *Ty1 = splitInfo[i].first;
uint32_t len1 = splitInfo[i].second;
IGCLLVM::FixedVectorType *VTy1 = dyn_cast<IGCLLVM::FixedVectorType>(Ty1);
for (uint32_t j = 0; j < len1; ++j) {
alignment_t vAlign = (alignment_t)MinAlign(Align, (alignment_t)eOffset * EBytes);
Value *offsetAddr = ASI.CreateConstScalarGEP(svals[subIdx]->getType(), Addr, eOffset);
Instruction *newST = ASI.Create(svals[subIdx], offsetAddr, vAlign, IsVolatile);
eOffset += (VTy1 ? int_cast<uint32_t>(VTy1->getNumElements()) : 1);
++subIdx;
// If this is a new 3-element vector, add it into m_Vector3List
if (VTy1 && VTy1->getNumElements() == 3) {
m_Vector3List.push_back(newST);
}
}
}
// Stores don't require post processing, so remove it as soon as we finish splitting
vecToSubVec.erase(SI);
SI->eraseFromParent();
// Since Load is processed later, stop optimizing if inst is Load.
Instruction *inst = dyn_cast<Instruction>(StoredVal);
bool keepLI = inst && isAbstractLoadInst(inst) && (vecToSubVec.find(inst) != vecToSubVec.end());
while (inst && !keepLI && inst->use_empty()) {
Instruction *next = nullptr;
if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(inst)) {
next = dyn_cast<Instruction>(IEI->getOperand(0));
}
inst->eraseFromParent();
inst = next;
keepLI = inst && isAbstractLoadInst(inst) && (vecToSubVec.find(inst) != vecToSubVec.end());
}
return true;
}
bool VectorPreProcess::splitLoad(AbstractLoadInst &ALI, V2SMap &vecToSubVec, WIAnalysisRunner &WI) {
Instruction *LI = ALI.getInst();
bool isLdRaw = isa<LdRawIntrinsic>(LI);
bool isPredLd = isa<PredicatedLoadIntrinsic>(LI);
IGCLLVM::FixedVectorType *VTy = cast<IGCLLVM::FixedVectorType>(LI->getType());
Type *ETy = VTy->getElementType();
uint32_t nelts = int_cast<uint32_t>(VTy->getNumElements());
// Split a vector type into multiple sub-types:
// 'len0' number of sub-vectors of type 'vecTy0'
// 'len1' number of sub-vectors of type 'vecTy1'
// ...
// SplitInfo : all pairs, each of which is (sub-vector's type, #sub-vectors).
SmallVector<std::pair<Type *, uint32_t>, 8> splitInfo;
uint32_t splitSize = getSplitByteSize(LI, WI);
if (IGC_IS_FLAG_ENABLED(EnableSplitUnalignedVector)) {
// byte and word-aligned loads can only load a dword at a time.
auto alignment = ALI.getAlignment();
if (!isLdRaw && alignment < 4) {
alignment_t newAlign = (alignment_t)IGCLLVM::getAlignmentValue(getKnownAlignment(ALI.getPointerOperand(), *m_DL));
if (newAlign > alignment) {
// For OCL stateful, the base can be as little as DW-aligned. To be safe,
// need to use DW-aligned. For example,
// % 0 = add i32 0, 16
// % 4 = inttoptr i32 % 0 to <8 x i16> addrspace(131073) *
// %5 = load <8 x i16>, <8 x i16> addrspace(131073) * %4, align 2
// newAlign from getKnownAlignment() is 16. But we can only set align to 4 as
// the base of this stateful could be just DW-aligned.
if (newAlign > 4 && isStatefulAddrSpace(ALI.getPointerAddressSpace())) {
newAlign = 4;
}
ALI.setAlignment(newAlign);
}
}
if ((isLdRaw || !WI.isUniform(ALI.getInst())) && ALI.getAlignment() < 4)
splitSize = 4;
}
createSplitVectorTypes(ETy, nelts, splitSize, splitInfo);
// return if no split
uint32_t len = splitInfo.size();
if (len == 1 && splitInfo[0].second == 1) {
return false;
}
ValVector splitMergeValues;
if (isPredLd)
createSplitMergeValues(LI, cast<PredicatedLoadIntrinsic>(LI)->getMergeValue(), splitInfo, splitMergeValues);
Value *Addr = ALI.getPointerOperand();
auto Align = ALI.getAlignment();
bool IsVolatile = ALI.getIsVolatile();
uint32_t eOffset = 0;
uint32_t EBytes = int_cast<unsigned int>(m_DL->getTypeAllocSize(ETy));
uint32_t mergeValueIdx = 0;
// Create a map entry for LI
ValVector &svals = vecToSubVec[LI][splitSize];
for (uint32_t i = 0; i < len; ++i) {
Type *Ty1 = splitInfo[i].first;
uint32_t len1 = splitInfo[i].second;
IGCLLVM::FixedVectorType *VTy1 = dyn_cast<IGCLLVM::FixedVectorType>(Ty1);
for (uint32_t j = 0; j < len1; ++j) {
alignment_t vAlign = (alignment_t)MinAlign(Align, (alignment_t)eOffset * EBytes);
Value *offsetAddr = ALI.CreateConstScalarGEP(Ty1, Addr, eOffset);
Value *MergeV = isPredLd ? splitMergeValues[mergeValueIdx++] : nullptr;
Instruction *I = ALI.Create(Ty1, offsetAddr, vAlign, IsVolatile, MergeV);
eOffset += (VTy1 ? int_cast<uint32_t>(VTy1->getNumElements()) : 1);
svals.push_back(I);
// If this is a new 3-element vector, add it into m_Vector3List
if (VTy1 && VTy1->getNumElements() == 3) {
m_Vector3List.push_back(I);
}
}
}
if (IGC_IS_FLAG_ENABLED(EnableSplitUnalignedVector)) {
if (m_DL->getTypeSizeInBits(svals[0]->getType()) < m_DL->getTypeSizeInBits(ETy)) {
const unsigned int denominator = (unsigned int)m_DL->getTypeSizeInBits(svals[0]->getType());
IGC_ASSERT(0 < denominator);
const uint32_t scalarsPerElement = (unsigned int)m_DL->getTypeSizeInBits(ETy) / denominator;
IGC_ASSERT(1 < scalarsPerElement);
IGC_ASSERT((svals.size() % scalarsPerElement) == 0);
ValVector mergedScalars;
IRBuilder<> builder(LI->getParent());
Instruction *nextInst = LI->getNextNode();
if (nextInst) {
builder.SetInsertPoint(nextInst);
}
Value *undef = UndefValue::get(FixedVectorType::get(svals[0]->getType(), scalarsPerElement));
for (uint32_t i = 0; i < svals.size() / scalarsPerElement; i++) {
Value *newElement = undef;
for (uint32_t j = 0; j < scalarsPerElement; j++) {
newElement = builder.CreateInsertElement(newElement, svals[i * scalarsPerElement + j], j);
}
mergedScalars.push_back(builder.CreateBitCast(newElement, ETy));
}
svals.clear();
svals.append(mergedScalars.begin(), mergedScalars.end());
}
}
// Put LI in m_Temps for post-processing.
//
// LI may be used only in store. If so, no need to re-generate the original
// vector as load and store will use the same set of sub-vectors. So, we delay
// generating the original vector until all stores are processed. Doing so,
// we re-generate the original vector only if it is necessary and thus avoid
// unnecesary insert/extract instructions.
m_Temps.push_back(LI);
return true;
}
bool VectorPreProcess::splitLoadStore(Instruction *Inst, V2SMap &vecToSubVec, WIAnalysisRunner &WI) {
std::optional<AbstractLoadInst> ALI = AbstractLoadInst::get(Inst, *m_DL);
std::optional<AbstractStoreInst> ASI = AbstractStoreInst::get(Inst, *m_DL);
IGC_ASSERT_MESSAGE((ALI || ASI), "Inst should be either load or store");
Type *Ty = ALI ? ALI->getInst()->getType() : ASI->getValueOperand()->getType();
IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
if (!VTy) {
return false;
}
if (VTy->getNumElements() == 3) {
// Handle 3-element vector later.
m_Vector3List.push_back(Inst);
return false;
}
Value *V = ALI ? ALI->getInst() : ASI->getInst();
auto InMap = [&vecToSubVec](Value *V) { return vecToSubVec.find(V) != vecToSubVec.end(); };
// Only LI could be processed already.
bool processed = ALI && InMap(V);
if (processed) {
return false;
}
// Do splitting
// If it is a store and its stored value is from a load that
// has not been splitted yet, then splitting the load first
// so that the stored value will be directly from loaded values
// without adding insert/extract instructions.
std::optional<AbstractLoadInst> aALI =
ASI && !InMap(ASI->getValueOperand()) ? AbstractLoadInst::get(ASI->getValueOperand(), *m_DL) : std::move(ALI);
if (aALI) {
auto aALIValue = aALI.value();
splitLoad(aALIValue, vecToSubVec, WI);
}
if (ASI) {
auto ASIValue = ASI.value();
splitStore(ASIValue, vecToSubVec, WI);
}
return true;
}
// For a vector3 whose element size < 4 bytes, will split them into one whose
// size is multiple of DW and one whose size is less than DW; If the size is
// less than DW, make sure it is either 1 Byte or 2 bytes. After this, for
// vector size < 4, it must be either 1 byte or 2 bytes, never 3 bytes.
// This function also splits vector3s with an element size of 8 bytes if
// ldraw or storeraw is being used since neither of those messages support
// payloads larger than 4 DW.
bool VectorPreProcess::splitVector3LoadStore(Instruction *Inst) {
std::optional<AbstractLoadInst> optionalALI = AbstractLoadInst::get(Inst, *m_DL);
AbstractLoadInst *ALI = optionalALI.has_value() ? &optionalALI.value() : nullptr;
std::optional<AbstractStoreInst> optionalASI = AbstractStoreInst::get(Inst, *m_DL);
AbstractStoreInst *ASI = optionalASI.has_value() ? &optionalASI.value() : nullptr;
std::optional<int> a;
IGC_ASSERT_MESSAGE((optionalALI || optionalASI), "Inst should be either load or store");
Type *Ty = ALI ? ALI->getInst()->getType() : ASI->getValueOperand()->getType();
IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
IGC_ASSERT_MESSAGE(nullptr != VTy, "Inst should be a 3-element vector load/store!");
IGC_ASSERT_MESSAGE(VTy->getNumElements() == 3, "Inst should be a 3-element vector load/store!");
Type *eTy = VTy->getElementType();
uint32_t etyBytes = int_cast<unsigned int>(m_DL->getTypeAllocSize(eTy));
// total size of vector in bytes;
// uint32_t sz = VTy->getNumElements() * etyBytes;
GenIntrinsicInst *II = dyn_cast<GenIntrinsicInst>(Inst);
bool isStoreRaw = II && (II->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
II->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed);
bool isPredLoad = isa<PredicatedLoadIntrinsic>(Inst);
if (etyBytes == 1 || etyBytes == 2 || (etyBytes == 8 && (isa<LdRawIntrinsic>(Inst) || isStoreRaw))) {
IRBuilder<> Builder(Inst);
if (optionalALI) {
Value *Elt0 = NULL;
Value *Elt1 = NULL;
Value *Elt2 = NULL;
bool UseLegacyLdRawMessage =
isa<LdRawIntrinsic>(Inst) && EmitPass::shouldGenerateLSCQuery(*m_CGCtx, Inst) != Tristate::True;
// If alignment is the same as 4-element vector's, it's likely safe
// to make it 4-element load. (always safe ?)
if (ALI->getAlignment() >= 4 * etyBytes &&
// Legacy ldraw message doesn't support 32-byte payloads
!(UseLegacyLdRawMessage && etyBytes == 8)) {
// Make it 4-element load
Type *newVTy = FixedVectorType::get(eTy, 4);
ValVector splitMergeValues;
if (isPredLoad)
createSplitMergeValues(Inst, cast<PredicatedLoadIntrinsic>(Inst)->getMergeValue(), {{newVTy, 1}},
splitMergeValues);
Value *V = ALI->Create(newVTy, isPredLoad ? splitMergeValues[0] : nullptr);
Elt0 = Builder.CreateExtractElement(V, Builder.getInt32(0), "elt0");
Elt1 = Builder.CreateExtractElement(V, Builder.getInt32(1), "elt1");
Elt2 = Builder.CreateExtractElement(V, Builder.getInt32(2), "elt2");
} else {
// One 2-element vector load + one scalar load
Type *newVTy = FixedVectorType::get(eTy, 2);
Value *offsetAddr = ALI->CreateConstScalarGEP(eTy, ALI->getPointerOperand(), 2);
ValVector splitMergeValues;
if (isPredLoad)
createSplitMergeValues(Inst, cast<PredicatedLoadIntrinsic>(Inst)->getMergeValue(), {{newVTy, 1}, {eTy, 1}},
splitMergeValues);
Value *V2 = ALI->Create(newVTy, isPredLoad ? splitMergeValues[0] : nullptr);
Elt0 = Builder.CreateExtractElement(V2, Builder.getInt32(0), "elt0");
Elt1 = Builder.CreateExtractElement(V2, Builder.getInt32(1), "elt1");
uint32_t newAlign = (uint32_t)MinAlign(ALI->getAlignment(), 2 * etyBytes);
Elt2 = ALI->Create(eTy, offsetAddr, newAlign, ALI->getIsVolatile(), isPredLoad ? splitMergeValues[1] : nullptr);
}
// A little optimization here
ExtractElementInst *EEInsts[3];
for (int i = 0; i < 3; ++i) {
EEInsts[i] = nullptr;
}
if (isValueUsedOnlyByEEI(ALI->getInst(), EEInsts)) {
if (EEInsts[0] != nullptr) {
EEInsts[0]->replaceAllUsesWith(Elt0);
EEInsts[0]->eraseFromParent();
}
if (EEInsts[1] != nullptr) {
EEInsts[1]->replaceAllUsesWith(Elt1);
EEInsts[1]->eraseFromParent();
}
if (EEInsts[2] != nullptr) {
EEInsts[2]->replaceAllUsesWith(Elt2);
EEInsts[2]->eraseFromParent();
}
} else {
Value *V = Builder.CreateInsertElement(UndefValue::get(VTy), Elt0, Builder.getInt32(0));
V = Builder.CreateInsertElement(V, Elt1, Builder.getInt32(1));
V = Builder.CreateInsertElement(V, Elt2, Builder.getInt32(2));
ALI->getInst()->replaceAllUsesWith(V);
}
ALI->getInst()->eraseFromParent();
} else {
Value *Ptr = ASI->getPointerOperand();
// Split 3-element into 2-element + 1 scalar
Type *newVTy = FixedVectorType::get(eTy, 2);
Value *StoredVal = ASI->getValueOperand();
Value *offsetAddr = ASI->CreateConstScalarGEP(StoredVal->getType(), Ptr, 2);
InsertElementInst *IEInsts[3];
for (int i = 0; i < 3; ++i) {
IEInsts[i] = nullptr;
}
// vec3 = vec2 + scalar, newAlign is an alignment for scalar store.
uint32_t newAlign = (uint32_t)MinAlign(ASI->getAlignment(), 2 * etyBytes);
Value *UDVal = UndefValue::get(eTy);
if (isValueCreatedOnlyByIEI(ASI->getInst(), IEInsts)) {
// This case should be most frequent, and want
// to generate a better code by removing dead
// InsertElementInst.
// Be ware of partial vector store.
Value *V = UndefValue::get(newVTy);
V = Builder.CreateInsertElement(V, (IEInsts[0] != nullptr) ? IEInsts[0]->getOperand(1) : UDVal,
Builder.getInt32(0));
V = Builder.CreateInsertElement(V, (IEInsts[1] != nullptr) ? IEInsts[1]->getOperand(1) : UDVal,
Builder.getInt32(1));
V = ASI->Create(V);
// If IEInsts[2] is undefined, skip scalar store.
if (IEInsts[2] != nullptr) {
(void)ASI->Create(IEInsts[2]->getOperand(1), offsetAddr, newAlign, ASI->getIsVolatile());
}
ASI->getInst()->eraseFromParent();
// Remove all InsertElementInst if possible
bool change = true;
while (change) {
change = false;
for (int i = 0; i < 3; ++i) {
if (IEInsts[i] && IEInsts[i]->use_empty()) {
IEInsts[i]->eraseFromParent();
IEInsts[i] = nullptr;
change = true;
}
}
}
} else {
// Get a 2-element vector and a scalar from the
// 3-element vector and store them respectively.
// Shuffle isn't handled in Emit, use extract/insert instead
Value *Elt0 = Builder.CreateExtractElement(StoredVal, Builder.getInt32(0), "Elt0");
Value *Elt1 = Builder.CreateExtractElement(StoredVal, Builder.getInt32(1), "Elt1");
Value *Elt2 = Builder.CreateExtractElement(StoredVal, Builder.getInt32(2), "Elt2");
Value *V = Builder.CreateInsertElement(UndefValue::get(newVTy), Elt0, Builder.getInt32(0));
V = Builder.CreateInsertElement(V, Elt1, Builder.getInt32(1));
ASI->Create(V);
ASI->Create(Elt2, offsetAddr, newAlign, ASI->getIsVolatile());
ASI->getInst()->eraseFromParent();
}
}
return true;
}
return false;
}
// availBeforeInst:
// Indicate that all scalar values of VecVal are available right before
// instruction 'availBeforeInst'. If availBeforeInst is null, it means
// all scalar values are constants.
void VectorPreProcess::getOrGenScalarValues(Function &F, Value *VecVal, ValVector &scalars,
Instruction *&availBeforeInst) {
availBeforeInst = nullptr;
IGCLLVM::FixedVectorType *VTy = cast<IGCLLVM::FixedVectorType>(VecVal->getType());
if (!VTy) {
scalars[0] = VecVal;
return;
}
uint32_t nelts = int_cast<uint32_t>(VTy->getNumElements());
Type *ETy = VTy->getElementType();
if (isa<UndefValue>(VecVal)) {
Value *udv = UndefValue::get(ETy);
for (uint32_t i = 0; i < nelts; ++i) {
scalars[i] = udv;
}
} else if (ConstantVector *CV = dyn_cast<ConstantVector>(VecVal)) {
for (uint32_t i = 0; i < nelts; ++i) {
scalars[i] = CV->getOperand(i);
}
} else if (ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(VecVal)) {
for (uint32_t i = 0; i < nelts; ++i) {
scalars[i] = CDV->getElementAsConstant(i);
}
} else if (ConstantAggregateZero *CAZ = dyn_cast<ConstantAggregateZero>(VecVal)) {
for (uint32_t i = 0; i < nelts; ++i) {
scalars[i] = CAZ->getSequentialElement();
}
} else {
bool genExtract = false;
Value *V = VecVal;
IGC_ASSERT(scalars.size() == nelts);
for (uint32_t i = 0; i < nelts; ++i) {
scalars[i] = nullptr;
}
uint32_t numEltsFound = 0;
while (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
Value *ixVal = IEI->getOperand(2);
ConstantInt *CI = dyn_cast<ConstantInt>(ixVal);
if (!CI) {
genExtract = true;
break;
}
uint32_t ix = int_cast<unsigned int>(CI->getZExtValue());
if (scalars[ix] == nullptr) {
scalars[ix] = IEI->getOperand(1);
++numEltsFound;
}
if (numEltsFound == nelts) {
break;
}
V = IEI->getOperand(0);
}
// Generate extractelement instructions if not all elements were found.
if (!isa<UndefValue>(V) && numEltsFound != nelts) {
genExtract = true;
}
BasicBlock::iterator inst_b;
if (Instruction *I = dyn_cast<Instruction>(VecVal)) {
if (auto phi = dyn_cast<PHINode>(I)) {
// avoid inserting between phis
inst_b = phi->getParent()->getFirstInsertionPt();
} else {
inst_b = BasicBlock::iterator(I);
++inst_b;
}
} else {
// VecVal is an argument or constant
inst_b = F.begin()->getFirstInsertionPt();
}
IRBuilder<> Builder(&(*inst_b));
for (uint32_t i = 0; i < nelts; ++i) {
if (scalars[i] == nullptr) {
Value *S;
if (genExtract) {
S = Builder.CreateExtractElement(V, Builder.getInt32(i));
} else {
S = UndefValue::get(ETy);
}
scalars[i] = S;
}
}
availBeforeInst = &(*inst_b);
}
}
// Perform LoadInst/StoreInst simplification. E.g. The following vector load is
// only used by three extractelements with constants indices, so we can narrow
// the load width to 3.
//
// %34 = load <4 x float> addrspace(1)* %33, align 16
// %scalar35 = extractelement <4 x float> %34, i32 0
// %scalar36 = extractelement <4 x float> %34, i32 1
// %scalar47 = extractelement <4 x float> %34, i32 2
//
// %40 = bitcast <4 x float> addrspace(1)* %33 to <3 x float> addrspace(1)*
// %41 = load <3 x float> addrspace(1)* %40, align 16 (keep alignment!)
// %scalar42 = extractelement <3 x float> %41, i32 0
// %scalar43 = extractelement <3 x float> %41, i32 1
// %scalar44 = extractelement <3 x float> %41, i32 2
//
Instruction *VectorPreProcess::simplifyLoadStore(Instruction *Inst) {
if (std::optional<AbstractLoadInst> optionalALI = AbstractLoadInst::get(Inst, *m_DL)) {
bool optReportEnabled = IGC_IS_FLAG_ENABLED(EnableOptReportLoadNarrowing);
auto emitOptReport = [&](std::string report, Instruction *from, Instruction *to) {
std::string strFrom;
llvm::raw_string_ostream rsoFrom(strFrom);
from->print(rsoFrom);
std::string strTo;
llvm::raw_string_ostream rsoTo(strTo);
to->print(rsoTo);
std::stringstream optReportFile;
optReportFile << IGC::Debug::GetShaderOutputFolder() << "LoadNarrowing.opt";
std::ofstream optReportStream;
optReportStream.open(optReportFile.str(), std::ios::app);
optReportStream << IGC::Debug::GetShaderOutputName() << ": " << report << std::endl
<< rsoFrom.str() << " ->" << std::endl
<< rsoTo.str() << std::endl;
};
auto ALI = optionalALI.value();
if (!Inst->getType()->isVectorTy() || ALI.getAlignment() < 4)
return Inst;
unsigned NBits = int_cast<unsigned>(m_DL->getTypeSizeInBits(Inst->getType()->getScalarType()));
if (NBits < 32)
return Inst;
BitCastInst *BC = nullptr;
Type *DstEltTy = nullptr;
// Handle bitcasts patterns like:
//
// %41 = call <4 x i32> @llvm.genx.GenISA.ldrawvector.indexed.v4i32.p1v4f32(...)
// %bc = bitcast <4 x i32> %41 to <4 x float>
// %42 = extractelement <4 x float> %bc, i32 0
if (Inst->hasOneUse()) {
BC = dyn_cast<BitCastInst>(*Inst->users().begin());
if (BC) {
IGCLLVM::FixedVectorType *DstVTy = dyn_cast<IGCLLVM::FixedVectorType>(BC->getType());
IGCLLVM::FixedVectorType *SrcVTy = dyn_cast<IGCLLVM::FixedVectorType>(BC->getOperand(0)->getType());
if (IGC_IS_FLAG_DISABLED(EnableBitcastedLoadNarrowing) || !DstVTy || !SrcVTy ||
DstVTy->getNumElements() != SrcVTy->getNumElements()) {
BC = nullptr;
} else {
DstEltTy = DstVTy->getElementType();
}
}
}
SmallVector<ExtractElementInst *, 8> ConstEEIUses;
unsigned MaxIndex = 0;
for (auto U : (BC ? BC : Inst)->users()) {
auto EEI = dyn_cast<ExtractElementInst>(U);
if (!EEI || !isa<ConstantInt>(EEI->getIndexOperand()))
return Inst;
auto CI = cast<ConstantInt>(EEI->getIndexOperand());
ConstEEIUses.push_back(EEI);
MaxIndex = std::max(MaxIndex, int_cast<unsigned>(CI->getZExtValue()));
}
// All uses are constant EEI.
IGC_ASSERT_MESSAGE((BC ? BC : Inst)->hasNUses(ConstEEIUses.size()), "out of sync");
// FIXME: this is to WA an issue that splitLoadStore does not split
// vectors of size 5, 6, 7.
if (MaxIndex + 1 > 4)
return Inst;
// If MaxIndex is smaller than <vector_size - 1>, then narrow the size
// of this vector load to reduce unnecessary memory load.
//
// TODO: further optimize this load into a message with channel masks
// for cases in which use indices are sparse like {0, 2}.
unsigned N = (unsigned)cast<IGCLLVM::FixedVectorType>(Inst->getType())->getNumElements();
if (N == MaxIndex + 1)
return Inst;
// Check if we can turn a ldrawvector into a ldraw
Instruction *NewLI = nullptr;
IRBuilder<> Builder(Inst);
auto ldrawvec = dyn_cast<LdRawIntrinsic>(Inst);
bool canSimplifyOneUse =
ldrawvec && isa<VectorType>(ldrawvec->getType()) && (BC ? BC : Inst)->hasOneUse() && !ConstEEIUses.empty();
bool canSimplifyOneUseZeroIndex =
canSimplifyOneUse && cast<ConstantInt>(ConstEEIUses.front()->getIndexOperand())->getZExtValue() == 0;
// There is a known case where narrowing bitcasted ldrawvector to ldraw
// leads to a corruption. We can still simplify a vector load to
// a narrow one (e.g. <4 x i32> to <2 x i32> when only 0th elt is used
// as a float).
// TODO: remove WA when issue is resolved.
bool skipSimplifyBitcastedOneUse =
canSimplifyOneUse && BC && IGC_IS_FLAG_DISABLED(EnableBitcastedLoadNarrowingToScalar);
auto simplifyLDVecToLDRaw = [&](bool calc_offset) {
auto EE_user = ConstEEIUses.front();
auto return_type = cast<VectorType>(ldrawvec->getType())->getElementType();
auto buffer_ptr = ldrawvec->getResourceValue();
Value *OffsetVal = ldrawvec->getOffsetValue();
auto alloc_size = (unsigned)m_DL->getTypeAllocSize(return_type);
if (calc_offset) {
auto EE_index = (unsigned)cast<ConstantInt>(EE_user->getIndexOperand())->getZExtValue();
if (isa<ConstantInt>(OffsetVal)) {
// Calculate static offset
auto offset = (unsigned)cast<ConstantInt>(OffsetVal)->getZExtValue();
auto new_offset = offset + (EE_index * alloc_size);
OffsetVal = Builder.getInt32(new_offset);
} else {
// Calculate runtime offset
OffsetVal = Builder.CreateAdd(OffsetVal, Builder.getInt32(EE_index * alloc_size));
}
}
Type *types[2] = {return_type, buffer_ptr->getType()};
Value *args[4] = {buffer_ptr, OffsetVal, Builder.getInt32(alloc_size), Builder.getInt1(ldrawvec->isVolatile())};
Function *newLdRawFunction =
GenISAIntrinsic::getDeclaration(ldrawvec->getModule(), GenISAIntrinsic::GenISA_ldraw_indexed, types);
NewLI = Builder.CreateCall(newLdRawFunction, args);
NewLI->setDebugLoc(EE_user->getDebugLoc());
if (optReportEnabled) {
std::string type;
llvm::raw_string_ostream rsoType(type);
Inst->getType()->print(rsoType);
std::stringstream report;
report << (BC ? "Bitcasted vector" : "Vector") << " load of " << rsoType.str()
<< " is transformed to scalar load";
if (calc_offset) {
report << (isa<ConstantInt>(ldrawvec->getOffsetValue()) ? ", static offset added" : ", runtime offset added");
}
report << ":";
emitOptReport(report.str(), Inst, NewLI);
}
Value *NewBC = nullptr;
if (BC) {
NewBC = Builder.CreateBitCast(NewLI, DstEltTy);
}
EE_user->replaceAllUsesWith(BC ? NewBC : NewLI);
EE_user->eraseFromParent();
if (BC) {
BC->eraseFromParent();
}
};
if (canSimplifyOneUseZeroIndex && !skipSimplifyBitcastedOneUse) {
simplifyLDVecToLDRaw(false);
return NewLI;
} else if (canSimplifyOneUse && !skipSimplifyBitcastedOneUse) {
simplifyLDVecToLDRaw(true);
return NewLI;
} else {
// WA: Do not narrow a bitcasted vector load to 1 elt vector load,
// choose at least 2 elts vector.
if (canSimplifyOneUseZeroIndex && skipSimplifyBitcastedOneUse) {
MaxIndex = 1;
}
Type *NewVecTy = FixedVectorType::get(cast<VectorType>(Inst->getType())->getElementType(), MaxIndex + 1);
bool isPredLoad = isa<PredicatedLoadIntrinsic>(Inst);
ValVector splitMergeValues;
if (isPredLoad)
createSplitMergeValues(Inst, cast<PredicatedLoadIntrinsic>(Inst)->getMergeValue(), {{NewVecTy, 1}},
splitMergeValues);
NewLI = ALI.Create(NewVecTy, isPredLoad ? splitMergeValues[0] : nullptr);
if (optReportEnabled) {
std::string type, narrowedType;
llvm::raw_string_ostream rsoType(type), rsoNarrowedType(narrowedType);
Inst->getType()->print(rsoType);
NewVecTy->print(rsoNarrowedType);
std::stringstream report;
report << (BC ? "Bitcasted vector" : "Vector") << " load of " << rsoType.str()
<< " is narrowed to vector load of " << rsoNarrowedType.str();
if (canSimplifyOneUseZeroIndex && skipSimplifyBitcastedOneUse) {
report << " (narrowing to scalar load is disabled by WA)";
}
report << ":";
emitOptReport(report.str(), Inst, NewLI);
}
// Loop and replace all uses.
SmallVector<Value *, 8> NewEEI(MaxIndex + 1, nullptr);
SmallVector<Value *, 8> NewBC(MaxIndex + 1, nullptr);
for (auto EEI : ConstEEIUses) {
auto CI = cast<ConstantInt>(EEI->getIndexOperand());
unsigned Idx = int_cast<unsigned>(CI->getZExtValue());
if (NewEEI[Idx] == nullptr) {
NewEEI[Idx] = Builder.CreateExtractElement(NewLI, CI);
if (BC) {
NewBC[Idx] = Builder.CreateBitCast(NewEEI[Idx], DstEltTy);
cast<BitCastInst>(NewBC[Idx])->setDebugLoc(BC->getDebugLoc());
}
}
cast<ExtractElementInst>(NewEEI[Idx])->setDebugLoc(EEI->getDebugLoc());
EEI->replaceAllUsesWith(BC ? NewBC[Idx] : NewEEI[Idx]);
EEI->eraseFromParent();
}
if (BC) {
BC->eraseFromParent();
}
IGC_ASSERT_MESSAGE(Inst->use_empty(), "out of sync");
Inst->eraseFromParent();
return NewLI;
}
}
// %2 = insertelement <4 x float> undef, float 1.000000e+00, i32 0
// %3 = insertelement <4 x float> %2, float 1.000000e+00, i32 1
// %4 = insertelement <4 x float> %3, float 1.000000e+00, i32 2
// store <4 x float> %4, <4 x float>* %1, align 16
//
// becomes
//
// %5 = bitcast <4 x float>* %1 to <3 x float>*
// %6 = insertelement <3 x float> undef, float 1.000000e+00, i32 0
// %7 = insertelement <3 x float> %2, float 1.000000e+00, i32 1
// %8 = insertelement <3 x float> %3, float 1.000000e+00, i32 2
// store <3 x float> %8, <3 x float>* %5, align 16
//
IGC_ASSERT(isAbstractStoreInst(Inst));
std::optional<AbstractStoreInst> optionalASI = AbstractStoreInst::get(Inst, *m_DL);
auto ASI = optionalASI.value();
Value *Val = ASI.getValueOperand();
if (isa<UndefValue>(Val)) {
Inst->eraseFromParent();
return nullptr;
}
if (!Val->getType()->isVectorTy() || ASI.getAlignment() < 4)
return Inst;
unsigned NBits = int_cast<unsigned>(m_DL->getTypeSizeInBits(Val->getType()->getScalarType()));
if (NBits < 32)
return Inst;
unsigned N = (unsigned)cast<IGCLLVM::FixedVectorType>(Val->getType())->getNumElements();
if (auto CV = dyn_cast<ConstantVector>(Val)) {
unsigned MaxIndex = 0;
for (unsigned i = N - 1; i != 0; --i) {
Constant *Item = CV->getAggregateElement(i);
if (!isa<UndefValue>(Item)) {
MaxIndex = i;
break;
}
}
if (MaxIndex + 1 == N)
return Inst;
SmallVector<Constant *, 8> Data(MaxIndex + 1, nullptr);
for (unsigned i = 0; i <= MaxIndex; ++i) {
Data[i] = CV->getAggregateElement(i);
}
auto SVal = ConstantVector::get(Data);
Instruction *NewSI = ASI.Create(SVal);
ASI.getInst()->eraseFromParent();
return NewSI;
}
SmallVector<InsertElementInst *, 8> ConstIEIs(N, nullptr);
Value *ChainVal = Val;
int MaxIndex = -1;
while (auto IEI = dyn_cast<InsertElementInst>(ChainVal)) {
if (MaxIndex + 1 == (int)N || !isa<ConstantInt>(IEI->getOperand(2))) {
return Inst;
}
// Make sure the last IEI will be recorded if an element is
// inserted multiple times.
auto CI = cast<ConstantInt>(IEI->getOperand(2));
int Idx = (int)CI->getZExtValue();
if (ConstIEIs[Idx] == nullptr) {
ConstIEIs[Idx] = IEI;
}
MaxIndex = std::max(MaxIndex, Idx);
ChainVal = IEI->getOperand(0);
}
// FIXME: this is to WA an issue that splitLoadStore does not split
// vectors of size 5, 6, 7.
if (MaxIndex + 1 > 4)
return Inst;
// Inserted less than N values into Undef.
if (MaxIndex >= 0 && MaxIndex + 1 < (int)N && isa<UndefValue>(ChainVal)) {
IRBuilder<> Builder(ASI.getInst());
Type *NewVecTy = FixedVectorType::get(cast<VectorType>(Val->getType())->getElementType(), MaxIndex + 1);
Value *SVal = UndefValue::get(NewVecTy);
for (int i = 0; i <= MaxIndex; ++i) {
if (ConstIEIs[i] != nullptr) {
SVal = Builder.CreateInsertElement(SVal, ConstIEIs[i]->getOperand(1), ConstIEIs[i]->getOperand(2));
}
}
Instruction *NewSI = ASI.Create(SVal);
ASI.getInst()->eraseFromParent();
return NewSI;
}
return Inst;
}
// Replace store instructions like
// store i24 %1, i24 addrspace(3)* %2, align 4
// or
// store i48 %1, i48 addrspace(3)* %2, align 4
//
// with
// store <3 x i8> %3, <3 x i8> addrspace(3)* %4, align 4
// or
// store <3 x i16> %16, <3 x i16> addrspace(3)* %4, align 4
//
// to be split later in this pass.
// Otherwise later TypeLegalizwe pass replaces these instructions with 3-element store.
// The same is for i24 and i48 load instructions.
//
bool VectorPreProcess::processScalarLoadStore(Function &F) {
InstWorkVector list_delete;
for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
Instruction *inst = &*I;
if (isa<StoreInst>(inst) || isa<PredicatedStoreIntrinsic>(inst)) {
std::optional<AbstractStoreInst> optionalASI = AbstractStoreInst::get(inst, *m_DL);
auto ASI = optionalASI.value();
Type *Ty = ASI.getValueOperand()->getType();
if (Ty->isVectorTy())
continue;
unsigned bitSize = int_cast<unsigned>(m_DL->getTypeSizeInBits(Ty->getScalarType()));
if (bitSize != 24 && bitSize != 48)
continue;
IRBuilder<> Builder(inst);
Type *newScalTy = bitSize == 24 ? Type::getInt8Ty(inst->getContext()) : Type::getInt16Ty(inst->getContext());
Type *newVecTy = IGCLLVM::FixedVectorType::get(newScalTy, 3);
ASI.Create(Builder.CreateBitCast(ASI.getValueOperand(), newVecTy));
list_delete.push_back(inst);
} else if (isa<LoadInst>(inst) || isa<PredicatedLoadIntrinsic>(inst)) {
std::optional<AbstractLoadInst> optionalALI = AbstractLoadInst::get(inst, *m_DL);
auto ALI = optionalALI.value();
Type *Ty = inst->getType();
if (Ty->isVectorTy())
continue;
unsigned bitSize = int_cast<unsigned>(m_DL->getTypeSizeInBits(Ty->getScalarType()));
if (bitSize != 24 && bitSize != 48)
continue;
IRBuilder<> Builder(inst);
Type *newScalTy = bitSize == 24 ? Type::getInt8Ty(inst->getContext()) : Type::getInt16Ty(inst->getContext());
Type *newVecTy = IGCLLVM::FixedVectorType::get(newScalTy, 3);
bool isPredLd = isa<PredicatedLoadIntrinsic>(inst);
ValVector splitMergeValues;
if (isPredLd)
createSplitMergeValues(inst, cast<PredicatedLoadIntrinsic>(inst)->getMergeValue(), {{newVecTy, 1}},
splitMergeValues);
Value *MergeVal = isPredLd ? splitMergeValues[0] : nullptr;
Value *newVecVal = ALI.Create(newVecTy, MergeVal);
Value *newVal = Builder.CreateBitCast(newVecVal, Ty);
inst->replaceAllUsesWith(newVal);
list_delete.push_back(inst);
}
}
if (list_delete.empty())
return false;
for (auto i : list_delete) {
i->eraseFromParent();
}
return true;
}
bool VectorPreProcess::runOnFunction(Function &F) {
bool changed = false;
m_DL = &F.getParent()->getDataLayout();
m_C = &F.getContext();
m_CGCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
changed = processScalarLoadStore(F);
for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
Instruction *inst = &*I;
if (isAbstractStoreInst(inst) || isAbstractLoadInst(inst)) {
m_WorkList.push_back(inst);
}
}
// Simplify loads/stores.
bool Simplified = false;
for (unsigned i = 0, n = m_WorkList.size(); i < n; ++i) {
Instruction *Inst = m_WorkList[i];
Instruction *NewInst = simplifyLoadStore(Inst);
if (NewInst != Inst) {
m_WorkList[i] = NewInst;
Simplified = true;
}
}
// Cleanup work items, only keep load and store instructions.
if (Simplified) {
changed = true;
auto new_end = std::remove_if(m_WorkList.begin(), m_WorkList.end(),
[](Value *V) { return !V || (!isAbstractStoreInst(V) && !isAbstractLoadInst(V)); });
m_WorkList.erase(new_end, m_WorkList.end());
}
// Split vectors
if (m_WorkList.size() > 0) {
V2SMap vecToSubVec;
// m_Temps is used to keep loads that needs post-processing.
m_Temps.clear();
{
auto *MDUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto *ModMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
TranslationTable TT;
TT.run(F);
WIAnalysisRunner WI(&F, LI, DT, PDT, MDUtils, m_CGCtx, ModMD, &TT);
WI.run();
for (uint32_t i = 0; i < m_WorkList.size(); ++i) {
if (splitLoadStore(m_WorkList[i], vecToSubVec, WI)) {
changed = true;
}
}
}
// Now, do post-processing for the splitted loads
for (uint32_t i = 0; i < m_Temps.size(); ++i) {
Value *V = m_Temps[i];
std::optional<AbstractLoadInst> ALI = AbstractLoadInst::get(V, *m_DL);
if (!ALI) {
continue;
}
Instruction *LI = ALI.value().getInst();
for (auto &it : vecToSubVec[LI]) {
ValVector &svals = it.second;
if (!LI->use_empty()) {
ValVector Scalars;
IRBuilder<> Builder(LI);
for (uint32_t j = 0; j < svals.size(); ++j) {
Type *Ty1 = svals[j]->getType();
IGCLLVM::FixedVectorType *VTy1 = dyn_cast<IGCLLVM::FixedVectorType>(Ty1);
if (VTy1) {
for (uint32_t k = 0; k < VTy1->getNumElements(); ++k) {
Value *S = Builder.CreateExtractElement(svals[j], Builder.getInt32(k), "split");
Scalars.push_back(S);
}
} else {
Scalars.push_back(svals[j]);
// svals[j] will be no long needed, set it to null
// to prevent double-deleting later
svals[j] = nullptr;
}
}
// Replace LI
replaceAllVectorUsesWithScalars(LI, Scalars);
// Remove any dead scalars
for (uint32_t j = 0; j < Scalars.size(); ++j) {
if (Scalars[j]->use_empty()) {
Instruction *tInst = cast<Instruction>(Scalars[j]);
tInst->eraseFromParent();
}
}
}
// Remove any dead sub vectors
for (uint32_t j = 0; j < svals.size(); ++j) {
if (svals[j] == nullptr) {
continue;
}
Instruction *tInst = cast<Instruction>(svals[j]);
if (tInst->use_empty()) {
// If this is a 3-element vector load, remove it
// from m_Vector3List as well.
if (isAbstractLoadInst(tInst) && tInst->getType()->isVectorTy() &&
cast<IGCLLVM::FixedVectorType>(tInst->getType())->getNumElements() == 3) {
InstWorkVector::iterator tI = m_Vector3List.begin(), tE = m_Vector3List.end();
for (; tI != tE; ++tI) {
Instruction *tmp = *tI;
if (tmp == tInst) {
break;
}
}
if (tI != m_Vector3List.end()) {
m_Vector3List.erase(tI);
}
}
tInst->eraseFromParent();
}
}
}
// Done with load splits, remove the original load inst
if (LI->use_empty()) {
vecToSubVec.erase(LI);
LI->eraseFromParent();
}
}
// Last, split 3-element vector if necessary
for (uint32_t i = 0; i < m_Vector3List.size(); ++i) {
if (splitVector3LoadStore(m_Vector3List[i])) {
changed = true;
}
}
vecToSubVec.clear();
m_Vector3List.clear();
m_WorkList.clear();
}
return changed;
}