mirror of
				https://github.com/intel/intel-graphics-compiler.git
				synced 2025-10-30 08:18:26 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			1805 lines
		
	
	
		
			72 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			1805 lines
		
	
	
		
			72 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*========================== begin_copyright_notice ============================
 | |
| 
 | |
| Copyright (C) 2017 Intel Corporation
 | |
| 
 | |
| SPDX-License-Identifier: MIT
 | |
| 
 | |
| ============================= end_copyright_notice ===========================*/
 | |
| 
 | |
| #include "Compiler/CISACodeGen/VectorProcess.hpp"
 | |
| #include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
 | |
| #include "Compiler/CISACodeGen/EmitVISAPass.hpp"
 | |
| #include "Compiler/CodeGenPublic.h"
 | |
| #include "Compiler/IGCPassSupport.h"
 | |
| #include "common/LLVMWarningsPush.hpp"
 | |
| #include <llvm/IR/DataLayout.h>
 | |
| #include <llvm/IR/Instructions.h>
 | |
| #include <llvmWrapper/IR/IRBuilder.h>
 | |
| #include <llvm/IR/InstIterator.h>
 | |
| #include <llvm/Support/MathExtras.h>
 | |
| #include <llvm/Transforms/Utils/Local.h>
 | |
| #include <llvmWrapper/IR/DerivedTypes.h>
 | |
| #include <llvmWrapper/Support/Alignment.h>
 | |
| #include <optional>
 | |
| #include "common/LLVMWarningsPop.hpp"
 | |
| #include "Probe/Assertion.h"
 | |
| 
 | |
| #include "common/debug/Debug.hpp"
 | |
| 
 | |
| #include <utility> // std::pair, std::make_pair
 | |
| #include <sstream> // std::string, std::stringstream
 | |
| #include <fstream> // std::ofstream
 | |
| 
 | |
| using namespace llvm;
 | |
| using namespace IGC;
 | |
| using IGCLLVM::FixedVectorType;
 | |
| 
 | |
| //
 | |
| // Description of VectorPreProcess Pass
 | |
| //   The purpose is both to legalize vector types and to reduce register
 | |
| //   presure. Once this pass is done, there is no 3-element vector whose
 | |
| //   element size < 4 bytes, that is, no <3 x i8>, no <3 x i16>. (But
 | |
| //   we will have <3xi32> and <3xi64>.)
 | |
| //
 | |
| // 1. Split a vector load/stores with a large vector into ones with
 | |
| //    smaller vectors or scalars; and make sure that the sub-vectors
 | |
| //    are either multiple of DW, vector3, or their size is less than
 | |
| //    4 bytes (see details in code).  Vector3 will be specially
 | |
| //    handled later.
 | |
| //    For example,
 | |
| //        <16xi64> ---> four <4xi64>
 | |
| //        <15xi32> ---> <8xi32>, <7xi32>
 | |
| //        <13xi32> ---> <8xi32>, <5xi32>
 | |
| //        <31xi16> ---> <16xi16>, <12xi16>, <3xi16>
 | |
| //        <19xi16> ---> <16xi16>, <3xi16>
 | |
| //        <39xi8>  ---> <32xi8>, <4xi8>, <3xi8>
 | |
| //    Note that splitting keeps the vector element's type without
 | |
| //    changing it.
 | |
| //
 | |
| //    Note that as 6/2020,
 | |
| //      if size of vector element >= DW, the number of elements of the new vector
 | |
| //      should be power of 2 except for vector3.  Thus, we should not see 5xi32,
 | |
| //      7xi32, etc.  This makes code emit easier.
 | |
| //
 | |
| // 2. Special processing of 3-element vectors
 | |
| //    If (vector element's size < 4 bytes)
 | |
| //    {
 | |
| //       3-element vector load  --> 2-element vector load + scalar load
 | |
| //       3-element vector store --> 2-element vector store + scalar store.
 | |
| //    }
 | |
| //    Note that 3-element load could be optimized to 4-element load (check
 | |
| //    details in the code).
 | |
| //
 | |
| //    for example,
 | |
| //      (1) %1 = load <3 x i8> *p
 | |
| //          converted into
 | |
| //             %pv = bitcast p to <2 x i8>*  // %pv is type <2 x i8>*
 | |
| //             %ps = (i8*)p + 2;             // %ps is type i8*
 | |
| //             %2 = load <2 x i8> *pv
 | |
| //             %3 = load i8 *ps
 | |
| //          original vector %1 == (%2, %3)
 | |
| //
 | |
| //      (2) store <3 x i16> %1, <3 x i16> *p
 | |
| //          converted into
 | |
| //             %pv = bitcast p to <2 x i16>*  // %pv is type <2 x i16>*
 | |
| //             %ps = (i16*)p + 2;             // %ps is type i16*
 | |
| //             %new_v = (%1.x, %1.y)
 | |
| //             store <2 x i16> %new_v, <2 x i16> *pv
 | |
| //             store i16 %1.z i8 *ps
 | |
| //
 | |
| namespace {
 | |
| // AbstractLoadInst and AbstractStoreInst abstract away the differences
 | |
| // between ldraw, Load and PredicatedLoad and between storeraw, Store and PredicatedStore.
 | |
| // Note on usage: The Value* passed as the ptr paramter to the Create method
 | |
| // should be either the result of the getPointerOperand() method or the
 | |
| // CreateConstScalarGEP() method. Do not attempt to do arithmetic
 | |
| // (or pointer arithmetic) on these values.
 | |
| class AbstractLoadInst {
 | |
|   Instruction *const m_inst;
 | |
|   const DataLayout &DL;
 | |
|   AbstractLoadInst(LoadInst *LI, const DataLayout &DL) : m_inst(LI), DL(DL) {}
 | |
|   AbstractLoadInst(LdRawIntrinsic *LdRI, const DataLayout &DL) : m_inst(LdRI), DL(DL) {}
 | |
|   AbstractLoadInst(PredicatedLoadIntrinsic *PLI, const DataLayout &DL) : m_inst(PLI), DL(DL) {}
 | |
| 
 | |
|   LoadInst *getLoad() const { return cast<LoadInst>(m_inst); }
 | |
|   LdRawIntrinsic *getLdRaw() const { return cast<LdRawIntrinsic>(m_inst); }
 | |
|   PredicatedLoadIntrinsic *getPredicatedLoad() const { return cast<PredicatedLoadIntrinsic>(m_inst); }
 | |
| 
 | |
| public:
 | |
|   Instruction *getInst() const { return m_inst; }
 | |
|   alignment_t getAlignment() const {
 | |
|     if (isa<LoadInst>(m_inst))
 | |
|       return IGCLLVM::getAlignmentValue(getLoad());
 | |
|     if (isa<LdRawIntrinsic>(m_inst))
 | |
|       return getLdRaw()->getAlignment();
 | |
|     return getPredicatedLoad()->getAlignment();
 | |
|   }
 | |
|   void setAlignment(alignment_t alignment) {
 | |
|     if (isa<LoadInst>(m_inst)) {
 | |
|       getLoad()->setAlignment(IGCLLVM::getCorrectAlign(alignment));
 | |
|     } else if (isa<LdRawIntrinsic>(m_inst)) {
 | |
|       getLdRaw()->setAlignment(alignment);
 | |
|     } else {
 | |
|       getPredicatedLoad()->setAlignment(alignment);
 | |
|     }
 | |
|   }
 | |
|   Value *getPointerOperand() const {
 | |
|     if (isa<LoadInst>(m_inst))
 | |
|       return getLoad()->getPointerOperand();
 | |
|     if (isa<LdRawIntrinsic>(m_inst))
 | |
|       return getLdRaw()->getResourceValue();
 | |
|     return getPredicatedLoad()->getPointerOperand();
 | |
|   }
 | |
|   bool getIsVolatile() const {
 | |
|     if (isa<LoadInst>(m_inst))
 | |
|       return getLoad()->isVolatile();
 | |
|     if (isa<LdRawIntrinsic>(m_inst))
 | |
|       return getLdRaw()->isVolatile();
 | |
|     return getPredicatedLoad()->isVolatile();
 | |
|   }
 | |
|   unsigned getPointerAddressSpace() const { return getPointerOperand()->getType()->getPointerAddressSpace(); }
 | |
|   Value *getMergeValue() const {
 | |
|     if (isa<PredicatedLoadIntrinsic>(m_inst))
 | |
|       return getPredicatedLoad()->getMergeValue();
 | |
|     return nullptr;
 | |
|   }
 | |
|   Instruction *Create(Type *returnType, Value *mergeValue = nullptr) {
 | |
|     return Create(returnType, getPointerOperand(), getAlignment(), getIsVolatile(), mergeValue);
 | |
|   }
 | |
|   Instruction *Create(Type *returnType, Value *ptr, alignment_t alignment, bool isVolatile,
 | |
|                       Value *mergeValue = nullptr) {
 | |
|     IGCLLVM::IRBuilder<> builder(m_inst);
 | |
|     if (isa<LoadInst>(m_inst)) {
 | |
|       Type *newPtrType = PointerType::get(returnType, ptr->getType()->getPointerAddressSpace());
 | |
|       ptr = builder.CreateBitCast(ptr, newPtrType);
 | |
|       LoadInst *newLI = builder.CreateAlignedLoad(returnType, ptr, IGCLLVM::getAlign(alignment), isVolatile);
 | |
|       if (MDNode *lscMetadata = m_inst->getMetadata("lsc.cache.ctrl")) {
 | |
|         newLI->setMetadata("lsc.cache.ctrl", lscMetadata);
 | |
|       }
 | |
|       return newLI;
 | |
|     }
 | |
|     if (isa<LdRawIntrinsic>(m_inst)) {
 | |
|       LdRawIntrinsic *ldraw = getLdRaw();
 | |
|       bool hasComputedOffset = ptr != ldraw->getResourceValue();
 | |
|       Value *offsetVal = hasComputedOffset ? ptr : ldraw->getOffsetValue();
 | |
|       ptr = ldraw->getResourceValue();
 | |
|       Type *types[2] = {returnType, ptr->getType()};
 | |
|       Value *args[4] = {ptr, offsetVal, builder.getInt32((uint32_t)alignment), builder.getInt1(isVolatile)};
 | |
|       Function *newLdRawFunction = GenISAIntrinsic::getDeclaration(ldraw->getModule(), ldraw->getIntrinsicID(), types);
 | |
|       return builder.CreateCall(newLdRawFunction, args);
 | |
|     }
 | |
| 
 | |
|     IGC_ASSERT(isa<PredicatedLoadIntrinsic>(m_inst));
 | |
|     IGC_ASSERT(mergeValue);
 | |
|     IGC_ASSERT(mergeValue->getType() == returnType);
 | |
| 
 | |
|     PredicatedLoadIntrinsic *PLI = getPredicatedLoad();
 | |
|     Type *newPtrType = PointerType::get(returnType, ptr->getType()->getPointerAddressSpace());
 | |
|     ptr = builder.CreateBitCast(ptr, newPtrType);
 | |
|     Type *types[3] = {returnType, ptr->getType(), returnType};
 | |
|     Function *predLoadFunc = GenISAIntrinsic::getDeclaration(m_inst->getModule(), PLI->getIntrinsicID(), types);
 | |
|     Value *args[4] = {ptr, builder.getInt64((uint64_t)alignment), PLI->getPredicate(), mergeValue};
 | |
|     Instruction *PredLoad = builder.CreateCall(predLoadFunc, args);
 | |
|     if (MDNode *lscMetadata = m_inst->getMetadata("lsc.cache.ctrl"))
 | |
|       PredLoad->setMetadata("lsc.cache.ctrl", lscMetadata);
 | |
|     return PredLoad;
 | |
|   }
 | |
|   // Emulates a GEP on a pointer of the scalar type of returnType.
 | |
|   Value *CreateConstScalarGEP(Type *returnType, Value *ptr, uint32_t offset) {
 | |
|     IGCLLVM::IRBuilder<> builder(m_inst);
 | |
|     if (isa<LoadInst>(m_inst) || isa<PredicatedLoadIntrinsic>(m_inst)) {
 | |
|       Type *ePtrType = PointerType::get(returnType->getScalarType(), ptr->getType()->getPointerAddressSpace());
 | |
|       ptr = builder.CreateBitCast(ptr, ePtrType);
 | |
|       return builder.CreateConstGEP1_32(returnType->getScalarType(), ptr, offset);
 | |
|     } else {
 | |
|       uint32_t sizeInBytes = int_cast<uint32_t>(DL.getTypeSizeInBits(returnType->getScalarType()) / 8);
 | |
|       Value *offsetInBytes = builder.getInt32(offset * sizeInBytes);
 | |
|       return builder.CreateAdd(offsetInBytes, getLdRaw()->getOffsetValue());
 | |
|     }
 | |
|   }
 | |
|   static std::optional<AbstractLoadInst> get(llvm::Value *value, const DataLayout &DL) {
 | |
|     if (LoadInst *LI = dyn_cast<LoadInst>(value)) {
 | |
|       return AbstractLoadInst{LI, DL};
 | |
|     } else if (LdRawIntrinsic *LdRI = dyn_cast<LdRawIntrinsic>(value)) {
 | |
|       return AbstractLoadInst{LdRI, DL};
 | |
|     } else if (PredicatedLoadIntrinsic *PLI = dyn_cast<PredicatedLoadIntrinsic>(value)) {
 | |
|       return AbstractLoadInst{PLI, DL};
 | |
|     } else {
 | |
|       return std::nullopt;
 | |
|     }
 | |
|   }
 | |
| };
 | |
| static bool isAbstractLoadInst(llvm::Value *value) {
 | |
|   return isa<LoadInst>(value) || isa<LdRawIntrinsic>(value) || isa<PredicatedLoadIntrinsic>(value);
 | |
| }
 | |
| 
 | |
| class AbstractStoreInst {
 | |
|   Instruction *const m_inst;
 | |
|   const DataLayout &DL;
 | |
|   AbstractStoreInst(StoreInst *SI, const DataLayout &DL) : m_inst(SI), DL(DL) {}
 | |
|   AbstractStoreInst(StoreRawIntrinsic *SRI, const DataLayout &DL) : m_inst(SRI), DL(DL) {}
 | |
|   AbstractStoreInst(PredicatedStoreIntrinsic *PSI, const DataLayout &DL) : m_inst(PSI), DL(DL) {}
 | |
| 
 | |
|   StoreInst *getStore() const { return cast<StoreInst>(m_inst); }
 | |
|   StoreRawIntrinsic *getStoreRaw() const { return cast<StoreRawIntrinsic>(m_inst); }
 | |
|   PredicatedStoreIntrinsic *getPredicatedStore() const { return cast<PredicatedStoreIntrinsic>(m_inst); }
 | |
| 
 | |
| public:
 | |
|   Instruction *getInst() const { return m_inst; }
 | |
|   alignment_t getAlignment() const {
 | |
|     if (isa<StoreInst>(m_inst))
 | |
|       return IGCLLVM::getAlignmentValue(getStore());
 | |
|     if (isa<StoreRawIntrinsic>(m_inst))
 | |
|       return getStoreRaw()->getAlignment();
 | |
|     return getPredicatedStore()->getAlignment();
 | |
|   }
 | |
|   void setAlignment(alignment_t alignment) {
 | |
|     if (isa<StoreInst>(m_inst)) {
 | |
|       getStore()->setAlignment(IGCLLVM::getCorrectAlign(alignment));
 | |
|     } else if (isa<PredicatedStoreIntrinsic>(m_inst)) {
 | |
|       getPredicatedStore()->setAlignment(alignment);
 | |
|     }
 | |
|   }
 | |
|   Value *getValueOperand() const {
 | |
|     if (isa<StoreInst>(m_inst))
 | |
|       return getStore()->getValueOperand();
 | |
|     if (isa<StoreRawIntrinsic>(m_inst))
 | |
|       return getStoreRaw()->getArgOperand(2);
 | |
|     return getPredicatedStore()->getValueOperand();
 | |
|   }
 | |
|   Value *getPointerOperand() const {
 | |
|     if (isa<StoreInst>(m_inst))
 | |
|       return getStore()->getPointerOperand();
 | |
|     if (isa<StoreRawIntrinsic>(m_inst))
 | |
|       return getStoreRaw()->getArgOperand(0);
 | |
|     return getPredicatedStore()->getPointerOperand();
 | |
|   }
 | |
|   bool getIsVolatile() const {
 | |
|     if (isa<StoreInst>(m_inst))
 | |
|       return getStore()->isVolatile();
 | |
|     if (isa<PredicatedLoadIntrinsic>(m_inst))
 | |
|       return getPredicatedStore()->isVolatile();
 | |
|     return false;
 | |
|   }
 | |
|   unsigned getPointerAddressSpace() const { return getPointerOperand()->getType()->getPointerAddressSpace(); }
 | |
|   Instruction *Create(Value *storedValue, Value *ptr, alignment_t alignment, bool isVolatile) {
 | |
|     IRBuilder<> builder(m_inst);
 | |
|     Type *newType = storedValue->getType();
 | |
|     if (isa<StoreInst>(m_inst)) {
 | |
|       Type *newPtrType = PointerType::get(newType, ptr->getType()->getPointerAddressSpace());
 | |
|       ptr = builder.CreateBitCast(ptr, newPtrType);
 | |
|       return alignment ? builder.CreateAlignedStore(storedValue, ptr, IGCLLVM::getAlign(alignment), isVolatile)
 | |
|                        : builder.CreateStore(storedValue, ptr, isVolatile);
 | |
|     }
 | |
|     if (isa<StoreRawIntrinsic>(m_inst)) {
 | |
|       bool hasComputedOffset = ptr != getPointerOperand();
 | |
|       Value *offset = hasComputedOffset ? ptr : getStoreRaw()->getArgOperand(1);
 | |
|       ptr = getPointerOperand();
 | |
|       Type *types[2] = {ptr->getType(), newType};
 | |
|       Value *args[5] = {ptr, offset, storedValue, builder.getInt32((uint32_t)alignment), builder.getInt1(isVolatile)};
 | |
|       Function *newStoreRawFunction =
 | |
|           GenISAIntrinsic::getDeclaration(getStoreRaw()->getModule(), getStoreRaw()->getIntrinsicID(), types);
 | |
|       return builder.CreateCall(newStoreRawFunction, args);
 | |
|     }
 | |
| 
 | |
|     Type *newPtrType = PointerType::get(newType, ptr->getType()->getPointerAddressSpace());
 | |
|     ptr = builder.CreateBitCast(ptr, newPtrType);
 | |
|     Type *types[2] = {ptr->getType(), newType};
 | |
|     Function *predStoreFunc = GenISAIntrinsic::getDeclaration(getPredicatedStore()->getModule(),
 | |
|                                                               getPredicatedStore()->getIntrinsicID(), types);
 | |
|     Value *args[4] = {ptr, storedValue, builder.getInt64((uint64_t)alignment), getPredicatedStore()->getPredicate()};
 | |
|     return builder.CreateCall(predStoreFunc, args);
 | |
|   }
 | |
|   Instruction *Create(Value *storedValue) {
 | |
|     return Create(storedValue, getPointerOperand(), getAlignment(), getIsVolatile());
 | |
|   }
 | |
|   // Emulates a GEP on a pointer of the scalar type of storedType.
 | |
|   Value *CreateConstScalarGEP(Type *storedType, Value *ptr, uint32_t offset) {
 | |
|     IGCLLVM::IRBuilder<> builder(m_inst);
 | |
|     if (isa<StoreInst>(m_inst) || isa<PredicatedStoreIntrinsic>(m_inst)) {
 | |
|       Type *ePtrType = PointerType::get(storedType->getScalarType(), ptr->getType()->getPointerAddressSpace());
 | |
|       ptr = builder.CreateBitCast(ptr, ePtrType);
 | |
|       return builder.CreateConstGEP1_32(storedType->getScalarType(), ptr, offset);
 | |
|     } else {
 | |
|       uint32_t sizeInBytes = int_cast<uint32_t>(DL.getTypeSizeInBits(storedType->getScalarType()) / 8);
 | |
|       Value *offsetInBytes = builder.getInt32(offset * sizeInBytes);
 | |
|       return builder.CreateAdd(offsetInBytes, getStoreRaw()->getArgOperand(1));
 | |
|     }
 | |
|   }
 | |
|   static std::optional<AbstractStoreInst> get(llvm::Value *value, const DataLayout &DL) {
 | |
|     if (StoreInst *SI = dyn_cast<StoreInst>(value)) {
 | |
|       return AbstractStoreInst{SI, DL};
 | |
|     }
 | |
|     if (StoreRawIntrinsic *SRI = dyn_cast<StoreRawIntrinsic>(value)) {
 | |
|       return AbstractStoreInst{SRI, DL};
 | |
|     }
 | |
|     if (PredicatedStoreIntrinsic *PSI = dyn_cast<PredicatedStoreIntrinsic>(value)) {
 | |
|       return AbstractStoreInst{PSI, DL};
 | |
|     }
 | |
|     return std::nullopt;
 | |
|   }
 | |
| };
 | |
| 
 | |
| static bool isAbstractStoreInst(llvm::Value *value) {
 | |
|   GenIntrinsicInst *II = dyn_cast<GenIntrinsicInst>(value);
 | |
|   return isa<StoreInst>(value) || (II && (II->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed ||
 | |
|                                           II->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
 | |
|                                           II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore));
 | |
| }
 | |
| 
 | |
| class VectorPreProcess : public FunctionPass {
 | |
| public:
 | |
|   typedef SmallVector<Instruction *, 32> InstWorkVector;
 | |
|   typedef SmallVector<Value *, 16> ValVector;
 | |
| 
 | |
|   // vector value -> (split size in bytes -> vector's component values)
 | |
|   typedef DenseMap<Value *, DenseMap<uint32_t, ValVector>> V2SMap;
 | |
| 
 | |
|   enum class VPConst {
 | |
|     // If a vector's size is bigger than SPLIT_SIZE, split it into multiple
 | |
|     // of SPLIT_SIZE (plus smaller sub-vectors or scalar if any).
 | |
|     // With SPLIT_SIZE=32, we have the max vectors as below after this pass:
 | |
|     //     <32 x i8>, 16xi16, 8xi32, or 4xi64!
 | |
|     SPLIT_SIZE = 32,                  // default, 32 bytes
 | |
|     LSC_D64_UNIFORM_SPLIT_SIZE = 512, // LSC transpose 64 x D64
 | |
|     LSC_D32_UNIFORM_SPLIT_SIZE = 256, // LSC transpose 64 x D32
 | |
|     RAW_SPLIT_SIZE = 16
 | |
|   };
 | |
| 
 | |
|   static char ID; // Pass identification, replacement for typeid
 | |
|   VectorPreProcess() : FunctionPass(ID), m_DL(nullptr), m_C(nullptr), m_WorkList(), m_Temps(), m_CGCtx(nullptr) {
 | |
|     initializeVectorPreProcessPass(*PassRegistry::getPassRegistry());
 | |
|   }
 | |
| 
 | |
|   StringRef getPassName() const override { return "VectorPreProcess"; }
 | |
|   bool runOnFunction(Function &F) override;
 | |
|   void getAnalysisUsage(AnalysisUsage &AU) const override {
 | |
|     AU.setPreservesCFG();
 | |
|     AU.addRequired<CodeGenContextWrapper>();
 | |
|     AU.addRequired<MetaDataUtilsWrapper>();
 | |
|     AU.addRequired<DominatorTreeWrapperPass>();
 | |
|     AU.addRequired<PostDominatorTreeWrapperPass>();
 | |
|     AU.addRequired<LoopInfoWrapperPass>();
 | |
|   }
 | |
| 
 | |
| private:
 | |
|   void getOrGenScalarValues(Function &F, Value *VecVal, ValVector &scalars, Instruction *&availBeforeInst);
 | |
|   void replaceAllVectorUsesWithScalars(Instruction *VI, ValVector &SVals);
 | |
| 
 | |
|   // Return true if V is created by InsertElementInst with const index.
 | |
|   bool isValueCreatedOnlyByIEI(Value *V, InsertElementInst **IEInsts);
 | |
|   // Return true if V is only used by ExtractElement with const index.
 | |
|   bool isValueUsedOnlyByEEI(Value *V, ExtractElementInst **EEInsts);
 | |
| 
 | |
|   // Split load/store that cannot be re-layout or is too big.
 | |
|   uint32_t getSplitByteSize(Instruction *I, WIAnalysisRunner &WI) const;
 | |
|   bool splitLoadStore(Instruction *Inst, V2SMap &vecToSubVec, WIAnalysisRunner &WI);
 | |
|   bool splitLoad(AbstractLoadInst &LI, V2SMap &vecToSubVec, WIAnalysisRunner &WI);
 | |
|   bool splitStore(AbstractStoreInst &SI, V2SMap &vecToSubVec, WIAnalysisRunner &WI);
 | |
|   bool splitVector3LoadStore(Instruction *Inst);
 | |
|   // Simplify load/store instructions if possible. Return itself if no
 | |
|   // simplification is performed.
 | |
|   Instruction *simplifyLoadStore(Instruction *LI);
 | |
|   void createSplitVectorTypes(Type *ETy, uint32_t NElts, uint32_t SplitSize,
 | |
|                               SmallVector<std::pair<Type *, uint32_t>, 8> &SplitInfo);
 | |
|   // If predicated loads are split, we also need to split merge values
 | |
|   void createSplitMergeValues(Instruction *Inst, Value *OrigMergeVal,
 | |
|                               const SmallVector<std::pair<Type *, uint32_t>, 8> &SplitInfo,
 | |
|                               ValVector &NewMergeVals) const;
 | |
|   bool processScalarLoadStore(Function &F);
 | |
| 
 | |
| private:
 | |
|   const DataLayout *m_DL;
 | |
|   LLVMContext *m_C;
 | |
|   InstWorkVector m_WorkList;
 | |
|   ValVector m_Temps;
 | |
|   InstWorkVector m_Vector3List; // used for keep all 3-element vectors.
 | |
|   IGC::CodeGenContext *m_CGCtx;
 | |
| };
 | |
| } // namespace
 | |
| 
 | |
| // Register pass to igc-opt
 | |
| #define PASS_FLAG "igc-vectorpreprocess"
 | |
| #define PASS_DESCRIPTION "Split loads/stores of big (or 3-element) vectors into smaller ones."
 | |
| #define PASS_CFG_ONLY false
 | |
| #define PASS_ANALYSIS false
 | |
| IGC_INITIALIZE_PASS_BEGIN(VectorPreProcess, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 | |
| IGC_INITIALIZE_PASS_END(VectorPreProcess, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
 | |
| 
 | |
| char VectorPreProcess::ID = 0;
 | |
| 
 | |
| FunctionPass *IGC::createVectorPreProcessPass() { return new VectorPreProcess(); }
 | |
| 
 | |
| bool VectorPreProcess::isValueCreatedOnlyByIEI(Value *V, InsertElementInst **IEInsts) {
 | |
|   Value *ChainVal = V;
 | |
|   while (!isa<UndefValue>(ChainVal)) {
 | |
|     InsertElementInst *IEI = dyn_cast<InsertElementInst>(ChainVal);
 | |
|     if (!IEI || !isa<ConstantInt>(IEI->getOperand(2))) {
 | |
|       return false;
 | |
|     }
 | |
|     ConstantInt *CInt = cast<ConstantInt>(IEI->getOperand(2));
 | |
|     uint32_t idx = (uint32_t)CInt->getZExtValue();
 | |
| 
 | |
|     // Make sure the last IEI will be recorded if an element is
 | |
|     // inserted multiple times.
 | |
|     if (IEInsts[idx] == nullptr) {
 | |
|       IEInsts[idx] = IEI;
 | |
|     }
 | |
| 
 | |
|     ChainVal = IEI->getOperand(0);
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| bool VectorPreProcess::isValueUsedOnlyByEEI(Value *V, ExtractElementInst **EEInsts) {
 | |
|   for (Value::user_iterator UI = V->user_begin(), UE = V->user_end(); UI != UE; ++UI) {
 | |
|     ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(*UI);
 | |
|     if (!EEI || (EEI->getOperand(0) != V) || !isa<ConstantInt>(EEI->getOperand(1))) {
 | |
|       return false;
 | |
|     }
 | |
|     ConstantInt *CInt = cast<ConstantInt>(EEI->getOperand(1));
 | |
|     uint32_t idx = (uint32_t)CInt->getZExtValue();
 | |
| 
 | |
|     // Quit if there are multiple extract from the same index.
 | |
|     if (EEInsts[idx] != nullptr) {
 | |
|       return false;
 | |
|     }
 | |
|     EEInsts[idx] = EEI;
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| // SVals[0:NumElements] has all scalar elements of vector VI. This function
 | |
| // tries to replace all uses of VI with SVals[...] if possible, If not
 | |
| // possible, re-generate the vector from SVals at the BB of VI.
 | |
| //
 | |
| // This function also erase VI.
 | |
| void VectorPreProcess::replaceAllVectorUsesWithScalars(Instruction *VI, ValVector &SVals) {
 | |
|   SmallVector<Instruction *, 8> ToBeDeleted;
 | |
|   bool genVec = false;
 | |
|   for (Value::user_iterator UI = VI->user_begin(), UE = VI->user_end(); UI != UE; ++UI) {
 | |
|     ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(*UI);
 | |
|     if (!EEI) {
 | |
|       genVec = true;
 | |
|       continue;
 | |
|     }
 | |
|     ConstantInt *CI = dyn_cast<ConstantInt>(EEI->getOperand(1));
 | |
|     if (!CI) {
 | |
|       genVec = true;
 | |
|       continue;
 | |
|     }
 | |
|     uint32_t ix = (uint32_t)CI->getZExtValue();
 | |
|     EEI->replaceAllUsesWith(SVals[ix]);
 | |
|     ToBeDeleted.push_back(EEI);
 | |
|   }
 | |
|   if (genVec) {
 | |
|     Instruction *I;
 | |
|     if (!isa<PHINode>(VI)) {
 | |
|       I = VI;
 | |
|     } else {
 | |
|       I = VI->getParent()->getFirstNonPHI();
 | |
|     }
 | |
|     IRBuilder<> Builder(I);
 | |
|     IGCLLVM::FixedVectorType *VTy = cast<IGCLLVM::FixedVectorType>(VI->getType());
 | |
|     Value *newVec = UndefValue::get(VTy);
 | |
|     for (uint32_t i = 0, e = int_cast<uint32_t>(VTy->getNumElements()); i < e; ++i) {
 | |
|       newVec = Builder.CreateInsertElement(newVec, SVals[i], Builder.getInt32(i), "scalarize");
 | |
|     }
 | |
|     // Replace old instruction with new one
 | |
|     VI->replaceAllUsesWith(newVec);
 | |
|   }
 | |
|   for (uint32_t i = 0; i < ToBeDeleted.size(); ++i) {
 | |
|     ToBeDeleted[i]->eraseFromParent();
 | |
|   }
 | |
| }
 | |
| 
 | |
| void VectorPreProcess::createSplitVectorTypes(Type *ETy, uint32_t NElts, uint32_t SplitSize,
 | |
|                                               SmallVector<std::pair<Type *, uint32_t>, 8> &SplitInfo) {
 | |
|   uint32_t ebytes = int_cast<uint32_t>(m_DL->getTypeSizeInBits(ETy) / 8);
 | |
| 
 | |
|   // todo: generalize splitting for cases whose element size is bigger than splitsize!
 | |
|   if (IGC_IS_FLAG_ENABLED(EnableSplitUnalignedVector)) {
 | |
|     if (ebytes > SplitSize) {
 | |
|       IGC_ASSERT(SplitSize);
 | |
|       uint32_t M = NElts * ebytes / SplitSize;
 | |
|       Type *Ty = IntegerType::get(ETy->getContext(), SplitSize * 8);
 | |
|       SplitInfo.push_back(std::make_pair(Ty, M));
 | |
|       return;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Both SplitSize and ebytes shall be a power of 2
 | |
|   IGC_ASSERT(ebytes);
 | |
|   IGC_ASSERT_MESSAGE((SplitSize % ebytes) == 0, "Internal Error: Wrong split size!");
 | |
| 
 | |
|   uint32_t E = SplitSize / ebytes; // split size in elements
 | |
|   uint32_t N = NElts;              // the number of elements to be split
 | |
| 
 | |
|   IGC_ASSERT(E);
 | |
| 
 | |
|   // 1. Make sure splitting it by SplitSize as required
 | |
|   uint32_t M = N / E; // the number of subvectors for split size E
 | |
|   if (M > 0) {
 | |
|     Type *Ty = (E == 1) ? ETy : FixedVectorType::get(ETy, E);
 | |
|     SplitInfo.push_back(std::make_pair(Ty, M));
 | |
|   }
 | |
|   N = N % E;
 | |
|   E = E / 2; // next split size
 | |
| 
 | |
|   // 2. The remaining elts are splitted if not power of 2 until N <= 4.
 | |
|   while (N > 4) {
 | |
|     IGC_ASSERT(E);
 | |
| 
 | |
|     M = N / E; // the number of subvectors for split size E
 | |
|     if (M > 0) {
 | |
|       SplitInfo.push_back(std::make_pair(FixedVectorType::get(ETy, E), M));
 | |
|     }
 | |
|     // The remaining elts are to be split for next iteration.
 | |
|     N = N % E;
 | |
|     E = E / 2; // next split size
 | |
|   }
 | |
| 
 | |
|   // 3. A vector of 1|2|3|4 elements. No further splitting!
 | |
|   if (N > 0) {
 | |
|     Type *Ty = (N == 1) ? ETy : FixedVectorType::get(ETy, N);
 | |
|     SplitInfo.push_back(std::make_pair(Ty, 1));
 | |
|   }
 | |
| }
 | |
| 
 | |
| void VectorPreProcess::createSplitMergeValues(Instruction *Inst, Value *OrigMergeVal,
 | |
|                                               const SmallVector<std::pair<Type *, uint32_t>, 8> &SplitInfo,
 | |
|                                               ValVector &NewMergeVals) const {
 | |
|   // if OrigMergeVal is a zeroinitializer, undef, or poison value, we just need to fill
 | |
|   // NewMergeVals with the same based on SplitInfo and return.
 | |
|   if (isa<ConstantAggregateZero>(OrigMergeVal) || isa<UndefValue>(OrigMergeVal) || isa<PoisonValue>(OrigMergeVal)) {
 | |
|     for (auto &SI : SplitInfo) {
 | |
|       Type *Ty = SI.first;
 | |
|       IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
 | |
|       uint32_t N = SI.second;
 | |
|       for (uint32_t i = 0; i < N; ++i) {
 | |
|         Value *NewMergeVal = nullptr;
 | |
|         if (isa<ConstantAggregateZero>(OrigMergeVal)) {
 | |
|           if (VTy)
 | |
|             NewMergeVal = ConstantAggregateZero::get(VTy);
 | |
|           else
 | |
|             NewMergeVal = Constant::getNullValue(Ty);
 | |
|         } else if (isa<PoisonValue>(OrigMergeVal)) {
 | |
|           NewMergeVal = PoisonValue::get(SI.first);
 | |
|         } else {
 | |
|           NewMergeVal = UndefValue::get(SI.first);
 | |
|         }
 | |
|         NewMergeVals.push_back(NewMergeVal);
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     return;
 | |
|   }
 | |
| 
 | |
|   IRBuilder<> Builder(Inst);
 | |
| 
 | |
|   // Case when we split vector merge value into subvectors. Element type is the same.
 | |
|   // Just one big vector is being split into subvectors.
 | |
|   if (IGCLLVM::FixedVectorType *OrigVTy = dyn_cast<IGCLLVM::FixedVectorType>(OrigMergeVal->getType())) {
 | |
|     unsigned OrigVTyNEl = OrigVTy->getNumElements();
 | |
|     uint32_t idx = 0; // index counting elements of the the original vector merge value
 | |
| 
 | |
|     // Split the merge value into subvectors
 | |
|     for (auto &SI : SplitInfo) {
 | |
|       Type *Ty = SI.first;
 | |
|       IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
 | |
|       uint32_t N = SI.second;
 | |
|       for (uint32_t i = 0; i < N; ++i) {
 | |
|         Value *NewMergeVal = UndefValue::get(Ty);
 | |
|         if (VTy) {
 | |
|           for (uint32_t j = 0, e = int_cast<uint32_t>(VTy->getNumElements()); j < e; ++j) {
 | |
|             Value *Elt = (idx < OrigVTyNEl) ? Builder.CreateExtractElement(OrigMergeVal, Builder.getInt32(idx++))
 | |
|                                             : Constant::getNullValue(VTy->getElementType());
 | |
|             NewMergeVal = Builder.CreateInsertElement(NewMergeVal, Elt, Builder.getInt32(j));
 | |
|           }
 | |
|         } else {
 | |
|           NewMergeVal = Builder.CreateExtractElement(OrigMergeVal, Builder.getInt32(idx++));
 | |
|         }
 | |
|         NewMergeVals.push_back(NewMergeVal);
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     return;
 | |
|   }
 | |
| 
 | |
|   // Case when we change scalar value into vector with smaller element type.
 | |
|   IGC_ASSERT_MESSAGE(SplitInfo.size() == 1, "Unexpected split info!");
 | |
|   IGC_ASSERT_MESSAGE(SplitInfo[0].second == 1, "Unexpected split info!");
 | |
|   Value *NewMergeVal = Builder.CreateBitCast(OrigMergeVal, SplitInfo[0].first);
 | |
|   NewMergeVals.push_back(NewMergeVal);
 | |
| }
 | |
| 
 | |
| uint32_t VectorPreProcess::getSplitByteSize(Instruction *I, WIAnalysisRunner &WI) const {
 | |
|   uint32_t bytes = 0;
 | |
|   std::optional<AbstractLoadInst> ALI = AbstractLoadInst::get(I, *m_DL);
 | |
|   std::optional<AbstractStoreInst> ASI = AbstractStoreInst::get(I, *m_DL);
 | |
| 
 | |
|   if (isa<LoadInst>(I) || isa<PredicatedLoadIntrinsic>(I)) {
 | |
|     IGC_ASSERT(ALI.has_value());
 | |
|     bytes = (uint32_t)VPConst::SPLIT_SIZE;
 | |
|     if (WI.isUniform(ALI->getPointerOperand()) &&
 | |
|         (m_CGCtx->platform.LSCEnabled() || IGC_GET_FLAG_VALUE(UniformMemOpt4OW))) {
 | |
|       if (ALI->getAlignment() >= 8)
 | |
|         bytes = (uint32_t)VPConst::LSC_D64_UNIFORM_SPLIT_SIZE;
 | |
|       else if (ALI->getAlignment() >= 4)
 | |
|         bytes = (uint32_t)VPConst::LSC_D32_UNIFORM_SPLIT_SIZE;
 | |
|     }
 | |
|   } else if (isa<StoreInst>(I) || isa<PredicatedStoreIntrinsic>(I)) {
 | |
|     IGC_ASSERT(ASI.has_value());
 | |
|     bytes = (uint32_t)VPConst::SPLIT_SIZE;
 | |
|     Value *Addr = ASI->getPointerOperand();
 | |
|     Value *Data = ASI->getValueOperand();
 | |
|     if (m_CGCtx->platform.LSCEnabled() && WI.isUniform(Addr) && WI.isUniform(Data)) {
 | |
|       if (ASI->getAlignment() >= 8)
 | |
|         bytes = (uint32_t)VPConst::LSC_D64_UNIFORM_SPLIT_SIZE;
 | |
|       else if (ASI->getAlignment() >= 4)
 | |
|         bytes = (uint32_t)VPConst::LSC_D32_UNIFORM_SPLIT_SIZE;
 | |
|     }
 | |
|   } else if (isa<LdRawIntrinsic>(I) || isa<StoreRawIntrinsic>(I)) {
 | |
|     uint32_t alignment =
 | |
|         isa<LdRawIntrinsic>(I) ? cast<LdRawIntrinsic>(I)->getAlignment() : cast<StoreRawIntrinsic>(I)->getAlignment();
 | |
|     Value *bufferAddr = isa<LdRawIntrinsic>(I) ? cast<LdRawIntrinsic>(I)->getResourceValue()
 | |
|                                                : cast<StoreRawIntrinsic>(I)->getResourceValue();
 | |
|     Value *offset = isa<LdRawIntrinsic>(I) ? cast<LdRawIntrinsic>(I)->getOffsetValue()
 | |
|                                            : cast<StoreRawIntrinsic>(I)->getOffsetValue();
 | |
|     Value *data = isa<LdRawIntrinsic>(I) ? nullptr : cast<StoreRawIntrinsic>(I)->getStoreValue();
 | |
|     bytes = (uint32_t)VPConst::RAW_SPLIT_SIZE;
 | |
|     if (EmitPass::shouldGenerateLSCQuery(*m_CGCtx, I) == Tristate::True) {
 | |
|       if (WI.isUniform(bufferAddr) && WI.isUniform(offset) && (data == nullptr || WI.isUniform(data))) {
 | |
|         if (alignment >= 8) {
 | |
|           bytes = (uint32_t)VPConst::LSC_D64_UNIFORM_SPLIT_SIZE;
 | |
|         } else if (alignment >= 4) {
 | |
|           bytes = (uint32_t)VPConst::LSC_D32_UNIFORM_SPLIT_SIZE;
 | |
|         }
 | |
|       } else {
 | |
|         bytes = (uint32_t)VPConst::SPLIT_SIZE;
 | |
|       }
 | |
|     } else {
 | |
|       Type *ValueTy = nullptr;
 | |
|       if (StoreRawIntrinsic *SRI = dyn_cast<StoreRawIntrinsic>(I)) {
 | |
|         ValueTy = SRI->getStoreValue()->getType();
 | |
|       } else {
 | |
|         ValueTy = I->getType();
 | |
|       }
 | |
|       IGCLLVM::FixedVectorType *vecType = dyn_cast_or_null<IGCLLVM::FixedVectorType>(ValueTy);
 | |
|       if (vecType && m_DL->getTypeSizeInBits(vecType->getScalarType()) == 64) {
 | |
|         bytes = 8; // use QW load/store
 | |
|       }
 | |
|     }
 | |
|   } else {
 | |
|     bytes = (uint32_t)VPConst::SPLIT_SIZE;
 | |
|   }
 | |
| 
 | |
|   if ((isa<LoadInst>(I) || isa<StoreInst>(I) || isa<PredicatedLoadIntrinsic>(I) || isa<PredicatedStoreIntrinsic>(I)) &&
 | |
|       WI.isUniform(I)) {
 | |
|     auto Alignment = ALI.has_value() ? ALI->getAlignment() : ASI->getAlignment();
 | |
|     if (Alignment >= 16) {
 | |
|       Type *ETy = ALI.has_value() ? cast<VectorType>(I->getType())->getElementType()
 | |
|                                   : cast<VectorType>(ASI->getValueOperand()->getType())->getElementType();
 | |
| 
 | |
|       Value *Ptr = ALI.has_value() ? ALI->getPointerOperand() : ASI->getPointerOperand();
 | |
|       bool SLM = Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_LOCAL;
 | |
|       uint32_t ebytes = int_cast<uint32_t>(m_DL->getTypeSizeInBits(ETy) / 8);
 | |
|       // Limit to DW and QW element types to avoid generating vectors that
 | |
|       // are too large (ideally, should be <= 32 elements currently).
 | |
|       if (ebytes == 4 || ebytes == 8) {
 | |
|         bytes = std::max(bytes, m_CGCtx->platform.getMaxBlockMsgSize(SLM));
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   return bytes;
 | |
| }
 | |
| 
 | |
| bool VectorPreProcess::splitStore(AbstractStoreInst &ASI, V2SMap &vecToSubVec, WIAnalysisRunner &WI) {
 | |
|   Instruction *SI = ASI.getInst();
 | |
|   Value *StoredVal = ASI.getValueOperand();
 | |
|   IGCLLVM::FixedVectorType *VTy = cast<IGCLLVM::FixedVectorType>(StoredVal->getType());
 | |
|   Type *ETy = VTy->getElementType();
 | |
|   uint32_t nelts = int_cast<uint32_t>(VTy->getNumElements());
 | |
| 
 | |
|   // splitInfo: Keep track of all pairs of (sub-vec type, #sub-vec).
 | |
|   SmallVector<std::pair<Type *, uint32_t>, 8> splitInfo;
 | |
|   bool isStoreInst = isa<StoreInst>(SI) || isa<PredicatedStoreIntrinsic>(SI);
 | |
|   uint32_t splitSize = getSplitByteSize(SI, WI);
 | |
|   if (IGC_IS_FLAG_ENABLED(EnableSplitUnalignedVector)) {
 | |
|     // byte and word-aligned stores can only store a dword at a time.
 | |
|     auto alignment = ASI.getAlignment();
 | |
|     if (isStoreInst && alignment < 4) {
 | |
|       alignment_t newAlign = (alignment_t)IGCLLVM::getAlignmentValue(getKnownAlignment(ASI.getPointerOperand(), *m_DL));
 | |
|       if (newAlign > alignment) {
 | |
|         // For the same reason as Load, use DW-aligned for OCL stateful.
 | |
|         if (newAlign > 4 && isStatefulAddrSpace(ASI.getPointerAddressSpace())) {
 | |
|           newAlign = 4;
 | |
|         }
 | |
|         ASI.setAlignment(newAlign);
 | |
|       }
 | |
|     }
 | |
|     bool needsDWordSplit =
 | |
|         (!isStoreInst || m_CGCtx->m_DriverInfo.splitUnalignedVectors() || !WI.isUniform(ASI.getInst())) &&
 | |
|         ASI.getAlignment() < 4;
 | |
|     if (needsDWordSplit) {
 | |
|       splitSize = 4;
 | |
|     }
 | |
|   }
 | |
|   createSplitVectorTypes(ETy, nelts, splitSize, splitInfo);
 | |
| 
 | |
|   // return if no split
 | |
|   uint32_t len = splitInfo.size();
 | |
|   if (len == 1 && splitInfo[0].second == 1) {
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   // Create a new value in the map for store
 | |
|   ValVector &svals = vecToSubVec[SI][splitSize];
 | |
|   if (svals.size() == 0) {
 | |
|     // Need to create splitted values.
 | |
|     Instruction *insertBeforeInst = nullptr;
 | |
|     ValVector scalars(nelts, nullptr);
 | |
|     getOrGenScalarValues(*SI->getParent()->getParent(), StoredVal, scalars, insertBeforeInst);
 | |
|     insertBeforeInst = insertBeforeInst ? insertBeforeInst : SI;
 | |
|     IRBuilder<> aBuilder(insertBeforeInst);
 | |
| 
 | |
|     Type *Ty1 = splitInfo[0].first;
 | |
|     if (IGC_IS_FLAG_ENABLED(EnableSplitUnalignedVector)) {
 | |
|       if (m_DL->getTypeSizeInBits(ETy) > m_DL->getTypeSizeInBits(Ty1->getScalarType())) {
 | |
|         std::vector<Value *> splitScalars;
 | |
|         IGC_ASSERT(m_DL->getTypeSizeInBits(Ty1->getScalarType()));
 | |
|         const uint32_t vectorSize =
 | |
|             (unsigned int)m_DL->getTypeSizeInBits(ETy) / (unsigned int)m_DL->getTypeSizeInBits(Ty1->getScalarType());
 | |
|         Type *splitType = FixedVectorType::get(Ty1, vectorSize);
 | |
|         for (uint32_t i = 0; i < nelts; i++) {
 | |
|           Value *splitInst = aBuilder.CreateBitCast(scalars[i], splitType);
 | |
|           for (uint32_t j = 0; j < vectorSize; j++) {
 | |
|             splitScalars.push_back(aBuilder.CreateExtractElement(splitInst, j));
 | |
|           }
 | |
|         }
 | |
|         scalars.resize(splitScalars.size());
 | |
|         for (uint32_t i = 0; i < splitScalars.size(); i++) {
 | |
|           scalars[i] = splitScalars[i];
 | |
|         }
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     // Now generate svals
 | |
|     for (uint32_t i = 0, Idx = 0; i < len; ++i) {
 | |
|       Type *Ty1 = splitInfo[i].first;
 | |
|       uint32_t len1 = splitInfo[i].second;
 | |
|       IGCLLVM::FixedVectorType *VTy1 = dyn_cast<IGCLLVM::FixedVectorType>(Ty1);
 | |
|       for (uint32_t j = 0; j < len1; ++j) {
 | |
|         Value *subVec;
 | |
|         if (!VTy1) {
 | |
|           subVec = scalars[Idx];
 | |
|           ++Idx;
 | |
|         } else {
 | |
|           subVec = UndefValue::get(Ty1);
 | |
|           uint32_t n1 = int_cast<uint32_t>(VTy1->getNumElements());
 | |
|           for (uint32_t k = 0; k < n1; ++k) {
 | |
|             subVec = aBuilder.CreateInsertElement(subVec, scalars[Idx], aBuilder.getInt32(k));
 | |
|             ++Idx;
 | |
|           }
 | |
|         }
 | |
|         svals.push_back(subVec);
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   Value *Addr = ASI.getPointerOperand();
 | |
|   auto Align = ASI.getAlignment();
 | |
|   bool IsVolatile = ASI.getIsVolatile();
 | |
|   uint32_t eOffset = 0;
 | |
|   uint32_t EBytes = int_cast<unsigned int>(m_DL->getTypeAllocSize(ETy));
 | |
| 
 | |
|   for (uint32_t i = 0, subIdx = 0; i < len; ++i) {
 | |
|     Type *Ty1 = splitInfo[i].first;
 | |
|     uint32_t len1 = splitInfo[i].second;
 | |
|     IGCLLVM::FixedVectorType *VTy1 = dyn_cast<IGCLLVM::FixedVectorType>(Ty1);
 | |
|     for (uint32_t j = 0; j < len1; ++j) {
 | |
|       alignment_t vAlign = (alignment_t)MinAlign(Align, (alignment_t)eOffset * EBytes);
 | |
|       Value *offsetAddr = ASI.CreateConstScalarGEP(svals[subIdx]->getType(), Addr, eOffset);
 | |
|       Instruction *newST = ASI.Create(svals[subIdx], offsetAddr, vAlign, IsVolatile);
 | |
|       eOffset += (VTy1 ? int_cast<uint32_t>(VTy1->getNumElements()) : 1);
 | |
|       ++subIdx;
 | |
| 
 | |
|       // If this is a new 3-element vector, add it into m_Vector3List
 | |
|       if (VTy1 && VTy1->getNumElements() == 3) {
 | |
|         m_Vector3List.push_back(newST);
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Stores don't require post processing, so remove it as soon as we finish splitting
 | |
|   vecToSubVec.erase(SI);
 | |
|   SI->eraseFromParent();
 | |
| 
 | |
|   // Since Load is processed later, stop optimizing if inst is Load.
 | |
|   Instruction *inst = dyn_cast<Instruction>(StoredVal);
 | |
|   bool keepLI = inst && isAbstractLoadInst(inst) && (vecToSubVec.find(inst) != vecToSubVec.end());
 | |
|   while (inst && !keepLI && inst->use_empty()) {
 | |
|     Instruction *next = nullptr;
 | |
|     if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(inst)) {
 | |
|       next = dyn_cast<Instruction>(IEI->getOperand(0));
 | |
|     }
 | |
| 
 | |
|     inst->eraseFromParent();
 | |
|     inst = next;
 | |
|     keepLI = inst && isAbstractLoadInst(inst) && (vecToSubVec.find(inst) != vecToSubVec.end());
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| bool VectorPreProcess::splitLoad(AbstractLoadInst &ALI, V2SMap &vecToSubVec, WIAnalysisRunner &WI) {
 | |
|   Instruction *LI = ALI.getInst();
 | |
|   bool isLdRaw = isa<LdRawIntrinsic>(LI);
 | |
|   bool isPredLd = isa<PredicatedLoadIntrinsic>(LI);
 | |
|   IGCLLVM::FixedVectorType *VTy = cast<IGCLLVM::FixedVectorType>(LI->getType());
 | |
|   Type *ETy = VTy->getElementType();
 | |
|   uint32_t nelts = int_cast<uint32_t>(VTy->getNumElements());
 | |
| 
 | |
|   // Split a vector type into multiple sub-types:
 | |
|   //       'len0' number of sub-vectors of type 'vecTy0'
 | |
|   //       'len1' number of sub-vectors of type 'vecTy1'
 | |
|   //       ...
 | |
|   // SplitInfo : all pairs, each of which is (sub-vector's type, #sub-vectors).
 | |
|   SmallVector<std::pair<Type *, uint32_t>, 8> splitInfo;
 | |
|   uint32_t splitSize = getSplitByteSize(LI, WI);
 | |
|   if (IGC_IS_FLAG_ENABLED(EnableSplitUnalignedVector)) {
 | |
|     // byte and word-aligned loads can only load a dword at a time.
 | |
|     auto alignment = ALI.getAlignment();
 | |
|     if (!isLdRaw && alignment < 4) {
 | |
|       alignment_t newAlign = (alignment_t)IGCLLVM::getAlignmentValue(getKnownAlignment(ALI.getPointerOperand(), *m_DL));
 | |
|       if (newAlign > alignment) {
 | |
|         //  For OCL stateful, the base can be as little as DW-aligned. To be safe,
 | |
|         //  need to use DW-aligned. For example,
 | |
|         //       % 0 = add i32 0, 16
 | |
|         //       % 4 = inttoptr i32 % 0 to <8 x i16> addrspace(131073) *
 | |
|         //       %5 = load <8 x i16>, <8 x i16> addrspace(131073) * %4, align 2
 | |
|         //  newAlign from getKnownAlignment() is 16. But we can only set align to 4 as
 | |
|         //  the base of this stateful could be just DW-aligned.
 | |
|         if (newAlign > 4 && isStatefulAddrSpace(ALI.getPointerAddressSpace())) {
 | |
|           newAlign = 4;
 | |
|         }
 | |
|         ALI.setAlignment(newAlign);
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if ((isLdRaw || !WI.isUniform(ALI.getInst())) && ALI.getAlignment() < 4)
 | |
|       splitSize = 4;
 | |
|   }
 | |
|   createSplitVectorTypes(ETy, nelts, splitSize, splitInfo);
 | |
| 
 | |
|   // return if no split
 | |
|   uint32_t len = splitInfo.size();
 | |
|   if (len == 1 && splitInfo[0].second == 1) {
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   ValVector splitMergeValues;
 | |
|   if (isPredLd)
 | |
|     createSplitMergeValues(LI, cast<PredicatedLoadIntrinsic>(LI)->getMergeValue(), splitInfo, splitMergeValues);
 | |
| 
 | |
|   Value *Addr = ALI.getPointerOperand();
 | |
|   auto Align = ALI.getAlignment();
 | |
|   bool IsVolatile = ALI.getIsVolatile();
 | |
| 
 | |
|   uint32_t eOffset = 0;
 | |
|   uint32_t EBytes = int_cast<unsigned int>(m_DL->getTypeAllocSize(ETy));
 | |
|   uint32_t mergeValueIdx = 0;
 | |
| 
 | |
|   // Create a map entry for LI
 | |
|   ValVector &svals = vecToSubVec[LI][splitSize];
 | |
| 
 | |
|   for (uint32_t i = 0; i < len; ++i) {
 | |
|     Type *Ty1 = splitInfo[i].first;
 | |
|     uint32_t len1 = splitInfo[i].second;
 | |
|     IGCLLVM::FixedVectorType *VTy1 = dyn_cast<IGCLLVM::FixedVectorType>(Ty1);
 | |
|     for (uint32_t j = 0; j < len1; ++j) {
 | |
|       alignment_t vAlign = (alignment_t)MinAlign(Align, (alignment_t)eOffset * EBytes);
 | |
|       Value *offsetAddr = ALI.CreateConstScalarGEP(Ty1, Addr, eOffset);
 | |
|       Value *MergeV = isPredLd ? splitMergeValues[mergeValueIdx++] : nullptr;
 | |
|       Instruction *I = ALI.Create(Ty1, offsetAddr, vAlign, IsVolatile, MergeV);
 | |
|       eOffset += (VTy1 ? int_cast<uint32_t>(VTy1->getNumElements()) : 1);
 | |
| 
 | |
|       svals.push_back(I);
 | |
| 
 | |
|       // If this is a new 3-element vector, add it into m_Vector3List
 | |
|       if (VTy1 && VTy1->getNumElements() == 3) {
 | |
|         m_Vector3List.push_back(I);
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   if (IGC_IS_FLAG_ENABLED(EnableSplitUnalignedVector)) {
 | |
|     if (m_DL->getTypeSizeInBits(svals[0]->getType()) < m_DL->getTypeSizeInBits(ETy)) {
 | |
|       const unsigned int denominator = (unsigned int)m_DL->getTypeSizeInBits(svals[0]->getType());
 | |
|       IGC_ASSERT(0 < denominator);
 | |
| 
 | |
|       const uint32_t scalarsPerElement = (unsigned int)m_DL->getTypeSizeInBits(ETy) / denominator;
 | |
|       IGC_ASSERT(1 < scalarsPerElement);
 | |
|       IGC_ASSERT((svals.size() % scalarsPerElement) == 0);
 | |
| 
 | |
|       ValVector mergedScalars;
 | |
|       IRBuilder<> builder(LI->getParent());
 | |
|       Instruction *nextInst = LI->getNextNode();
 | |
|       if (nextInst) {
 | |
|         builder.SetInsertPoint(nextInst);
 | |
|       }
 | |
|       Value *undef = UndefValue::get(FixedVectorType::get(svals[0]->getType(), scalarsPerElement));
 | |
|       for (uint32_t i = 0; i < svals.size() / scalarsPerElement; i++) {
 | |
|         Value *newElement = undef;
 | |
|         for (uint32_t j = 0; j < scalarsPerElement; j++) {
 | |
|           newElement = builder.CreateInsertElement(newElement, svals[i * scalarsPerElement + j], j);
 | |
|         }
 | |
|         mergedScalars.push_back(builder.CreateBitCast(newElement, ETy));
 | |
|       }
 | |
|       svals.clear();
 | |
|       svals.append(mergedScalars.begin(), mergedScalars.end());
 | |
|     }
 | |
|   }
 | |
|   // Put LI in m_Temps for post-processing.
 | |
|   //
 | |
|   // LI may be used only in store. If so, no need to re-generate the original
 | |
|   // vector as load and store will use the same set of sub-vectors. So, we delay
 | |
|   // generating the original vector until all stores are processed. Doing so,
 | |
|   // we re-generate the original vector only if it is necessary and thus avoid
 | |
|   // unnecesary insert/extract instructions.
 | |
|   m_Temps.push_back(LI);
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| bool VectorPreProcess::splitLoadStore(Instruction *Inst, V2SMap &vecToSubVec, WIAnalysisRunner &WI) {
 | |
|   std::optional<AbstractLoadInst> ALI = AbstractLoadInst::get(Inst, *m_DL);
 | |
|   std::optional<AbstractStoreInst> ASI = AbstractStoreInst::get(Inst, *m_DL);
 | |
|   IGC_ASSERT_MESSAGE((ALI || ASI), "Inst should be either load or store");
 | |
|   Type *Ty = ALI ? ALI->getInst()->getType() : ASI->getValueOperand()->getType();
 | |
|   IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
 | |
|   if (!VTy) {
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   if (VTy->getNumElements() == 3) {
 | |
|     // Handle 3-element vector later.
 | |
|     m_Vector3List.push_back(Inst);
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   Value *V = ALI ? ALI->getInst() : ASI->getInst();
 | |
| 
 | |
|   auto InMap = [&vecToSubVec](Value *V) { return vecToSubVec.find(V) != vecToSubVec.end(); };
 | |
| 
 | |
|   // Only LI could be processed already.
 | |
|   bool processed = ALI && InMap(V);
 | |
|   if (processed) {
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   // Do splitting
 | |
| 
 | |
|   // If it is a store and its stored value is from a load that
 | |
|   // has not been splitted yet, then splitting the load first
 | |
|   // so that the stored value will be directly from loaded values
 | |
|   // without adding insert/extract instructions.
 | |
|   std::optional<AbstractLoadInst> aALI =
 | |
|       ASI && !InMap(ASI->getValueOperand()) ? AbstractLoadInst::get(ASI->getValueOperand(), *m_DL) : std::move(ALI);
 | |
| 
 | |
|   if (aALI) {
 | |
|     auto aALIValue = aALI.value();
 | |
|     splitLoad(aALIValue, vecToSubVec, WI);
 | |
|   }
 | |
| 
 | |
|   if (ASI) {
 | |
|     auto ASIValue = ASI.value();
 | |
|     splitStore(ASIValue, vecToSubVec, WI);
 | |
|   }
 | |
| 
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| // For a vector3 whose element size < 4 bytes, will split them into one whose
 | |
| // size is multiple of DW and one whose size is less than DW; If the size is
 | |
| // less than DW, make sure it is either 1 Byte or 2 bytes.  After this, for
 | |
| // vector size < 4, it must be either 1 byte or 2 bytes, never 3  bytes.
 | |
| // This function also splits vector3s with an element size of 8 bytes if
 | |
| // ldraw or storeraw is being used since neither of those messages support
 | |
| // payloads larger than 4 DW.
 | |
| bool VectorPreProcess::splitVector3LoadStore(Instruction *Inst) {
 | |
|   std::optional<AbstractLoadInst> optionalALI = AbstractLoadInst::get(Inst, *m_DL);
 | |
|   AbstractLoadInst *ALI = optionalALI.has_value() ? &optionalALI.value() : nullptr;
 | |
|   std::optional<AbstractStoreInst> optionalASI = AbstractStoreInst::get(Inst, *m_DL);
 | |
|   AbstractStoreInst *ASI = optionalASI.has_value() ? &optionalASI.value() : nullptr;
 | |
| 
 | |
|   std::optional<int> a;
 | |
|   IGC_ASSERT_MESSAGE((optionalALI || optionalASI), "Inst should be either load or store");
 | |
|   Type *Ty = ALI ? ALI->getInst()->getType() : ASI->getValueOperand()->getType();
 | |
|   IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
 | |
|   IGC_ASSERT_MESSAGE(nullptr != VTy, "Inst should be a 3-element vector load/store!");
 | |
|   IGC_ASSERT_MESSAGE(VTy->getNumElements() == 3, "Inst should be a 3-element vector load/store!");
 | |
| 
 | |
|   Type *eTy = VTy->getElementType();
 | |
|   uint32_t etyBytes = int_cast<unsigned int>(m_DL->getTypeAllocSize(eTy));
 | |
|   // total size of vector in bytes;
 | |
|   // uint32_t sz = VTy->getNumElements() * etyBytes;
 | |
|   GenIntrinsicInst *II = dyn_cast<GenIntrinsicInst>(Inst);
 | |
|   bool isStoreRaw = II && (II->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
 | |
|                            II->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed);
 | |
|   bool isPredLoad = isa<PredicatedLoadIntrinsic>(Inst);
 | |
| 
 | |
|   if (etyBytes == 1 || etyBytes == 2 || (etyBytes == 8 && (isa<LdRawIntrinsic>(Inst) || isStoreRaw))) {
 | |
|     IRBuilder<> Builder(Inst);
 | |
|     if (optionalALI) {
 | |
|       Value *Elt0 = NULL;
 | |
|       Value *Elt1 = NULL;
 | |
|       Value *Elt2 = NULL;
 | |
|       bool UseLegacyLdRawMessage =
 | |
|           isa<LdRawIntrinsic>(Inst) && EmitPass::shouldGenerateLSCQuery(*m_CGCtx, Inst) != Tristate::True;
 | |
|       // If alignment is the same as 4-element vector's, it's likely safe
 | |
|       // to make it 4-element load. (always safe ?)
 | |
|       if (ALI->getAlignment() >= 4 * etyBytes &&
 | |
|           // Legacy ldraw message doesn't support 32-byte payloads
 | |
|           !(UseLegacyLdRawMessage && etyBytes == 8)) {
 | |
|         // Make it 4-element load
 | |
|         Type *newVTy = FixedVectorType::get(eTy, 4);
 | |
| 
 | |
|         ValVector splitMergeValues;
 | |
|         if (isPredLoad)
 | |
|           createSplitMergeValues(Inst, cast<PredicatedLoadIntrinsic>(Inst)->getMergeValue(), {{newVTy, 1}},
 | |
|                                  splitMergeValues);
 | |
| 
 | |
|         Value *V = ALI->Create(newVTy, isPredLoad ? splitMergeValues[0] : nullptr);
 | |
| 
 | |
|         Elt0 = Builder.CreateExtractElement(V, Builder.getInt32(0), "elt0");
 | |
|         Elt1 = Builder.CreateExtractElement(V, Builder.getInt32(1), "elt1");
 | |
|         Elt2 = Builder.CreateExtractElement(V, Builder.getInt32(2), "elt2");
 | |
|       } else {
 | |
|         // One 2-element vector load + one scalar load
 | |
|         Type *newVTy = FixedVectorType::get(eTy, 2);
 | |
|         Value *offsetAddr = ALI->CreateConstScalarGEP(eTy, ALI->getPointerOperand(), 2);
 | |
| 
 | |
|         ValVector splitMergeValues;
 | |
|         if (isPredLoad)
 | |
|           createSplitMergeValues(Inst, cast<PredicatedLoadIntrinsic>(Inst)->getMergeValue(), {{newVTy, 1}, {eTy, 1}},
 | |
|                                  splitMergeValues);
 | |
| 
 | |
|         Value *V2 = ALI->Create(newVTy, isPredLoad ? splitMergeValues[0] : nullptr);
 | |
|         Elt0 = Builder.CreateExtractElement(V2, Builder.getInt32(0), "elt0");
 | |
|         Elt1 = Builder.CreateExtractElement(V2, Builder.getInt32(1), "elt1");
 | |
| 
 | |
|         uint32_t newAlign = (uint32_t)MinAlign(ALI->getAlignment(), 2 * etyBytes);
 | |
|         Elt2 = ALI->Create(eTy, offsetAddr, newAlign, ALI->getIsVolatile(), isPredLoad ? splitMergeValues[1] : nullptr);
 | |
|       }
 | |
| 
 | |
|       // A little optimization here
 | |
|       ExtractElementInst *EEInsts[3];
 | |
|       for (int i = 0; i < 3; ++i) {
 | |
|         EEInsts[i] = nullptr;
 | |
|       }
 | |
|       if (isValueUsedOnlyByEEI(ALI->getInst(), EEInsts)) {
 | |
|         if (EEInsts[0] != nullptr) {
 | |
|           EEInsts[0]->replaceAllUsesWith(Elt0);
 | |
|           EEInsts[0]->eraseFromParent();
 | |
|         }
 | |
|         if (EEInsts[1] != nullptr) {
 | |
|           EEInsts[1]->replaceAllUsesWith(Elt1);
 | |
|           EEInsts[1]->eraseFromParent();
 | |
|         }
 | |
|         if (EEInsts[2] != nullptr) {
 | |
|           EEInsts[2]->replaceAllUsesWith(Elt2);
 | |
|           EEInsts[2]->eraseFromParent();
 | |
|         }
 | |
|       } else {
 | |
|         Value *V = Builder.CreateInsertElement(UndefValue::get(VTy), Elt0, Builder.getInt32(0));
 | |
|         V = Builder.CreateInsertElement(V, Elt1, Builder.getInt32(1));
 | |
|         V = Builder.CreateInsertElement(V, Elt2, Builder.getInt32(2));
 | |
|         ALI->getInst()->replaceAllUsesWith(V);
 | |
|       }
 | |
|       ALI->getInst()->eraseFromParent();
 | |
|     } else {
 | |
|       Value *Ptr = ASI->getPointerOperand();
 | |
|       // Split 3-element into 2-element + 1 scalar
 | |
|       Type *newVTy = FixedVectorType::get(eTy, 2);
 | |
|       Value *StoredVal = ASI->getValueOperand();
 | |
|       Value *offsetAddr = ASI->CreateConstScalarGEP(StoredVal->getType(), Ptr, 2);
 | |
|       InsertElementInst *IEInsts[3];
 | |
|       for (int i = 0; i < 3; ++i) {
 | |
|         IEInsts[i] = nullptr;
 | |
|       }
 | |
| 
 | |
|       // vec3 = vec2 + scalar,  newAlign is an alignment for scalar store.
 | |
|       uint32_t newAlign = (uint32_t)MinAlign(ASI->getAlignment(), 2 * etyBytes);
 | |
|       Value *UDVal = UndefValue::get(eTy);
 | |
|       if (isValueCreatedOnlyByIEI(ASI->getInst(), IEInsts)) {
 | |
|         // This case should be most frequent, and want
 | |
|         // to generate a better code by removing dead
 | |
|         // InsertElementInst.
 | |
| 
 | |
|         // Be ware of partial vector store.
 | |
|         Value *V = UndefValue::get(newVTy);
 | |
|         V = Builder.CreateInsertElement(V, (IEInsts[0] != nullptr) ? IEInsts[0]->getOperand(1) : UDVal,
 | |
|                                         Builder.getInt32(0));
 | |
|         V = Builder.CreateInsertElement(V, (IEInsts[1] != nullptr) ? IEInsts[1]->getOperand(1) : UDVal,
 | |
|                                         Builder.getInt32(1));
 | |
|         V = ASI->Create(V);
 | |
| 
 | |
|         // If IEInsts[2] is undefined, skip scalar store.
 | |
|         if (IEInsts[2] != nullptr) {
 | |
|           (void)ASI->Create(IEInsts[2]->getOperand(1), offsetAddr, newAlign, ASI->getIsVolatile());
 | |
|         }
 | |
|         ASI->getInst()->eraseFromParent();
 | |
| 
 | |
|         // Remove all InsertElementInst if possible
 | |
|         bool change = true;
 | |
|         while (change) {
 | |
|           change = false;
 | |
|           for (int i = 0; i < 3; ++i) {
 | |
|             if (IEInsts[i] && IEInsts[i]->use_empty()) {
 | |
|               IEInsts[i]->eraseFromParent();
 | |
|               IEInsts[i] = nullptr;
 | |
|               change = true;
 | |
|             }
 | |
|           }
 | |
|         }
 | |
|       } else {
 | |
|         // Get a 2-element vector and a scalar from the
 | |
|         // 3-element vector and store them respectively.
 | |
|         // Shuffle isn't handled in Emit, use extract/insert instead
 | |
|         Value *Elt0 = Builder.CreateExtractElement(StoredVal, Builder.getInt32(0), "Elt0");
 | |
|         Value *Elt1 = Builder.CreateExtractElement(StoredVal, Builder.getInt32(1), "Elt1");
 | |
|         Value *Elt2 = Builder.CreateExtractElement(StoredVal, Builder.getInt32(2), "Elt2");
 | |
|         Value *V = Builder.CreateInsertElement(UndefValue::get(newVTy), Elt0, Builder.getInt32(0));
 | |
|         V = Builder.CreateInsertElement(V, Elt1, Builder.getInt32(1));
 | |
|         ASI->Create(V);
 | |
|         ASI->Create(Elt2, offsetAddr, newAlign, ASI->getIsVolatile());
 | |
|         ASI->getInst()->eraseFromParent();
 | |
|       }
 | |
|     }
 | |
|     return true;
 | |
|   }
 | |
|   return false;
 | |
| }
 | |
| 
 | |
| // availBeforeInst:
 | |
| //    Indicate that all scalar values of VecVal are available right before
 | |
| //    instruction 'availBeforeInst'. If availBeforeInst is null, it means
 | |
| //    all scalar values are constants.
 | |
| void VectorPreProcess::getOrGenScalarValues(Function &F, Value *VecVal, ValVector &scalars,
 | |
|                                             Instruction *&availBeforeInst) {
 | |
|   availBeforeInst = nullptr;
 | |
| 
 | |
|   IGCLLVM::FixedVectorType *VTy = cast<IGCLLVM::FixedVectorType>(VecVal->getType());
 | |
|   if (!VTy) {
 | |
|     scalars[0] = VecVal;
 | |
|     return;
 | |
|   }
 | |
| 
 | |
|   uint32_t nelts = int_cast<uint32_t>(VTy->getNumElements());
 | |
|   Type *ETy = VTy->getElementType();
 | |
|   if (isa<UndefValue>(VecVal)) {
 | |
|     Value *udv = UndefValue::get(ETy);
 | |
|     for (uint32_t i = 0; i < nelts; ++i) {
 | |
|       scalars[i] = udv;
 | |
|     }
 | |
|   } else if (ConstantVector *CV = dyn_cast<ConstantVector>(VecVal)) {
 | |
|     for (uint32_t i = 0; i < nelts; ++i) {
 | |
|       scalars[i] = CV->getOperand(i);
 | |
|     }
 | |
|   } else if (ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(VecVal)) {
 | |
|     for (uint32_t i = 0; i < nelts; ++i) {
 | |
|       scalars[i] = CDV->getElementAsConstant(i);
 | |
|     }
 | |
|   } else if (ConstantAggregateZero *CAZ = dyn_cast<ConstantAggregateZero>(VecVal)) {
 | |
|     for (uint32_t i = 0; i < nelts; ++i) {
 | |
|       scalars[i] = CAZ->getSequentialElement();
 | |
|     }
 | |
|   } else {
 | |
|     bool genExtract = false;
 | |
|     Value *V = VecVal;
 | |
|     IGC_ASSERT(scalars.size() == nelts);
 | |
|     for (uint32_t i = 0; i < nelts; ++i) {
 | |
|       scalars[i] = nullptr;
 | |
|     }
 | |
|     uint32_t numEltsFound = 0;
 | |
|     while (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
 | |
|       Value *ixVal = IEI->getOperand(2);
 | |
|       ConstantInt *CI = dyn_cast<ConstantInt>(ixVal);
 | |
|       if (!CI) {
 | |
|         genExtract = true;
 | |
|         break;
 | |
|       }
 | |
|       uint32_t ix = int_cast<unsigned int>(CI->getZExtValue());
 | |
|       if (scalars[ix] == nullptr) {
 | |
|         scalars[ix] = IEI->getOperand(1);
 | |
|         ++numEltsFound;
 | |
|       }
 | |
|       if (numEltsFound == nelts) {
 | |
|         break;
 | |
|       }
 | |
|       V = IEI->getOperand(0);
 | |
|     }
 | |
|     // Generate extractelement instructions if not all elements were found.
 | |
|     if (!isa<UndefValue>(V) && numEltsFound != nelts) {
 | |
|       genExtract = true;
 | |
|     }
 | |
| 
 | |
|     BasicBlock::iterator inst_b;
 | |
|     if (Instruction *I = dyn_cast<Instruction>(VecVal)) {
 | |
|       if (auto phi = dyn_cast<PHINode>(I)) {
 | |
|         // avoid inserting between phis
 | |
|         inst_b = phi->getParent()->getFirstInsertionPt();
 | |
|       } else {
 | |
|         inst_b = BasicBlock::iterator(I);
 | |
|         ++inst_b;
 | |
|       }
 | |
|     } else {
 | |
|       // VecVal is an argument or constant
 | |
|       inst_b = F.begin()->getFirstInsertionPt();
 | |
|     }
 | |
| 
 | |
|     IRBuilder<> Builder(&(*inst_b));
 | |
|     for (uint32_t i = 0; i < nelts; ++i) {
 | |
|       if (scalars[i] == nullptr) {
 | |
|         Value *S;
 | |
|         if (genExtract) {
 | |
|           S = Builder.CreateExtractElement(V, Builder.getInt32(i));
 | |
|         } else {
 | |
|           S = UndefValue::get(ETy);
 | |
|         }
 | |
|         scalars[i] = S;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     availBeforeInst = &(*inst_b);
 | |
|   }
 | |
| }
 | |
| 
 | |
| // Perform LoadInst/StoreInst simplification. E.g. The following vector load is
 | |
| // only used by three extractelements with constants indices, so we can narrow
 | |
| // the load width to 3.
 | |
| //
 | |
| // %34 = load <4 x float> addrspace(1)* %33, align 16
 | |
| // %scalar35 = extractelement <4 x float> %34, i32 0
 | |
| // %scalar36 = extractelement <4 x float> %34, i32 1
 | |
| // %scalar47 = extractelement <4 x float> %34, i32 2
 | |
| //
 | |
| // %40 = bitcast <4 x float> addrspace(1)* %33 to <3 x float> addrspace(1)*
 | |
| // %41 = load <3 x float> addrspace(1)* %40, align 16 (keep alignment!)
 | |
| // %scalar42 = extractelement <3 x float> %41, i32 0
 | |
| // %scalar43 = extractelement <3 x float> %41, i32 1
 | |
| // %scalar44 = extractelement <3 x float> %41, i32 2
 | |
| //
 | |
| Instruction *VectorPreProcess::simplifyLoadStore(Instruction *Inst) {
 | |
|   if (std::optional<AbstractLoadInst> optionalALI = AbstractLoadInst::get(Inst, *m_DL)) {
 | |
|     bool optReportEnabled = IGC_IS_FLAG_ENABLED(EnableOptReportLoadNarrowing);
 | |
|     auto emitOptReport = [&](std::string report, Instruction *from, Instruction *to) {
 | |
|       std::string strFrom;
 | |
|       llvm::raw_string_ostream rsoFrom(strFrom);
 | |
|       from->print(rsoFrom);
 | |
| 
 | |
|       std::string strTo;
 | |
|       llvm::raw_string_ostream rsoTo(strTo);
 | |
|       to->print(rsoTo);
 | |
| 
 | |
|       std::stringstream optReportFile;
 | |
|       optReportFile << IGC::Debug::GetShaderOutputFolder() << "LoadNarrowing.opt";
 | |
| 
 | |
|       std::ofstream optReportStream;
 | |
|       optReportStream.open(optReportFile.str(), std::ios::app);
 | |
|       optReportStream << IGC::Debug::GetShaderOutputName() << ": " << report << std::endl
 | |
|                       << rsoFrom.str() << " ->" << std::endl
 | |
|                       << rsoTo.str() << std::endl;
 | |
|     };
 | |
| 
 | |
|     auto ALI = optionalALI.value();
 | |
|     if (!Inst->getType()->isVectorTy() || ALI.getAlignment() < 4)
 | |
|       return Inst;
 | |
| 
 | |
|     unsigned NBits = int_cast<unsigned>(m_DL->getTypeSizeInBits(Inst->getType()->getScalarType()));
 | |
|     if (NBits < 32)
 | |
|       return Inst;
 | |
| 
 | |
|     BitCastInst *BC = nullptr;
 | |
|     Type *DstEltTy = nullptr;
 | |
|     // Handle bitcasts patterns like:
 | |
|     //
 | |
|     // %41 = call <4 x i32> @llvm.genx.GenISA.ldrawvector.indexed.v4i32.p1v4f32(...)
 | |
|     // %bc = bitcast <4 x i32> %41 to <4 x float>
 | |
|     // %42 = extractelement <4 x float> %bc, i32 0
 | |
|     if (Inst->hasOneUse()) {
 | |
|       BC = dyn_cast<BitCastInst>(*Inst->users().begin());
 | |
|       if (BC) {
 | |
|         IGCLLVM::FixedVectorType *DstVTy = dyn_cast<IGCLLVM::FixedVectorType>(BC->getType());
 | |
|         IGCLLVM::FixedVectorType *SrcVTy = dyn_cast<IGCLLVM::FixedVectorType>(BC->getOperand(0)->getType());
 | |
|         if (IGC_IS_FLAG_DISABLED(EnableBitcastedLoadNarrowing) || !DstVTy || !SrcVTy ||
 | |
|             DstVTy->getNumElements() != SrcVTy->getNumElements()) {
 | |
|           BC = nullptr;
 | |
|         } else {
 | |
|           DstEltTy = DstVTy->getElementType();
 | |
|         }
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     SmallVector<ExtractElementInst *, 8> ConstEEIUses;
 | |
|     unsigned MaxIndex = 0;
 | |
|     for (auto U : (BC ? BC : Inst)->users()) {
 | |
|       auto EEI = dyn_cast<ExtractElementInst>(U);
 | |
|       if (!EEI || !isa<ConstantInt>(EEI->getIndexOperand()))
 | |
|         return Inst;
 | |
| 
 | |
|       auto CI = cast<ConstantInt>(EEI->getIndexOperand());
 | |
|       ConstEEIUses.push_back(EEI);
 | |
|       MaxIndex = std::max(MaxIndex, int_cast<unsigned>(CI->getZExtValue()));
 | |
|     }
 | |
| 
 | |
|     // All uses are constant EEI.
 | |
|     IGC_ASSERT_MESSAGE((BC ? BC : Inst)->hasNUses(ConstEEIUses.size()), "out of sync");
 | |
| 
 | |
|     // FIXME: this is to WA an issue that splitLoadStore does not split
 | |
|     // vectors of size 5, 6, 7.
 | |
|     if (MaxIndex + 1 > 4)
 | |
|       return Inst;
 | |
| 
 | |
|     // If MaxIndex is smaller than <vector_size - 1>, then narrow the size
 | |
|     // of this vector load to reduce unnecessary memory load.
 | |
|     //
 | |
|     // TODO: further optimize this load into a message with channel masks
 | |
|     // for cases in which use indices are sparse like {0, 2}.
 | |
|     unsigned N = (unsigned)cast<IGCLLVM::FixedVectorType>(Inst->getType())->getNumElements();
 | |
|     if (N == MaxIndex + 1)
 | |
|       return Inst;
 | |
| 
 | |
|     // Check if we can turn a ldrawvector into a ldraw
 | |
|     Instruction *NewLI = nullptr;
 | |
|     IRBuilder<> Builder(Inst);
 | |
|     auto ldrawvec = dyn_cast<LdRawIntrinsic>(Inst);
 | |
|     bool canSimplifyOneUse =
 | |
|         ldrawvec && isa<VectorType>(ldrawvec->getType()) && (BC ? BC : Inst)->hasOneUse() && !ConstEEIUses.empty();
 | |
| 
 | |
|     bool canSimplifyOneUseZeroIndex =
 | |
|         canSimplifyOneUse && cast<ConstantInt>(ConstEEIUses.front()->getIndexOperand())->getZExtValue() == 0;
 | |
| 
 | |
|     // There is a known case where narrowing bitcasted ldrawvector to ldraw
 | |
|     // leads to a corruption. We can still simplify a vector load to
 | |
|     // a narrow one (e.g. <4 x i32> to <2 x i32> when only 0th elt is used
 | |
|     // as a float).
 | |
|     // TODO: remove WA when issue is resolved.
 | |
|     bool skipSimplifyBitcastedOneUse =
 | |
|         canSimplifyOneUse && BC && IGC_IS_FLAG_DISABLED(EnableBitcastedLoadNarrowingToScalar);
 | |
| 
 | |
|     auto simplifyLDVecToLDRaw = [&](bool calc_offset) {
 | |
|       auto EE_user = ConstEEIUses.front();
 | |
|       auto return_type = cast<VectorType>(ldrawvec->getType())->getElementType();
 | |
|       auto buffer_ptr = ldrawvec->getResourceValue();
 | |
|       Value *OffsetVal = ldrawvec->getOffsetValue();
 | |
|       auto alloc_size = (unsigned)m_DL->getTypeAllocSize(return_type);
 | |
|       if (calc_offset) {
 | |
|         auto EE_index = (unsigned)cast<ConstantInt>(EE_user->getIndexOperand())->getZExtValue();
 | |
|         if (isa<ConstantInt>(OffsetVal)) {
 | |
|           // Calculate static offset
 | |
|           auto offset = (unsigned)cast<ConstantInt>(OffsetVal)->getZExtValue();
 | |
|           auto new_offset = offset + (EE_index * alloc_size);
 | |
|           OffsetVal = Builder.getInt32(new_offset);
 | |
|         } else {
 | |
|           // Calculate runtime offset
 | |
|           OffsetVal = Builder.CreateAdd(OffsetVal, Builder.getInt32(EE_index * alloc_size));
 | |
|         }
 | |
|       }
 | |
|       Type *types[2] = {return_type, buffer_ptr->getType()};
 | |
|       Value *args[4] = {buffer_ptr, OffsetVal, Builder.getInt32(alloc_size), Builder.getInt1(ldrawvec->isVolatile())};
 | |
|       Function *newLdRawFunction =
 | |
|           GenISAIntrinsic::getDeclaration(ldrawvec->getModule(), GenISAIntrinsic::GenISA_ldraw_indexed, types);
 | |
|       NewLI = Builder.CreateCall(newLdRawFunction, args);
 | |
|       NewLI->setDebugLoc(EE_user->getDebugLoc());
 | |
| 
 | |
|       if (optReportEnabled) {
 | |
|         std::string type;
 | |
|         llvm::raw_string_ostream rsoType(type);
 | |
|         Inst->getType()->print(rsoType);
 | |
| 
 | |
|         std::stringstream report;
 | |
|         report << (BC ? "Bitcasted vector" : "Vector") << " load of " << rsoType.str()
 | |
|                << " is transformed to scalar load";
 | |
|         if (calc_offset) {
 | |
|           report << (isa<ConstantInt>(ldrawvec->getOffsetValue()) ? ", static offset added" : ", runtime offset added");
 | |
|         }
 | |
|         report << ":";
 | |
|         emitOptReport(report.str(), Inst, NewLI);
 | |
|       }
 | |
| 
 | |
|       Value *NewBC = nullptr;
 | |
|       if (BC) {
 | |
|         NewBC = Builder.CreateBitCast(NewLI, DstEltTy);
 | |
|       }
 | |
|       EE_user->replaceAllUsesWith(BC ? NewBC : NewLI);
 | |
|       EE_user->eraseFromParent();
 | |
|       if (BC) {
 | |
|         BC->eraseFromParent();
 | |
|       }
 | |
|     };
 | |
| 
 | |
|     if (canSimplifyOneUseZeroIndex && !skipSimplifyBitcastedOneUse) {
 | |
|       simplifyLDVecToLDRaw(false);
 | |
|       return NewLI;
 | |
|     } else if (canSimplifyOneUse && !skipSimplifyBitcastedOneUse) {
 | |
|       simplifyLDVecToLDRaw(true);
 | |
|       return NewLI;
 | |
|     } else {
 | |
|       // WA: Do not narrow a bitcasted vector load to 1 elt vector load,
 | |
|       // choose at least 2 elts vector.
 | |
|       if (canSimplifyOneUseZeroIndex && skipSimplifyBitcastedOneUse) {
 | |
|         MaxIndex = 1;
 | |
|       }
 | |
| 
 | |
|       Type *NewVecTy = FixedVectorType::get(cast<VectorType>(Inst->getType())->getElementType(), MaxIndex + 1);
 | |
| 
 | |
|       bool isPredLoad = isa<PredicatedLoadIntrinsic>(Inst);
 | |
|       ValVector splitMergeValues;
 | |
|       if (isPredLoad)
 | |
|         createSplitMergeValues(Inst, cast<PredicatedLoadIntrinsic>(Inst)->getMergeValue(), {{NewVecTy, 1}},
 | |
|                                splitMergeValues);
 | |
| 
 | |
|       NewLI = ALI.Create(NewVecTy, isPredLoad ? splitMergeValues[0] : nullptr);
 | |
| 
 | |
|       if (optReportEnabled) {
 | |
|         std::string type, narrowedType;
 | |
|         llvm::raw_string_ostream rsoType(type), rsoNarrowedType(narrowedType);
 | |
|         Inst->getType()->print(rsoType);
 | |
|         NewVecTy->print(rsoNarrowedType);
 | |
| 
 | |
|         std::stringstream report;
 | |
|         report << (BC ? "Bitcasted vector" : "Vector") << " load of " << rsoType.str()
 | |
|                << " is narrowed to vector load of " << rsoNarrowedType.str();
 | |
|         if (canSimplifyOneUseZeroIndex && skipSimplifyBitcastedOneUse) {
 | |
|           report << " (narrowing to scalar load is disabled by WA)";
 | |
|         }
 | |
|         report << ":";
 | |
|         emitOptReport(report.str(), Inst, NewLI);
 | |
|       }
 | |
| 
 | |
|       // Loop and replace all uses.
 | |
|       SmallVector<Value *, 8> NewEEI(MaxIndex + 1, nullptr);
 | |
|       SmallVector<Value *, 8> NewBC(MaxIndex + 1, nullptr);
 | |
|       for (auto EEI : ConstEEIUses) {
 | |
|         auto CI = cast<ConstantInt>(EEI->getIndexOperand());
 | |
|         unsigned Idx = int_cast<unsigned>(CI->getZExtValue());
 | |
|         if (NewEEI[Idx] == nullptr) {
 | |
|           NewEEI[Idx] = Builder.CreateExtractElement(NewLI, CI);
 | |
|           if (BC) {
 | |
|             NewBC[Idx] = Builder.CreateBitCast(NewEEI[Idx], DstEltTy);
 | |
|             cast<BitCastInst>(NewBC[Idx])->setDebugLoc(BC->getDebugLoc());
 | |
|           }
 | |
|         }
 | |
|         cast<ExtractElementInst>(NewEEI[Idx])->setDebugLoc(EEI->getDebugLoc());
 | |
|         EEI->replaceAllUsesWith(BC ? NewBC[Idx] : NewEEI[Idx]);
 | |
|         EEI->eraseFromParent();
 | |
|       }
 | |
|       if (BC) {
 | |
|         BC->eraseFromParent();
 | |
|       }
 | |
|       IGC_ASSERT_MESSAGE(Inst->use_empty(), "out of sync");
 | |
|       Inst->eraseFromParent();
 | |
|       return NewLI;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // %2 = insertelement <4 x float> undef, float 1.000000e+00, i32 0
 | |
|   // %3 = insertelement <4 x float> %2, float 1.000000e+00, i32 1
 | |
|   // %4 = insertelement <4 x float> %3, float 1.000000e+00, i32 2
 | |
|   // store <4 x float> %4, <4 x float>* %1, align 16
 | |
|   //
 | |
|   // becomes
 | |
|   //
 | |
|   // %5 = bitcast <4 x float>* %1 to <3 x float>*
 | |
|   // %6 = insertelement <3 x float> undef, float 1.000000e+00, i32 0
 | |
|   // %7 = insertelement <3 x float> %2, float 1.000000e+00, i32 1
 | |
|   // %8 = insertelement <3 x float> %3, float 1.000000e+00, i32 2
 | |
|   // store <3 x float> %8, <3 x float>* %5, align 16
 | |
|   //
 | |
|   IGC_ASSERT(isAbstractStoreInst(Inst));
 | |
|   std::optional<AbstractStoreInst> optionalASI = AbstractStoreInst::get(Inst, *m_DL);
 | |
|   auto ASI = optionalASI.value();
 | |
|   Value *Val = ASI.getValueOperand();
 | |
|   if (isa<UndefValue>(Val)) {
 | |
|     Inst->eraseFromParent();
 | |
|     return nullptr;
 | |
|   }
 | |
| 
 | |
|   if (!Val->getType()->isVectorTy() || ASI.getAlignment() < 4)
 | |
|     return Inst;
 | |
| 
 | |
|   unsigned NBits = int_cast<unsigned>(m_DL->getTypeSizeInBits(Val->getType()->getScalarType()));
 | |
|   if (NBits < 32)
 | |
|     return Inst;
 | |
| 
 | |
|   unsigned N = (unsigned)cast<IGCLLVM::FixedVectorType>(Val->getType())->getNumElements();
 | |
|   if (auto CV = dyn_cast<ConstantVector>(Val)) {
 | |
|     unsigned MaxIndex = 0;
 | |
|     for (unsigned i = N - 1; i != 0; --i) {
 | |
|       Constant *Item = CV->getAggregateElement(i);
 | |
|       if (!isa<UndefValue>(Item)) {
 | |
|         MaxIndex = i;
 | |
|         break;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     if (MaxIndex + 1 == N)
 | |
|       return Inst;
 | |
| 
 | |
|     SmallVector<Constant *, 8> Data(MaxIndex + 1, nullptr);
 | |
|     for (unsigned i = 0; i <= MaxIndex; ++i) {
 | |
|       Data[i] = CV->getAggregateElement(i);
 | |
|     }
 | |
|     auto SVal = ConstantVector::get(Data);
 | |
|     Instruction *NewSI = ASI.Create(SVal);
 | |
|     ASI.getInst()->eraseFromParent();
 | |
|     return NewSI;
 | |
|   }
 | |
| 
 | |
|   SmallVector<InsertElementInst *, 8> ConstIEIs(N, nullptr);
 | |
|   Value *ChainVal = Val;
 | |
|   int MaxIndex = -1;
 | |
|   while (auto IEI = dyn_cast<InsertElementInst>(ChainVal)) {
 | |
|     if (MaxIndex + 1 == (int)N || !isa<ConstantInt>(IEI->getOperand(2))) {
 | |
|       return Inst;
 | |
|     }
 | |
| 
 | |
|     // Make sure the last IEI will be recorded if an element is
 | |
|     // inserted multiple times.
 | |
|     auto CI = cast<ConstantInt>(IEI->getOperand(2));
 | |
|     int Idx = (int)CI->getZExtValue();
 | |
|     if (ConstIEIs[Idx] == nullptr) {
 | |
|       ConstIEIs[Idx] = IEI;
 | |
|     }
 | |
|     MaxIndex = std::max(MaxIndex, Idx);
 | |
|     ChainVal = IEI->getOperand(0);
 | |
|   }
 | |
| 
 | |
|   // FIXME: this is to WA an issue that splitLoadStore does not split
 | |
|   // vectors of size 5, 6, 7.
 | |
|   if (MaxIndex + 1 > 4)
 | |
|     return Inst;
 | |
| 
 | |
|   // Inserted less than N values into Undef.
 | |
|   if (MaxIndex >= 0 && MaxIndex + 1 < (int)N && isa<UndefValue>(ChainVal)) {
 | |
|     IRBuilder<> Builder(ASI.getInst());
 | |
|     Type *NewVecTy = FixedVectorType::get(cast<VectorType>(Val->getType())->getElementType(), MaxIndex + 1);
 | |
|     Value *SVal = UndefValue::get(NewVecTy);
 | |
|     for (int i = 0; i <= MaxIndex; ++i) {
 | |
|       if (ConstIEIs[i] != nullptr) {
 | |
|         SVal = Builder.CreateInsertElement(SVal, ConstIEIs[i]->getOperand(1), ConstIEIs[i]->getOperand(2));
 | |
|       }
 | |
|     }
 | |
|     Instruction *NewSI = ASI.Create(SVal);
 | |
|     ASI.getInst()->eraseFromParent();
 | |
|     return NewSI;
 | |
|   }
 | |
| 
 | |
|   return Inst;
 | |
| }
 | |
| 
 | |
| // Replace store instructions like
 | |
| // store i24 %1, i24 addrspace(3)* %2, align 4
 | |
| // or
 | |
| // store i48 %1, i48 addrspace(3)* %2, align 4
 | |
| //
 | |
| // with
 | |
| // store <3 x i8> %3, <3 x i8> addrspace(3)* %4, align 4
 | |
| // or
 | |
| // store <3 x i16> %16, <3 x i16> addrspace(3)* %4, align 4
 | |
| //
 | |
| // to be split later in this pass.
 | |
| // Otherwise later TypeLegalizwe pass replaces these instructions with 3-element store.
 | |
| // The same is for i24 and i48 load instructions.
 | |
| //
 | |
| bool VectorPreProcess::processScalarLoadStore(Function &F) {
 | |
|   InstWorkVector list_delete;
 | |
|   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
 | |
|     Instruction *inst = &*I;
 | |
|     if (isa<StoreInst>(inst) || isa<PredicatedStoreIntrinsic>(inst)) {
 | |
|       std::optional<AbstractStoreInst> optionalASI = AbstractStoreInst::get(inst, *m_DL);
 | |
|       auto ASI = optionalASI.value();
 | |
| 
 | |
|       Type *Ty = ASI.getValueOperand()->getType();
 | |
|       if (Ty->isVectorTy())
 | |
|         continue;
 | |
|       unsigned bitSize = int_cast<unsigned>(m_DL->getTypeSizeInBits(Ty->getScalarType()));
 | |
|       if (bitSize != 24 && bitSize != 48)
 | |
|         continue;
 | |
|       IRBuilder<> Builder(inst);
 | |
|       Type *newScalTy = bitSize == 24 ? Type::getInt8Ty(inst->getContext()) : Type::getInt16Ty(inst->getContext());
 | |
|       Type *newVecTy = IGCLLVM::FixedVectorType::get(newScalTy, 3);
 | |
|       ASI.Create(Builder.CreateBitCast(ASI.getValueOperand(), newVecTy));
 | |
|       list_delete.push_back(inst);
 | |
|     } else if (isa<LoadInst>(inst) || isa<PredicatedLoadIntrinsic>(inst)) {
 | |
|       std::optional<AbstractLoadInst> optionalALI = AbstractLoadInst::get(inst, *m_DL);
 | |
|       auto ALI = optionalALI.value();
 | |
| 
 | |
|       Type *Ty = inst->getType();
 | |
|       if (Ty->isVectorTy())
 | |
|         continue;
 | |
|       unsigned bitSize = int_cast<unsigned>(m_DL->getTypeSizeInBits(Ty->getScalarType()));
 | |
|       if (bitSize != 24 && bitSize != 48)
 | |
|         continue;
 | |
|       IRBuilder<> Builder(inst);
 | |
|       Type *newScalTy = bitSize == 24 ? Type::getInt8Ty(inst->getContext()) : Type::getInt16Ty(inst->getContext());
 | |
|       Type *newVecTy = IGCLLVM::FixedVectorType::get(newScalTy, 3);
 | |
| 
 | |
|       bool isPredLd = isa<PredicatedLoadIntrinsic>(inst);
 | |
|       ValVector splitMergeValues;
 | |
|       if (isPredLd)
 | |
|         createSplitMergeValues(inst, cast<PredicatedLoadIntrinsic>(inst)->getMergeValue(), {{newVecTy, 1}},
 | |
|                                splitMergeValues);
 | |
|       Value *MergeVal = isPredLd ? splitMergeValues[0] : nullptr;
 | |
| 
 | |
|       Value *newVecVal = ALI.Create(newVecTy, MergeVal);
 | |
|       Value *newVal = Builder.CreateBitCast(newVecVal, Ty);
 | |
|       inst->replaceAllUsesWith(newVal);
 | |
|       list_delete.push_back(inst);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   if (list_delete.empty())
 | |
|     return false;
 | |
| 
 | |
|   for (auto i : list_delete) {
 | |
|     i->eraseFromParent();
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| bool VectorPreProcess::runOnFunction(Function &F) {
 | |
|   bool changed = false;
 | |
|   m_DL = &F.getParent()->getDataLayout();
 | |
|   m_C = &F.getContext();
 | |
|   m_CGCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
 | |
| 
 | |
|   changed = processScalarLoadStore(F);
 | |
| 
 | |
|   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
 | |
|     Instruction *inst = &*I;
 | |
|     if (isAbstractStoreInst(inst) || isAbstractLoadInst(inst)) {
 | |
|       m_WorkList.push_back(inst);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Simplify loads/stores.
 | |
|   bool Simplified = false;
 | |
|   for (unsigned i = 0, n = m_WorkList.size(); i < n; ++i) {
 | |
|     Instruction *Inst = m_WorkList[i];
 | |
|     Instruction *NewInst = simplifyLoadStore(Inst);
 | |
|     if (NewInst != Inst) {
 | |
|       m_WorkList[i] = NewInst;
 | |
|       Simplified = true;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Cleanup work items, only keep load and store instructions.
 | |
|   if (Simplified) {
 | |
|     changed = true;
 | |
|     auto new_end = std::remove_if(m_WorkList.begin(), m_WorkList.end(),
 | |
|                                   [](Value *V) { return !V || (!isAbstractStoreInst(V) && !isAbstractLoadInst(V)); });
 | |
|     m_WorkList.erase(new_end, m_WorkList.end());
 | |
|   }
 | |
| 
 | |
|   // Split vectors
 | |
|   if (m_WorkList.size() > 0) {
 | |
|     V2SMap vecToSubVec;
 | |
| 
 | |
|     // m_Temps is used to keep loads that needs post-processing.
 | |
|     m_Temps.clear();
 | |
| 
 | |
|     {
 | |
|       auto *MDUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
 | |
|       auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 | |
|       auto *PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
 | |
|       auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 | |
|       auto *ModMD = getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
 | |
| 
 | |
|       TranslationTable TT;
 | |
|       TT.run(F);
 | |
|       WIAnalysisRunner WI(&F, LI, DT, PDT, MDUtils, m_CGCtx, ModMD, &TT);
 | |
|       WI.run();
 | |
| 
 | |
|       for (uint32_t i = 0; i < m_WorkList.size(); ++i) {
 | |
|         if (splitLoadStore(m_WorkList[i], vecToSubVec, WI)) {
 | |
|           changed = true;
 | |
|         }
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     // Now, do post-processing for the splitted loads
 | |
|     for (uint32_t i = 0; i < m_Temps.size(); ++i) {
 | |
|       Value *V = m_Temps[i];
 | |
|       std::optional<AbstractLoadInst> ALI = AbstractLoadInst::get(V, *m_DL);
 | |
|       if (!ALI) {
 | |
|         continue;
 | |
|       }
 | |
|       Instruction *LI = ALI.value().getInst();
 | |
| 
 | |
|       for (auto &it : vecToSubVec[LI]) {
 | |
|         ValVector &svals = it.second;
 | |
|         if (!LI->use_empty()) {
 | |
|           ValVector Scalars;
 | |
|           IRBuilder<> Builder(LI);
 | |
|           for (uint32_t j = 0; j < svals.size(); ++j) {
 | |
|             Type *Ty1 = svals[j]->getType();
 | |
|             IGCLLVM::FixedVectorType *VTy1 = dyn_cast<IGCLLVM::FixedVectorType>(Ty1);
 | |
|             if (VTy1) {
 | |
|               for (uint32_t k = 0; k < VTy1->getNumElements(); ++k) {
 | |
|                 Value *S = Builder.CreateExtractElement(svals[j], Builder.getInt32(k), "split");
 | |
|                 Scalars.push_back(S);
 | |
|               }
 | |
|             } else {
 | |
|               Scalars.push_back(svals[j]);
 | |
| 
 | |
|               // svals[j] will be no long needed, set it to null
 | |
|               // to prevent double-deleting later
 | |
|               svals[j] = nullptr;
 | |
|             }
 | |
|           }
 | |
|           // Replace LI
 | |
|           replaceAllVectorUsesWithScalars(LI, Scalars);
 | |
| 
 | |
|           // Remove any dead scalars
 | |
|           for (uint32_t j = 0; j < Scalars.size(); ++j) {
 | |
|             if (Scalars[j]->use_empty()) {
 | |
|               Instruction *tInst = cast<Instruction>(Scalars[j]);
 | |
|               tInst->eraseFromParent();
 | |
|             }
 | |
|           }
 | |
|         }
 | |
| 
 | |
|         // Remove any dead sub vectors
 | |
|         for (uint32_t j = 0; j < svals.size(); ++j) {
 | |
|           if (svals[j] == nullptr) {
 | |
|             continue;
 | |
|           }
 | |
|           Instruction *tInst = cast<Instruction>(svals[j]);
 | |
|           if (tInst->use_empty()) {
 | |
|             // If this is a 3-element vector load, remove it
 | |
|             // from m_Vector3List as well.
 | |
|             if (isAbstractLoadInst(tInst) && tInst->getType()->isVectorTy() &&
 | |
|                 cast<IGCLLVM::FixedVectorType>(tInst->getType())->getNumElements() == 3) {
 | |
|               InstWorkVector::iterator tI = m_Vector3List.begin(), tE = m_Vector3List.end();
 | |
|               for (; tI != tE; ++tI) {
 | |
|                 Instruction *tmp = *tI;
 | |
|                 if (tmp == tInst) {
 | |
|                   break;
 | |
|                 }
 | |
|               }
 | |
|               if (tI != m_Vector3List.end()) {
 | |
|                 m_Vector3List.erase(tI);
 | |
|               }
 | |
|             }
 | |
| 
 | |
|             tInst->eraseFromParent();
 | |
|           }
 | |
|         }
 | |
|       }
 | |
| 
 | |
|       // Done with load splits, remove the original load inst
 | |
|       if (LI->use_empty()) {
 | |
|         vecToSubVec.erase(LI);
 | |
|         LI->eraseFromParent();
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     // Last, split 3-element vector if necessary
 | |
|     for (uint32_t i = 0; i < m_Vector3List.size(); ++i) {
 | |
|       if (splitVector3LoadStore(m_Vector3List[i])) {
 | |
|         changed = true;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     vecToSubVec.clear();
 | |
|     m_Vector3List.clear();
 | |
|     m_WorkList.clear();
 | |
|   }
 | |
|   return changed;
 | |
| }
 | 
