Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/VectorProcess.cpp
Paige, Alexander 420b632df9 Update IGC code format
Update IGC code format
2025-07-20 06:20:11 +02:00

835 lines
33 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2017 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "Compiler/CISACodeGen/VectorProcess.hpp"
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "Compiler/CISACodeGen/EmitVISAPass.hpp"
#include "Compiler/IGCPassSupport.h"
#include "common/IGCIRBuilder.h"
#include "common/LLVMWarningsPush.hpp"
#include "llvmWrapper/Support/Alignment.h"
#include "llvmWrapper/IR/DerivedTypes.h"
#include <llvm/IR/DataLayout.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/InstIterator.h>
#include "common/LLVMWarningsPop.hpp"
#include "Probe/Assertion.h"
using namespace llvm;
using namespace IGC;
using IGCLLVM::FixedVectorType;
//
// Description of VectorProcess Pass
// The pass is to do data layout of vector explicitly by inserting bitcasts.
// These bitcasts have special meaning and cannot be deleted. We insert
// those bitcasts right before emitting vISA code so that the most codegen
// passes will not need to special-handle those bitcasts.
//
// As we assume that vector type (in llvm ir) is in a "packed form", which means
// that when we group several workitems (each llvm code is a single workitem)
// into a single thread, the elements of a vector in LLVM IR are no longer
// consecutive in their GRF. For example, given <n x T> v, its vISA
// variable under SIMD8 (group 8 workitems into a single thread) will be
// laid out as follow (For readability, C variables are used and C's struct
// layout is assumed):
// struct { T c0, c1, c2, c3, c4, c5, c6, c7 } visaVar[n];
// where c0, c1, ... c7 represent values for simd lane 0 -- 7,
// respectively. For example, assume the original workitem 0 is at SIMD
// lane 0, and its vector v for lane 0 will be
// visaVar[0].c0, visaVar[1].c0, visaVar[2].c0,...... visaVar[n-1].c0,
// which are no longer consecutive in visaVar.
//
// This layout is not guaranteed to be efficiently generated by gathers/scatters.
// For example, <16xi8> can be generated by 16 1-byte byte scattered Reads, each
// read reads 1 byte for every lane; but <16xi8> can be viewed as <4xi32>. And
// a single gather4 can get entire <4xi32>. Thus, to have an efficient message,
// the original vector could be "re-layout" to a different vector type that can
// be mapped to send message more efficently. But this "re-layout" has cost,
// that is, we will have to generate mov instructions (maybe a lot), as shown
// below:
// <16xi8> v
// struct { i8 c0, c1, ..., c7 } visaVar_v[16];
// Note: this array of struct is required in IGC (referred to as
// packed form).
//
// <4xi32> v_as4xi32
// struct { i32 c0, c1, ..., c7 } visaVar_v_as4xi32[4]; or
// struct { i8 c0[4], c1[4], ..., c7[4] } visaVar_v_as4xi32[4];
// note: each element of the array is actually a struct of array!
// visaVar_v_as4xi32 = gather4 &v
//
//
// To convert <4xi32> back to <16xi8> (required as packed-form), the
// following is needed:
// for(i=0; i < 4; ++i)
// for(j=0; j < 4; ++j)
// visaVar_v[i*4 + j].c0 = visaVar_v_as4xi32[i].c0[j];
// ......
// visaVar_v[i*4 + j].c7 = visaVar_v_as4xi32[i].c7[j];
// and this has 4 * 4 * 8 = 128 mov instructions !
//
//
// In order to generate such mov instructions explicitly, we insert bitcast between
// the original vector and one we want to use for load and store, and this bitcast
// basically emits movs similar to the conversion code as shown above. We call
// this bitcast as re-data-layout. The following is the code generated for this
// explicit bitcast (done by emitVectorBitCast):
// before: %v = load <16xi8>* p
//
// after: %np = bitcast p to <4 x i32>*
// %nv = load <4 x i32>* np
// %v = bitcast nv to <16 x i8> <<--- re-data-layout bitcast
//
// Since this could potentially generate a lot of movs (may be optimized away),
// bitcasts are inserted only if it is needed.
//
// ** Note, we guarantee that the size of a vector is either 1, 2 bytes,
// ** or multiple of DW at this point. This is guaranteed by VectorPreProcess
// ** (as <3 x i8> cannot be mapped to a single send message, has to be
// ** splitted. We split <3 x i8> in VectorPreProcess so that we don't have
// ** to worry about splitting vector here).
//
// Given a vector < n x T>, the type of load/store is calculated "conceptually"
// as the following, note that if sizeof(T) is 4 or 8, we normally do not
// need to do conversion at all (but there are exception when load/store is
// is mis-aligned). (Keep in mind that sizeof(T)*n is 1|2|multiple-of-DW.)
// if (n * sizeof(T) < 4 bytes) {
// <n x T> ---> S; where S is the scalar type whose size == n * sizeof(T);
// } else if ( (sizeof(T) != 4 && Using A32 message ) ||
// (sizeof(T) != 4|8 && Using A64 message) ) {
//
// <n x T> --> <n1 x i64> : sizeof(T) == 8 && A64 messages; or
// <n1 x i32> : otherwise
// }
//
// For example,
// (1) %1 = load <8 x i16> *p
// converted into
// new_p = bitcast p to <4 x i32>*
// %2 = load <4 x i32> *new_p
// %1 = bitcast %2 to <8 x i16>
//
// (2) %1 = load <4 x i64> *p
// Using A32, converted into
// new_p = bitcast p to <8 x i32>*
// %2 = load <8 x i32> *new_p
// %1 = bitcast %2 to <4 x i64>
//
// Using A64, do nothing.
//
namespace {
class VectorProcess : public FunctionPass {
public:
typedef SmallVector<Instruction *, 32> InstWorkVector;
static char ID; // Pass identification, replacement for typeid
VectorProcess()
: FunctionPass(ID), m_DL(nullptr), m_C(nullptr), has_8Byte_A64_BS(true), has_QW_BTS_GS(false), m_WorkList() {
initializeVectorProcessPass(*PassRegistry::getPassRegistry());
}
StringRef getPassName() const override { return "VectorProcess"; }
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<CodeGenContextWrapper>();
}
private:
bool reLayoutLoadStore(Instruction *Inst);
bool optimizeBitCast(BitCastInst *BC);
Value *ProcessMergeValue(Instruction *Inst, Value *V, Type *NewTy, Type *NewIntETy, Type *NewIntTy) const;
private:
const DataLayout *m_DL;
LLVMContext *m_C;
bool has_8Byte_A64_BS; // true if 8-byte A64 Byte scattered is supported
bool has_QW_BTS_GS; // true if qword BTS Gather/Scatter is supported
InstWorkVector m_WorkList;
};
} // namespace
// Register pass to igc-opt
#define PASS_FLAG "igc-vectorprocess"
#define PASS_DESCRIPTION "Process vector loads/stores for explicit vISA variable layout"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(VectorProcess, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_END(VectorProcess, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
char VectorProcess::ID = 0;
FunctionPass *IGC::createVectorProcessPass() { return new VectorProcess(); }
bool VectorProcess::reLayoutLoadStore(Instruction *Inst) {
LoadInst *const LI = dyn_cast<LoadInst>(Inst);
StoreInst *const SI = dyn_cast<StoreInst>(Inst);
GenIntrinsicInst *const II = dyn_cast<GenIntrinsicInst>(Inst);
Value *Ptr = nullptr;
Type *Ty = nullptr;
if (nullptr != LI) {
Ptr = LI->getPointerOperand();
Ty = LI->getType();
} else if (nullptr != SI) {
IGC_ASSERT(0 < SI->getNumOperands());
IGC_ASSERT(nullptr != SI->getOperand(0));
Ptr = SI->getPointerOperand();
Ty = SI->getOperand(0)->getType();
} else {
IGC_ASSERT(nullptr != II);
IGC_ASSERT(0 < II->getNumOperands());
IGC_ASSERT(nullptr != II->getOperand(0));
Ptr = II->getOperand(0);
if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldrawvector_indexed ||
II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldraw_indexed ||
II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad) {
Ty = II->getType();
} else if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
II->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed) {
IGC_ASSERT(2 < IGCLLVM::getNumArgOperands(II));
IGC_ASSERT(nullptr != II->getArgOperand(2));
Ty = II->getArgOperand(2)->getType();
} else if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore) {
IGC_ASSERT(1 < IGCLLVM::getNumArgOperands(II));
IGC_ASSERT(nullptr != II->getArgOperand(1));
Ty = II->getArgOperand(1)->getType();
} else {
IGC_ASSERT_MESSAGE(0, "Internal Error: unknown intrinsic");
}
}
IGC_ASSERT(nullptr != Ptr);
IGC_ASSERT(nullptr != Ty);
IGCLLVM::FixedVectorType *const VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
// Treat a scalar as 1-element vector
uint32_t nelts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
Type *eTy = VTy ? VTy->getElementType() : Ty;
uint32_t eTyBits = int_cast<unsigned int>(m_DL->getTypeSizeInBits(eTy));
IGC_ASSERT_MESSAGE((eTyBits == 8 || eTyBits == 16 || eTyBits == 32 || eTyBits == 64),
"the Size of Vector element must be 8/16/32/64 bits.");
uint32_t eTyBytes = (eTyBits >> 3);
uint32_t TBytes = nelts * eTyBytes; // Total size in bytes
//
// Assumption:
// 1. if the size of vector < 4 bytes, it must be 1 or 2 bytes (never 3);
// 2. if the size of vector >= 4 bytes, it must be multiple of DW
// Those 2 assumption are guaranteed by VectorPreProcess.
//
// So far, we are using A32 untyped and byte scattered messages,
// and A64 scattered messages and A64 untyped messages.
//
// A32: using DW as the new element type.
// A64: the new element type will be:
// unaligned load/store: DW if no 8-byte A64 byte scattered message
// QW otherwise;
// aligned vector of long type: use QW
// others: use DW.
// For vector whose size is smaller than 4 bytes, they must be converted
// to a 1-element vector (or scalar) so all elements are read/written with
// a single message.
//
Type *new_eTy;
uint32_t new_nelts;
PointerType *PtrTy = cast<PointerType>(Ptr->getType());
if (TBytes == 1) {
IGC_ASSERT_MESSAGE(nelts == 1, "Internal Error: something wrong");
return false;
} else if (TBytes == 2 || TBytes == 4) {
if (nelts == 1) {
// No conversion needed.
return false;
}
new_nelts = 1;
new_eTy = (TBytes == 2) ? Type::getInt16Ty(*m_C) : Type::getInt32Ty(*m_C);
} else {
// This handles all the other cases
CodeGenContext *cgCtx = nullptr;
cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
bool useA64 = IGC::isA64Ptr(PtrTy, cgCtx);
bool useBSS = IGC::DecodeBufferType(PtrTy->getAddressSpace()) == IGC::BINDLESS;
alignment_t align;
if (LI) {
align = IGCLLVM::getAlignmentValue(LI);
} else if (SI) {
align = IGCLLVM::getAlignmentValue(SI);
} else if (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad) {
align = cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
} else if (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore) {
align = cast<ConstantInt>(II->getArgOperand(2))->getZExtValue();
} else {
align = 1;
}
bool useQW = false;
if (useA64) {
useQW = (TBytes % 8 == 0) && ((has_8Byte_A64_BS && align < 4) || (eTyBytes == 8U && align >= 8U));
} else if (useBSS) {
useQW = has_QW_BTS_GS && nelts == 1 && (eTyBytes == 8U && align >= 8U);
}
if (EmitPass::shouldGenerateLSCQuery(*cgCtx, Inst) == Tristate::True) {
// With LSC, want to use QW if element size is 8 bytes.
useQW = (eTyBytes == 8);
}
const uint32_t new_eTyBytes = useQW ? 8 : 4;
if (eTyBytes == new_eTyBytes && !eTy->isAggregateType()) {
// The original vector is already a good one. Skip.
return false;
}
new_eTy = useQW ? Type::getInt64Ty(*m_C) : Type::getInt32Ty(*m_C);
IGC_ASSERT(new_eTyBytes);
IGC_ASSERT_MESSAGE((TBytes % new_eTyBytes) == 0, "Wrong new vector size");
new_nelts = TBytes / new_eTyBytes;
}
IGCIRBuilder<> Builder(Inst);
Type *newVTy;
if (new_nelts == 1) {
newVTy = new_eTy;
} else {
newVTy = FixedVectorType::get(new_eTy, new_nelts);
}
Type *newPtrTy = PointerType::get(newVTy, PtrTy->getPointerAddressSpace());
Value *newPtr;
if (IntToPtrInst *i2p = dyn_cast<IntToPtrInst>(Ptr)) {
newPtr = Builder.CreateIntToPtr(i2p->getOperand(0), newPtrTy, "IntToPtr2");
} else {
newPtr = Builder.CreateBitCast(Ptr, newPtrTy, "vptrcast");
}
// These types are needed when we are dealing with pointers
// and using ptrtoint and inttoptr.
Type *int_eTy = Type::getIntNTy(*m_C, eTyBits);
Type *new_intTy = VTy ? FixedVectorType::get(int_eTy, nelts) : int_eTy;
if (LI || (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad)) {
Instruction *oldLoad = LI ? cast<Instruction>(LI) : cast<Instruction>(II);
Instruction *load;
if (LI) {
load = Builder.CreateAlignedLoad(newVTy, newPtr, IGCLLVM::getCorrectAlign(IGCLLVM::getAlignmentValue(LI)),
LI->isVolatile(), "vCastload");
} else {
Type *types[] = {newVTy, newPtrTy, newVTy};
Function *F = GenISAIntrinsic::getDeclaration(II->getParent()->getParent()->getParent(),
GenISAIntrinsic::GenISA_PredicatedLoad, types);
load = Builder.CreateCall4(F, newPtr, II->getOperand(1), II->getOperand(2),
ProcessMergeValue(Inst, II->getOperand(3), newVTy, int_eTy, new_intTy));
}
load->copyMetadata(*oldLoad);
Value *V = load;
if (eTy->isPointerTy()) {
// cannot bitcast int to ptr; need to use intToptr.
// First, cast the loaded value to a vector type that is same to
// the original vector type with ptr element type replaced
// with int-element type.
// second, IntToPtr cast to the original vector type.
V = Builder.CreateBitCast(V, new_intTy);
if (VTy) {
// If we need a vector inttoptr, scalarize it here.
auto *BC = V;
V = UndefValue::get(Ty);
for (unsigned i = 0; i < nelts; i++) {
auto *EE = Builder.CreateExtractElement(BC, i);
auto *ITP = Builder.CreateIntToPtr(EE, eTy);
V = Builder.CreateInsertElement(V, ITP, i);
}
} else {
V = Builder.CreateIntToPtr(V, Ty);
}
} else {
// TODO: if Ty is Aggregate type then this bitCast conradicts to LLVM spec
V = Builder.CreateBitCast(V, Ty);
}
oldLoad->replaceAllUsesWith(V);
oldLoad->eraseFromParent();
} else if (SI || (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore)) {
Instruction *oldStore = SI ? cast<Instruction>(SI) : cast<Instruction>(II);
Value *StoreVal = SI ? SI->getValueOperand() : II->getArgOperand(1);
Value *V;
if (eTy->isPointerTy()) {
// Similar to the load. First, PtrtoInt cast to a new vector,
// and then bitcast to the stored type.
Type *int_eTy = Type::getIntNTy(*m_C, eTyBits);
if (VTy) {
// If we need a vector inttoptr, scalarize it here.
V = UndefValue::get(FixedVectorType::get(int_eTy, nelts));
for (unsigned i = 0; i < nelts; i++) {
auto *EE = Builder.CreateExtractElement(StoreVal, i);
auto *ITP = Builder.CreatePtrToInt(EE, int_eTy);
V = Builder.CreateInsertElement(V, ITP, i);
}
} else if (isa<IntToPtrInst>(StoreVal) && cast<IntToPtrInst>(StoreVal)->getOperand(0)->getType() == int_eTy) {
// Detect case when creating PtrToInt and BitCast instructions
// is not needed. This is when store value is created from
// a vector with the same type as the target vector type.
//
// e.g. example from a Vulkan shader with variable pointers:
// Before:
// %7 = bitcast <2 x i32> %assembled.vect7 to i64
// %Temp-26.i.VP = inttoptr i64 %7 to i32 addrspace(1179648)*
// store i32 addrspace(1179648)* %Temp-26.i.VP, i32 addrspace(1179648)** %6, align 8
// After:
// store <2 x i32> %assembled.vect7, <2 x i32>* %vptrcast, align 8
V = cast<IntToPtrInst>(StoreVal)->getOperand(0);
} else {
V = Builder.CreatePtrToInt(StoreVal, int_eTy);
}
if (isa<BitCastInst>(V) && (cast<BitCastInst>(V)->getOperand(0)->getType() == newVTy)) {
V = cast<BitCastInst>(V)->getOperand(0);
} else {
V = Builder.CreateBitCast(V, newVTy);
}
} else {
V = Builder.CreateBitCast(StoreVal, newVTy);
}
Instruction *store = nullptr;
if (SI && IGCLLVM::getAlignmentValue(SI) == 0) {
store = Builder.CreateStore(V, newPtr, SI->isVolatile());
} else if (SI) {
store = Builder.CreateAlignedStore(V, newPtr, IGCLLVM::getAlign(*SI), SI->isVolatile());
} else {
Type *types[] = {newPtrTy, newVTy};
Function *F = GenISAIntrinsic::getDeclaration(II->getParent()->getParent()->getParent(),
GenISAIntrinsic::GenISA_PredicatedStore, types);
store = Builder.CreateCall4(F, newPtr, V, II->getOperand(2), II->getOperand(3));
}
store->copyMetadata(*oldStore);
oldStore->eraseFromParent();
} else if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldrawvector_indexed ||
II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldraw_indexed) {
Type *types[] = {newVTy, newPtrTy};
Function *F = GenISAIntrinsic::getDeclaration(II->getParent()->getParent()->getParent(),
GenISAIntrinsic::GenISA_ldrawvector_indexed, types);
Value *V = Builder.CreateCall4(F, newPtr, II->getOperand(1), II->getOperand(2), II->getOperand(3));
if (eTy->isPointerTy()) {
Type *intETy = Type::getIntNTy(*m_C, eTyBits);
Type *newIntTy = VTy ? IGCLLVM::FixedVectorType::get(intETy, nelts) : intETy;
V = Builder.CreateBitCast(V, newIntTy);
V = Builder.CreateIntToPtr(V, Ty);
} else {
V = Builder.CreateBitCast(V, Ty);
}
II->replaceAllUsesWith(V);
II->eraseFromParent();
} else {
IGC_ASSERT(II->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
II->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed);
Type *types[] = {newPtrTy, newVTy};
Function *F = GenISAIntrinsic::getDeclaration(II->getParent()->getParent()->getParent(),
GenISAIntrinsic::GenISA_storerawvector_indexed, types);
Value *V;
if (eTy->isPointerTy()) {
Type *intETy = Type::getIntNTy(*m_C, eTyBits);
Type *newIntTy = VTy ? IGCLLVM::FixedVectorType::get(intETy, nelts) : intETy;
V = Builder.CreatePtrToInt(II->getOperand(2), newIntTy);
V = Builder.CreateBitCast(V, newVTy);
} else {
V = Builder.CreateBitCast(II->getOperand(2), newVTy);
}
Builder.CreateCall5(F, newPtr, II->getOperand(1), V, II->getOperand(3), II->getOperand(4));
II->eraseFromParent();
}
return true;
}
bool VectorProcess::optimizeBitCast(BitCastInst *BC) {
bool change = false;
Value *Src = BC->getOperand(0);
Type *SrcTy = Src->getType();
Type *Ty = BC->getType();
if (Ty == SrcTy) {
BC->replaceAllUsesWith(Src);
return true;
}
// Only handle non-pointer bitcast
if (isa<PointerType>(Ty) || isa<PointerType>(SrcTy)) {
return false;
}
for (Value::user_iterator UI = BC->user_begin(), UE = BC->user_end(); UI != UE; ++UI) {
if (BitCastInst *Inst = dyn_cast<BitCastInst>(*UI)) {
IRBuilder<> Builder(Inst);
Type *Ty1 = Inst->getType();
if (SrcTy == Ty1) {
Inst->replaceAllUsesWith(Src);
} else {
BitCastInst *nBC = (BitCastInst *)Builder.CreateBitCast(Src, Ty1);
Inst->replaceAllUsesWith(nBC);
// Add nBC so it will be processed again.
m_WorkList.push_back(nBC);
}
change = true;
}
}
return change;
}
bool VectorProcess::runOnFunction(Function &F) {
CodeGenContext *cgCtx = nullptr;
cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
bool changed = false;
m_DL = &F.getParent()->getDataLayout();
m_C = &F.getContext();
has_8Byte_A64_BS = cgCtx->platform.has8ByteA64ByteScatteredMessage();
has_QW_BTS_GS = cgCtx->platform.hasQWGatherScatterBTSMessage();
// Adjust load/store layout by inserting bitcast.
// Those bitcasts should not be optimized away.
for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
Instruction *inst = &*I;
if (isa<LoadInst>(inst) || isa<StoreInst>(inst)) {
m_WorkList.push_back(inst);
} else if (GenIntrinsicInst *intrin = dyn_cast<GenIntrinsicInst>(inst)) {
if (intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_ldrawvector_indexed ||
intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_ldraw_indexed ||
intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed ||
intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad ||
intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore) {
m_WorkList.push_back(inst);
}
}
}
for (unsigned i = 0; i < m_WorkList.size(); ++i) {
if (reLayoutLoadStore(m_WorkList[i])) {
changed = true;
}
}
m_WorkList.clear();
// To remove unnecessary bitcast
if (changed) {
for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
Instruction *inst = &*I;
if (isa<BitCastInst>(inst)) {
m_WorkList.push_back(inst);
}
}
bool doclean = false;
for (unsigned i = 0; i < m_WorkList.size(); ++i) {
if (BitCastInst *Inst = dyn_cast<BitCastInst>(m_WorkList[i])) {
if (optimizeBitCast(Inst)) {
doclean = true;
}
}
}
while (doclean) {
// Given b2 = bitcast A, T2
// b1 = bitcast b2, T1
// we say b1's level is 1, b2's level is 2.
//
// This pass, in theory, can have two-level dead bitcasts.
// Therefore, we expect "while" will take three iterations at most. And
// WorkList is the set of bitcasts, which isn't expected to be big.
doclean = false;
for (unsigned i = 0; i < m_WorkList.size(); ++i) {
if (m_WorkList[i] && m_WorkList[i]->use_empty()) {
m_WorkList[i]->eraseFromParent();
m_WorkList[i] = NULL;
doclean = true;
}
}
}
m_WorkList.clear();
}
// DumpLLVMIR(cgCtx, "vectorprocess");
return changed;
}
Value *VectorProcess::ProcessMergeValue(Instruction *Inst, Value *V, Type *NewTy, Type *NewIntEType,
Type *NewIntTy) const {
// if V is a zero initializer, undef or poison value, we just need to create
// corresponding value of NewTy.
if (isa<ConstantAggregateZero>(V)) {
if (IGCLLVM::FixedVectorType *NewVTy = dyn_cast<IGCLLVM::FixedVectorType>(NewTy))
return ConstantAggregateZero::get(NewVTy);
else
return Constant::getNullValue(NewTy);
}
if (isa<PoisonValue>(V))
return PoisonValue::get(NewTy);
if (isa<UndefValue>(V))
return UndefValue::get(NewTy);
IRBuilder<> Builder(Inst);
Type *Ty = V->getType();
IGCLLVM::FixedVectorType *const VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
uint32_t nelts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
Type *eTy = VTy ? VTy->getElementType() : Ty;
if (eTy->isPointerTy()) {
// cannot bitcast ptr to int; First, PtrToInt cast
// then bitcast int (scalar or vector) to the new type.
if (VTy) {
// need a vector ptrtoint, scalarize:
auto *oldV = V;
V = UndefValue::get(NewIntTy);
for (unsigned i = 0; i < nelts; ++i) {
auto *EE = Builder.CreateExtractElement(oldV, i);
auto *PTI = Builder.CreatePtrToInt(EE, NewIntEType);
V = Builder.CreateInsertElement(V, PTI, i);
}
} else {
V = Builder.CreatePtrToInt(V, NewIntTy);
}
}
return Builder.CreateBitCast(V, NewTy);
}
//
// getInfo maps vector to the right messages. It assume that a vector
// can be mapped to more than one messages, and those messages may be
// different as long as the message returns exactly the same "packed form"
// of the vector.
//
// getInfo() initializes the array of struct (insts), which specifies
// the number of send instructions (or gathers/scatters visa instructions)
// needed to read/write this vector into vISA variable. The clients will
// access this array of struct directly after getInfo() call.
//
// VectorProcess() will change each vector load and store into a new vector
// load and store that can map exactly to these messages. getInfo() has
// the following agreement with VectorProcess():
// 1) If sizeof(Ty) >= 4 bytes, sizeof(Ty) must be multiple of 4 bytes.
// And futhermore, the element type of 'Ty' if 'Ty" is a vector type
// or 'Ty' if 'Ty' is a scalar type, must be either 4 bytes (DW) or
// 8 bytes (QW).
// 2) If sizeof(Ty) < 4 bytes, sizeof(Ty) must be either 1 byte or
// 2 bytes. The sizeof(Ty) cannot be 3 bytes!
// (Note that VectorMessage and VectorProcess must be in sync with regard
// to this agreetment.)
//
void VectorMessage::getInfo(Type *Ty, uint64_t Align, bool useA32, bool forceByteScatteredRW) {
VectorType *VTy = dyn_cast<VectorType>(Ty);
Type *eTy = VTy ? cast<VectorType>(VTy)->getElementType() : Ty;
unsigned eltSize = Shader->GetScalarTypeSizeInRegister(eTy);
unsigned nElts = VTy ? (unsigned)cast<IGCLLVM::FixedVectorType>(VTy)->getNumElements() : 1;
// total bytes
const unsigned TBytes = nElts * eltSize;
// Per-channel Max Bytes (MB) that can be read/written by a single send inst
unsigned MB;
SIMDMode SM = Shader->m_SIMDSize;
bool has_8B_A64_BS = Shader->m_Platform->has8ByteA64ByteScatteredMessage();
bool has_8DW_A64_SM = Shader->m_Platform->has8DWA64ScatteredMessage();
//
// Set up default message and the data type of the message
//
MESSAGE_KIND defaultKind;
VISA_Type defaultDataType;
if (Align < 4 || TBytes < 4 || forceByteScatteredRW) {
if (forceByteScatteredRW) {
IGC_ASSERT(useA32);
}
defaultKind = useA32 ? MESSAGE_A32_BYTE_SCATTERED_RW : MESSAGE_A64_SCATTERED_RW;
MB = useA32 ? A32_BYTE_SCATTERED_MAX_BYTES
: ((has_8B_A64_BS && eltSize == 8) ? A64_BYTE_SCATTERED_MAX_BYTES_8B : A64_BYTE_SCATTERED_MAX_BYTES);
defaultDataType = ISA_TYPE_UB;
// To make sure that vector and message match.
IGC_ASSERT_MESSAGE((MB == eltSize || (MB > eltSize && nElts == 1)), "Internal Error: mismatch layout for vector");
} else {
defaultKind = useA32 ? MESSAGE_A32_UNTYPED_SURFACE_RW : MESSAGE_A64_SCATTERED_RW;
MB = useA32 ? A32_UNTYPED_MAX_BYTES
: ((has_8DW_A64_SM && SM == SIMDMode::SIMD8) ? A64_SCATTERED_MAX_BYTES_8DW_SIMD8
: A64_SCATTERED_MAX_BYTES_4DW);
bool allowQWMessage = !useA32 && eltSize == 8 && Align >= 8U;
defaultDataType = (eltSize == 8) ? ISA_TYPE_UQ : ISA_TYPE_UD;
// To make sure that send returns the correct layout for vector.
IGC_ASSERT_MESSAGE((eltSize == 4 /* common */ || allowQWMessage /* A64, QW */),
"Internal Error: mismatch layout for vector");
}
MESSAGE_KIND kind = defaultKind;
VISA_Type dataType = defaultDataType;
unsigned bytes = TBytes;
size_t i = 0;
for (; bytes >= MB; ++i, bytes -= MB) {
IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
insts[i].startByte = (uint16_t)(TBytes - bytes);
insts[i].kind = kind;
insts[i].blkType = dataType;
insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
IGC_ASSERT(insts[i].blkInBytes);
insts[i].numBlks = MB / insts[i].blkInBytes;
}
// Process the remaining elements if any. It could have at most
// two separate sends. For example, assuming the remaining bytes
// are for <7 x i32> and it is for A64 SIMD8 with align >=4; thus
// we will need two sends: one for the first <4 x i32> and the
// second for the remaining <3 x i32>.
if (MB == A64_SCATTERED_MAX_BYTES_8DW_SIMD8) { // MB == 32 bytes
unsigned MB2 = A64_SCATTERED_MAX_BYTES_8DW_SIMD8 / 2; // 16 bytes
if (bytes > MB2) {
IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
insts[i].startByte = (uint16_t)(TBytes - bytes);
insts[i].kind = kind;
insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
IGC_ASSERT(insts[i].blkInBytes);
insts[i].numBlks = MB2 / insts[i].blkInBytes;
++i;
bytes -= MB2;
}
}
if (bytes > 0) {
if (Align >= 4) {
if (!useA32 && eltSize == 4 && bytes == 12) {
kind = MESSAGE_A64_UNTYPED_SURFACE_RW;
}
}
IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
insts[i].startByte = (uint16_t)(TBytes - bytes);
insts[i].kind = kind;
insts[i].blkType = dataType;
insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
IGC_ASSERT(insts[i].blkInBytes);
insts[i].numBlks = (uint16_t)bytes / insts[i].blkInBytes;
++i;
}
numInsts = i;
IGC_ASSERT_MESSAGE(numInsts <= VECMESSAGEINFO_MAX_LEN,
"Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
IGC_ASSERT_MESSAGE(numInsts <= (sizeof(insts) / sizeof(*insts)),
"Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
}
void VectorMessage::getLSCInfo(llvm::Type *Ty, uint64_t Align, CodeGenContext *ctx, bool useA32, bool transpose) {
IGC_ASSERT(nullptr != ctx);
IGC_ASSERT(nullptr != Shader);
IGCLLVM::FixedVectorType *VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
Type *eTy = VTy ? VTy->getContainedType(0) : Ty;
unsigned eltSize = Shader->GetScalarTypeSizeInRegister(eTy);
unsigned nElts = VTy ? (unsigned)VTy->getNumElements() : 1;
// total bytes
const unsigned TBytes = nElts * eltSize;
char TRANS_VEC_SIZE[8] = {1, 2, 3, 4, 8, 16, 32, 64};
MESSAGE_KIND kind = useA32 ? MESSAGE_A32_LSC_RW : MESSAGE_A64_LSC_RW;
VISA_Type dataType = GetType(Ty, ctx);
uint16_t blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
// Per-channel Max Bytes (MB) that can be read/written by a single send inst
const unsigned int numLanesForSIMDSize = numLanes(Shader->m_SIMDSize);
IGC_ASSERT(numLanesForSIMDSize);
unsigned int MB = (8 * ctx->platform.getGRFSize()) / numLanesForSIMDSize;
if (Align < 4 || (eltSize == 8 && Align < 8)) {
MB = eltSize;
}
size_t i = 0;
if (transpose) {
unsigned bytes = TBytes;
for (int j = 0; j < 8; j++) {
const unsigned int denominator = blkInBytes * TRANS_VEC_SIZE[7 - j];
IGC_ASSERT(denominator);
if (bytes % denominator == 0) {
IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
insts[i].startByte = (uint16_t)(TBytes - bytes);
insts[i].kind = kind;
insts[i].blkType = dataType;
insts[i].blkInBytes = blkInBytes;
insts[i].numBlks = TRANS_VEC_SIZE[7 - j];
bytes -= insts[i].numBlks * blkInBytes;
i++;
break;
} else //
{
if (bytes / denominator != 0) {
IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
insts[i].startByte = (uint16_t)(TBytes - bytes);
insts[i].kind = kind;
insts[i].blkType = dataType;
insts[i].blkInBytes = blkInBytes;
insts[i].numBlks = TRANS_VEC_SIZE[7 - j];
bytes -= insts[i].numBlks * blkInBytes;
i++;
} // else j++;
}
}
IGC_ASSERT(bytes == 0);
} else {
unsigned bytes = TBytes;
for (; bytes >= MB; ++i, bytes -= MB) {
insts[i].startByte = (uint16_t)(TBytes - bytes);
insts[i].kind = kind;
insts[i].blkType = dataType;
insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
IGC_ASSERT(insts[i].blkInBytes);
insts[i].numBlks = MB / insts[i].blkInBytes;
}
if (bytes > 0) {
insts[i].startByte = (uint16_t)(TBytes - bytes);
insts[i].kind = kind;
insts[i].blkType = dataType;
insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
IGC_ASSERT(insts[i].blkInBytes);
insts[i].numBlks = (uint16_t)bytes / insts[i].blkInBytes;
++i;
}
}
numInsts = i;
IGC_ASSERT_MESSAGE(numInsts <= VECMESSAGEINFO_MAX_LEN,
"Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
IGC_ASSERT_MESSAGE(numInsts <= (sizeof(insts) / sizeof(*insts)),
"Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
}
VectorMessage::VectorMessage(EmitPass *emitter) : Shader(emitter->m_currShader) { numInsts = 0; }