Files
intel-graphics-compiler/IGC/Compiler/CISACodeGen/SplitLoads.cpp
Paige, Alexander 420b632df9 Update IGC code format
Update IGC code format
2025-07-20 06:20:11 +02:00

1947 lines
72 KiB
C++

/*========================== begin_copyright_notice ============================
Copyright (C) 2025 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
#include "Compiler/CISACodeGen/SplitLoads.h"
#include "Compiler/CISACodeGen/IGCLivenessAnalysis.h"
#include "Compiler/CodeGenPublic.h"
#include "Compiler/IGCPassSupport.h"
#include "Compiler/MetaDataUtilsWrapper.h"
// clang-format off
#include "common/LLVMWarningsPush.hpp"
#include "llvmWrapper/IR/DerivedTypes.h"
#include "llvmWrapper/IR/Function.h"
#include "llvmWrapper/IR/Value.h"
#include "llvm/Support/MathExtras.h"
#include "common/LLVMWarningsPop.hpp"
// clang-format on
#include "IGC/common/Types.hpp"
#include <algorithm>
#include <array>
#include <deque>
#include <optional>
#include <string>
using namespace llvm;
using namespace IGC;
using namespace IGC::LS;
#define DEBUG_TYPE "igc-split-loads"
// ============================================================================
// The goal of this feature:
// ============================================================================
//
// The file provides basic tools for splitting 2D LSC block loads of the form:
// -- <N x iX> @llvm.genx.GenISA.LSC2DBlockRead.vNiX(i64, i32, i32, i32, i32,
// i32, i32, i32, i32, i32, i1, i1, i32)
//
// For the load to be eligible for splitting, the loaded vector must be
// subsequently split into smaller chunks. For example, consider the load of a
// 16-element vector
// -- %vec = call <16 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v16i16(i64 %ptr,
// i32 127, i32 63, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 1, i1
// false, i1 false, i32 0)
//
// that is subsequently split into two 8-element vectors:
// -- %pick.0 = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32
// 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// -- %pick.1 = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32
// 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// that are finally fed into some users:
// -- call void @fun_v8i16(<8 x i16> %pick.0)
// -- call void @fun_v8i16(<8 x i16> %pick.1)
//
// This sequence can be replaced by 2 smaller loads that feed directly into the
// users:
// -- %vec.0 = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %ptr,
// i32 127, i32 63, i32 127, i32 0, i32 0, i32 16, i32 16, i32 8, i32 1, i1
// false, i1 false, i32 0)
// -- %vec.1 = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %ptr,
// i32 127, i32 63, i32 127, i32 0, i32 8, i32 16, i32 16, i32 8, i32 1, i1
// false, i1 false, i32 0)
// -- call void @fun_v8i16(<8 x i16> %vec.0)
// -- call void @fun_v8i16(<8 x i16> %vec.1)
//
// Whether this is beneficial or not depends on the register pressure and
// rescheduling possibilities.
//
// ============================================================================
// Outline of the tool:
// ============================================================================
//
// A given load can be split by the instance of `LoadSplitter` created by the
// factory function
// -- static std::unique_ptr<LoadSplitter> Create(Function *inF, CodeGenContext
// *inCGC, IGCLivenessAnalysis *inRPE);
//
// Given a load `GenIntrinsicInst *GII`, all possible split dimensions (see
// below for details) can be obtained by calling
// -- PossibleDims LoadSplitter::possibleDims(GenIntrinsicInst *GII);
//
// The splitting is then carried out by
// -- bool LoadSplitter::split(GenIntrinsicInst *GII, Dims dims);
// where `dims` represent the desired dimensions of the split.
//
// To split all loads in a basic block use:
// -- bool LoadSplitter::splitAllToSmallest(BasicBlock *BB);
//
// Splitting can be carried out automatically by the pass `SplitLoads`.
// To activate the pass, set the IGC flag `LS_enableLoadSplitting=1`.
//
// The splitting procedure consist of the following phases:
// 1. Process the load and its users to figure out the split structure of the
// load.
// 2. Calculate possible split dimensions.
// 3. Split.
//
// ============================================================================
// I. Process the load
// ============================================================================
//
// Parameters of the intrinsic are stored and managed in the class `LoadData`.
// After verifying the validity of the parameters, the users of the load are
// traced and stored by the class `TraceData`. The tracing is carried out in
// `TraceData::tracePicks` and is done as follows:
//
// 1. A subvector of the loaded vector can be picked by either
// `ShuffleVectorInst` or a sequence of `InsertElementInst` and
// `ExtractElementInst`. If the picking is carried out by `ShuffleVectorInst`,
// the picks must come entirely from one of its arguments, with another one
// being an explicit constant (this includes undefs, zeroinitializers). The
// indices must be constant. Here the indices can repeat and undefs are allowed.
//
// If the picking is carried out by the sequence of `InsertElementInst` and
// `ExtractElementInst`, the sequence must start with:
// -- %ext.0 = extractelement <16 x i16> %vec, i32 [from]
// -- %pick.0 = insertelement <8 x i16> undef, i16 %ext, i32 [to]
// and continue by the repetition of:
// -- %ext.n = extractelement <16 x i16> %vec, i32 [from]
// -- %pick.n = insertelement <8 x i16> %pick.(n-1), i16 %ext.n, i32 [to]
// The indices [from] and [to] must be constant. Each extractelement must have a
// single user, which is the corresponding insertelement. Each insertelement,
// except for the last, must have a single user, which is the next
// extractelement.
//
// The picks can be stacked. For example,
// -- %pick.1 = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32
// 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// -- %pick.1.1 = shufflevector <8 x i16> %pick.1, <8 x i16> undef, <4 x i32>
// <i32 4, i32 5, i32 6, i32 7>
//
// picks elements {12, 13, 14, 15} of the original %vec.
//
// The conditions above guarantee that the picks form a tree. Furthermore, no
// other values, except explicit constants and undefs, are used in the picks.
// This guarantees that the instructions can be safely erased after the split
// is done.
//
// 2. `BitCastInst` are allowed to appear in the tree of the picks if the bit
// width of the scalars does not change. Thus, a load of i32's can be cast to
// float, i16's to hf's, etc. If multiple bitcasts appear in the tree, the
// scalar type is recalculated each time.
//
// 3. Once a user that is not a node in the tree of picks is identified, it is
// saved. The users are grouped by the picks they use as well as the types they
// are cast to. Thus, it is possible to have multiple users of the same pick,
// and with different types as well.
//
// ============================================================================
// II. Calculate possible split dimensions
// ============================================================================
//
// Once the tree of picks (and casts) is created, we want to calculate possible
// split dimensions. This is carried out in Load::possibleDims().
// We require that the picks obey two conditions:
// 1. Each pick must a multi-block range (`MBRange`)
// 2. All picks must be grid-uniform.
//
// Ad 1. A multi-brock range (MBRange) is a sequence of groups, each group
// containing the same number of consecutive integers. The gaps between the
// consecutive groups must also be equal. For example, a pick
// -- {0,1,4,5,8,9,12,13}
// is a valid MBRange as it contains four groups of equal size of consecutive
// integers ( {0,1}, {4,5}, {8,9}, {12,13} ) and the gap
// between the groups is constant. All picks must be valid MBRanges.
//
// Given an MBRange, we say it has dimensions RxC, where R is the size of each
// group and C the number of groups. In the example above, RxC = 2x4. We can
// think about R and C as numbers of rows and columns in the RxC grid.
//
// If the pick consists of consecutive integers only, e.g. {2,3,4,5}, we prefer
// using C=1, so here we would have RxC = 4x1. However, 2x2 and 1x4 are also
// valid dimensions for this pick.
//
// Ad 2. Possible splits are determined by the size of the loaded vector and the
// number of blocks read by the intrinsic. Let's say the vector has length V
// (such as <V x i16>) and we have B block read (B is the 9-th argument in the
// intrinsic or "vB" in the OpenCL intrinsic such as
// @__builtin_IB_subgroup_block_read_cacheopts_u32_m8k16v2). We will say that
// the dimensions of the load are GxB, where G=V/B is the size of the group. We
// can think about the vector as a grid with G rows and B columns.
//
// The picks are
// grid-uniform if they are all MBRanges of the same dimensions and they
// constitute a tiling of the grid. They cannot overlap, repeat indices or miss
// any indices.
//
// For example, consider a 16-element vector loaded by an intrinsic with 4
// blocks. This constitutes a 4x4 grid. The grid can be tiled by the MBRanges of
// the following dimensions:
// -- 1x1 ( {0}, {1}, ..., {15} )
// -- 2x1 ( {0,1}, {2,3}, {4,5}, {6,7}, {8,9}, {10,11}, {12,13}, {14,15})
// -- 4x1 ( {0,1,2,3}, {4,5,6,7}, {8,9,10,11}, {12,13,14,15})
// -- 8x1 = 4x2 ( {0,1,2,3,4,5,6,7}, {8,9,10,11,12,13,14,15})
// -- 16x1 = 8x2 = 4x4 ( {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15})
// -- 1x2 ( {0,4}, {1,5}, {2,6}, {3,7}, {8,12}, {9,13}, {10,14}, {11,15} )
// -- 2x2 ( {0,1,4,5}, {2,3,6,7}, {8,9,12,13}, {10,11,14,15} )
// -- 1x4 ( {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15} )
// -- 2x4 ( {0,1,4,5,8,9,11,12}, {2,3,6,7,10,11,14,15} )
// Note that 8x1 = 4x2 and 16x1 = 8x2 = 4x4.
//
// In Load::possibleDims() we first check conditions 1 and 2. If they are
// satisfied, we calculate the dimensions of the picks. This gives us the
// smallest possible split. In addition, we calculate all other possible
// dimensions. For example, if the picks form a uniform subgrid of dimension 2x2
// of the grid 4x4, then the loads can be split into:
// -- 2x2, 2x4, 8x1, 16x1.
//
// ============================================================================
// III. Split
// ============================================================================
//
// First, for each MBRange, the new parameters of the load after the split are
// calculated and stored in the new instance of the `Load` class. After this is
// done, the new loads are created and inserted into the IR. If the size of the
// new loads is equal exactly to the size of the picks, the users can be
// connected directly to the new loads (with possible bitcasts). If the new
// loads produce larger vectors than the users consume, a sequence of
// `InsertElementInst` and `ExtractElementInst` is inserted in order to split
// the vectors into smaller chunks.
//
// As a final example consider a load has 4 blocks and produces a 16-element
// vector, i.e., it has dimension 4x4,
// -- %vec = call <16 x i8> @llvm.genx.GenISA.LSC2DBlockRead.v16i8(i64 %ptr,
// i32 127, i32 63, i32 127, i32 0, i32 0, i32 8, i32 16, i32 4, i32 4, i1
// false, i1 false, i32 0)
//
// The load is split into four 4-element vectors and the picks
// are grid-uniform of dimension 2x2,
// -- %pick.0 = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32
// 0, i32 1, i32 4, i32 5>
// -- %pick.1 = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32
// 2, i32 3, i32 6, i32 7>
// -- %pick.2 = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32
// 8, i32 9, i32 12, i32 13>
// -- %pick.3 = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32
// 10, i32 11, i32 14, i32 15>
//
// The minimal possible split has dimension 2x2. However, one of the conditions
// for the splits to work is that the size of the split vector x SIMD fills up
// at least 1 GRF. Otherwise the load involves padding, which would break the
// validity of the splits. Thus, the smallest valid split has dimension 4x2 =
// 8x1. In such a case the new loads produce 8-element vector and have 2 blocks:
// -- %vec.0 = call <8 x i8> @llvm.genx.GenISA.LSC2DBlockRead.v8i8(i64 %ptr,
// i32 127, i32 63, i32 127, i32 0, i32 0, i32 8, i32 16, i32 4, i32 2, i1
// false, i1 false, i32 0)
// -- %vec.1 = call <8 x i8> @llvm.genx.GenISA.LSC2DBlockRead.v8i8(i64 %ptr,
// i32 127, i32 63, i32 127, i32 32, i32 0, i32 8, i32 16, i32 4, i32 2, i1
// false, i1 false, i32 0)
//
// We still have to pick the 4-element subvectors %pick.n. n=0,1,2,3 of the
// 8-element vectors %vec.0 and %vec.1. This is done by inserting a sequence of
// `InsertElementInst` and `ExtractElementInst`. In this particular case the
// picks are {0,1,4,5} and {2,3,6,7} for both vectors. This example (and many
// others) can be found in the test file `isa_flak_k16.ll` as the test function
// `@i8_4x4_to_2x2`.
//
// ============================================================================
namespace IGC::LS {
Config &config() { return Config::get(); }
} // namespace IGC::LS
namespace {
constexpr unsigned DEF_PICK_SIZE = 64;
constexpr unsigned DEF_NUM_OF_LOADS = 4;
constexpr unsigned DEF_NUM_OF_PICKS_PER_LOAD = 4;
constexpr unsigned DEF_NUM_OF_CASTS_OR_USERS_PER_PICK = 2;
constexpr unsigned DEF_NUM_OF_USERS_PER_PICK = DEF_NUM_OF_CASTS_OR_USERS_PER_PICK * DEF_NUM_OF_CASTS_OR_USERS_PER_PICK;
constexpr unsigned DEF_NUM_OF_OPTS = 4;
struct Pick;
struct MBRange;
/// `Pick` represents a mask with some additional information.
/// It is a vector of integers, where each element is either an index or `-1`
/// (undef).
struct Pick : public SmallVector<int, DEF_PICK_SIZE> {
explicit Pick() : SmallVector<int, DEF_PICK_SIZE>() {}
explicit Pick(unsigned size, int init = -1) : SmallVector<int, DEF_PICK_SIZE>(size, init) {}
Pick(std::initializer_list<int> init) : SmallVector<int, DEF_PICK_SIZE>(init) {}
/// Returns `true` if the pick is `{0,1,2,...,length-1}`.
bool isTrivial(unsigned length) const;
/// Returns `true` if the element is contained in the pick.
bool contains(int x) const { return std::find(begin(), end(), x) != end(); }
/// Uses elements of `this` as indices to pick the elements from `origin`.
/// If the index is out of range, places -1.
Pick pickFrom(const Pick &origin) const;
/// Creates the pick from a given mask. The elements must belong to the range
/// [`begin`, `end`] or be undefs. If successful, returns the `Pick` with
/// `begin` subtracted from all elements. Otherwise std::nullopt is returned.
static std::optional<Pick> fromMask(ArrayRef<int> mask, int begin, int end);
/// The index corresponding to the `n`-th element of `mbr` in `this` is mapped
/// to `n`. If the index is absent in `mbr`, places -1.
Pick narrowTo(const MBRange &mbr) const;
/// Creates the pick containing all numbers from `begin` to `begin + size -
/// 1`.
static Pick createIdentityPick(unsigned size, int begin = 0);
};
/// `MBRange` (multi-block range) represents a sequence of groups, each group
/// containing the same number of consecutive integers.
/// - `first` denotes the first element in the `MBRange`.
/// - `grSize` denotes the number of elements in each group.
/// - `grPitch` denotes the distance between the first elements of two
/// consecutive groups.
/// - `numOfGr` denotes the number of groups.
/// If `grPitch <= grSize`, then `numOfGr` must be equal to `1`.
struct MBRange {
int first = 0;
unsigned grSize = 0;
int grPitch = 0;
unsigned numOfGr = 0;
explicit MBRange() = default;
MBRange(int first, unsigned grSize, int grPitch, unsigned numOfGr)
: first(first), grSize(grSize), grPitch(grPitch), numOfGr(numOfGr) {}
/// Returns empty range.
static MBRange getEmpty() { return MBRange(); }
/// Returns `true` if the range is empty.
bool empty() const { return !grSize || !numOfGr || !grPitch; }
/// Returns the total number of elements in the range.
int size() const { return grSize * numOfGr; }
/// Returns the last element of the range.
int last() const {
return grPitch * (static_cast<int>(0 < numOfGr ? numOfGr : 1) - 1) + first + static_cast<int>(grSize) - 1;
}
/// Returns the `n`-th element of the range.
int operator[](int n) const;
/// Returns the `elt` element of the `group`-th group.
int operator()(int group, int elt) const;
/// Returns the index corresponding to element `x` of the range. If `x` is not
/// in the range, returns `-1`.
int indexOf(int x) const;
/// Returns `true` if the element is contained in the range.
bool contains(int x) const;
enum class Containment { Contains, Excludes, Intersects };
/// Checks the relation between the range and the pick.
/// - Returns `Containment::Contains` if the pick is contained fully in the
/// range.
/// - Returns `Containment::Excludes` if the pick lies entirely outside of the
/// range.
/// - Returns `Containment::Intersects` if the pick is sliced by the range.
/// Undefs are discarded if `allowUndefs` is `true`. Otherise
/// `Containment::Excludes` is returned.
Containment containsOrExcludes(const Pick &pick, bool allowUndefs) const;
/// Returns the pick of successive elements corresponding to the range.
Pick toPick() const;
/// Converts the pick to `MBRange`. The pick must contain consecutive indices
/// and undefs are not allowed. If the pick is not a valid `MBRange`, returns
/// `std::nullopt`.
static std::optional<MBRange> fromPick(const Pick &pick);
};
} // unnamed namespace
// ===========================================================================
bool Pick::isTrivial(unsigned length) const {
if (size() != length)
return false;
for (unsigned n = 0; n < length; ++n) {
if ((*this)[n] != static_cast<int>(n))
return false;
}
return true;
}
Pick Pick::pickFrom(const Pick &origin) const {
Pick newPick = Pick(size());
std::transform(begin(), end(), newPick.begin(),
[&](int n) -> int { return 0 <= n && static_cast<unsigned>(n) < origin.size() ? origin[n] : -1; });
return newPick;
}
Pick Pick::narrowTo(const MBRange &mbr) const {
Pick newPick = Pick(size());
for (unsigned n = 0; n < size(); ++n) {
int idx = mbr.indexOf((*this)[n]);
newPick[n] = 0 <= idx ? idx : -1;
}
return newPick;
}
Pick Pick::createIdentityPick(unsigned size, int begin) {
Pick interval = Pick(size);
std::generate(interval.begin(), interval.end(), [&]() -> int { return begin++; });
return interval;
}
std::optional<Pick> Pick::fromMask(ArrayRef<int> mask, int begin, int end) {
Pick newPick = Pick(mask.size());
for (unsigned n = 0; n < mask.size(); ++n) {
int val = mask[n];
if (0 <= val) {
if (val < begin || end < val)
return std::nullopt;
newPick[n] = val - begin;
} else {
newPick[n] = -1;
}
}
return std::make_optional(std::move(newPick));
}
raw_ostream &operator<<(raw_ostream &os, const Pick &pick) {
os << "{ ";
for (int x : pick) {
os << x << ' ';
}
os << '}';
return os;
}
// ===========================================================================
int MBRange::operator[](int n) const { return (*this)(n / grSize, n % grSize); }
int MBRange::operator()(int group, int elt) const { return group * grPitch + first + elt; }
bool MBRange::contains(int x) const {
return empty() ? false
: 1 < numOfGr ? first <= x && x <= last() && (x - first) % grPitch < static_cast<int>(grSize)
: first <= x && x <= last();
}
MBRange::Containment MBRange::containsOrExcludes(const Pick &pick, bool allowUndefs) const {
if (pick.empty())
return MBRange::Containment::Contains;
if (empty())
return MBRange::Containment::Excludes;
return std::any_of(pick.begin(), pick.end(), [&](int x) -> bool { return contains(x); })
? std::all_of(pick.begin(), pick.end(),
[&](int x) -> bool { return contains(x) || (allowUndefs && x < 0); })
? MBRange::Containment::Contains
: MBRange::Containment::Intersects
: std::all_of(pick.begin(), pick.end(), [&](int x) -> bool { return !contains(x); })
? MBRange::Containment::Excludes
: MBRange::Containment::Intersects;
}
int MBRange::indexOf(int x) const {
return contains(x) ? 1 < numOfGr ? (x - first) / grPitch * grSize + (x - first) % grPitch : x - first : -1;
}
Pick MBRange::toPick() const {
if (empty())
return Pick();
Pick pick = Pick(size());
for (int n = 0; n < size(); ++n) {
pick[n] = (*this)[n];
}
return pick;
}
std::optional<MBRange> MBRange::fromPick(const Pick &pick) {
if (pick.empty()) {
return MBRange::getEmpty();
}
if (pick.front() < 0) {
return std::nullopt;
}
int val;
MBRange ret(pick.front(), 0, 0, 0);
for (unsigned n = 1; n < pick.size(); ++n) {
val = pick[n];
// We don't allow undefs.
if (val < 0) {
return std::nullopt;
}
// The value jumps, so we possibly reached the group pitch.
if (val != ret.first + static_cast<int>(n)) {
ret.grSize = n;
ret.grPitch = val - ret.first;
break;
}
}
// Single group range.
if (ret.grSize == 0) {
ret.grSize = pick.size();
ret.numOfGr = 1;
return ret;
}
// grPitch should be positive and larger than the grSize
if (ret.grPitch <= static_cast<int>(ret.grSize)) {
return std::nullopt;
}
if (pick.size() % ret.grSize) {
return std::nullopt;
}
ret.numOfGr = pick.size() / ret.grSize;
for (unsigned gr = 1; gr < ret.numOfGr; ++gr) {
for (unsigned el = 0; el < ret.grSize; ++el) {
val = pick[gr * ret.grSize + el];
if (val < 0) {
return std::nullopt;
}
if (val != ret(gr, el)) {
return std::nullopt;
}
}
}
return ret;
}
raw_ostream &operator<<(raw_ostream &os, const MBRange &range) {
for (unsigned gr = 0; gr < range.numOfGr; ++gr) {
os << '[' << range(gr, 0) << ", " << range(gr, range.grSize - 1) << "] ";
}
return os;
}
// ==========================================================================
namespace {
constexpr unsigned NUM_OF_BLOCKLOAD_ARGS = 13;
/// Indices for arguments of GenISA_LSC2DBlockRead.
namespace LSC2D_BlockRead {
enum : unsigned {
argSurfacePtr = 0,
argSurfaceWidthLessOne_inBytes = 1,
argSurfaceHeightLessOne_inPitches = 2,
argSurfacePitchLessOne_inBytes = 3,
argXOffset_inElts = 4,
argYOffset_inPitches = 5,
argSizeInBits = 6,
argBlockWidth_inElts = 7,
argBlockHeight_inElts = 8,
argNumOfBlocks = 9,
argIsTranspose = 10,
argIsVNNI = 11,
argCacheFlags = 12
};
}
/// Returns the numeric value of the argument number `n` to the intrinsic `GII`
/// as `unsigned int`. Assumes `GII->getArgOperand(n)` exists and can be cast to
/// `ConstantInt`.
static unsigned getArgZ(GenIntrinsicInst *GII, unsigned n) {
return static_cast<unsigned>(cast<ConstantInt>(GII->getArgOperand(n))->getZExtValue());
}
/// Returns the numeric value of the argument number `n` to the intrinsic `GII`
/// as `signed int`. Assumes `GII->getArgOperand(n)` exists and can be cast to
/// `ConstantInt`.
static int getArgS(GenIntrinsicInst *GII, unsigned n) {
return static_cast<int>(cast<ConstantInt>(GII->getArgOperand(n))->getSExtValue());
}
#define DBG(x) LLVM_DEBUG(x)
using Picks = SmallVector<Pick, DEF_NUM_OF_PICKS_PER_LOAD>;
using MBRanges = SmallVector<MBRange, DEF_NUM_OF_PICKS_PER_LOAD>;
/// `TraceData` contains all the data about the structure of the splits of a
/// load vector. It gathers all picks from the load together with the associated
/// bitcasts and their users.
struct TraceData {
using Cast = SmallDenseMap<Type *, SmallVector<Instruction *, DEF_NUM_OF_CASTS_OR_USERS_PER_PICK>,
DEF_NUM_OF_CASTS_OR_USERS_PER_PICK>;
using Casts = SmallVector<Cast, DEF_NUM_OF_PICKS_PER_LOAD>;
using ToRemove = SmallVector<Instruction *, DEF_NUM_OF_PICKS_PER_LOAD + 1>;
BasicBlock *BB = nullptr;
Pick initialPick = Pick();
std::unique_ptr<Picks> picks = nullptr;
Casts typesToCastTo = Casts();
ToRemove toRemove = ToRemove();
/// Returns total vector length.
unsigned vectorLength() const { return initialPick.size(); }
/// Traces the pick tree starting from the load `GII` and returns `true` if
/// the `TraceData` is valid. If `true` is returned, then `picks` is non-null.
bool tracePicks(GenIntrinsicInst *GII);
/// Uses `builder` to create the sequence of LLVM instructions that represent
/// the splits: i) The tree is attached to `load` and ii) For each pick,
/// the corresponding sequence of `InsertElementInst` and `ExtractElementInst`
/// is inserted. iii) Final bitcasts are attached and the users are updated
/// appropriately.
void putPicks(IRBuilder<> &builder, Value *load);
/// Removes all instructions marked to remove.
void removeOldInstructions();
/// Returns new `TraceData` containing those picks in this, which are subpicks
/// of `largeRange`.
std::unique_ptr<TraceData> pickSubpicksOf(const MBRange &largeRange);
private:
struct Node : public Pick {
Type *type;
Instruction *fun;
explicit Node() : Pick(), type(nullptr), fun(nullptr) {}
Node(const Pick &pick, Type *type, Instruction *fun) : Pick(pick), type{type}, fun{fun} {}
};
std::optional<Node> addBitCast(const Node &previous, BitCastInst *BCI);
std::optional<Node> addShuffle(const Node &previous, ShuffleVectorInst *SVI);
std::optional<Node> addExtractInsertSequence(const Node &previous, ExtractElementInst *EEI,
SmallPtrSet<Value *, DEF_NUM_OF_PICKS_PER_LOAD> &extractsToSkip);
void addLeaf(const Node &leaf);
};
/// `LoadData` represents the data that is associated with a block load.
struct LoadData {
/// Block width in elements as specified in the intrinsic.
unsigned blockWidth_E = 0;
/// Block height in elements as specified in the intrinsic.
unsigned blockHeight_E = 0;
/// Number of blocks as specified in the intrinsic.
unsigned numOfBlocks = 0;
/// Length of the loaded vector in elements.
unsigned vectorLength = 0;
/// Bit width of a single element in the loaded vector.
unsigned scalarBitWidth = 0;
/// Bit width of a single element on the surface as specified in the
/// intrinsic.
unsigned elementBitWidth = 0;
/// Is the load transposed.
bool transposed = false;
/// Is the load VNNI-transformed.
bool vnni = false;
/// Checks if this `LoadData` is the same as `rhs` as the 2D LSC load data.
bool sameAsLoad(const LoadData &rhs) const;
/// Returns the length of a single block in the vector.
unsigned groupLength() const { return vectorLength / numOfBlocks; }
/// Uses parameters of the intrinsic to figure out SIMD.
/// If the SIMD is reported correctly, this is equal to config().actualSimd.
unsigned SIMD() const {
// From the point of view of the intrinsic, the total bit width of the load
// is: totalBW = blockWidth * blockHeight * numOfBlocks * elementBitWidth.
// From the point of view of the loaded vectors, the total bit width is:
// totalBW = vectorLength * SIMD * scalarBitWidth.
// From this:
return (blockWidth_E * blockHeight_E * numOfBlocks * elementBitWidth) / (vectorLength * scalarBitWidth);
}
/// Returns the size of a single scalar multiplied by the SIMD.
unsigned scalarMemSize_B() const {
return (blockWidth_E * blockHeight_E * numOfBlocks * elementBitWidth) / (8 * vectorLength);
}
/// Returns the minimum valid group length for the split load.
unsigned getMinGroupLength(unsigned atLeastThisLarge = 0) const;
/// Checks if the load has valid parameters as long as this pass in concerned.
bool isValidLoad() const;
/// Returns the name of the intrinsic with the data represented by this
/// `LoadData` and corresponding to the stand-alone block load.
std::string getBlockLoadName() const;
protected:
IntegerType *scalarTy = nullptr;
private:
bool isValidTransposed() const;
bool isValidVNNI() const;
};
/// `Load` contains all the data associated with the load and its picks.
/// It provides methods for splitting the load and creating the corresponding
/// LLVM IR.
struct Load : public LoadData {
GenIntrinsicInst *GII = nullptr;
int xOffset_E = 0; // X Offset in elements
int yOffset_P = 0; // Y Offset in surface pitch
std::unique_ptr<TraceData> trace = nullptr;
explicit Load() = default;
Load(const LoadData &data) : LoadData(data) {}
Load(const Load &rhs) : LoadData(rhs), GII(rhs.GII), xOffset_E(rhs.xOffset_E), yOffset_P(rhs.yOffset_P) {}
Load &operator=(const Load &rhs) {
if (this != &rhs) {
LoadData::operator=(rhs);
GII = rhs.GII;
xOffset_E = rhs.xOffset_E;
yOffset_P = rhs.yOffset_P;
}
return *this;
}
Load(Load &&) = default;
Load &operator=(Load &&) = default;
~Load() = default;
/// Reads the data from the load intrinsic.
bool readFromLoad(GenIntrinsicInst *GII);
/// Traces all picks attached to the load and returns `true` if the
/// `TraceData` is valid. If `true` is returned, then `trace` and
/// `trace->picks` is non-null.
bool tracePicks();
/// Returns the set of possible grid-uniform dimensions into which the load
/// can be split. It takes into account the limits from `Config`.
PossibleDims possibleDims();
/// Returns the new load corresponding to loading only part of the original
/// load. If the load is not valid, returns `nullptr`.
std::unique_ptr<Load> split(const MBRange &range);
/// Creates the LLVM call for the stand-alone block load.
/// Uses `GII` as the insertion point, so `GII` should point to the old load.
CallInst *putBlockLoad(IRBuilder<> &builder);
/// Deletes the original load and its picks.
void removeOldInstructions();
private:
bool fillBlockData();
Load &splitFlat(const MBRange &range);
Load &splitTransposed(const MBRange &range);
Load &splitVNNI(const MBRange &range);
std::unique_ptr<Load> splitLoadData(const MBRange &range);
};
} // unnamed namespace
// ==========================================================================
raw_ostream &operator<<(raw_ostream &os, const Dims &dims) {
os << dims.grSize << " x " << dims.numOfGr;
return os;
}
// ==========================================================================
/// Given a `Value` of a vectorial type, returns the pair representing the
/// scalar type and the size of the vector.
static std::pair<IntegerType *, unsigned> getScalarTypeAndSize(Value *V) {
std::pair<IntegerType *, unsigned> ret{nullptr, 0};
IGCLLVM::FixedVectorType *vectorTy = dyn_cast<IGCLLVM::FixedVectorType>(V->getType());
if (!vectorTy)
return ret;
ret.first = dyn_cast<IntegerType>(vectorTy->getElementType());
if (!ret.first)
return ret;
ret.second = vectorTy->getNumElements();
return ret;
}
/// Creates the function of a given name in LLVM.
static Function *createFunction(StringRef name, Module *currModule, ArrayRef<Value *> args, Type *retTy,
Function *copyAttrAfter = nullptr) {
SmallVector<Type *, NUM_OF_BLOCKLOAD_ARGS> argsTy;
argsTy.assign(args.size(), nullptr);
for (unsigned i = 0; i < argsTy.size(); ++i) {
argsTy[i] = args[i]->getType();
}
FunctionType *newFunTy = FunctionType::get(retTy, argsTy, false);
Function *newFun = Function::Create(newFunTy, GlobalValue::ExternalLinkage, name, currModule);
if (copyAttrAfter) {
newFun->copyAttributesFrom(copyAttrAfter);
if (isa<GlobalObject>(copyAttrAfter)) {
newFun->copyMetadata(cast<GlobalObject>(copyAttrAfter), 0);
}
}
return newFun;
}
/// Adds `offset` to `value` if `offset` is non-zero.
static Value *createAdd(IRBuilder<> &builder, Value *value, unsigned offset) {
return offset ? (isa<ConstantInt>(value)
? builder.getInt32(static_cast<unsigned>(cast<ConstantInt>(value)->getZExtValue()) + offset)
: builder.CreateAdd(value, builder.getInt32(offset), "", true, true))
: value;
}
// ===========================================================================
bool Config::initialize(Function *F, CodeGenContext *inCGC, IGCLivenessAnalysis *inRPE) {
CGC = inCGC;
RPE = inRPE;
if (!F || !CGC || !RPE)
return false;
if (!CGC->platform.hasLSC()) {
DBG(dbgs() << " [SKIP] No support for LSC on this platform.\n");
return false;
}
if (!IGC::ForceAlwaysInline(CGC)) {
if (F->isDeclaration())
return false;
} else {
if (!F->getReturnType()->isVoidTy())
return false;
}
// Actual SIMD is the SIMD as reported by the compiler.
// Default SIMD is the default SIMD associated with the architecture.
// Default SIMD is used only if actual SIMD is absent and mostly for testing
// purposes.
defaultSimd = 0;
switch (CGC->platform.getPlatformInfo().eProductFamily) {
case IGFX_DG2:
case IGFX_METEORLAKE:
case IGFX_ARROWLAKE:
defaultSimd = 16;
break;
default:
defaultSimd = 32;
break;
}
actualSimd = 0;
if (RPE->MDUtils && RPE->MDUtils->findFunctionsInfoItem(F) != RPE->MDUtils->end_FunctionsInfo()) {
IGC::IGCMD::FunctionInfoMetaDataHandle funcInfoMD = RPE->MDUtils->getFunctionsInfoItem(F);
actualSimd = funcInfoMD->getSubGroupSize()->getSIMDSize();
}
isLegitW8 = false;
sizeOfRegs_B = RPE->registerSizeInBytes();
numOfRegs = CGC->getNumGRFPerThread();
minSplitSize_B = minSplitSize_GRF * sizeOfRegs_B;
splitThreshold_B = (static_cast<int>(numOfRegs) + splitThresholdDelta_GRF) * sizeOfRegs_B;
DBG(Module *newM = F->getParent(); if (newM != M) {
M = newM;
dbgs() << "CONFIG DATA:\n";
dbgs() << " -- SPLITTING ENABLED / ignore reg pressure = " << enableLoadSplitting << " / "
<< ignoreSplitThreshold << "\n";
dbgs() << " -- register size [B] / number of registers = " << sizeOfRegs_B << " / " << numOfRegs << "\n";
dbgs() << " -- default SIMD / actual SIMD = " << defaultSimd << " / " << actualSimd << "\n";
dbgs() << " -- split threshold [B] = " << splitThreshold_B << "\n";
dbgs() << " -- min split size [E] / min split size [B] = " << minSplitSize_E << " / " << minSplitSize_B
<< "\n";
});
return true;
}
// ===========================================================================
std::optional<TraceData::Node> TraceData::addBitCast(const Node &previous, BitCastInst *BCI) {
IGCLLVM::FixedVectorType *srcTy = dyn_cast<IGCLLVM::FixedVectorType>(BCI->getSrcTy());
IGCLLVM::FixedVectorType *destTy = dyn_cast<IGCLLVM::FixedVectorType>(BCI->getDestTy());
if (!(srcTy && destTy))
return std::nullopt;
// We only allow bitcasts that preserve the size of the underlying scalar
// type.
if (srcTy->getElementType()->getScalarSizeInBits() != destTy->getElementType()->getScalarSizeInBits())
return std::nullopt;
return TraceData::Node(previous, destTy->getElementType(), BCI);
}
std::optional<TraceData::Node> TraceData::addShuffle(const Node &previous, ShuffleVectorInst *SVI) {
if (!isa<IGCLLVM::FixedVectorType>(SVI->getType()))
return std::nullopt;
// Previous node is a bitcast, shuffle vector, insert element, or the original
// load, so we checked that previous->fun has the type of a fixed vector.
unsigned previousVectorLength = cast<IGCLLVM::FixedVectorType>(previous.fun->getType())->getNumElements();
// We must make sure that the shuffle vector is a pick from the previous
// vector. The other vector must be undef.
int beginPos, endPos;
if (SVI->getOperand(0) == previous.fun) {
beginPos = 0;
endPos = previousVectorLength - 1;
if (!isa<Constant>(SVI->getOperand(1)))
return std::nullopt;
} else {
beginPos = previousVectorLength;
endPos = 2 * previousVectorLength - 1;
if (!isa<Constant>(SVI->getOperand(0)))
return std::nullopt;
}
// Pick::fromMask guarantees that the pick is entirely contained in [beginPos,
// endPos].
std::optional<Pick> newPick = Pick::fromMask(SVI->getShuffleMask(), beginPos, endPos);
if (!newPick)
return std::nullopt;
return TraceData::Node(newPick->pickFrom(previous), previous.type, SVI);
}
std::optional<TraceData::Node>
TraceData::addExtractInsertSequence(const TraceData::Node &previous, ExtractElementInst *EEI,
SmallPtrSet<Value *, DEF_NUM_OF_PICKS_PER_LOAD> &extractsToSkip) {
auto isValidExtract = [&](ExtractElementInst *E) -> bool {
if (!isa<IGCLLVM::FixedVectorType>(E->getVectorOperand()->getType()))
return false;
if (!isa<ConstantInt>(E->getIndexOperand()))
return false;
if (!E->hasOneUse())
return false;
return true;
};
auto isValidInsert = [&](InsertElementInst *I) -> bool {
if (!isa<IGCLLVM::FixedVectorType>(I->getType()))
return false;
if (!isa<ConstantInt>(I->getOperand(2)))
return false;
return true;
};
auto getAssociatedInsert = [&](ExtractElementInst *E) -> InsertElementInst * {
return dyn_cast<InsertElementInst>(*E->user_begin());
};
auto getAssociatedExtract = [&](InsertElementInst *I) -> ExtractElementInst * {
return dyn_cast<ExtractElementInst>(I->getOperand(1));
};
auto getPreviousInsert = [&](InsertElementInst *I) -> InsertElementInst * {
return dyn_cast<InsertElementInst>(I->getOperand(0));
};
auto getNextInsert = [&](InsertElementInst *I) -> InsertElementInst * {
return dyn_cast<InsertElementInst>(*I->user_begin());
};
std::deque<std::pair<ExtractElementInst *, InsertElementInst *>> EIs;
// We want to find the chain of extract/insert elements.
// There is no guarantee that the first user of the load is the first extract
// element, so we must traverse the chain both up and down.
auto addPairFromExtract = [&](ExtractElementInst *E, bool front) -> bool {
if (!E)
return false;
extractsToSkip.insert(E);
if (!isValidExtract(E))
return false;
InsertElementInst *I = getAssociatedInsert(E);
if (!I || !isValidInsert(I))
return false;
if (front) {
EIs.push_front(std::make_pair(E, I));
} else {
EIs.push_back(std::make_pair(E, I));
}
return true;
};
auto addPairFromInsert = [&](InsertElementInst *I, bool front) -> bool {
if (!I || !isValidInsert(I))
return false;
ExtractElementInst *E = getAssociatedExtract(I);
if (!E)
return false;
extractsToSkip.insert(E);
if (!isValidExtract(E))
return false;
if (front) {
EIs.push_front(std::make_pair(E, I));
} else {
EIs.push_back(std::make_pair(E, I));
}
return true;
};
// First we move "up" to find the first pair of insert/extract elements.
if (!addPairFromExtract(EEI, true))
return std::nullopt;
InsertElementInst *insert = EIs.front().second;
do {
insert = getPreviousInsert(insert);
} while (addPairFromInsert(insert, true));
// Since addPairFromExtract succeeded, we know that the first pair in EIs
// exists. This pair is valid but the previous pair failed. So either there
// was no previous pair (which is what we want) or the previous pair was
// invalid and we bail out.
if (!isa<UndefValue>(EIs.front().second->getOperand(0)))
return std::nullopt;
// Now we move "down".
insert = EIs.back().second;
do {
// If the insert has more than one user, the sequence ends.
if (!insert->hasOneUse())
break;
insert = getNextInsert(insert);
} while (addPairFromInsert(insert, false));
// It is the role of tracePicks to figure out if the next instructions are
// valid leaves.
// The first pair in EIs determines:
// - from which vector we pick
// - how many elements we pick
unsigned newVectorLength = cast<IGCLLVM::FixedVectorType>(EIs.front().second->getType())->getNumElements();
Value *pickingFrom = EIs.front().first->getVectorOperand();
Value *currentBuildVector = nullptr;
Pick pick = Pick(newVectorLength, -1);
// While building the picks, we have to check that:
// - we pick from the same vector
// - we keep building the same vector
// - we assign each index only once
auto addPick = [&](std::pair<ExtractElementInst *, InsertElementInst *> &ei) -> bool {
if (ei.first->getVectorOperand() != pickingFrom)
return false;
if (currentBuildVector) {
if (currentBuildVector != ei.second->getOperand(0))
return false;
}
currentBuildVector = ei.second;
unsigned origIdx = cast<ConstantInt>(ei.first->getIndexOperand())->getZExtValue();
unsigned newIdx = cast<ConstantInt>(ei.second->getOperand(2))->getZExtValue();
if (pick[newIdx] != -1)
return false;
pick[newIdx] = origIdx;
return true;
};
for (unsigned n = 0; n < EIs.size() - 1; ++n) {
if (!addPick(EIs[n]))
return std::nullopt;
toRemove.push_back(EIs[n].first);
toRemove.push_back(EIs[n].second);
}
if (!addPick(EIs.back()))
return std::nullopt;
toRemove.push_back(EIs.back().first);
return TraceData::Node(pick.pickFrom(previous), previous.type, EIs.back().second);
}
void TraceData::addLeaf(const Node &leaf) {
if (leaf.empty())
return;
auto it = std::find(picks->begin(), picks->end(), leaf);
if (it != picks->end()) {
typesToCastTo[it - picks->begin()][leaf.type].push_back(leaf.fun);
} else {
picks->push_back(leaf);
typesToCastTo.push_back(Cast{{leaf.type, SmallVector<Instruction *, 1>(1, leaf.fun)}});
}
}
bool TraceData::tracePicks(GenIntrinsicInst *GII) {
auto [_, vecLen] = getScalarTypeAndSize(GII);
if (!vecLen)
return false;
BB = GII->getParent();
initialPick = Pick::createIdentityPick(vecLen);
picks = std::make_unique<Picks>();
typesToCastTo.clear();
toRemove.clear();
SmallVector<TraceData::Node, DEF_NUM_OF_PICKS_PER_LOAD + 1> activeNodes;
activeNodes.emplace_back(Pick::createIdentityPick(vectorLength()),
cast<IGCLLVM::FixedVectorType>(GII->getType())->getElementType(), GII);
TraceData::Node currNode;
// We need to skip the extract elements that are part of the already processed
// insert/extract sequence.
SmallPtrSet<Value *, DEF_NUM_OF_PICKS_PER_LOAD> extractsToSkip;
while (!activeNodes.empty()) {
currNode = activeNodes.back();
activeNodes.pop_back();
toRemove.push_back(currNode.fun);
for (User *nextUser : currNode.fun->users()) {
Instruction *next = dyn_cast<Instruction>(nextUser);
if (!next) {
return false;
}
if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(next)) {
// currNode->fun can be the starting load, bitcast, insert element, or
// shufflevector. In all those cases we checked that
// currNode->fun->getType() is a fixed vector type.
std::optional<TraceData::Node> newNode = addShuffle(currNode, SVI);
if (!newNode) {
return false;
}
activeNodes.push_back(std::move(*newNode));
} else if (BitCastInst *BCI = dyn_cast<BitCastInst>(next)) {
std::optional<TraceData::Node> newNode = addBitCast(currNode, BCI);
if (!newNode) {
return false;
}
activeNodes.push_back(std::move(*newNode));
} else if (ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(next)) {
if (extractsToSkip.count(EEI))
continue;
std::optional<TraceData::Node> newNode = addExtractInsertSequence(currNode, EEI, extractsToSkip);
if (!newNode) {
return false;
}
activeNodes.push_back(std::move(*newNode));
} else {
addLeaf(currNode);
}
}
}
return true;
}
void TraceData::putPicks(IRBuilder<> &builder, Value *load) {
if (!picks)
return;
// This assumes no repeated values in the pick.
auto putPick = [&](const Pick &pick, Value *loadOrCast, Type *scalarTy) -> Value * {
Value *elem;
Value *newVector = UndefValue::get(IGCLLVM::FixedVectorType::get(scalarTy, pick.size()));
for (unsigned insertPos = 0; insertPos < pick.size(); ++insertPos) {
if (pick[insertPos] < 0)
continue;
unsigned extractPos = static_cast<unsigned>(pick[insertPos]);
elem = builder.CreateExtractElement(loadOrCast, extractPos);
newVector = builder.CreateInsertElement(newVector, elem, insertPos);
}
return newVector;
};
// We attach the pick tree to the load.
Value *loadOrCast = load;
Type *scalarTy;
std::tie(scalarTy, std::ignore) = getScalarTypeAndSize(load);
Value *picksVal;
Value *pickAndCastVal;
for (unsigned n = 0; n < picks->size(); ++n) {
const Pick &pick = (*picks)[n];
// If a pick is non-trivial, we insert the insert/extract sequence.
// picksVal is the resulting value, or the origianl load if no pick is
// required.
picksVal = pick.isTrivial(vectorLength()) ? loadOrCast : putPick(pick, loadOrCast, scalarTy);
// For each pick we cast picksVal to the appropriate type and replace the
// users. pickAndCastVal is the resulting value of the cast, if needed, or
// is the original pick.
for (auto &[type, users] : typesToCastTo[n]) {
if (type == scalarTy) {
pickAndCastVal = picksVal;
} else {
pickAndCastVal = builder.CreateBitCast(picksVal, IGCLLVM::FixedVectorType::get(type, pick.size()));
}
for (Instruction *user : users) {
// The only case when fun == newCallAndCast is when the load goes
// directly into the call, without any shuffles or bitcasts.
if (user != pickAndCastVal) {
user->replaceAllUsesWith(pickAndCastVal);
}
}
users.clear();
users.push_back(dyn_cast<Instruction>(pickAndCastVal));
}
}
}
void TraceData::removeOldInstructions() {
for (Instruction *instr : toRemove) {
if (instr && instr->getType()) {
instr->replaceAllUsesWith(UndefValue::get(instr->getType()));
}
}
for (Instruction *instr : toRemove) {
if (instr) {
instr->eraseFromParent();
}
}
toRemove.clear();
}
std::unique_ptr<TraceData> TraceData::pickSubpicksOf(const MBRange &mbr) {
if (!picks)
return nullptr;
std::unique_ptr<TraceData> ret = std::make_unique<TraceData>();
ret->BB = BB;
ret->picks = std::make_unique<Picks>();
ret->initialPick = mbr.toPick();
for (unsigned n = 0; n < picks->size(); ++n) {
const Pick &pick = (*picks)[n];
if (mbr.contains(pick[0])) {
Pick narrowed = pick.narrowTo(mbr);
ret->picks->push_back(narrowed);
ret->typesToCastTo.push_back(typesToCastTo[n]);
}
}
return ret;
}
// ===========================================================================
/// Creates a vector of MBRange's, each of the same `grSize`, `grPitch`, and
/// `numOfGr`. The ranges cover the entire vector of length `vectorLength`.
static std::optional<MBRanges> makeUniform(unsigned grSize, int grPitch, unsigned numOfGr, unsigned vectorLength) {
if (!vectorLength)
return std::nullopt;
if (grSize == grPitch) {
grSize *= numOfGr;
numOfGr = 1;
}
MBRanges mbrs;
if (numOfGr <= 1) {
if (vectorLength % grSize)
return std::nullopt;
unsigned numOfGroups = vectorLength / grSize;
mbrs.resize(numOfGroups);
for (unsigned n = 0; n < numOfGroups; ++n) {
mbrs[n] = MBRange(n * grSize, grSize, grPitch, 1);
}
} else {
if (vectorLength % (grPitch * numOfGr) || grPitch % grSize || grPitch <= static_cast<int>(grSize) || grPitch <= 0)
return std::nullopt;
unsigned numInBlock = grPitch / grSize;
unsigned blockPitch = grPitch * numOfGr;
unsigned numOfBlocks = vectorLength / blockPitch;
mbrs.resize(numInBlock * numOfBlocks);
for (unsigned n = 0; n < numOfBlocks; ++n) {
for (unsigned m = 0; m < numInBlock; ++m) {
mbrs[n * numInBlock + m] = MBRange(n * blockPitch + m * grSize, grSize, grPitch, numOfGr);
}
}
}
return std::make_optional(std::move(mbrs));
}
/// Creates a vector of MBRange's, each of the same `grSize`, `grPitch`, and
/// `numOfGr` and covering the entire vector of length `vectorLength`
/// grid-uniformly. Each pick of `picks` must fit into exactly one range.
static std::optional<MBRanges> makeGridUniformPicks(const Picks &picks, unsigned vectorLength, unsigned blockLength) {
if (picks.empty())
return std::nullopt;
MBRanges mbrs;
mbrs.reserve(picks.size());
for (unsigned n = 0; n < picks.size(); ++n) {
// The picks must be grid-uniform. No undefs allowed.
std::optional<MBRange> mbr = MBRange::fromPick(picks[n]);
if (!mbr) {
DBG(dbgs() << " -- Pick " << picks[n] << " is not a valid multi-block range.\n");
return std::nullopt;
}
if (mbr->numOfGr == 1) {
mbr->grPitch = blockLength;
}
mbrs.push_back(*mbr);
}
MBRange mbr = mbrs.front();
if (mbrs.size() * mbr.size() != vectorLength) {
DBG(dbgs() << " -- Invalid sizes of multi-block ranges.\n");
return std::nullopt;
}
if (!std::all_of(std::next(mbrs.begin()), mbrs.end(), [&](const MBRange &x) {
return x.grPitch == mbr.grPitch && x.grSize == mbr.grSize && x.numOfGr == mbr.numOfGr;
})) {
DBG(dbgs() << " -- Multi-block ranges of different sizes.\n");
return std::nullopt;
}
if (mbr.numOfGr == 1) {
if (!isPowerOf2_32(mbr.grSize) || vectorLength % mbr.grSize || mbr.first % mbr.grSize) {
DBG(dbgs() << " -- Invalid size of multi-block range " << mbr << ".\n");
return std::nullopt;
}
} else {
if (!(isPowerOf2_32(mbr.grSize) && isPowerOf2_32(mbr.grPitch) && isPowerOf2_32(mbr.numOfGr))) {
DBG(dbgs() << " -- Invalid size of multi-block range " << mbr << ".\n");
return std::nullopt;
}
// - group size must be smaller than block length and be its divisor
// - group pitch must match the block length (there are no gaps between
// blocks)
if (blockLength <= mbr.grSize || blockLength % mbr.grSize || mbr.grPitch != blockLength) {
DBG(dbgs() << " -- Invalid size of multi-block range " << mbr << ".\n");
return std::nullopt;
}
unsigned numOfBlocks = vectorLength / blockLength;
unsigned firstInGr = mbr.first % blockLength;
unsigned grStart = mbr.first / blockLength;
// - first elt in each group must be a multiple of the group size
// - number of groups must be a divisor of the number of blocks
// - the block with first elt must be a multiple of the number of groups
if (firstInGr % mbr.grSize || numOfBlocks % mbr.numOfGr || grStart % mbr.numOfGr) {
DBG(dbgs() << " -- Invalid size of multi-block range " << mbr << ".\n");
return std::nullopt;
}
}
SmallVector<int, DEF_PICK_SIZE> allPicks(vectorLength, 0);
for (const MBRange &range : mbrs) {
for (int n = 0; n < range.size(); ++n) {
int &alreadyPicked = allPicks[range[n]];
if (alreadyPicked) {
DBG(dbgs() << " -- Multi-block ranges overlap.\n");
return std::nullopt;
}
alreadyPicked = 1;
}
}
if (std::find(allPicks.begin(), allPicks.end(), 0) != allPicks.end()) {
DBG(dbgs() << " -- Multi-block ranges do not cover the entire vector.\n");
return std::nullopt;
}
DBG(dbgs() << " -- Multi-block ranges are valid.\n";
dbgs() << " -- First MB range is = " << mbrs.front() << ".\n";);
return std::make_optional(std::move(mbrs));
}
// ===========================================================================
bool LoadData::isValidLoad() const {
// For this pass we assume the following:
// 1. We must avoid padding, otherwise the splitting would fail.
// 2. All parameters are power of two, so that the splitting is possible.
// 3. The length of the vector is at least 2 (so we don't deal with v1s).
if (!isPowerOf2_32(vectorLength) || !isPowerOf2_32(blockWidth_E) || !isPowerOf2_32(blockHeight_E) ||
!isPowerOf2_32(numOfBlocks) || !isPowerOf2_32(scalarBitWidth) || !isPowerOf2_32(elementBitWidth))
return false;
if ((blockHeight_E * blockWidth_E * elementBitWidth / 8) % config().sizeOfRegs_B)
return false; // so we don't deal with padding
if (scalarBitWidth % elementBitWidth)
return false; // so scalarBitLength >= elementBitLength
if (vectorLength < 2) // We don't want to deal with v1's.
return false;
DBG(
bool ok = true; if (transposed && vnni) ok = false; else if (!transposed and !vnni) {
unsigned rowBytesPerBlk = ((elementBitWidth / 8) * blockWidth_E);
if ((rowBytesPerBlk * numOfBlocks) > 64 || rowBytesPerBlk < 4)
ok = false;
} else if (transposed) {
bool isValid64 = (elementBitWidth == 64 && blockHeight_E == 8 &&
(blockWidth_E <= 4 || (blockWidth_E == 8 && config().isLegitW8)));
bool isValid32 = (elementBitWidth == 32 && blockHeight_E <= 32 && blockWidth_E <= 8);
if (numOfBlocks != 1 || !(isValid32 || isValid64))
ok = false;
} else if (vnni) {
// scalarBitWidth / elementBitWidth is ok since scalarBitLength %
// elementBitLength == 0.
bool isValid8 = (elementBitWidth == 8 && blockHeight_E >= 4 && blockWidth_E >= 4);
bool isValid16 = (elementBitWidth == 16 && blockHeight_E >= 2 && blockWidth_E >= 2 && blockWidth_E <= 32);
if (!(isValid8 || isValid16))
ok = false;
} if (!ok) {
dbgs() << " -- [ERROR] Load is invalid. Parameters mismatch.\n";
return false;
}
if (config().actualSimd) {
if (config().actualSimd != SIMD()) {
dbgs() << " -- [ERROR] Load is invalid. SIMD mismatch.\n";
return false;
}
});
return true;
}
unsigned LoadData::getMinGroupLength(unsigned atLeastThisLarge) const {
// There are no lower bounds for straight loads.
unsigned minGroupLen = 1;
if (transposed)
minGroupLen = divideCeil(32, elementBitWidth);
else if (vnni)
minGroupLen = divideCeil(32, scalarBitWidth);
// Minimal block length must be equal or larger than:
// 1. minGroupLen
// 2. atLeastThisLarge
// 3. its size in bytes must be at least equal to the size of 1 GRF to avoid
// padding.
// 4. It also must be a power of 2.
// For 3: the bit width of a load group of size N is N * scalarMemSize_B()
// so N must be greater or equal than the bit width of 1 GRF.
minGroupLen = PowerOf2Ceil(std::max(minGroupLen, atLeastThisLarge));
if (minGroupLen * scalarMemSize_B() < config().sizeOfRegs_B) {
minGroupLen = PowerOf2Ceil(divideCeil(config().sizeOfRegs_B, scalarMemSize_B()));
}
return minGroupLen;
}
bool LoadData::sameAsLoad(const LoadData &rhs) const {
return blockWidth_E == rhs.blockWidth_E && blockHeight_E == rhs.blockHeight_E && numOfBlocks == rhs.numOfBlocks &&
vectorLength == rhs.vectorLength && scalarBitWidth == rhs.scalarBitWidth &&
elementBitWidth == rhs.elementBitWidth && transposed == rhs.transposed && vnni == rhs.vnni;
}
std::string LoadData::getBlockLoadName() const {
return std::string("llvm.genx.GenISA.LSC2DBlockRead.") +
(1 < vectorLength ? "v" + std::to_string(vectorLength) : std::string()) + "i" + std::to_string(scalarBitWidth);
}
// ===========================================================================
bool Load::readFromLoad(GenIntrinsicInst *inGII) {
GII = inGII;
if (!fillBlockData()) {
DBG(dbgs() << " -- Could not fill block data.\n");
return false;
}
if (!isValidLoad()) {
DBG(dbgs() << " -- [ERROR] Load is invalid.\n");
return false;
}
return true;
}
bool Load::tracePicks() {
trace = std::make_unique<TraceData>();
if (!trace->tracePicks(GII)) {
DBG(dbgs() << " -- Tracing picks failed.\n");
return false;
}
if (trace->picks->empty()) {
DBG(dbgs() << " -- Empty picks.\n");
return false;
}
return true;
}
PossibleDims Load::possibleDims() {
if (!trace || !trace->picks || trace->picks->empty())
return {};
std::optional<MBRanges> minSplitOpt = makeGridUniformPicks(*trace->picks, vectorLength, groupLength());
if (!minSplitOpt || !minSplitOpt->size())
return {};
// those are already PowerOf2Ceil'ed:
unsigned minGrSize = getMinGroupLength(minSplitOpt->front().grSize);
unsigned minNumOfGr = minSplitOpt->front().numOfGr;
DBG(dbgs() << " -- Minimal block size = " << minGrSize << " x " << minNumOfGr << ".\n");
// For multiple blocks, if dims.numOfGr > 1, the subgroup cannot cover the
// entire group, i.e., dims.grSize < groupLength(). However, if dims.numOfGr =
// 1, the group size can be as large as the vector size. For example, 2x2 is a
// proper subdimension of 4x4, but 4x2 is not, as it is equivalent to 8x1.
PossibleDims dims;
// First, dimensions with a single group.
for (unsigned grSize = std::max(groupLength() * minNumOfGr, minGrSize); grSize <= vectorLength; grSize *= 2) {
if (grSize < config().minSplitSize_E || grSize * scalarMemSize_B() < config().minSplitSize_B)
continue;
dims.insert({grSize, 1});
}
// Next, dimensions with a multiple groups.
for (unsigned grSize = minGrSize; grSize < groupLength(); grSize *= 2) {
for (unsigned numOfGr = minNumOfGr; numOfGr <= numOfBlocks; numOfGr *= 2) {
if (grSize * numOfGr < config().minSplitSize_E || grSize * numOfGr * scalarMemSize_B() < config().minSplitSize_B)
continue;
dims.insert({grSize, numOfGr});
}
}
if (dims.empty()) {
DBG(dbgs() << " -- [SKIP] No possible dimensions (including no split) "
"satisfy all the conditions.\n");
} else if (dims.size() == 1) {
DBG(dbgs() << " -- [SKIP] No possible splits.\n");
}
DBG(dbgs() << " -- Possible dimensions:\n"; for (const Dims &d : dims) { dbgs() << " -- " << d << "\n"; });
return dims;
}
Load &Load::splitFlat(const MBRange &range) {
unsigned blockStart = range.first / blockHeight_E;
unsigned blockEnd = range.last() / blockHeight_E;
vectorLength = static_cast<unsigned>(range.size());
numOfBlocks = blockEnd - blockStart + 1;
xOffset_E += static_cast<int>(blockStart * blockWidth_E);
yOffset_P += static_cast<int>(range.first % blockHeight_E); // old block height
blockHeight_E = range.size() / numOfBlocks; // new block height
return *this;
}
Load &Load::splitTransposed(const MBRange &range) {
vectorLength = static_cast<unsigned>(range.size());
xOffset_E += range.first;
blockWidth_E = static_cast<unsigned>(range.size());
return *this;
}
Load &Load::splitVNNI(const MBRange &range) {
unsigned scalarInElts = scalarBitWidth / elementBitWidth;
unsigned blockHeight_S = blockHeight_E / scalarInElts;
unsigned blockStart = range.first / blockHeight_S;
unsigned blockEnd = range.last() / blockHeight_S;
vectorLength = static_cast<unsigned>(range.size());
numOfBlocks = blockEnd - blockStart + 1;
xOffset_E += static_cast<int>(blockStart * blockWidth_E);
yOffset_P += static_cast<int>((range.first % blockHeight_S) * scalarInElts); // old block height
blockHeight_E = (range.size() / numOfBlocks) * scalarInElts; // new block height
return *this;
}
std::unique_ptr<Load> Load::splitLoadData(const MBRange &range) {
unsigned scalarToEltRatio = scalarBitWidth / elementBitWidth; // scalarBitWidth % elementBitWidth == 0 was checked.
if (range.first % scalarToEltRatio) {
DBG(dbgs() << " -- [ERROR] Position of the first element does not "
"divide the scalar to element ratio.\n");
return nullptr;
}
// We copy GII and AP as well, because we will use them as the insertion point
// for new instructions. The copy constructor does NOT copy traces.
std::unique_ptr<Load> ret = std::make_unique<Load>(*this);
if (!transposed && !vnni) {
ret->splitFlat(range);
} else if (transposed) {
ret->splitTransposed(range);
} else {
ret->splitVNNI(range);
}
if (!ret->isValidLoad())
return nullptr;
return ret;
}
std::unique_ptr<Load> Load::split(const MBRange &range) {
if (vectorLength == range.size())
return nullptr;
std::unique_ptr<Load> ret = splitLoadData(range);
if (!ret)
return nullptr;
DBG(dbgs() << " -- [OK] Split load created for range " << range << ".\n");
if (trace)
ret->trace = trace->pickSubpicksOf(range);
DBG(dbgs() << " -- [OK] Subpicks picked.\n");
return ret;
}
CallInst *Load::putBlockLoad(IRBuilder<> &builder) {
builder.SetInsertPoint(GII);
std::array<Value *, NUM_OF_BLOCKLOAD_ARGS> args;
for (unsigned i = 0; i < args.size(); ++i) {
args[i] = GII->getArgOperand(i);
}
Function *newLoadFun = builder.GetInsertBlock()->getModule()->getFunction(getBlockLoadName());
if (!newLoadFun) {
newLoadFun = createFunction(getBlockLoadName(), builder.GetInsertBlock()->getModule(), args,
IGCLLVM::FixedVectorType::get(scalarTy, vectorLength), GII->getCalledFunction());
}
args[LSC2D_BlockRead::argBlockWidth_inElts] = builder.getInt32(blockWidth_E);
args[LSC2D_BlockRead::argBlockHeight_inElts] = builder.getInt32(blockHeight_E);
args[LSC2D_BlockRead::argNumOfBlocks] = builder.getInt32(numOfBlocks);
Value *valX = createAdd(builder, GII->getArgOperand(LSC2D_BlockRead::argXOffset_inElts), xOffset_E);
Value *valY = createAdd(builder, GII->getArgOperand(LSC2D_BlockRead::argYOffset_inPitches), yOffset_P);
args[LSC2D_BlockRead::argXOffset_inElts] = valX;
args[LSC2D_BlockRead::argYOffset_inPitches] = valY;
CallInst *newGII = builder.CreateCall(newLoadFun->getFunctionType(), newLoadFun, args);
GII = cast<GenIntrinsicInst>(newGII);
return newGII;
}
bool Load::fillBlockData() {
if (!(isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argBlockHeight_inElts)) &&
isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argBlockWidth_inElts)) &&
isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argSizeInBits)) &&
isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argNumOfBlocks)) &&
isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argIsTranspose)) &&
isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argIsVNNI)))) {
return false;
}
std::tie(scalarTy, vectorLength) = getScalarTypeAndSize(GII);
if (!scalarTy)
return false;
// For block loads actual offsets may not be constant.
// We only track the relative offsets in this case.
xOffset_E = 0;
yOffset_P = 0;
elementBitWidth = getArgZ(GII, LSC2D_BlockRead::argSizeInBits);
blockWidth_E = getArgZ(GII, LSC2D_BlockRead::argBlockWidth_inElts);
blockHeight_E = getArgZ(GII, LSC2D_BlockRead::argBlockHeight_inElts);
numOfBlocks = getArgZ(GII, LSC2D_BlockRead::argNumOfBlocks);
transposed = getArgZ(GII, LSC2D_BlockRead::argIsTranspose);
vnni = getArgZ(GII, LSC2D_BlockRead::argIsVNNI);
// We don't do anything with cache flags or the read address, so we will just
// copy their values in putBlockLoad.
scalarBitWidth = scalarTy->getBitWidth();
return true;
}
void Load::removeOldInstructions() {
if (trace)
trace->removeOldInstructions(); // This already contains the GII call.
}
// ==========================================================================
namespace IGC::LS {
struct LoadSplitter::Impl {
static std::unique_ptr<LoadSplitter::Impl> Create(Function *inF, CodeGenContext *inCGC, IGCLivenessAnalysis *inRPE);
bool isRPHigh(BasicBlock *BB);
PossibleDims possibleDims(GenIntrinsicInst *GII);
bool split(GenIntrinsicInst *GII, Dims dims);
bool splitAllToSmallest(BasicBlock *BB);
private:
SmallDenseMap<GenIntrinsicInst *, std::unique_ptr<Load>, DEF_NUM_OF_LOADS> blockLoadsMap;
SmallDenseMap<GenIntrinsicInst *, Dims, DEF_NUM_OF_LOADS> dimsMap;
using SplitLoads = SmallVector<std::unique_ptr<Load>, DEF_NUM_OF_LOADS>;
bool processBlockLoad(GenIntrinsicInst *GII);
SplitLoads splitBlockLoad(Load &load, const MBRanges &splits);
bool putBlockLoad(Load &load, const std::string &nameExt = std::string());
};
// ==========================================================================
std::unique_ptr<LoadSplitter> LoadSplitter::Create(Function *inF, CodeGenContext *inCGC, IGCLivenessAnalysis *inRPE) {
std::unique_ptr<LoadSplitter> ret = std::unique_ptr<LoadSplitter>(new LoadSplitter());
ret->impl = Impl::Create(inF, inCGC, inRPE);
if (!ret->impl) {
return nullptr;
}
return ret;
}
bool LoadSplitter::isRPHigh(BasicBlock *BB) { return impl->isRPHigh(BB); }
PossibleDims LoadSplitter::possibleDims(GenIntrinsicInst *GII) { return impl->possibleDims(GII); }
bool LoadSplitter::splitAllToSmallest(BasicBlock *BB) { return impl->splitAllToSmallest(BB); }
bool LoadSplitter::split(GenIntrinsicInst *GII, Dims dims) { return impl->split(GII, dims); }
// ==========================================================================
std::unique_ptr<LoadSplitter::Impl> LoadSplitter::Impl::Create(Function *inF, CodeGenContext *inCGC,
IGCLivenessAnalysis *inRPE) {
std::unique_ptr<LoadSplitter::Impl> ret = std::unique_ptr<LoadSplitter::Impl>(new LoadSplitter::Impl());
if (!config().initialize(inF, inCGC, inRPE)) {
return nullptr;
}
return ret;
}
bool LoadSplitter::Impl::isRPHigh(BasicBlock *BB) {
int regPressure = config().RPE->getMaxRegCountForBB(*BB, config().SIMD()) * config().sizeOfRegs_B;
DBG(dbgs() << " -- Reg Pressure = " << regPressure << " B, threshold = " << config().splitThreshold_B << " B.\n");
if (regPressure <= config().splitThreshold_B) {
DBG(dbgs() << " [SKIP] Register pressure below threshold.\n");
return false;
}
DBG(dbgs() << " [OK] Reg pressure high.\n");
return true;
}
bool LoadSplitter::Impl::processBlockLoad(GenIntrinsicInst *GII) {
if (!GII)
return false;
auto ptr = blockLoadsMap.find(GII);
if (ptr != blockLoadsMap.end())
return true;
std::unique_ptr<Load> load = std::make_unique<Load>();
if (!load->readFromLoad(GII)) {
DBG(dbgs() << " -- [SKIP] Invalid block load.\n");
return false;
}
DBG(dbgs() << " -- [OK] Valid block load.\n");
if (!load->tracePicks()) {
DBG(dbgs() << " -- [SKIP] Invalid picks.\n");
return false;
}
DBG(dbgs() << " -- [OK] Picks are valid:\n";
for (const Pick &pick : *load->trace->picks) { dbgs() << " -- " << pick << "\n"; });
blockLoadsMap[GII] = std::move(load);
return true;
}
PossibleDims LoadSplitter::Impl::possibleDims(GenIntrinsicInst *GII) {
if (!GII)
return {};
DBG(dbgs() << "\nPossible split dimensions for: " << *GII << ".\n");
PossibleDims dims;
if (GII->getIntrinsicID() == GenISAIntrinsic::GenISA_LSC2DBlockRead) {
if (!processBlockLoad(GII))
return {};
Load &load = *blockLoadsMap[GII];
DBG(dbgs() << " -- load = " << *load.GII << ".\n");
dims = load.possibleDims();
} else {
DBG(dbgs() << " [ERROR] The intrinsic: " << *GII << " is not a load.\n");
return {};
}
return dims;
}
LoadSplitter::Impl::SplitLoads LoadSplitter::Impl::splitBlockLoad(Load &load, const MBRanges &splits) {
unsigned numOfSplits = load.vectorLength / splits.front().size();
if (numOfSplits == 1) {
DBG(dbgs() << " -- [SKIP] No need to split the block load.\n");
return {};
}
SplitLoads splitLoads;
splitLoads.reserve(numOfSplits);
for (unsigned m = 0; m < numOfSplits; ++m) {
std::unique_ptr<Load> splitLoad = load.split(splits[m]);
if (!splitLoad) {
DBG(dbgs() << " -- [ERROR] Split is not valid.\n");
return {};
}
if (splitLoad->trace->picks->empty()) {
DBG(dbgs() << " -- [SKIP] The new load has no users.\n");
continue;
}
splitLoads.push_back(std::move(splitLoad));
}
return splitLoads;
}
bool LoadSplitter::Impl::putBlockLoad(Load &load, const std::string &nameExt) {
IRBuilder<> builder(load.GII);
CallInst *V = load.putBlockLoad(builder);
DBG(dbgs() << " -- [OK] New load put: " << *V << ".\n");
if (load.GII->hasName()) {
V->setName(load.GII->getName() + nameExt);
}
load.trace->putPicks(builder, V);
DBG(dbgs() << " -- [OK] Picks put.\n");
return true;
}
bool LoadSplitter::Impl::split(GenIntrinsicInst *GII, Dims dims) {
if (!GII)
return false;
if (GII->getIntrinsicID() != GenISAIntrinsic::GenISA_LSC2DBlockRead) {
DBG(dbgs() << " -- [ERROR] The intrinsic: " << *GII << " is not a load.\n");
return false;
}
DBG(dbgs() << " -- Block load: " << *GII << "\n");
DBG(dbgs() << " -- Dimensions: " << dims << "\n");
auto doPicksFit = [](const MBRanges &mbrs, const Picks &picks) -> bool {
for (unsigned r = 0; r < mbrs.size(); ++r) {
const MBRange &mbr = mbrs[r];
for (const Pick &pick : picks) {
if (mbr.containsOrExcludes(pick, false) == MBRange::Containment::Intersects) {
return false;
}
}
}
return true;
};
if (!processBlockLoad(GII))
return false;
Load &load = *blockLoadsMap[GII];
if (load.vectorLength <= dims.size()) {
DBG(dbgs() << " -- [SKIP] Nothing to split.\n");
return false;
}
std::optional<MBRanges> splitsOpt = makeUniform(dims.grSize, load.groupLength(), dims.numOfGr, load.vectorLength);
if (!splitsOpt) {
DBG(dbgs() << " -- [ERROR] Split is not valid.\n");
return false;
}
MBRanges &splits = *splitsOpt;
if (!doPicksFit(splits, *load.trace->picks)) {
DBG(dbgs() << " -- [ERROR] Picks do not fit the splits.\n");
return false;
}
auto newLoads = splitBlockLoad(load, splits);
if (newLoads.empty()) {
DBG(dbgs() << " -- [ERROR] Splitting failed.\n");
return false;
}
for (unsigned n = 0; n < newLoads.size(); ++n) {
putBlockLoad(*newLoads[n], ".split." + std::to_string(n));
}
load.removeOldInstructions();
return true;
}
bool LoadSplitter::Impl::splitAllToSmallest(BasicBlock *BB) {
DBG(dbgs() << "\nNew BB: " << BB->getName() << ".\n");
if (!config().ignoreSplitThreshold && !isRPHigh(BB)) {
DBG(dbgs() << " [SKIP] Register pressure below threshold.\n");
return false;
}
blockLoadsMap.clear();
dimsMap.clear();
for (auto I = BB->begin(); I != BB->end(); ++I) {
GenIntrinsicInst *GII = dyn_cast<GenIntrinsicInst>(&*I);
if (!GII)
continue;
if (GII->getIntrinsicID() == GenISAIntrinsic::GenISA_LSC2DBlockRead) {
DBG(dbgs() << "\nNew block load: " << *GII << ".\n");
if (!processBlockLoad(GII)) {
DBG(dbgs() << " [SKIP] Invalid block load.\n");
continue;
}
DBG(dbgs() << " [OK] Load is potentially splittable.\n");
}
}
DBG(dbgs() << "\nLoads tracing done.\n");
if (blockLoadsMap.empty()) {
DBG(dbgs() << " [SKIP] No loads left to split.\n");
return false;
}
DBG(dbgs() << " -- Number of block loads = " << blockLoadsMap.size() << ".\n");
DBG(dbgs() << "\nCalculating possible splits:\n");
for (auto &[GII, loadPtr] : blockLoadsMap) {
DBG(dbgs() << " -- Block load: " << *GII << ".\n");
PossibleDims dims = possibleDims(GII);
if (dims.empty())
continue;
dimsMap[GII] = *std::min_element(dims.begin(), dims.end());
}
if (dimsMap.empty()) {
DBG(dbgs() << " [SKIP] No splits left.\n");
return false;
}
DBG(dbgs() << " [OK] Smallest possible splits found.\n");
bool codeChanges = false;
DBG(dbgs() << "\nSplitting loads:\n");
for (auto &[GII, dims] : dimsMap) {
if (!split(GII, dims)) {
DBG(dbgs() << " [SKIP] Split failed.\n");
continue;
}
codeChanges = true;
}
DBG(dbgs() << " [OK] Splitting done.\n");
return codeChanges;
}
} // namespace IGC::LS
// ==========================================================================
namespace {
class SplitLoads : public FunctionPass {
public:
// Pass identification, replacement for typeid
static char ID;
virtual StringRef getPassName() const override { return "SplitLoads"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<CodeGenContextWrapper>();
AU.addRequired<IGCLivenessAnalysis>();
AU.setPreservesCFG();
}
SplitLoads();
SplitLoads(const SplitLoads &) = delete;
SplitLoads &operator=(const SplitLoads &) = delete;
virtual bool runOnFunction(Function &F) override;
private:
std::unique_ptr<LoadSplitter> loadSplitter = nullptr;
};
} // namespace
char SplitLoads::ID = 0;
// Register pass to igc-opt
#define PASS_FLAG "igc-split-loads"
#define PASS_DESCRIPTION "Splits 2D LSC block loads into smaller chunks"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(SplitLoads, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_DEPENDENCY(IGCLivenessAnalysis)
IGC_INITIALIZE_PASS_END(SplitLoads, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
FunctionPass *IGC::createSplitLoadsPass() { return new SplitLoads(); }
SplitLoads::SplitLoads() : FunctionPass(ID) { initializeSplitLoadsPass(*PassRegistry::getPassRegistry()); }
bool SplitLoads::runOnFunction(Function &F) {
if (!config().enableLoadSplitting || skipFunction(F)) {
return false;
}
loadSplitter = LoadSplitter::Create(&F, getAnalysis<CodeGenContextWrapper>().getCodeGenContext(),
&getAnalysis<IGCLivenessAnalysis>());
if (!loadSplitter) {
return false;
}
DBG(dbgs() << "\nSPLITLOADS ON: " << F.getName() << "\n");
auto pad = [](const std::string &s, size_t len) -> std::string {
return s.size() < len ? s + std::string(len - s.size(), ' ') : s.substr(0, len);
};
bool codeChanged = false;
for (BasicBlock &BB : F) {
bool splitterChangesCode = loadSplitter->splitAllToSmallest(&BB);
codeChanged |= splitterChangesCode;
if (splitterChangesCode) {
DBG(dbgs() << "BB: " << pad(BB.getName().str(), 20) << " : SPLIT SUCCESSFUL.\n");
} else {
DBG(dbgs() << "BB: " << pad(BB.getName().str(), 20) << " : NO SPLITS.\n");
}
}
DBG(dbgs() << "\n");
return codeChanged;
}
#undef DEBUG_TYPE