mirror of
				https://github.com/intel/intel-graphics-compiler.git
				synced 2025-10-30 08:18:26 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			1947 lines
		
	
	
		
			72 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			1947 lines
		
	
	
		
			72 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*========================== begin_copyright_notice ============================
 | |
| 
 | |
| Copyright (C) 2025 Intel Corporation
 | |
| 
 | |
| SPDX-License-Identifier: MIT
 | |
| 
 | |
| ============================= end_copyright_notice ===========================*/
 | |
| 
 | |
| #include "Compiler/CISACodeGen/SplitLoads.h"
 | |
| 
 | |
| #include "Compiler/CISACodeGen/IGCLivenessAnalysis.h"
 | |
| #include "Compiler/CodeGenPublic.h"
 | |
| #include "Compiler/IGCPassSupport.h"
 | |
| #include "Compiler/MetaDataUtilsWrapper.h"
 | |
| 
 | |
| // clang-format off
 | |
| #include "common/LLVMWarningsPush.hpp"
 | |
| #include "llvmWrapper/IR/DerivedTypes.h"
 | |
| #include "llvmWrapper/IR/Function.h"
 | |
| #include "llvmWrapper/IR/Value.h"
 | |
| #include "llvm/Support/MathExtras.h"
 | |
| #include "common/LLVMWarningsPop.hpp"
 | |
| // clang-format on
 | |
| 
 | |
| #include "IGC/common/Types.hpp"
 | |
| 
 | |
| #include <algorithm>
 | |
| #include <array>
 | |
| #include <deque>
 | |
| #include <optional>
 | |
| #include <string>
 | |
| 
 | |
| using namespace llvm;
 | |
| using namespace IGC;
 | |
| using namespace IGC::LS;
 | |
| 
 | |
| #define DEBUG_TYPE "igc-split-loads"
 | |
| 
 | |
| // ============================================================================
 | |
| // The goal of this feature:
 | |
| // ============================================================================
 | |
| //
 | |
| // The file provides basic tools for splitting 2D LSC block loads of the form:
 | |
| //  -- <N x iX> @llvm.genx.GenISA.LSC2DBlockRead.vNiX(i64, i32, i32, i32, i32,
 | |
| //  i32, i32, i32, i32, i32, i1, i1, i32)
 | |
| //
 | |
| // For the load to be eligible for splitting, the loaded vector must be
 | |
| // subsequently split into smaller chunks. For example, consider the load of a
 | |
| // 16-element vector
 | |
| //  -- %vec = call <16 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v16i16(i64 %ptr,
 | |
| //  i32 127, i32 63, i32 127, i32 0, i32 0, i32 16, i32 16, i32 16, i32 1, i1
 | |
| //  false, i1 false, i32 0)
 | |
| //
 | |
| // that is subsequently split into two 8-element vectors:
 | |
| //  -- %pick.0 = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32
 | |
| //  0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 | |
| //  -- %pick.1 = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32
 | |
| //  8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 | |
| // that are finally fed into some users:
 | |
| //  -- call void @fun_v8i16(<8 x i16> %pick.0)
 | |
| //  -- call void @fun_v8i16(<8 x i16> %pick.1)
 | |
| //
 | |
| // This sequence can be replaced by 2 smaller loads that feed directly into the
 | |
| // users:
 | |
| //  -- %vec.0 = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %ptr,
 | |
| //  i32 127, i32 63, i32 127, i32 0, i32 0, i32 16, i32 16, i32 8, i32 1, i1
 | |
| //  false, i1 false, i32 0)
 | |
| //  -- %vec.1 = call <8 x i16> @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64 %ptr,
 | |
| //  i32 127, i32 63, i32 127, i32 0, i32 8, i32 16, i32 16, i32 8, i32 1, i1
 | |
| //  false, i1 false, i32 0)
 | |
| //  -- call void @fun_v8i16(<8 x i16> %vec.0)
 | |
| //  -- call void @fun_v8i16(<8 x i16> %vec.1)
 | |
| //
 | |
| // Whether this is beneficial or not depends on the register pressure and
 | |
| // rescheduling possibilities.
 | |
| //
 | |
| // ============================================================================
 | |
| // Outline of the tool:
 | |
| // ============================================================================
 | |
| //
 | |
| // A given load can be split by the instance of `LoadSplitter` created by the
 | |
| // factory function
 | |
| //  -- static std::unique_ptr<LoadSplitter> Create(Function *inF, CodeGenContext
 | |
| //  *inCGC, IGCLivenessAnalysis *inRPE);
 | |
| //
 | |
| // Given a load `GenIntrinsicInst *GII`, all possible split dimensions (see
 | |
| // below for details) can be obtained by calling
 | |
| //  -- PossibleDims LoadSplitter::possibleDims(GenIntrinsicInst *GII);
 | |
| //
 | |
| // The splitting is then carried out by
 | |
| //  -- bool LoadSplitter::split(GenIntrinsicInst *GII, Dims dims);
 | |
| // where `dims` represent the desired dimensions of the split.
 | |
| //
 | |
| // To split all loads in a basic block use:
 | |
| //  -- bool LoadSplitter::splitAllToSmallest(BasicBlock *BB);
 | |
| //
 | |
| // Splitting can be carried out automatically by the pass `SplitLoads`.
 | |
| // To activate the pass, set the IGC flag `LS_enableLoadSplitting=1`.
 | |
| //
 | |
| // The splitting procedure consist of the following phases:
 | |
| // 1. Process the load and its users to figure out the split structure of the
 | |
| // load.
 | |
| // 2. Calculate possible split dimensions.
 | |
| // 3. Split.
 | |
| //
 | |
| // ============================================================================
 | |
| // I.    Process the load
 | |
| // ============================================================================
 | |
| //
 | |
| // Parameters of the intrinsic are stored and managed in the class `LoadData`.
 | |
| // After verifying the validity of the parameters, the users of the load are
 | |
| // traced and stored by the class `TraceData`. The tracing is carried out in
 | |
| // `TraceData::tracePicks` and is done as follows:
 | |
| //
 | |
| // 1. A subvector of the loaded vector can be picked by either
 | |
| // `ShuffleVectorInst` or a sequence of `InsertElementInst` and
 | |
| // `ExtractElementInst`. If the picking is carried out by `ShuffleVectorInst`,
 | |
| // the picks must come entirely from one of its arguments, with another one
 | |
| // being an explicit constant (this includes undefs, zeroinitializers). The
 | |
| // indices must be constant. Here the indices can repeat and undefs are allowed.
 | |
| //
 | |
| // If the picking is carried out by the sequence of `InsertElementInst` and
 | |
| // `ExtractElementInst`, the sequence must start with:
 | |
| //  -- %ext.0 = extractelement <16 x i16> %vec, i32 [from]
 | |
| //  -- %pick.0 = insertelement <8 x i16> undef, i16 %ext, i32 [to]
 | |
| // and continue by the repetition of:
 | |
| //  -- %ext.n = extractelement <16 x i16> %vec, i32 [from]
 | |
| //  -- %pick.n = insertelement <8 x i16> %pick.(n-1), i16 %ext.n, i32 [to]
 | |
| // The indices [from] and [to] must be constant. Each extractelement must have a
 | |
| // single user, which is the corresponding insertelement. Each insertelement,
 | |
| // except for the last, must have a single user, which is the next
 | |
| // extractelement.
 | |
| //
 | |
| // The picks can be stacked. For example,
 | |
| // -- %pick.1 = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32
 | |
| // 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 | |
| // -- %pick.1.1 = shufflevector <8 x i16> %pick.1, <8 x i16> undef, <4 x i32>
 | |
| // <i32 4, i32 5, i32 6, i32 7>
 | |
| //
 | |
| // picks elements {12, 13, 14, 15} of the original %vec.
 | |
| //
 | |
| // The conditions above guarantee that the picks form a tree. Furthermore, no
 | |
| // other values, except explicit constants and undefs, are used in the picks.
 | |
| // This guarantees that the instructions can be safely erased after the split
 | |
| // is done.
 | |
| //
 | |
| // 2. `BitCastInst` are allowed to appear in the tree of the picks if the bit
 | |
| // width of the scalars does not change. Thus, a load of i32's can be cast to
 | |
| // float, i16's to hf's, etc. If multiple bitcasts appear in the tree, the
 | |
| // scalar type is recalculated each time.
 | |
| //
 | |
| // 3. Once a user that is not a node in the tree of picks is identified, it is
 | |
| // saved. The users are grouped by the picks they use as well as the types they
 | |
| // are cast to. Thus, it is possible to have multiple users of the same pick,
 | |
| // and with different types as well.
 | |
| //
 | |
| // ============================================================================
 | |
| // II.   Calculate possible split dimensions
 | |
| // ============================================================================
 | |
| //
 | |
| // Once the tree of picks (and casts) is created, we want to calculate possible
 | |
| // split dimensions. This is carried out in Load::possibleDims().
 | |
| // We require that the picks obey two conditions:
 | |
| // 1. Each pick must a multi-block range (`MBRange`)
 | |
| // 2. All picks must be grid-uniform.
 | |
| //
 | |
| // Ad 1. A multi-brock range (MBRange) is a sequence of groups, each group
 | |
| // containing the same number of consecutive integers. The gaps between the
 | |
| // consecutive groups must also be equal. For example, a pick
 | |
| //  -- {0,1,4,5,8,9,12,13}
 | |
| // is a valid MBRange as it contains four groups of equal size of consecutive
 | |
| // integers ( {0,1}, {4,5}, {8,9}, {12,13} ) and the gap
 | |
| // between the groups is constant. All picks must be valid MBRanges.
 | |
| //
 | |
| // Given an MBRange, we say it has dimensions RxC, where R is the size of each
 | |
| // group and C the number of groups. In the example above, RxC = 2x4. We can
 | |
| // think about R and C as numbers of rows and columns in the RxC grid.
 | |
| //
 | |
| // If the pick consists of consecutive integers only, e.g. {2,3,4,5}, we prefer
 | |
| // using C=1, so here we would have RxC = 4x1. However, 2x2 and 1x4 are also
 | |
| // valid dimensions for this pick.
 | |
| //
 | |
| // Ad 2. Possible splits are determined by the size of the loaded vector and the
 | |
| // number of blocks read by the intrinsic. Let's say the vector has length V
 | |
| // (such as <V x i16>) and we have B block read (B is the 9-th argument in the
 | |
| // intrinsic or "vB" in the OpenCL intrinsic such as
 | |
| // @__builtin_IB_subgroup_block_read_cacheopts_u32_m8k16v2). We will say that
 | |
| // the dimensions of the load are GxB, where G=V/B is the size of the group. We
 | |
| // can think about the vector as a grid with G rows and B columns.
 | |
| //
 | |
| // The picks are
 | |
| // grid-uniform if they are all MBRanges of the same dimensions and they
 | |
| // constitute a tiling of the grid. They cannot overlap, repeat indices or miss
 | |
| // any indices.
 | |
| //
 | |
| // For example, consider a 16-element vector loaded by an intrinsic with 4
 | |
| // blocks. This constitutes a 4x4 grid. The grid can be tiled by the MBRanges of
 | |
| // the following dimensions:
 | |
| //  -- 1x1 ( {0}, {1}, ..., {15} )
 | |
| //  -- 2x1 ( {0,1}, {2,3}, {4,5}, {6,7}, {8,9}, {10,11}, {12,13}, {14,15})
 | |
| //  -- 4x1 ( {0,1,2,3}, {4,5,6,7}, {8,9,10,11}, {12,13,14,15})
 | |
| //  -- 8x1 = 4x2 ( {0,1,2,3,4,5,6,7}, {8,9,10,11,12,13,14,15})
 | |
| //  -- 16x1 = 8x2 = 4x4 ( {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15})
 | |
| //  -- 1x2 ( {0,4}, {1,5}, {2,6}, {3,7}, {8,12}, {9,13}, {10,14}, {11,15} )
 | |
| //  -- 2x2 ( {0,1,4,5}, {2,3,6,7}, {8,9,12,13}, {10,11,14,15} )
 | |
| //  -- 1x4 ( {0,4,8,12}, {1,5,9,13}, {2,6,10,14}, {3,7,11,15} )
 | |
| //  -- 2x4 ( {0,1,4,5,8,9,11,12}, {2,3,6,7,10,11,14,15} )
 | |
| // Note that 8x1 = 4x2 and 16x1 = 8x2 = 4x4.
 | |
| //
 | |
| // In Load::possibleDims() we first check conditions 1 and 2. If they are
 | |
| // satisfied, we calculate the dimensions of the picks. This gives us the
 | |
| // smallest possible split. In addition, we calculate all other possible
 | |
| // dimensions. For example, if the picks form a uniform subgrid of dimension 2x2
 | |
| // of the grid 4x4, then the loads can be split into:
 | |
| //  -- 2x2, 2x4, 8x1, 16x1.
 | |
| //
 | |
| // ============================================================================
 | |
| // III.  Split
 | |
| // ============================================================================
 | |
| //
 | |
| // First, for each MBRange, the new parameters of the load after the split are
 | |
| // calculated and stored in the new instance of the `Load` class. After this is
 | |
| // done, the new loads are created and inserted into the IR. If the size of the
 | |
| // new loads is equal exactly to the size of the picks, the users can be
 | |
| // connected directly to the new loads (with possible bitcasts). If the new
 | |
| // loads produce larger vectors than the users consume, a sequence of
 | |
| // `InsertElementInst` and `ExtractElementInst` is inserted in order to split
 | |
| // the vectors into smaller chunks.
 | |
| //
 | |
| // As a final example consider a load has 4 blocks and produces a 16-element
 | |
| // vector, i.e., it has dimension 4x4,
 | |
| //  -- %vec = call <16 x i8> @llvm.genx.GenISA.LSC2DBlockRead.v16i8(i64 %ptr,
 | |
| //  i32 127, i32 63, i32 127, i32 0, i32 0, i32 8, i32 16, i32 4, i32 4, i1
 | |
| //  false, i1 false, i32 0)
 | |
| //
 | |
| // The load is split into four 4-element vectors and the picks
 | |
| // are grid-uniform of dimension 2x2,
 | |
| //  -- %pick.0 = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32
 | |
| //  0, i32 1, i32 4, i32 5>
 | |
| //  -- %pick.1 = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32
 | |
| //  2, i32 3, i32 6, i32 7>
 | |
| //  -- %pick.2 = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32
 | |
| //  8, i32 9, i32 12, i32 13>
 | |
| //  -- %pick.3 = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32
 | |
| //  10, i32 11, i32 14, i32 15>
 | |
| //
 | |
| // The minimal possible split has dimension 2x2. However, one of the conditions
 | |
| // for the splits to work is that the size of the split vector x SIMD fills up
 | |
| // at least 1 GRF. Otherwise the load involves padding, which would break the
 | |
| // validity of the splits. Thus, the smallest valid split has dimension 4x2 =
 | |
| // 8x1. In such a case the new loads produce 8-element vector and have 2 blocks:
 | |
| //  -- %vec.0 = call <8 x i8> @llvm.genx.GenISA.LSC2DBlockRead.v8i8(i64 %ptr,
 | |
| //  i32 127, i32 63, i32 127, i32 0, i32 0, i32 8, i32 16, i32 4, i32 2, i1
 | |
| //  false, i1 false, i32 0)
 | |
| //  -- %vec.1 = call <8 x i8> @llvm.genx.GenISA.LSC2DBlockRead.v8i8(i64 %ptr,
 | |
| //  i32 127, i32 63, i32 127, i32 32, i32 0, i32 8, i32 16, i32 4, i32 2, i1
 | |
| //  false, i1 false, i32 0)
 | |
| //
 | |
| // We still have to pick the 4-element subvectors %pick.n. n=0,1,2,3 of the
 | |
| // 8-element vectors %vec.0 and %vec.1. This is done by inserting a sequence of
 | |
| // `InsertElementInst` and `ExtractElementInst`. In this particular case the
 | |
| // picks are {0,1,4,5} and {2,3,6,7} for both vectors. This example (and many
 | |
| // others) can be found in the test file `isa_flak_k16.ll` as the test function
 | |
| // `@i8_4x4_to_2x2`.
 | |
| //
 | |
| // ============================================================================
 | |
| 
 | |
| namespace IGC::LS {
 | |
| Config &config() { return Config::get(); }
 | |
| } // namespace IGC::LS
 | |
| 
 | |
| namespace {
 | |
| 
 | |
| constexpr unsigned DEF_PICK_SIZE = 64;
 | |
| constexpr unsigned DEF_NUM_OF_LOADS = 4;
 | |
| constexpr unsigned DEF_NUM_OF_PICKS_PER_LOAD = 4;
 | |
| constexpr unsigned DEF_NUM_OF_CASTS_OR_USERS_PER_PICK = 2;
 | |
| constexpr unsigned DEF_NUM_OF_USERS_PER_PICK = DEF_NUM_OF_CASTS_OR_USERS_PER_PICK * DEF_NUM_OF_CASTS_OR_USERS_PER_PICK;
 | |
| constexpr unsigned DEF_NUM_OF_OPTS = 4;
 | |
| 
 | |
| struct Pick;
 | |
| struct MBRange;
 | |
| 
 | |
| /// `Pick` represents a mask with some additional information.
 | |
| /// It is a vector of integers, where each element is either an index or `-1`
 | |
| /// (undef).
 | |
| struct Pick : public SmallVector<int, DEF_PICK_SIZE> {
 | |
|   explicit Pick() : SmallVector<int, DEF_PICK_SIZE>() {}
 | |
|   explicit Pick(unsigned size, int init = -1) : SmallVector<int, DEF_PICK_SIZE>(size, init) {}
 | |
|   Pick(std::initializer_list<int> init) : SmallVector<int, DEF_PICK_SIZE>(init) {}
 | |
| 
 | |
|   /// Returns `true` if the pick is `{0,1,2,...,length-1}`.
 | |
|   bool isTrivial(unsigned length) const;
 | |
| 
 | |
|   /// Returns `true` if the element is contained in the pick.
 | |
|   bool contains(int x) const { return std::find(begin(), end(), x) != end(); }
 | |
| 
 | |
|   /// Uses elements of `this` as indices to pick the elements from `origin`.
 | |
|   /// If the index is out of range, places -1.
 | |
|   Pick pickFrom(const Pick &origin) const;
 | |
| 
 | |
|   /// Creates the pick from a given mask. The elements must belong to the range
 | |
|   /// [`begin`, `end`] or be undefs. If successful, returns the `Pick` with
 | |
|   /// `begin` subtracted from all elements. Otherwise std::nullopt is returned.
 | |
|   static std::optional<Pick> fromMask(ArrayRef<int> mask, int begin, int end);
 | |
| 
 | |
|   /// The index corresponding to the `n`-th element of `mbr` in `this` is mapped
 | |
|   /// to `n`. If the index is absent in `mbr`, places -1.
 | |
|   Pick narrowTo(const MBRange &mbr) const;
 | |
| 
 | |
|   /// Creates the pick containing all numbers from `begin` to `begin + size -
 | |
|   /// 1`.
 | |
|   static Pick createIdentityPick(unsigned size, int begin = 0);
 | |
| };
 | |
| 
 | |
| /// `MBRange` (multi-block range) represents a sequence of groups, each group
 | |
| /// containing the same number of consecutive integers.
 | |
| /// - `first` denotes the first element in the `MBRange`.
 | |
| /// - `grSize` denotes the number of elements in each group.
 | |
| /// - `grPitch` denotes the distance between the first elements of two
 | |
| /// consecutive groups.
 | |
| /// - `numOfGr` denotes the number of groups.
 | |
| /// If `grPitch <= grSize`, then `numOfGr` must be equal to `1`.
 | |
| struct MBRange {
 | |
|   int first = 0;
 | |
|   unsigned grSize = 0;
 | |
|   int grPitch = 0;
 | |
|   unsigned numOfGr = 0;
 | |
| 
 | |
|   explicit MBRange() = default;
 | |
|   MBRange(int first, unsigned grSize, int grPitch, unsigned numOfGr)
 | |
|       : first(first), grSize(grSize), grPitch(grPitch), numOfGr(numOfGr) {}
 | |
| 
 | |
|   /// Returns empty range.
 | |
|   static MBRange getEmpty() { return MBRange(); }
 | |
| 
 | |
|   /// Returns `true` if the range is empty.
 | |
|   bool empty() const { return !grSize || !numOfGr || !grPitch; }
 | |
| 
 | |
|   /// Returns the total number of elements in the range.
 | |
|   int size() const { return grSize * numOfGr; }
 | |
| 
 | |
|   /// Returns the last element of the range.
 | |
|   int last() const {
 | |
|     return grPitch * (static_cast<int>(0 < numOfGr ? numOfGr : 1) - 1) + first + static_cast<int>(grSize) - 1;
 | |
|   }
 | |
| 
 | |
|   /// Returns the `n`-th element of the range.
 | |
|   int operator[](int n) const;
 | |
| 
 | |
|   /// Returns the `elt` element of the `group`-th group.
 | |
|   int operator()(int group, int elt) const;
 | |
| 
 | |
|   /// Returns the index corresponding to element `x` of the range. If `x` is not
 | |
|   /// in the range, returns `-1`.
 | |
|   int indexOf(int x) const;
 | |
| 
 | |
|   /// Returns `true` if the element is contained in the range.
 | |
|   bool contains(int x) const;
 | |
| 
 | |
|   enum class Containment { Contains, Excludes, Intersects };
 | |
|   /// Checks the relation between the range and the pick.
 | |
|   /// - Returns `Containment::Contains` if the pick is contained fully in the
 | |
|   /// range.
 | |
|   /// - Returns `Containment::Excludes` if the pick lies entirely outside of the
 | |
|   /// range.
 | |
|   /// - Returns `Containment::Intersects` if the pick is sliced by the range.
 | |
|   /// Undefs are discarded if `allowUndefs` is `true`. Otherise
 | |
|   /// `Containment::Excludes` is returned.
 | |
|   Containment containsOrExcludes(const Pick &pick, bool allowUndefs) const;
 | |
| 
 | |
|   /// Returns the pick of successive elements corresponding to the range.
 | |
|   Pick toPick() const;
 | |
| 
 | |
|   /// Converts the pick to `MBRange`. The pick must contain consecutive indices
 | |
|   /// and undefs are not allowed. If the pick is not a valid `MBRange`, returns
 | |
|   /// `std::nullopt`.
 | |
|   static std::optional<MBRange> fromPick(const Pick &pick);
 | |
| };
 | |
| 
 | |
| } // unnamed namespace
 | |
| 
 | |
| // ===========================================================================
 | |
| 
 | |
| bool Pick::isTrivial(unsigned length) const {
 | |
|   if (size() != length)
 | |
|     return false;
 | |
|   for (unsigned n = 0; n < length; ++n) {
 | |
|     if ((*this)[n] != static_cast<int>(n))
 | |
|       return false;
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| Pick Pick::pickFrom(const Pick &origin) const {
 | |
|   Pick newPick = Pick(size());
 | |
|   std::transform(begin(), end(), newPick.begin(),
 | |
|                  [&](int n) -> int { return 0 <= n && static_cast<unsigned>(n) < origin.size() ? origin[n] : -1; });
 | |
|   return newPick;
 | |
| }
 | |
| 
 | |
| Pick Pick::narrowTo(const MBRange &mbr) const {
 | |
|   Pick newPick = Pick(size());
 | |
|   for (unsigned n = 0; n < size(); ++n) {
 | |
|     int idx = mbr.indexOf((*this)[n]);
 | |
|     newPick[n] = 0 <= idx ? idx : -1;
 | |
|   }
 | |
|   return newPick;
 | |
| }
 | |
| 
 | |
| Pick Pick::createIdentityPick(unsigned size, int begin) {
 | |
|   Pick interval = Pick(size);
 | |
|   std::generate(interval.begin(), interval.end(), [&]() -> int { return begin++; });
 | |
|   return interval;
 | |
| }
 | |
| 
 | |
| std::optional<Pick> Pick::fromMask(ArrayRef<int> mask, int begin, int end) {
 | |
|   Pick newPick = Pick(mask.size());
 | |
|   for (unsigned n = 0; n < mask.size(); ++n) {
 | |
|     int val = mask[n];
 | |
|     if (0 <= val) {
 | |
|       if (val < begin || end < val)
 | |
|         return std::nullopt;
 | |
|       newPick[n] = val - begin;
 | |
|     } else {
 | |
|       newPick[n] = -1;
 | |
|     }
 | |
|   }
 | |
|   return std::make_optional(std::move(newPick));
 | |
| }
 | |
| 
 | |
| raw_ostream &operator<<(raw_ostream &os, const Pick &pick) {
 | |
|   os << "{ ";
 | |
|   for (int x : pick) {
 | |
|     os << x << ' ';
 | |
|   }
 | |
|   os << '}';
 | |
|   return os;
 | |
| }
 | |
| 
 | |
| // ===========================================================================
 | |
| 
 | |
| int MBRange::operator[](int n) const { return (*this)(n / grSize, n % grSize); }
 | |
| 
 | |
| int MBRange::operator()(int group, int elt) const { return group * grPitch + first + elt; }
 | |
| 
 | |
| bool MBRange::contains(int x) const {
 | |
|   return empty()       ? false
 | |
|          : 1 < numOfGr ? first <= x && x <= last() && (x - first) % grPitch < static_cast<int>(grSize)
 | |
|                        : first <= x && x <= last();
 | |
| }
 | |
| 
 | |
| MBRange::Containment MBRange::containsOrExcludes(const Pick &pick, bool allowUndefs) const {
 | |
|   if (pick.empty())
 | |
|     return MBRange::Containment::Contains;
 | |
|   if (empty())
 | |
|     return MBRange::Containment::Excludes;
 | |
|   return std::any_of(pick.begin(), pick.end(), [&](int x) -> bool { return contains(x); })
 | |
|              ? std::all_of(pick.begin(), pick.end(),
 | |
|                            [&](int x) -> bool { return contains(x) || (allowUndefs && x < 0); })
 | |
|                    ? MBRange::Containment::Contains
 | |
|                    : MBRange::Containment::Intersects
 | |
|          : std::all_of(pick.begin(), pick.end(), [&](int x) -> bool { return !contains(x); })
 | |
|              ? MBRange::Containment::Excludes
 | |
|              : MBRange::Containment::Intersects;
 | |
| }
 | |
| 
 | |
| int MBRange::indexOf(int x) const {
 | |
|   return contains(x) ? 1 < numOfGr ? (x - first) / grPitch * grSize + (x - first) % grPitch : x - first : -1;
 | |
| }
 | |
| 
 | |
| Pick MBRange::toPick() const {
 | |
|   if (empty())
 | |
|     return Pick();
 | |
|   Pick pick = Pick(size());
 | |
|   for (int n = 0; n < size(); ++n) {
 | |
|     pick[n] = (*this)[n];
 | |
|   }
 | |
|   return pick;
 | |
| }
 | |
| 
 | |
| std::optional<MBRange> MBRange::fromPick(const Pick &pick) {
 | |
|   if (pick.empty()) {
 | |
|     return MBRange::getEmpty();
 | |
|   }
 | |
|   if (pick.front() < 0) {
 | |
|     return std::nullopt;
 | |
|   }
 | |
| 
 | |
|   int val;
 | |
|   MBRange ret(pick.front(), 0, 0, 0);
 | |
|   for (unsigned n = 1; n < pick.size(); ++n) {
 | |
|     val = pick[n];
 | |
|     // We don't allow undefs.
 | |
|     if (val < 0) {
 | |
|       return std::nullopt;
 | |
|     }
 | |
|     // The value jumps, so we possibly reached the group pitch.
 | |
|     if (val != ret.first + static_cast<int>(n)) {
 | |
|       ret.grSize = n;
 | |
|       ret.grPitch = val - ret.first;
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
|   // Single group range.
 | |
|   if (ret.grSize == 0) {
 | |
|     ret.grSize = pick.size();
 | |
|     ret.numOfGr = 1;
 | |
|     return ret;
 | |
|   }
 | |
|   // grPitch should be positive and larger than the grSize
 | |
|   if (ret.grPitch <= static_cast<int>(ret.grSize)) {
 | |
|     return std::nullopt;
 | |
|   }
 | |
| 
 | |
|   if (pick.size() % ret.grSize) {
 | |
|     return std::nullopt;
 | |
|   }
 | |
|   ret.numOfGr = pick.size() / ret.grSize;
 | |
|   for (unsigned gr = 1; gr < ret.numOfGr; ++gr) {
 | |
|     for (unsigned el = 0; el < ret.grSize; ++el) {
 | |
|       val = pick[gr * ret.grSize + el];
 | |
|       if (val < 0) {
 | |
|         return std::nullopt;
 | |
|       }
 | |
|       if (val != ret(gr, el)) {
 | |
|         return std::nullopt;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   return ret;
 | |
| }
 | |
| 
 | |
| raw_ostream &operator<<(raw_ostream &os, const MBRange &range) {
 | |
|   for (unsigned gr = 0; gr < range.numOfGr; ++gr) {
 | |
|     os << '[' << range(gr, 0) << ", " << range(gr, range.grSize - 1) << "] ";
 | |
|   }
 | |
|   return os;
 | |
| }
 | |
| 
 | |
| // ==========================================================================
 | |
| 
 | |
| namespace {
 | |
| 
 | |
| constexpr unsigned NUM_OF_BLOCKLOAD_ARGS = 13;
 | |
| 
 | |
| /// Indices for arguments of GenISA_LSC2DBlockRead.
 | |
| namespace LSC2D_BlockRead {
 | |
| enum : unsigned {
 | |
|   argSurfacePtr = 0,
 | |
|   argSurfaceWidthLessOne_inBytes = 1,
 | |
|   argSurfaceHeightLessOne_inPitches = 2,
 | |
|   argSurfacePitchLessOne_inBytes = 3,
 | |
|   argXOffset_inElts = 4,
 | |
|   argYOffset_inPitches = 5,
 | |
|   argSizeInBits = 6,
 | |
|   argBlockWidth_inElts = 7,
 | |
|   argBlockHeight_inElts = 8,
 | |
|   argNumOfBlocks = 9,
 | |
|   argIsTranspose = 10,
 | |
|   argIsVNNI = 11,
 | |
|   argCacheFlags = 12
 | |
| };
 | |
| }
 | |
| 
 | |
| /// Returns the numeric value of the argument number `n` to the intrinsic `GII`
 | |
| /// as `unsigned int`. Assumes `GII->getArgOperand(n)` exists and can be cast to
 | |
| /// `ConstantInt`.
 | |
| static unsigned getArgZ(GenIntrinsicInst *GII, unsigned n) {
 | |
|   return static_cast<unsigned>(cast<ConstantInt>(GII->getArgOperand(n))->getZExtValue());
 | |
| }
 | |
| 
 | |
| /// Returns the numeric value of the argument number `n` to the intrinsic `GII`
 | |
| /// as `signed int`. Assumes `GII->getArgOperand(n)` exists and can be cast to
 | |
| /// `ConstantInt`.
 | |
| static int getArgS(GenIntrinsicInst *GII, unsigned n) {
 | |
|   return static_cast<int>(cast<ConstantInt>(GII->getArgOperand(n))->getSExtValue());
 | |
| }
 | |
| 
 | |
| #define DBG(x) LLVM_DEBUG(x)
 | |
| 
 | |
| using Picks = SmallVector<Pick, DEF_NUM_OF_PICKS_PER_LOAD>;
 | |
| using MBRanges = SmallVector<MBRange, DEF_NUM_OF_PICKS_PER_LOAD>;
 | |
| 
 | |
| /// `TraceData` contains all the data about the structure of the splits of a
 | |
| /// load vector. It gathers all picks from the load together with the associated
 | |
| /// bitcasts and their users.
 | |
| struct TraceData {
 | |
|   using Cast = SmallDenseMap<Type *, SmallVector<Instruction *, DEF_NUM_OF_CASTS_OR_USERS_PER_PICK>,
 | |
|                              DEF_NUM_OF_CASTS_OR_USERS_PER_PICK>;
 | |
|   using Casts = SmallVector<Cast, DEF_NUM_OF_PICKS_PER_LOAD>;
 | |
|   using ToRemove = SmallVector<Instruction *, DEF_NUM_OF_PICKS_PER_LOAD + 1>;
 | |
| 
 | |
|   BasicBlock *BB = nullptr;
 | |
|   Pick initialPick = Pick();
 | |
|   std::unique_ptr<Picks> picks = nullptr;
 | |
|   Casts typesToCastTo = Casts();
 | |
|   ToRemove toRemove = ToRemove();
 | |
| 
 | |
|   /// Returns total vector length.
 | |
|   unsigned vectorLength() const { return initialPick.size(); }
 | |
| 
 | |
|   /// Traces the pick tree starting from the load `GII` and returns `true` if
 | |
|   /// the `TraceData` is valid. If `true` is returned, then `picks` is non-null.
 | |
|   bool tracePicks(GenIntrinsicInst *GII);
 | |
| 
 | |
|   /// Uses `builder` to create the sequence of LLVM instructions that represent
 | |
|   /// the splits: i) The tree is attached to `load` and ii) For each pick,
 | |
|   /// the corresponding sequence of `InsertElementInst` and `ExtractElementInst`
 | |
|   /// is inserted. iii) Final bitcasts are attached and the users are updated
 | |
|   /// appropriately.
 | |
|   void putPicks(IRBuilder<> &builder, Value *load);
 | |
| 
 | |
|   /// Removes all instructions marked to remove.
 | |
|   void removeOldInstructions();
 | |
| 
 | |
|   /// Returns new `TraceData` containing those picks in this, which are subpicks
 | |
|   /// of `largeRange`.
 | |
|   std::unique_ptr<TraceData> pickSubpicksOf(const MBRange &largeRange);
 | |
| 
 | |
| private:
 | |
|   struct Node : public Pick {
 | |
|     Type *type;
 | |
|     Instruction *fun;
 | |
| 
 | |
|     explicit Node() : Pick(), type(nullptr), fun(nullptr) {}
 | |
|     Node(const Pick &pick, Type *type, Instruction *fun) : Pick(pick), type{type}, fun{fun} {}
 | |
|   };
 | |
| 
 | |
|   std::optional<Node> addBitCast(const Node &previous, BitCastInst *BCI);
 | |
|   std::optional<Node> addShuffle(const Node &previous, ShuffleVectorInst *SVI);
 | |
|   std::optional<Node> addExtractInsertSequence(const Node &previous, ExtractElementInst *EEI,
 | |
|                                                SmallPtrSet<Value *, DEF_NUM_OF_PICKS_PER_LOAD> &extractsToSkip);
 | |
|   void addLeaf(const Node &leaf);
 | |
| };
 | |
| 
 | |
| /// `LoadData` represents the data that is associated with a block load.
 | |
| struct LoadData {
 | |
|   /// Block width in elements as specified in the intrinsic.
 | |
|   unsigned blockWidth_E = 0;
 | |
| 
 | |
|   /// Block height in elements as specified in the intrinsic.
 | |
|   unsigned blockHeight_E = 0;
 | |
| 
 | |
|   /// Number of blocks as specified in the intrinsic.
 | |
|   unsigned numOfBlocks = 0;
 | |
| 
 | |
|   /// Length of the loaded vector in elements.
 | |
|   unsigned vectorLength = 0;
 | |
| 
 | |
|   /// Bit width of a single element in the loaded vector.
 | |
|   unsigned scalarBitWidth = 0;
 | |
| 
 | |
|   /// Bit width of a single element on the surface as specified in the
 | |
|   /// intrinsic.
 | |
|   unsigned elementBitWidth = 0;
 | |
| 
 | |
|   /// Is the load transposed.
 | |
|   bool transposed = false;
 | |
| 
 | |
|   /// Is the load VNNI-transformed.
 | |
|   bool vnni = false;
 | |
| 
 | |
|   /// Checks if this `LoadData` is the same as `rhs` as the 2D LSC load data.
 | |
|   bool sameAsLoad(const LoadData &rhs) const;
 | |
| 
 | |
|   /// Returns the length of a single block in the vector.
 | |
|   unsigned groupLength() const { return vectorLength / numOfBlocks; }
 | |
| 
 | |
|   /// Uses parameters of the intrinsic to figure out SIMD.
 | |
|   /// If the SIMD is reported correctly, this is equal to config().actualSimd.
 | |
|   unsigned SIMD() const {
 | |
|     // From the point of view of the intrinsic, the total bit width of the load
 | |
|     // is: totalBW = blockWidth * blockHeight * numOfBlocks * elementBitWidth.
 | |
|     // From the point of view of the loaded vectors, the total bit width is:
 | |
|     // totalBW = vectorLength * SIMD * scalarBitWidth.
 | |
|     // From this:
 | |
|     return (blockWidth_E * blockHeight_E * numOfBlocks * elementBitWidth) / (vectorLength * scalarBitWidth);
 | |
|   }
 | |
| 
 | |
|   /// Returns the size of a single scalar multiplied by the SIMD.
 | |
|   unsigned scalarMemSize_B() const {
 | |
|     return (blockWidth_E * blockHeight_E * numOfBlocks * elementBitWidth) / (8 * vectorLength);
 | |
|   }
 | |
| 
 | |
|   /// Returns the minimum valid group length for the split load.
 | |
|   unsigned getMinGroupLength(unsigned atLeastThisLarge = 0) const;
 | |
| 
 | |
|   /// Checks if the load has valid parameters as long as this pass in concerned.
 | |
|   bool isValidLoad() const;
 | |
| 
 | |
|   /// Returns the name of the intrinsic with the data represented by this
 | |
|   /// `LoadData` and corresponding to the stand-alone block load.
 | |
|   std::string getBlockLoadName() const;
 | |
| 
 | |
| protected:
 | |
|   IntegerType *scalarTy = nullptr;
 | |
| 
 | |
| private:
 | |
|   bool isValidTransposed() const;
 | |
|   bool isValidVNNI() const;
 | |
| };
 | |
| 
 | |
| /// `Load` contains all the data associated with the load and its picks.
 | |
| /// It provides methods for splitting the load and creating the corresponding
 | |
| /// LLVM IR.
 | |
| struct Load : public LoadData {
 | |
|   GenIntrinsicInst *GII = nullptr;
 | |
|   int xOffset_E = 0; // X Offset in elements
 | |
|   int yOffset_P = 0; // Y Offset in surface pitch
 | |
|   std::unique_ptr<TraceData> trace = nullptr;
 | |
| 
 | |
|   explicit Load() = default;
 | |
|   Load(const LoadData &data) : LoadData(data) {}
 | |
|   Load(const Load &rhs) : LoadData(rhs), GII(rhs.GII), xOffset_E(rhs.xOffset_E), yOffset_P(rhs.yOffset_P) {}
 | |
|   Load &operator=(const Load &rhs) {
 | |
|     if (this != &rhs) {
 | |
|       LoadData::operator=(rhs);
 | |
|       GII = rhs.GII;
 | |
|       xOffset_E = rhs.xOffset_E;
 | |
|       yOffset_P = rhs.yOffset_P;
 | |
|     }
 | |
|     return *this;
 | |
|   }
 | |
|   Load(Load &&) = default;
 | |
|   Load &operator=(Load &&) = default;
 | |
|   ~Load() = default;
 | |
| 
 | |
|   /// Reads the data from the load intrinsic.
 | |
|   bool readFromLoad(GenIntrinsicInst *GII);
 | |
| 
 | |
|   /// Traces all picks attached to the load and returns `true` if the
 | |
|   /// `TraceData` is valid. If `true` is returned, then `trace` and
 | |
|   /// `trace->picks` is non-null.
 | |
|   bool tracePicks();
 | |
| 
 | |
|   /// Returns the set of possible grid-uniform dimensions into which the load
 | |
|   /// can be split. It takes into account the limits from `Config`.
 | |
|   PossibleDims possibleDims();
 | |
| 
 | |
|   /// Returns the new load corresponding to loading only part of the original
 | |
|   /// load. If the load is not valid, returns `nullptr`.
 | |
|   std::unique_ptr<Load> split(const MBRange &range);
 | |
| 
 | |
|   /// Creates the LLVM call for the stand-alone block load.
 | |
|   /// Uses `GII` as the insertion point, so `GII` should point to the old load.
 | |
|   CallInst *putBlockLoad(IRBuilder<> &builder);
 | |
| 
 | |
|   /// Deletes the original load and its picks.
 | |
|   void removeOldInstructions();
 | |
| 
 | |
| private:
 | |
|   bool fillBlockData();
 | |
| 
 | |
|   Load &splitFlat(const MBRange &range);
 | |
|   Load &splitTransposed(const MBRange &range);
 | |
|   Load &splitVNNI(const MBRange &range);
 | |
| 
 | |
|   std::unique_ptr<Load> splitLoadData(const MBRange &range);
 | |
| };
 | |
| 
 | |
| } // unnamed namespace
 | |
| 
 | |
| // ==========================================================================
 | |
| 
 | |
| raw_ostream &operator<<(raw_ostream &os, const Dims &dims) {
 | |
|   os << dims.grSize << " x " << dims.numOfGr;
 | |
|   return os;
 | |
| }
 | |
| 
 | |
| // ==========================================================================
 | |
| 
 | |
| /// Given a `Value` of a vectorial type, returns the pair representing the
 | |
| /// scalar type and the size of the vector.
 | |
| static std::pair<IntegerType *, unsigned> getScalarTypeAndSize(Value *V) {
 | |
|   std::pair<IntegerType *, unsigned> ret{nullptr, 0};
 | |
|   IGCLLVM::FixedVectorType *vectorTy = dyn_cast<IGCLLVM::FixedVectorType>(V->getType());
 | |
|   if (!vectorTy)
 | |
|     return ret;
 | |
|   ret.first = dyn_cast<IntegerType>(vectorTy->getElementType());
 | |
|   if (!ret.first)
 | |
|     return ret;
 | |
|   ret.second = vectorTy->getNumElements();
 | |
|   return ret;
 | |
| }
 | |
| 
 | |
| /// Creates the function of a given name in LLVM.
 | |
| static Function *createFunction(StringRef name, Module *currModule, ArrayRef<Value *> args, Type *retTy,
 | |
|                                 Function *copyAttrAfter = nullptr) {
 | |
|   SmallVector<Type *, NUM_OF_BLOCKLOAD_ARGS> argsTy;
 | |
|   argsTy.assign(args.size(), nullptr);
 | |
|   for (unsigned i = 0; i < argsTy.size(); ++i) {
 | |
|     argsTy[i] = args[i]->getType();
 | |
|   }
 | |
|   FunctionType *newFunTy = FunctionType::get(retTy, argsTy, false);
 | |
|   Function *newFun = Function::Create(newFunTy, GlobalValue::ExternalLinkage, name, currModule);
 | |
|   if (copyAttrAfter) {
 | |
|     newFun->copyAttributesFrom(copyAttrAfter);
 | |
|     if (isa<GlobalObject>(copyAttrAfter)) {
 | |
|       newFun->copyMetadata(cast<GlobalObject>(copyAttrAfter), 0);
 | |
|     }
 | |
|   }
 | |
|   return newFun;
 | |
| }
 | |
| 
 | |
| /// Adds `offset` to `value` if `offset` is non-zero.
 | |
| static Value *createAdd(IRBuilder<> &builder, Value *value, unsigned offset) {
 | |
|   return offset ? (isa<ConstantInt>(value)
 | |
|                        ? builder.getInt32(static_cast<unsigned>(cast<ConstantInt>(value)->getZExtValue()) + offset)
 | |
|                        : builder.CreateAdd(value, builder.getInt32(offset), "", true, true))
 | |
|                 : value;
 | |
| }
 | |
| 
 | |
| // ===========================================================================
 | |
| 
 | |
| bool Config::initialize(Function *F, CodeGenContext *inCGC, IGCLivenessAnalysis *inRPE) {
 | |
|   CGC = inCGC;
 | |
|   RPE = inRPE;
 | |
|   if (!F || !CGC || !RPE)
 | |
|     return false;
 | |
|   if (!CGC->platform.hasLSC()) {
 | |
|     DBG(dbgs() << " [SKIP] No support for LSC on this platform.\n");
 | |
|     return false;
 | |
|   }
 | |
|   if (!IGC::ForceAlwaysInline(CGC)) {
 | |
|     if (F->isDeclaration())
 | |
|       return false;
 | |
|   } else {
 | |
|     if (!F->getReturnType()->isVoidTy())
 | |
|       return false;
 | |
|   }
 | |
| 
 | |
|   // Actual SIMD is the SIMD as reported by the compiler.
 | |
|   // Default SIMD is the default SIMD associated with the architecture.
 | |
|   // Default SIMD is used only if actual SIMD is absent and mostly for testing
 | |
|   // purposes.
 | |
|   defaultSimd = 0;
 | |
|   switch (CGC->platform.getPlatformInfo().eProductFamily) {
 | |
|   case IGFX_DG2:
 | |
|   case IGFX_METEORLAKE:
 | |
|   case IGFX_ARROWLAKE:
 | |
|     defaultSimd = 16;
 | |
|     break;
 | |
|   default:
 | |
|     defaultSimd = 32;
 | |
|     break;
 | |
|   }
 | |
|   actualSimd = 0;
 | |
|   if (RPE->MDUtils && RPE->MDUtils->findFunctionsInfoItem(F) != RPE->MDUtils->end_FunctionsInfo()) {
 | |
|     IGC::IGCMD::FunctionInfoMetaDataHandle funcInfoMD = RPE->MDUtils->getFunctionsInfoItem(F);
 | |
|     actualSimd = funcInfoMD->getSubGroupSize()->getSIMDSize();
 | |
|   }
 | |
| 
 | |
|   isLegitW8 = false;
 | |
|   sizeOfRegs_B = RPE->registerSizeInBytes();
 | |
|   numOfRegs = CGC->getNumGRFPerThread();
 | |
| 
 | |
|   minSplitSize_B = minSplitSize_GRF * sizeOfRegs_B;
 | |
|   splitThreshold_B = (static_cast<int>(numOfRegs) + splitThresholdDelta_GRF) * sizeOfRegs_B;
 | |
| 
 | |
|   DBG(Module *newM = F->getParent(); if (newM != M) {
 | |
|     M = newM;
 | |
|     dbgs() << "CONFIG DATA:\n";
 | |
|     dbgs() << " -- SPLITTING ENABLED / ignore reg pressure          = " << enableLoadSplitting << " / "
 | |
|            << ignoreSplitThreshold << "\n";
 | |
|     dbgs() << " -- register size [B] / number of registers          = " << sizeOfRegs_B << " / " << numOfRegs << "\n";
 | |
|     dbgs() << " -- default SIMD / actual SIMD                       = " << defaultSimd << " / " << actualSimd << "\n";
 | |
|     dbgs() << " -- split threshold [B]                              = " << splitThreshold_B << "\n";
 | |
|     dbgs() << " -- min split size [E] / min split size [B]          = " << minSplitSize_E << " / " << minSplitSize_B
 | |
|            << "\n";
 | |
|   });
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| // ===========================================================================
 | |
| 
 | |
| std::optional<TraceData::Node> TraceData::addBitCast(const Node &previous, BitCastInst *BCI) {
 | |
|   IGCLLVM::FixedVectorType *srcTy = dyn_cast<IGCLLVM::FixedVectorType>(BCI->getSrcTy());
 | |
|   IGCLLVM::FixedVectorType *destTy = dyn_cast<IGCLLVM::FixedVectorType>(BCI->getDestTy());
 | |
|   if (!(srcTy && destTy))
 | |
|     return std::nullopt;
 | |
|   // We only allow bitcasts that preserve the size of the underlying scalar
 | |
|   // type.
 | |
|   if (srcTy->getElementType()->getScalarSizeInBits() != destTy->getElementType()->getScalarSizeInBits())
 | |
|     return std::nullopt;
 | |
|   return TraceData::Node(previous, destTy->getElementType(), BCI);
 | |
| }
 | |
| 
 | |
| std::optional<TraceData::Node> TraceData::addShuffle(const Node &previous, ShuffleVectorInst *SVI) {
 | |
|   if (!isa<IGCLLVM::FixedVectorType>(SVI->getType()))
 | |
|     return std::nullopt;
 | |
|   // Previous node is a bitcast, shuffle vector, insert element, or the original
 | |
|   // load, so we checked that previous->fun has the type of a fixed vector.
 | |
|   unsigned previousVectorLength = cast<IGCLLVM::FixedVectorType>(previous.fun->getType())->getNumElements();
 | |
|   // We must make sure that the shuffle vector is a pick from the previous
 | |
|   // vector. The other vector must be undef.
 | |
|   int beginPos, endPos;
 | |
|   if (SVI->getOperand(0) == previous.fun) {
 | |
|     beginPos = 0;
 | |
|     endPos = previousVectorLength - 1;
 | |
|     if (!isa<Constant>(SVI->getOperand(1)))
 | |
|       return std::nullopt;
 | |
|   } else {
 | |
|     beginPos = previousVectorLength;
 | |
|     endPos = 2 * previousVectorLength - 1;
 | |
|     if (!isa<Constant>(SVI->getOperand(0)))
 | |
|       return std::nullopt;
 | |
|   }
 | |
|   // Pick::fromMask guarantees that the pick is entirely contained in [beginPos,
 | |
|   // endPos].
 | |
|   std::optional<Pick> newPick = Pick::fromMask(SVI->getShuffleMask(), beginPos, endPos);
 | |
|   if (!newPick)
 | |
|     return std::nullopt;
 | |
|   return TraceData::Node(newPick->pickFrom(previous), previous.type, SVI);
 | |
| }
 | |
| 
 | |
| std::optional<TraceData::Node>
 | |
| TraceData::addExtractInsertSequence(const TraceData::Node &previous, ExtractElementInst *EEI,
 | |
|                                     SmallPtrSet<Value *, DEF_NUM_OF_PICKS_PER_LOAD> &extractsToSkip) {
 | |
| 
 | |
|   auto isValidExtract = [&](ExtractElementInst *E) -> bool {
 | |
|     if (!isa<IGCLLVM::FixedVectorType>(E->getVectorOperand()->getType()))
 | |
|       return false;
 | |
|     if (!isa<ConstantInt>(E->getIndexOperand()))
 | |
|       return false;
 | |
|     if (!E->hasOneUse())
 | |
|       return false;
 | |
|     return true;
 | |
|   };
 | |
| 
 | |
|   auto isValidInsert = [&](InsertElementInst *I) -> bool {
 | |
|     if (!isa<IGCLLVM::FixedVectorType>(I->getType()))
 | |
|       return false;
 | |
|     if (!isa<ConstantInt>(I->getOperand(2)))
 | |
|       return false;
 | |
|     return true;
 | |
|   };
 | |
| 
 | |
|   auto getAssociatedInsert = [&](ExtractElementInst *E) -> InsertElementInst * {
 | |
|     return dyn_cast<InsertElementInst>(*E->user_begin());
 | |
|   };
 | |
| 
 | |
|   auto getAssociatedExtract = [&](InsertElementInst *I) -> ExtractElementInst * {
 | |
|     return dyn_cast<ExtractElementInst>(I->getOperand(1));
 | |
|   };
 | |
| 
 | |
|   auto getPreviousInsert = [&](InsertElementInst *I) -> InsertElementInst * {
 | |
|     return dyn_cast<InsertElementInst>(I->getOperand(0));
 | |
|   };
 | |
| 
 | |
|   auto getNextInsert = [&](InsertElementInst *I) -> InsertElementInst * {
 | |
|     return dyn_cast<InsertElementInst>(*I->user_begin());
 | |
|   };
 | |
| 
 | |
|   std::deque<std::pair<ExtractElementInst *, InsertElementInst *>> EIs;
 | |
| 
 | |
|   // We want to find the chain of extract/insert elements.
 | |
|   // There is no guarantee that the first user of the load is the first extract
 | |
|   // element, so we must traverse the chain both up and down.
 | |
| 
 | |
|   auto addPairFromExtract = [&](ExtractElementInst *E, bool front) -> bool {
 | |
|     if (!E)
 | |
|       return false;
 | |
|     extractsToSkip.insert(E);
 | |
|     if (!isValidExtract(E))
 | |
|       return false;
 | |
|     InsertElementInst *I = getAssociatedInsert(E);
 | |
|     if (!I || !isValidInsert(I))
 | |
|       return false;
 | |
|     if (front) {
 | |
|       EIs.push_front(std::make_pair(E, I));
 | |
|     } else {
 | |
|       EIs.push_back(std::make_pair(E, I));
 | |
|     }
 | |
|     return true;
 | |
|   };
 | |
| 
 | |
|   auto addPairFromInsert = [&](InsertElementInst *I, bool front) -> bool {
 | |
|     if (!I || !isValidInsert(I))
 | |
|       return false;
 | |
|     ExtractElementInst *E = getAssociatedExtract(I);
 | |
|     if (!E)
 | |
|       return false;
 | |
|     extractsToSkip.insert(E);
 | |
|     if (!isValidExtract(E))
 | |
|       return false;
 | |
|     if (front) {
 | |
|       EIs.push_front(std::make_pair(E, I));
 | |
|     } else {
 | |
|       EIs.push_back(std::make_pair(E, I));
 | |
|     }
 | |
|     return true;
 | |
|   };
 | |
| 
 | |
|   // First we move "up" to find the first pair of insert/extract elements.
 | |
|   if (!addPairFromExtract(EEI, true))
 | |
|     return std::nullopt;
 | |
|   InsertElementInst *insert = EIs.front().second;
 | |
|   do {
 | |
|     insert = getPreviousInsert(insert);
 | |
|   } while (addPairFromInsert(insert, true));
 | |
|   // Since addPairFromExtract succeeded, we know that the first pair in EIs
 | |
|   // exists. This pair is valid but the previous pair failed. So either there
 | |
|   // was no previous pair (which is what we want) or the previous pair was
 | |
|   // invalid and we bail out.
 | |
|   if (!isa<UndefValue>(EIs.front().second->getOperand(0)))
 | |
|     return std::nullopt;
 | |
| 
 | |
|   // Now we move "down".
 | |
|   insert = EIs.back().second;
 | |
|   do {
 | |
|     // If the insert has more than one user, the sequence ends.
 | |
|     if (!insert->hasOneUse())
 | |
|       break;
 | |
|     insert = getNextInsert(insert);
 | |
|   } while (addPairFromInsert(insert, false));
 | |
|   // It is the role of tracePicks to figure out if the next instructions are
 | |
|   // valid leaves.
 | |
| 
 | |
|   // The first pair in EIs determines:
 | |
|   //  - from which vector we pick
 | |
|   //  - how many elements we pick
 | |
| 
 | |
|   unsigned newVectorLength = cast<IGCLLVM::FixedVectorType>(EIs.front().second->getType())->getNumElements();
 | |
|   Value *pickingFrom = EIs.front().first->getVectorOperand();
 | |
|   Value *currentBuildVector = nullptr;
 | |
|   Pick pick = Pick(newVectorLength, -1);
 | |
| 
 | |
|   // While building the picks, we have to check that:
 | |
|   //  - we pick from the same vector
 | |
|   //  - we keep building the same vector
 | |
|   //  - we assign each index only once
 | |
| 
 | |
|   auto addPick = [&](std::pair<ExtractElementInst *, InsertElementInst *> &ei) -> bool {
 | |
|     if (ei.first->getVectorOperand() != pickingFrom)
 | |
|       return false;
 | |
|     if (currentBuildVector) {
 | |
|       if (currentBuildVector != ei.second->getOperand(0))
 | |
|         return false;
 | |
|     }
 | |
|     currentBuildVector = ei.second;
 | |
| 
 | |
|     unsigned origIdx = cast<ConstantInt>(ei.first->getIndexOperand())->getZExtValue();
 | |
|     unsigned newIdx = cast<ConstantInt>(ei.second->getOperand(2))->getZExtValue();
 | |
|     if (pick[newIdx] != -1)
 | |
|       return false;
 | |
|     pick[newIdx] = origIdx;
 | |
|     return true;
 | |
|   };
 | |
| 
 | |
|   for (unsigned n = 0; n < EIs.size() - 1; ++n) {
 | |
|     if (!addPick(EIs[n]))
 | |
|       return std::nullopt;
 | |
|     toRemove.push_back(EIs[n].first);
 | |
|     toRemove.push_back(EIs[n].second);
 | |
|   }
 | |
|   if (!addPick(EIs.back()))
 | |
|     return std::nullopt;
 | |
|   toRemove.push_back(EIs.back().first);
 | |
|   return TraceData::Node(pick.pickFrom(previous), previous.type, EIs.back().second);
 | |
| }
 | |
| 
 | |
| void TraceData::addLeaf(const Node &leaf) {
 | |
|   if (leaf.empty())
 | |
|     return;
 | |
|   auto it = std::find(picks->begin(), picks->end(), leaf);
 | |
|   if (it != picks->end()) {
 | |
|     typesToCastTo[it - picks->begin()][leaf.type].push_back(leaf.fun);
 | |
|   } else {
 | |
|     picks->push_back(leaf);
 | |
|     typesToCastTo.push_back(Cast{{leaf.type, SmallVector<Instruction *, 1>(1, leaf.fun)}});
 | |
|   }
 | |
| }
 | |
| 
 | |
| bool TraceData::tracePicks(GenIntrinsicInst *GII) {
 | |
|   auto [_, vecLen] = getScalarTypeAndSize(GII);
 | |
|   if (!vecLen)
 | |
|     return false;
 | |
|   BB = GII->getParent();
 | |
|   initialPick = Pick::createIdentityPick(vecLen);
 | |
|   picks = std::make_unique<Picks>();
 | |
|   typesToCastTo.clear();
 | |
|   toRemove.clear();
 | |
| 
 | |
|   SmallVector<TraceData::Node, DEF_NUM_OF_PICKS_PER_LOAD + 1> activeNodes;
 | |
|   activeNodes.emplace_back(Pick::createIdentityPick(vectorLength()),
 | |
|                            cast<IGCLLVM::FixedVectorType>(GII->getType())->getElementType(), GII);
 | |
|   TraceData::Node currNode;
 | |
|   // We need to skip the extract elements that are part of the already processed
 | |
|   // insert/extract sequence.
 | |
|   SmallPtrSet<Value *, DEF_NUM_OF_PICKS_PER_LOAD> extractsToSkip;
 | |
| 
 | |
|   while (!activeNodes.empty()) {
 | |
|     currNode = activeNodes.back();
 | |
|     activeNodes.pop_back();
 | |
|     toRemove.push_back(currNode.fun);
 | |
|     for (User *nextUser : currNode.fun->users()) {
 | |
|       Instruction *next = dyn_cast<Instruction>(nextUser);
 | |
|       if (!next) {
 | |
|         return false;
 | |
|       }
 | |
|       if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(next)) {
 | |
|         // currNode->fun can be the starting load, bitcast, insert element, or
 | |
|         // shufflevector. In all those cases we checked that
 | |
|         // currNode->fun->getType() is a fixed vector type.
 | |
|         std::optional<TraceData::Node> newNode = addShuffle(currNode, SVI);
 | |
|         if (!newNode) {
 | |
|           return false;
 | |
|         }
 | |
|         activeNodes.push_back(std::move(*newNode));
 | |
|       } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(next)) {
 | |
|         std::optional<TraceData::Node> newNode = addBitCast(currNode, BCI);
 | |
|         if (!newNode) {
 | |
|           return false;
 | |
|         }
 | |
|         activeNodes.push_back(std::move(*newNode));
 | |
|       } else if (ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(next)) {
 | |
|         if (extractsToSkip.count(EEI))
 | |
|           continue;
 | |
|         std::optional<TraceData::Node> newNode = addExtractInsertSequence(currNode, EEI, extractsToSkip);
 | |
|         if (!newNode) {
 | |
|           return false;
 | |
|         }
 | |
|         activeNodes.push_back(std::move(*newNode));
 | |
|       } else {
 | |
|         addLeaf(currNode);
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| void TraceData::putPicks(IRBuilder<> &builder, Value *load) {
 | |
|   if (!picks)
 | |
|     return;
 | |
| 
 | |
|   // This assumes no repeated values in the pick.
 | |
|   auto putPick = [&](const Pick &pick, Value *loadOrCast, Type *scalarTy) -> Value * {
 | |
|     Value *elem;
 | |
|     Value *newVector = UndefValue::get(IGCLLVM::FixedVectorType::get(scalarTy, pick.size()));
 | |
|     for (unsigned insertPos = 0; insertPos < pick.size(); ++insertPos) {
 | |
|       if (pick[insertPos] < 0)
 | |
|         continue;
 | |
|       unsigned extractPos = static_cast<unsigned>(pick[insertPos]);
 | |
|       elem = builder.CreateExtractElement(loadOrCast, extractPos);
 | |
|       newVector = builder.CreateInsertElement(newVector, elem, insertPos);
 | |
|     }
 | |
|     return newVector;
 | |
|   };
 | |
| 
 | |
|   // We attach the pick tree to the load.
 | |
| 
 | |
|   Value *loadOrCast = load;
 | |
|   Type *scalarTy;
 | |
|   std::tie(scalarTy, std::ignore) = getScalarTypeAndSize(load);
 | |
| 
 | |
|   Value *picksVal;
 | |
|   Value *pickAndCastVal;
 | |
|   for (unsigned n = 0; n < picks->size(); ++n) {
 | |
|     const Pick &pick = (*picks)[n];
 | |
|     // If a pick is non-trivial, we insert the insert/extract sequence.
 | |
|     // picksVal is the resulting value, or the origianl load if no pick is
 | |
|     // required.
 | |
|     picksVal = pick.isTrivial(vectorLength()) ? loadOrCast : putPick(pick, loadOrCast, scalarTy);
 | |
| 
 | |
|     // For each pick we cast picksVal to the appropriate type and replace the
 | |
|     // users. pickAndCastVal is the resulting value of the cast, if needed, or
 | |
|     // is the original pick.
 | |
|     for (auto &[type, users] : typesToCastTo[n]) {
 | |
|       if (type == scalarTy) {
 | |
|         pickAndCastVal = picksVal;
 | |
|       } else {
 | |
|         pickAndCastVal = builder.CreateBitCast(picksVal, IGCLLVM::FixedVectorType::get(type, pick.size()));
 | |
|       }
 | |
|       for (Instruction *user : users) {
 | |
|         // The only case when fun == newCallAndCast is when the load goes
 | |
|         // directly into the call, without any shuffles or bitcasts.
 | |
|         if (user != pickAndCastVal) {
 | |
|           user->replaceAllUsesWith(pickAndCastVal);
 | |
|         }
 | |
|       }
 | |
|       users.clear();
 | |
|       users.push_back(dyn_cast<Instruction>(pickAndCastVal));
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| void TraceData::removeOldInstructions() {
 | |
|   for (Instruction *instr : toRemove) {
 | |
|     if (instr && instr->getType()) {
 | |
|       instr->replaceAllUsesWith(UndefValue::get(instr->getType()));
 | |
|     }
 | |
|   }
 | |
|   for (Instruction *instr : toRemove) {
 | |
|     if (instr) {
 | |
|       instr->eraseFromParent();
 | |
|     }
 | |
|   }
 | |
|   toRemove.clear();
 | |
| }
 | |
| 
 | |
| std::unique_ptr<TraceData> TraceData::pickSubpicksOf(const MBRange &mbr) {
 | |
|   if (!picks)
 | |
|     return nullptr;
 | |
|   std::unique_ptr<TraceData> ret = std::make_unique<TraceData>();
 | |
|   ret->BB = BB;
 | |
|   ret->picks = std::make_unique<Picks>();
 | |
|   ret->initialPick = mbr.toPick();
 | |
|   for (unsigned n = 0; n < picks->size(); ++n) {
 | |
|     const Pick &pick = (*picks)[n];
 | |
|     if (mbr.contains(pick[0])) {
 | |
|       Pick narrowed = pick.narrowTo(mbr);
 | |
|       ret->picks->push_back(narrowed);
 | |
|       ret->typesToCastTo.push_back(typesToCastTo[n]);
 | |
|     }
 | |
|   }
 | |
|   return ret;
 | |
| }
 | |
| 
 | |
| // ===========================================================================
 | |
| 
 | |
| /// Creates a vector of MBRange's, each of the same `grSize`, `grPitch`, and
 | |
| /// `numOfGr`. The ranges cover the entire vector of length `vectorLength`.
 | |
| static std::optional<MBRanges> makeUniform(unsigned grSize, int grPitch, unsigned numOfGr, unsigned vectorLength) {
 | |
|   if (!vectorLength)
 | |
|     return std::nullopt;
 | |
|   if (grSize == grPitch) {
 | |
|     grSize *= numOfGr;
 | |
|     numOfGr = 1;
 | |
|   }
 | |
|   MBRanges mbrs;
 | |
|   if (numOfGr <= 1) {
 | |
|     if (vectorLength % grSize)
 | |
|       return std::nullopt;
 | |
|     unsigned numOfGroups = vectorLength / grSize;
 | |
|     mbrs.resize(numOfGroups);
 | |
|     for (unsigned n = 0; n < numOfGroups; ++n) {
 | |
|       mbrs[n] = MBRange(n * grSize, grSize, grPitch, 1);
 | |
|     }
 | |
|   } else {
 | |
|     if (vectorLength % (grPitch * numOfGr) || grPitch % grSize || grPitch <= static_cast<int>(grSize) || grPitch <= 0)
 | |
|       return std::nullopt;
 | |
|     unsigned numInBlock = grPitch / grSize;
 | |
|     unsigned blockPitch = grPitch * numOfGr;
 | |
|     unsigned numOfBlocks = vectorLength / blockPitch;
 | |
|     mbrs.resize(numInBlock * numOfBlocks);
 | |
|     for (unsigned n = 0; n < numOfBlocks; ++n) {
 | |
|       for (unsigned m = 0; m < numInBlock; ++m) {
 | |
|         mbrs[n * numInBlock + m] = MBRange(n * blockPitch + m * grSize, grSize, grPitch, numOfGr);
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   return std::make_optional(std::move(mbrs));
 | |
| }
 | |
| 
 | |
| /// Creates a vector of MBRange's, each of the same `grSize`, `grPitch`, and
 | |
| /// `numOfGr` and covering the entire vector of length `vectorLength`
 | |
| /// grid-uniformly. Each pick of `picks` must fit into exactly one range.
 | |
| static std::optional<MBRanges> makeGridUniformPicks(const Picks &picks, unsigned vectorLength, unsigned blockLength) {
 | |
|   if (picks.empty())
 | |
|     return std::nullopt;
 | |
|   MBRanges mbrs;
 | |
|   mbrs.reserve(picks.size());
 | |
|   for (unsigned n = 0; n < picks.size(); ++n) {
 | |
|     // The picks must be grid-uniform. No undefs allowed.
 | |
|     std::optional<MBRange> mbr = MBRange::fromPick(picks[n]);
 | |
|     if (!mbr) {
 | |
|       DBG(dbgs() << "    -- Pick " << picks[n] << " is not a valid multi-block range.\n");
 | |
|       return std::nullopt;
 | |
|     }
 | |
|     if (mbr->numOfGr == 1) {
 | |
|       mbr->grPitch = blockLength;
 | |
|     }
 | |
|     mbrs.push_back(*mbr);
 | |
|   }
 | |
|   MBRange mbr = mbrs.front();
 | |
|   if (mbrs.size() * mbr.size() != vectorLength) {
 | |
|     DBG(dbgs() << "    -- Invalid sizes of multi-block ranges.\n");
 | |
|     return std::nullopt;
 | |
|   }
 | |
|   if (!std::all_of(std::next(mbrs.begin()), mbrs.end(), [&](const MBRange &x) {
 | |
|         return x.grPitch == mbr.grPitch && x.grSize == mbr.grSize && x.numOfGr == mbr.numOfGr;
 | |
|       })) {
 | |
|     DBG(dbgs() << "    -- Multi-block ranges of different sizes.\n");
 | |
|     return std::nullopt;
 | |
|   }
 | |
|   if (mbr.numOfGr == 1) {
 | |
|     if (!isPowerOf2_32(mbr.grSize) || vectorLength % mbr.grSize || mbr.first % mbr.grSize) {
 | |
|       DBG(dbgs() << "    -- Invalid size of multi-block range " << mbr << ".\n");
 | |
|       return std::nullopt;
 | |
|     }
 | |
|   } else {
 | |
|     if (!(isPowerOf2_32(mbr.grSize) && isPowerOf2_32(mbr.grPitch) && isPowerOf2_32(mbr.numOfGr))) {
 | |
|       DBG(dbgs() << "    -- Invalid size of multi-block range " << mbr << ".\n");
 | |
|       return std::nullopt;
 | |
|     }
 | |
|     // - group size must be smaller than block length and be its divisor
 | |
|     // - group pitch must match the block length (there are no gaps between
 | |
|     // blocks)
 | |
|     if (blockLength <= mbr.grSize || blockLength % mbr.grSize || mbr.grPitch != blockLength) {
 | |
|       DBG(dbgs() << "    -- Invalid size of multi-block range " << mbr << ".\n");
 | |
|       return std::nullopt;
 | |
|     }
 | |
|     unsigned numOfBlocks = vectorLength / blockLength;
 | |
|     unsigned firstInGr = mbr.first % blockLength;
 | |
|     unsigned grStart = mbr.first / blockLength;
 | |
|     // - first elt in each group must be a multiple of the group size
 | |
|     // - number of groups must be a divisor of the number of blocks
 | |
|     // - the block with first elt must be a multiple of the number of groups
 | |
|     if (firstInGr % mbr.grSize || numOfBlocks % mbr.numOfGr || grStart % mbr.numOfGr) {
 | |
|       DBG(dbgs() << "    -- Invalid size of multi-block range " << mbr << ".\n");
 | |
|       return std::nullopt;
 | |
|     }
 | |
|   }
 | |
|   SmallVector<int, DEF_PICK_SIZE> allPicks(vectorLength, 0);
 | |
|   for (const MBRange &range : mbrs) {
 | |
|     for (int n = 0; n < range.size(); ++n) {
 | |
|       int &alreadyPicked = allPicks[range[n]];
 | |
|       if (alreadyPicked) {
 | |
|         DBG(dbgs() << " -- Multi-block ranges overlap.\n");
 | |
|         return std::nullopt;
 | |
|       }
 | |
|       alreadyPicked = 1;
 | |
|     }
 | |
|   }
 | |
|   if (std::find(allPicks.begin(), allPicks.end(), 0) != allPicks.end()) {
 | |
|     DBG(dbgs() << "    -- Multi-block ranges do not cover the entire vector.\n");
 | |
|     return std::nullopt;
 | |
|   }
 | |
|   DBG(dbgs() << "    -- Multi-block ranges are valid.\n";
 | |
|       dbgs() << "       -- First MB range is = " << mbrs.front() << ".\n";);
 | |
|   return std::make_optional(std::move(mbrs));
 | |
| }
 | |
| 
 | |
| // ===========================================================================
 | |
| 
 | |
| bool LoadData::isValidLoad() const {
 | |
|   // For this pass we assume the following:
 | |
|   // 1. We must avoid padding, otherwise the splitting would fail.
 | |
|   // 2. All parameters are power of two, so that the splitting is possible.
 | |
|   // 3. The length of the vector is at least 2 (so we don't deal with v1s).
 | |
|   if (!isPowerOf2_32(vectorLength) || !isPowerOf2_32(blockWidth_E) || !isPowerOf2_32(blockHeight_E) ||
 | |
|       !isPowerOf2_32(numOfBlocks) || !isPowerOf2_32(scalarBitWidth) || !isPowerOf2_32(elementBitWidth))
 | |
|     return false;
 | |
|   if ((blockHeight_E * blockWidth_E * elementBitWidth / 8) % config().sizeOfRegs_B)
 | |
|     return false; // so we don't deal with padding
 | |
|   if (scalarBitWidth % elementBitWidth)
 | |
|     return false;       // so scalarBitLength >= elementBitLength
 | |
|   if (vectorLength < 2) // We don't want to deal with v1's.
 | |
|     return false;
 | |
| 
 | |
| 
 | |
|   DBG(
 | |
|       bool ok = true; if (transposed && vnni) ok = false; else if (!transposed and !vnni) {
 | |
|         unsigned rowBytesPerBlk = ((elementBitWidth / 8) * blockWidth_E);
 | |
|         if ((rowBytesPerBlk * numOfBlocks) > 64 || rowBytesPerBlk < 4)
 | |
|           ok = false;
 | |
|       } else if (transposed) {
 | |
|         bool isValid64 = (elementBitWidth == 64 && blockHeight_E == 8 &&
 | |
|                           (blockWidth_E <= 4 || (blockWidth_E == 8 && config().isLegitW8)));
 | |
|         bool isValid32 = (elementBitWidth == 32 && blockHeight_E <= 32 && blockWidth_E <= 8);
 | |
|         if (numOfBlocks != 1 || !(isValid32 || isValid64))
 | |
|           ok = false;
 | |
|       } else if (vnni) {
 | |
|         // scalarBitWidth / elementBitWidth is ok since scalarBitLength %
 | |
|         // elementBitLength == 0.
 | |
|         bool isValid8 = (elementBitWidth == 8 && blockHeight_E >= 4 && blockWidth_E >= 4);
 | |
|         bool isValid16 = (elementBitWidth == 16 && blockHeight_E >= 2 && blockWidth_E >= 2 && blockWidth_E <= 32);
 | |
|         if (!(isValid8 || isValid16))
 | |
|           ok = false;
 | |
|       } if (!ok) {
 | |
|         dbgs() << " -- [ERROR] Load is invalid. Parameters mismatch.\n";
 | |
|         return false;
 | |
|       }
 | |
| 
 | |
|       if (config().actualSimd) {
 | |
|         if (config().actualSimd != SIMD()) {
 | |
|           dbgs() << " -- [ERROR] Load is invalid. SIMD mismatch.\n";
 | |
|           return false;
 | |
|         }
 | |
|       });
 | |
| 
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| unsigned LoadData::getMinGroupLength(unsigned atLeastThisLarge) const {
 | |
|   // There are no lower bounds for straight loads.
 | |
|   unsigned minGroupLen = 1;
 | |
| 
 | |
|   if (transposed)
 | |
|     minGroupLen = divideCeil(32, elementBitWidth);
 | |
| 
 | |
|   else if (vnni)
 | |
|     minGroupLen = divideCeil(32, scalarBitWidth);
 | |
| 
 | |
|   // Minimal block length must be equal or larger than:
 | |
|   // 1. minGroupLen
 | |
|   // 2. atLeastThisLarge
 | |
|   // 3. its size in bytes must be at least equal to the size of 1 GRF to avoid
 | |
|   // padding.
 | |
|   // 4. It also must be a power of 2.
 | |
| 
 | |
|   // For 3: the bit width of a load group of size N is N * scalarMemSize_B()
 | |
|   // so N must be greater or equal than the bit width of 1 GRF.
 | |
|   minGroupLen = PowerOf2Ceil(std::max(minGroupLen, atLeastThisLarge));
 | |
|   if (minGroupLen * scalarMemSize_B() < config().sizeOfRegs_B) {
 | |
|     minGroupLen = PowerOf2Ceil(divideCeil(config().sizeOfRegs_B, scalarMemSize_B()));
 | |
|   }
 | |
|   return minGroupLen;
 | |
| }
 | |
| 
 | |
| bool LoadData::sameAsLoad(const LoadData &rhs) const {
 | |
|   return blockWidth_E == rhs.blockWidth_E && blockHeight_E == rhs.blockHeight_E && numOfBlocks == rhs.numOfBlocks &&
 | |
|          vectorLength == rhs.vectorLength && scalarBitWidth == rhs.scalarBitWidth &&
 | |
|          elementBitWidth == rhs.elementBitWidth && transposed == rhs.transposed && vnni == rhs.vnni;
 | |
| }
 | |
| 
 | |
| std::string LoadData::getBlockLoadName() const {
 | |
|   return std::string("llvm.genx.GenISA.LSC2DBlockRead.") +
 | |
|          (1 < vectorLength ? "v" + std::to_string(vectorLength) : std::string()) + "i" + std::to_string(scalarBitWidth);
 | |
| }
 | |
| 
 | |
| // ===========================================================================
 | |
| 
 | |
| bool Load::readFromLoad(GenIntrinsicInst *inGII) {
 | |
|   GII = inGII;
 | |
|   if (!fillBlockData()) {
 | |
|     DBG(dbgs() << "    -- Could not fill block data.\n");
 | |
|     return false;
 | |
|   }
 | |
|   if (!isValidLoad()) {
 | |
|     DBG(dbgs() << " -- [ERROR] Load is invalid.\n");
 | |
|     return false;
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| bool Load::tracePicks() {
 | |
|   trace = std::make_unique<TraceData>();
 | |
|   if (!trace->tracePicks(GII)) {
 | |
|     DBG(dbgs() << "    -- Tracing picks failed.\n");
 | |
|     return false;
 | |
|   }
 | |
|   if (trace->picks->empty()) {
 | |
|     DBG(dbgs() << "    -- Empty picks.\n");
 | |
|     return false;
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| PossibleDims Load::possibleDims() {
 | |
|   if (!trace || !trace->picks || trace->picks->empty())
 | |
|     return {};
 | |
|   std::optional<MBRanges> minSplitOpt = makeGridUniformPicks(*trace->picks, vectorLength, groupLength());
 | |
|   if (!minSplitOpt || !minSplitOpt->size())
 | |
|     return {};
 | |
|   // those are already PowerOf2Ceil'ed:
 | |
|   unsigned minGrSize = getMinGroupLength(minSplitOpt->front().grSize);
 | |
|   unsigned minNumOfGr = minSplitOpt->front().numOfGr;
 | |
|   DBG(dbgs() << "    -- Minimal block size = " << minGrSize << " x " << minNumOfGr << ".\n");
 | |
| 
 | |
|   // For multiple blocks, if dims.numOfGr > 1, the subgroup cannot cover the
 | |
|   // entire group, i.e., dims.grSize < groupLength(). However, if dims.numOfGr =
 | |
|   // 1, the group size can be as large as the vector size. For example, 2x2 is a
 | |
|   // proper subdimension of 4x4, but 4x2 is not, as it is equivalent to 8x1.
 | |
|   PossibleDims dims;
 | |
|   // First, dimensions with a single group.
 | |
|   for (unsigned grSize = std::max(groupLength() * minNumOfGr, minGrSize); grSize <= vectorLength; grSize *= 2) {
 | |
|     if (grSize < config().minSplitSize_E || grSize * scalarMemSize_B() < config().minSplitSize_B)
 | |
|       continue;
 | |
|     dims.insert({grSize, 1});
 | |
|   }
 | |
|   // Next, dimensions with a multiple groups.
 | |
|   for (unsigned grSize = minGrSize; grSize < groupLength(); grSize *= 2) {
 | |
|     for (unsigned numOfGr = minNumOfGr; numOfGr <= numOfBlocks; numOfGr *= 2) {
 | |
|       if (grSize * numOfGr < config().minSplitSize_E || grSize * numOfGr * scalarMemSize_B() < config().minSplitSize_B)
 | |
|         continue;
 | |
|       dims.insert({grSize, numOfGr});
 | |
|     }
 | |
|   }
 | |
|   if (dims.empty()) {
 | |
|     DBG(dbgs() << " -- [SKIP] No possible dimensions (including no split) "
 | |
|                   "satisfy all the conditions.\n");
 | |
|   } else if (dims.size() == 1) {
 | |
|     DBG(dbgs() << " -- [SKIP] No possible splits.\n");
 | |
|   }
 | |
|   DBG(dbgs() << "    -- Possible dimensions:\n"; for (const Dims &d : dims) { dbgs() << "       -- " << d << "\n"; });
 | |
|   return dims;
 | |
| }
 | |
| 
 | |
| Load &Load::splitFlat(const MBRange &range) {
 | |
|   unsigned blockStart = range.first / blockHeight_E;
 | |
|   unsigned blockEnd = range.last() / blockHeight_E;
 | |
|   vectorLength = static_cast<unsigned>(range.size());
 | |
|   numOfBlocks = blockEnd - blockStart + 1;
 | |
|   xOffset_E += static_cast<int>(blockStart * blockWidth_E);
 | |
|   yOffset_P += static_cast<int>(range.first % blockHeight_E); // old block height
 | |
|   blockHeight_E = range.size() / numOfBlocks;                 // new block height
 | |
|   return *this;
 | |
| }
 | |
| 
 | |
| Load &Load::splitTransposed(const MBRange &range) {
 | |
|   vectorLength = static_cast<unsigned>(range.size());
 | |
|   xOffset_E += range.first;
 | |
|   blockWidth_E = static_cast<unsigned>(range.size());
 | |
|   return *this;
 | |
| }
 | |
| 
 | |
| Load &Load::splitVNNI(const MBRange &range) {
 | |
|   unsigned scalarInElts = scalarBitWidth / elementBitWidth;
 | |
|   unsigned blockHeight_S = blockHeight_E / scalarInElts;
 | |
|   unsigned blockStart = range.first / blockHeight_S;
 | |
|   unsigned blockEnd = range.last() / blockHeight_S;
 | |
|   vectorLength = static_cast<unsigned>(range.size());
 | |
|   numOfBlocks = blockEnd - blockStart + 1;
 | |
|   xOffset_E += static_cast<int>(blockStart * blockWidth_E);
 | |
|   yOffset_P += static_cast<int>((range.first % blockHeight_S) * scalarInElts); // old block height
 | |
|   blockHeight_E = (range.size() / numOfBlocks) * scalarInElts;                 // new block height
 | |
|   return *this;
 | |
| }
 | |
| 
 | |
| std::unique_ptr<Load> Load::splitLoadData(const MBRange &range) {
 | |
|   unsigned scalarToEltRatio = scalarBitWidth / elementBitWidth; // scalarBitWidth % elementBitWidth == 0 was checked.
 | |
|   if (range.first % scalarToEltRatio) {
 | |
|     DBG(dbgs() << " -- [ERROR] Position of the first element does not "
 | |
|                   "divide the scalar to element ratio.\n");
 | |
|     return nullptr;
 | |
|   }
 | |
| 
 | |
|   // We copy GII and AP as well, because we will use them as the insertion point
 | |
|   // for new instructions. The copy constructor does NOT copy traces.
 | |
|   std::unique_ptr<Load> ret = std::make_unique<Load>(*this);
 | |
| 
 | |
|   if (!transposed && !vnni) {
 | |
|     ret->splitFlat(range);
 | |
|   } else if (transposed) {
 | |
|     ret->splitTransposed(range);
 | |
|   } else {
 | |
|     ret->splitVNNI(range);
 | |
|   }
 | |
| 
 | |
|   if (!ret->isValidLoad())
 | |
|     return nullptr;
 | |
|   return ret;
 | |
| }
 | |
| 
 | |
| std::unique_ptr<Load> Load::split(const MBRange &range) {
 | |
|   if (vectorLength == range.size())
 | |
|     return nullptr;
 | |
|   std::unique_ptr<Load> ret = splitLoadData(range);
 | |
|   if (!ret)
 | |
|     return nullptr;
 | |
|   DBG(dbgs() << "    -- [OK] Split load created for range " << range << ".\n");
 | |
|   if (trace)
 | |
|     ret->trace = trace->pickSubpicksOf(range);
 | |
|   DBG(dbgs() << "    -- [OK] Subpicks picked.\n");
 | |
|   return ret;
 | |
| }
 | |
| 
 | |
| CallInst *Load::putBlockLoad(IRBuilder<> &builder) {
 | |
|   builder.SetInsertPoint(GII);
 | |
|   std::array<Value *, NUM_OF_BLOCKLOAD_ARGS> args;
 | |
|   for (unsigned i = 0; i < args.size(); ++i) {
 | |
|     args[i] = GII->getArgOperand(i);
 | |
|   }
 | |
| 
 | |
|   Function *newLoadFun = builder.GetInsertBlock()->getModule()->getFunction(getBlockLoadName());
 | |
|   if (!newLoadFun) {
 | |
|     newLoadFun = createFunction(getBlockLoadName(), builder.GetInsertBlock()->getModule(), args,
 | |
|                                 IGCLLVM::FixedVectorType::get(scalarTy, vectorLength), GII->getCalledFunction());
 | |
|   }
 | |
| 
 | |
|   args[LSC2D_BlockRead::argBlockWidth_inElts] = builder.getInt32(blockWidth_E);
 | |
|   args[LSC2D_BlockRead::argBlockHeight_inElts] = builder.getInt32(blockHeight_E);
 | |
|   args[LSC2D_BlockRead::argNumOfBlocks] = builder.getInt32(numOfBlocks);
 | |
|   Value *valX = createAdd(builder, GII->getArgOperand(LSC2D_BlockRead::argXOffset_inElts), xOffset_E);
 | |
|   Value *valY = createAdd(builder, GII->getArgOperand(LSC2D_BlockRead::argYOffset_inPitches), yOffset_P);
 | |
|   args[LSC2D_BlockRead::argXOffset_inElts] = valX;
 | |
|   args[LSC2D_BlockRead::argYOffset_inPitches] = valY;
 | |
| 
 | |
|   CallInst *newGII = builder.CreateCall(newLoadFun->getFunctionType(), newLoadFun, args);
 | |
|   GII = cast<GenIntrinsicInst>(newGII);
 | |
|   return newGII;
 | |
| }
 | |
| 
 | |
| bool Load::fillBlockData() {
 | |
|   if (!(isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argBlockHeight_inElts)) &&
 | |
|         isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argBlockWidth_inElts)) &&
 | |
|         isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argSizeInBits)) &&
 | |
|         isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argNumOfBlocks)) &&
 | |
|         isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argIsTranspose)) &&
 | |
|         isa<ConstantInt>(GII->getArgOperand(LSC2D_BlockRead::argIsVNNI)))) {
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   std::tie(scalarTy, vectorLength) = getScalarTypeAndSize(GII);
 | |
|   if (!scalarTy)
 | |
|     return false;
 | |
|   // For block loads actual offsets may not be constant.
 | |
|   // We only track the relative offsets in this case.
 | |
|   xOffset_E = 0;
 | |
|   yOffset_P = 0;
 | |
|   elementBitWidth = getArgZ(GII, LSC2D_BlockRead::argSizeInBits);
 | |
|   blockWidth_E = getArgZ(GII, LSC2D_BlockRead::argBlockWidth_inElts);
 | |
|   blockHeight_E = getArgZ(GII, LSC2D_BlockRead::argBlockHeight_inElts);
 | |
|   numOfBlocks = getArgZ(GII, LSC2D_BlockRead::argNumOfBlocks);
 | |
|   transposed = getArgZ(GII, LSC2D_BlockRead::argIsTranspose);
 | |
|   vnni = getArgZ(GII, LSC2D_BlockRead::argIsVNNI);
 | |
|   // We don't do anything with cache flags or the read address, so we will just
 | |
|   // copy their values in putBlockLoad.
 | |
|   scalarBitWidth = scalarTy->getBitWidth();
 | |
| 
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| void Load::removeOldInstructions() {
 | |
|   if (trace)
 | |
|     trace->removeOldInstructions(); // This already contains the GII call.
 | |
| }
 | |
| 
 | |
| // ==========================================================================
 | |
| 
 | |
| namespace IGC::LS {
 | |
| 
 | |
| struct LoadSplitter::Impl {
 | |
|   static std::unique_ptr<LoadSplitter::Impl> Create(Function *inF, CodeGenContext *inCGC, IGCLivenessAnalysis *inRPE);
 | |
| 
 | |
|   bool isRPHigh(BasicBlock *BB);
 | |
|   PossibleDims possibleDims(GenIntrinsicInst *GII);
 | |
|   bool split(GenIntrinsicInst *GII, Dims dims);
 | |
|   bool splitAllToSmallest(BasicBlock *BB);
 | |
| 
 | |
| private:
 | |
|   SmallDenseMap<GenIntrinsicInst *, std::unique_ptr<Load>, DEF_NUM_OF_LOADS> blockLoadsMap;
 | |
|   SmallDenseMap<GenIntrinsicInst *, Dims, DEF_NUM_OF_LOADS> dimsMap;
 | |
| 
 | |
|   using SplitLoads = SmallVector<std::unique_ptr<Load>, DEF_NUM_OF_LOADS>;
 | |
|   bool processBlockLoad(GenIntrinsicInst *GII);
 | |
|   SplitLoads splitBlockLoad(Load &load, const MBRanges &splits);
 | |
|   bool putBlockLoad(Load &load, const std::string &nameExt = std::string());
 | |
| };
 | |
| 
 | |
| // ==========================================================================
 | |
| 
 | |
| std::unique_ptr<LoadSplitter> LoadSplitter::Create(Function *inF, CodeGenContext *inCGC, IGCLivenessAnalysis *inRPE) {
 | |
|   std::unique_ptr<LoadSplitter> ret = std::unique_ptr<LoadSplitter>(new LoadSplitter());
 | |
|   ret->impl = Impl::Create(inF, inCGC, inRPE);
 | |
|   if (!ret->impl) {
 | |
|     return nullptr;
 | |
|   }
 | |
|   return ret;
 | |
| }
 | |
| 
 | |
| bool LoadSplitter::isRPHigh(BasicBlock *BB) { return impl->isRPHigh(BB); }
 | |
| 
 | |
| PossibleDims LoadSplitter::possibleDims(GenIntrinsicInst *GII) { return impl->possibleDims(GII); }
 | |
| 
 | |
| bool LoadSplitter::splitAllToSmallest(BasicBlock *BB) { return impl->splitAllToSmallest(BB); }
 | |
| 
 | |
| bool LoadSplitter::split(GenIntrinsicInst *GII, Dims dims) { return impl->split(GII, dims); }
 | |
| 
 | |
| // ==========================================================================
 | |
| 
 | |
| std::unique_ptr<LoadSplitter::Impl> LoadSplitter::Impl::Create(Function *inF, CodeGenContext *inCGC,
 | |
|                                                                IGCLivenessAnalysis *inRPE) {
 | |
|   std::unique_ptr<LoadSplitter::Impl> ret = std::unique_ptr<LoadSplitter::Impl>(new LoadSplitter::Impl());
 | |
|   if (!config().initialize(inF, inCGC, inRPE)) {
 | |
|     return nullptr;
 | |
|   }
 | |
|   return ret;
 | |
| }
 | |
| 
 | |
| bool LoadSplitter::Impl::isRPHigh(BasicBlock *BB) {
 | |
|   int regPressure = config().RPE->getMaxRegCountForBB(*BB, config().SIMD()) * config().sizeOfRegs_B;
 | |
|   DBG(dbgs() << " -- Reg Pressure = " << regPressure << " B, threshold = " << config().splitThreshold_B << " B.\n");
 | |
|   if (regPressure <= config().splitThreshold_B) {
 | |
|     DBG(dbgs() << " [SKIP] Register pressure below threshold.\n");
 | |
|     return false;
 | |
|   }
 | |
|   DBG(dbgs() << " [OK] Reg pressure high.\n");
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| bool LoadSplitter::Impl::processBlockLoad(GenIntrinsicInst *GII) {
 | |
|   if (!GII)
 | |
|     return false;
 | |
|   auto ptr = blockLoadsMap.find(GII);
 | |
|   if (ptr != blockLoadsMap.end())
 | |
|     return true;
 | |
| 
 | |
|   std::unique_ptr<Load> load = std::make_unique<Load>();
 | |
|   if (!load->readFromLoad(GII)) {
 | |
|     DBG(dbgs() << " -- [SKIP] Invalid block load.\n");
 | |
|     return false;
 | |
|   }
 | |
|   DBG(dbgs() << " -- [OK] Valid block load.\n");
 | |
|   if (!load->tracePicks()) {
 | |
|     DBG(dbgs() << " -- [SKIP] Invalid picks.\n");
 | |
|     return false;
 | |
|   }
 | |
|   DBG(dbgs() << " -- [OK] Picks are valid:\n";
 | |
|       for (const Pick &pick : *load->trace->picks) { dbgs() << "    -- " << pick << "\n"; });
 | |
|   blockLoadsMap[GII] = std::move(load);
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| PossibleDims LoadSplitter::Impl::possibleDims(GenIntrinsicInst *GII) {
 | |
|   if (!GII)
 | |
|     return {};
 | |
|   DBG(dbgs() << "\nPossible split dimensions for: " << *GII << ".\n");
 | |
|   PossibleDims dims;
 | |
|   if (GII->getIntrinsicID() == GenISAIntrinsic::GenISA_LSC2DBlockRead) {
 | |
|     if (!processBlockLoad(GII))
 | |
|       return {};
 | |
|     Load &load = *blockLoadsMap[GII];
 | |
|     DBG(dbgs() << " -- load = " << *load.GII << ".\n");
 | |
|     dims = load.possibleDims();
 | |
|   } else {
 | |
|     DBG(dbgs() << " [ERROR] The intrinsic: " << *GII << " is not a load.\n");
 | |
|     return {};
 | |
|   }
 | |
|   return dims;
 | |
| }
 | |
| 
 | |
| LoadSplitter::Impl::SplitLoads LoadSplitter::Impl::splitBlockLoad(Load &load, const MBRanges &splits) {
 | |
|   unsigned numOfSplits = load.vectorLength / splits.front().size();
 | |
|   if (numOfSplits == 1) {
 | |
|     DBG(dbgs() << " -- [SKIP] No need to split the block load.\n");
 | |
|     return {};
 | |
|   }
 | |
|   SplitLoads splitLoads;
 | |
|   splitLoads.reserve(numOfSplits);
 | |
|   for (unsigned m = 0; m < numOfSplits; ++m) {
 | |
|     std::unique_ptr<Load> splitLoad = load.split(splits[m]);
 | |
|     if (!splitLoad) {
 | |
|       DBG(dbgs() << " -- [ERROR] Split is not valid.\n");
 | |
|       return {};
 | |
|     }
 | |
|     if (splitLoad->trace->picks->empty()) {
 | |
|       DBG(dbgs() << " -- [SKIP] The new load has no users.\n");
 | |
|       continue;
 | |
|     }
 | |
|     splitLoads.push_back(std::move(splitLoad));
 | |
|   }
 | |
|   return splitLoads;
 | |
| }
 | |
| 
 | |
| bool LoadSplitter::Impl::putBlockLoad(Load &load, const std::string &nameExt) {
 | |
|   IRBuilder<> builder(load.GII);
 | |
|   CallInst *V = load.putBlockLoad(builder);
 | |
|   DBG(dbgs() << "    -- [OK] New load put: " << *V << ".\n");
 | |
|   if (load.GII->hasName()) {
 | |
|     V->setName(load.GII->getName() + nameExt);
 | |
|   }
 | |
|   load.trace->putPicks(builder, V);
 | |
|   DBG(dbgs() << "    -- [OK] Picks put.\n");
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| bool LoadSplitter::Impl::split(GenIntrinsicInst *GII, Dims dims) {
 | |
|   if (!GII)
 | |
|     return false;
 | |
|   if (GII->getIntrinsicID() != GenISAIntrinsic::GenISA_LSC2DBlockRead) {
 | |
|     DBG(dbgs() << " -- [ERROR] The intrinsic: " << *GII << " is not a load.\n");
 | |
|     return false;
 | |
|   }
 | |
|   DBG(dbgs() << " -- Block load: " << *GII << "\n");
 | |
|   DBG(dbgs() << "    -- Dimensions: " << dims << "\n");
 | |
| 
 | |
|   auto doPicksFit = [](const MBRanges &mbrs, const Picks &picks) -> bool {
 | |
|     for (unsigned r = 0; r < mbrs.size(); ++r) {
 | |
|       const MBRange &mbr = mbrs[r];
 | |
|       for (const Pick &pick : picks) {
 | |
|         if (mbr.containsOrExcludes(pick, false) == MBRange::Containment::Intersects) {
 | |
|           return false;
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|     return true;
 | |
|   };
 | |
| 
 | |
|   if (!processBlockLoad(GII))
 | |
|     return false;
 | |
|   Load &load = *blockLoadsMap[GII];
 | |
|   if (load.vectorLength <= dims.size()) {
 | |
|     DBG(dbgs() << " -- [SKIP] Nothing to split.\n");
 | |
|     return false;
 | |
|   }
 | |
|   std::optional<MBRanges> splitsOpt = makeUniform(dims.grSize, load.groupLength(), dims.numOfGr, load.vectorLength);
 | |
|   if (!splitsOpt) {
 | |
|     DBG(dbgs() << " -- [ERROR] Split is not valid.\n");
 | |
|     return false;
 | |
|   }
 | |
|   MBRanges &splits = *splitsOpt;
 | |
|   if (!doPicksFit(splits, *load.trace->picks)) {
 | |
|     DBG(dbgs() << " -- [ERROR] Picks do not fit the splits.\n");
 | |
|     return false;
 | |
|   }
 | |
|   auto newLoads = splitBlockLoad(load, splits);
 | |
|   if (newLoads.empty()) {
 | |
|     DBG(dbgs() << " -- [ERROR] Splitting failed.\n");
 | |
|     return false;
 | |
|   }
 | |
|   for (unsigned n = 0; n < newLoads.size(); ++n) {
 | |
|     putBlockLoad(*newLoads[n], ".split." + std::to_string(n));
 | |
|   }
 | |
|   load.removeOldInstructions();
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| bool LoadSplitter::Impl::splitAllToSmallest(BasicBlock *BB) {
 | |
|   DBG(dbgs() << "\nNew BB: " << BB->getName() << ".\n");
 | |
| 
 | |
|   if (!config().ignoreSplitThreshold && !isRPHigh(BB)) {
 | |
|     DBG(dbgs() << " [SKIP] Register pressure below threshold.\n");
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   blockLoadsMap.clear();
 | |
|   dimsMap.clear();
 | |
| 
 | |
|   for (auto I = BB->begin(); I != BB->end(); ++I) {
 | |
|     GenIntrinsicInst *GII = dyn_cast<GenIntrinsicInst>(&*I);
 | |
|     if (!GII)
 | |
|       continue;
 | |
|     if (GII->getIntrinsicID() == GenISAIntrinsic::GenISA_LSC2DBlockRead) {
 | |
|       DBG(dbgs() << "\nNew block load: " << *GII << ".\n");
 | |
|       if (!processBlockLoad(GII)) {
 | |
|         DBG(dbgs() << " [SKIP] Invalid block load.\n");
 | |
|         continue;
 | |
|       }
 | |
|       DBG(dbgs() << " [OK] Load is potentially splittable.\n");
 | |
|     }
 | |
|   }
 | |
|   DBG(dbgs() << "\nLoads tracing done.\n");
 | |
|   if (blockLoadsMap.empty()) {
 | |
|     DBG(dbgs() << " [SKIP] No loads left to split.\n");
 | |
|     return false;
 | |
|   }
 | |
|   DBG(dbgs() << " -- Number of block loads = " << blockLoadsMap.size() << ".\n");
 | |
| 
 | |
|   DBG(dbgs() << "\nCalculating possible splits:\n");
 | |
|   for (auto &[GII, loadPtr] : blockLoadsMap) {
 | |
|     DBG(dbgs() << " -- Block load: " << *GII << ".\n");
 | |
|     PossibleDims dims = possibleDims(GII);
 | |
|     if (dims.empty())
 | |
|       continue;
 | |
|     dimsMap[GII] = *std::min_element(dims.begin(), dims.end());
 | |
|   }
 | |
|   if (dimsMap.empty()) {
 | |
|     DBG(dbgs() << " [SKIP] No splits left.\n");
 | |
|     return false;
 | |
|   }
 | |
|   DBG(dbgs() << " [OK] Smallest possible splits found.\n");
 | |
| 
 | |
|   bool codeChanges = false;
 | |
|   DBG(dbgs() << "\nSplitting loads:\n");
 | |
|   for (auto &[GII, dims] : dimsMap) {
 | |
|     if (!split(GII, dims)) {
 | |
|       DBG(dbgs() << " [SKIP] Split failed.\n");
 | |
|       continue;
 | |
|     }
 | |
|     codeChanges = true;
 | |
|   }
 | |
|   DBG(dbgs() << " [OK] Splitting done.\n");
 | |
|   return codeChanges;
 | |
| }
 | |
| 
 | |
| } // namespace IGC::LS
 | |
| 
 | |
| // ==========================================================================
 | |
| 
 | |
| namespace {
 | |
| class SplitLoads : public FunctionPass {
 | |
| public:
 | |
|   // Pass identification, replacement for typeid
 | |
|   static char ID;
 | |
| 
 | |
|   virtual StringRef getPassName() const override { return "SplitLoads"; }
 | |
| 
 | |
|   void getAnalysisUsage(AnalysisUsage &AU) const override {
 | |
|     AU.addRequired<CodeGenContextWrapper>();
 | |
|     AU.addRequired<IGCLivenessAnalysis>();
 | |
|     AU.setPreservesCFG();
 | |
|   }
 | |
| 
 | |
|   SplitLoads();
 | |
|   SplitLoads(const SplitLoads &) = delete;
 | |
|   SplitLoads &operator=(const SplitLoads &) = delete;
 | |
| 
 | |
|   virtual bool runOnFunction(Function &F) override;
 | |
| 
 | |
| private:
 | |
|   std::unique_ptr<LoadSplitter> loadSplitter = nullptr;
 | |
| };
 | |
| } // namespace
 | |
| 
 | |
| char SplitLoads::ID = 0;
 | |
| 
 | |
| // Register pass to igc-opt
 | |
| #define PASS_FLAG "igc-split-loads"
 | |
| #define PASS_DESCRIPTION "Splits 2D LSC block loads into smaller chunks"
 | |
| #define PASS_CFG_ONLY false
 | |
| #define PASS_ANALYSIS false
 | |
| IGC_INITIALIZE_PASS_BEGIN(SplitLoads, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
 | |
| IGC_INITIALIZE_PASS_DEPENDENCY(IGCLivenessAnalysis)
 | |
| IGC_INITIALIZE_PASS_END(SplitLoads, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
 | |
| 
 | |
| FunctionPass *IGC::createSplitLoadsPass() { return new SplitLoads(); }
 | |
| 
 | |
| SplitLoads::SplitLoads() : FunctionPass(ID) { initializeSplitLoadsPass(*PassRegistry::getPassRegistry()); }
 | |
| 
 | |
| bool SplitLoads::runOnFunction(Function &F) {
 | |
|   if (!config().enableLoadSplitting || skipFunction(F)) {
 | |
|     return false;
 | |
|   }
 | |
|   loadSplitter = LoadSplitter::Create(&F, getAnalysis<CodeGenContextWrapper>().getCodeGenContext(),
 | |
|                                       &getAnalysis<IGCLivenessAnalysis>());
 | |
|   if (!loadSplitter) {
 | |
|     return false;
 | |
|   }
 | |
| 
 | |
|   DBG(dbgs() << "\nSPLITLOADS ON: " << F.getName() << "\n");
 | |
| 
 | |
|   auto pad = [](const std::string &s, size_t len) -> std::string {
 | |
|     return s.size() < len ? s + std::string(len - s.size(), ' ') : s.substr(0, len);
 | |
|   };
 | |
| 
 | |
|   bool codeChanged = false;
 | |
|   for (BasicBlock &BB : F) {
 | |
|     bool splitterChangesCode = loadSplitter->splitAllToSmallest(&BB);
 | |
|     codeChanged |= splitterChangesCode;
 | |
|     if (splitterChangesCode) {
 | |
|       DBG(dbgs() << "BB: " << pad(BB.getName().str(), 20) << " : SPLIT SUCCESSFUL.\n");
 | |
|     } else {
 | |
|       DBG(dbgs() << "BB: " << pad(BB.getName().str(), 20) << " : NO SPLITS.\n");
 | |
|     }
 | |
|   }
 | |
|   DBG(dbgs() << "\n");
 | |
| 
 | |
|   return codeChanged;
 | |
| }
 | |
| 
 | |
| #undef DEBUG_TYPE
 | 
