[RISCV] Improve codegen for shuffles with LHS/RHS splats

Shuffles which are broken into separate halves reveal splats in which a half is accessed via one index; such operations can be optimized to use "vrgather.vi". This optimization could be achieved by adding extra patterns to match `vrgather_vv_vl` which uses a splat as an index operand, but this patch instead identifies splat earlier. This way, future optimizations can build on top of the data gathered here, e.g., to splat-gather dominant indices and insert any leftovers. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D107449
2026-01-17 06:40:01 +08:00 · 2021-08-04 11:07:34 +01:00
parent 5f996705e0
commit 2b4a1d4b86
5 changed files with 98 additions and 64 deletions
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1971,6 +1971,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
  bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
  bool InvertMask = IsSelect == SwapOps;

+  // Keep a track of which non-undef indices are used by each LHS/RHS shuffle
+  // half.
+  DenseMap<int, unsigned> LHSIndexCounts, RHSIndexCounts;
+
  // Now construct the mask that will be used by the vselect or blended
  // vrgather operation. For vrgathers, construct the appropriate indices into
  // each vector.
@@ -1985,6 +1989,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
      GatherIndicesRHS.push_back(
          IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT)
                            : DAG.getConstant(MaskIndex - NumElts, DL, XLenVT));
+      if (IsLHSOrUndefIndex && MaskIndex >= 0)
+        ++LHSIndexCounts[MaskIndex];
+      if (!IsLHSOrUndefIndex)
+        ++RHSIndexCounts[MaskIndex - NumElts];
    }
  }

@@ -2008,13 +2016,14 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
    return SDValue();
  }

-  unsigned GatherOpc = RISCVISD::VRGATHER_VV_VL;
+  unsigned GatherVXOpc = RISCVISD::VRGATHER_VX_VL;
+  unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
  MVT IndexVT = VT.changeTypeToInteger();
  // Since we can't introduce illegal index types at this stage, use i16 and
  // vrgatherei16 if the corresponding index type for plain vrgather is greater
  // than XLenVT.
  if (IndexVT.getScalarType().bitsGT(XLenVT)) {
-    GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
+    GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
    IndexVT = IndexVT.changeVectorElementType(MVT::i16);
  }

@@ -2027,28 +2036,48 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
  if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
    Gather = lowerScalarSplat(SplatValue, VL, ContainerVT, DL, DAG, Subtarget);
  } else {
-    SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
-    LHSIndices =
-        convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
-
    V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
-    Gather =
-        DAG.getNode(GatherOpc, DL, ContainerVT, V1, LHSIndices, TrueMask, VL);
+    // If only one index is used, we can use a "splat" vrgather.
+    // TODO: We can splat the most-common index and fix-up any stragglers, if
+    // that's beneficial.
+    if (LHSIndexCounts.size() == 1) {
+      int SplatIndex = LHSIndexCounts.begin()->getFirst();
+      Gather =
+          DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
+                      DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask, VL);
+    } else {
+      SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
+      LHSIndices =
+          convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
+
+      Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
+                           TrueMask, VL);
+    }
  }

  // If a second vector operand is used by this shuffle, blend it in with an
  // additional vrgather.
  if (!V2.isUndef()) {
+    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+    // If only one index is used, we can use a "splat" vrgather.
+    // TODO: We can splat the most-common index and fix-up any stragglers, if
+    // that's beneficial.
+    if (RHSIndexCounts.size() == 1) {
+      int SplatIndex = RHSIndexCounts.begin()->getFirst();
+      V2 = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
+                       DAG.getConstant(SplatIndex, DL, XLenVT), TrueMask, VL);
+    } else {
+      SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
+      RHSIndices =
+          convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
+      V2 = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, TrueMask,
+                       VL);
+    }
+
    MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
    SelectMask =
        convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);

-    SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
-    RHSIndices =
-        convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
-
-    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
-    V2 = DAG.getNode(GatherOpc, DL, ContainerVT, V2, RHSIndices, TrueMask, VL);
    Gather = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, SelectMask, V2,
                         Gather, VL);
  }
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -1314,6 +1314,18 @@ foreach vti = AllIntegerVectors in {
                 vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
                 vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;

+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+                                          (riscv_vrgather_vx_vl
+                                            vti.RegClass:$rs2,
+                                            uimm5:$imm,
+                                            (vti.Mask true_mask),
+                                            VLOpFrag),
+                                          vti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
+                 vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
+                 vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+
  // emul = lmul * 16 / sew
  defvar vlmul = vti.LMul;
  defvar octuple_lmul = vlmul.octuple;
@@ -1385,6 +1397,18 @@ foreach vti = AllFloatVectors in {
                 vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
                 vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;

+  def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask VMV0:$vm),
+                                          (riscv_vrgather_vx_vl
+                                            vti.RegClass:$rs2,
+                                            uimm5:$imm,
+                                            (vti.Mask true_mask),
+                                            VLOpFrag),
+                                          vti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVRGATHER_VI_"# vti.LMul.MX#"_MASK")
+                 vti.RegClass:$merge, vti.RegClass:$rs2, uimm5:$imm,
+                 vti.Mask:$vm, GPR:$vl, vti.Log2SEW)>;
+
  defvar vlmul = vti.LMul;
  defvar octuple_lmul = vlmul.octuple;
  defvar octuple_emul = !srl(!mul(octuple_lmul, 16), vti.Log2SEW);
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -57,28 +57,25 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x,
 ;
 ; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization:
 ; LMULMAX2:       # %bb.0:
-; LMULMAX2-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-NEXT:    vmv.v.i v25, 0
-; LMULMAX2-NEXT:    vrgather.vv v26, v8, v25
 ; LMULMAX2-NEXT:    addi a0, zero, 2
 ; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; LMULMAX2-NEXT:    vmv.s.x v0, a0
 ; LMULMAX2-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-NEXT:    vmv.v.i v27, 3
+; LMULMAX2-NEXT:    vrgather.vi v25, v8, 0
 ; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m1, tu, mu
-; LMULMAX2-NEXT:    vrgather.vv v26, v9, v27, v0.t
-; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
-; LMULMAX2-NEXT:    vrgather.vv v28, v10, v25
+; LMULMAX2-NEXT:    vrgather.vi v25, v9, 3, v0.t
 ; LMULMAX2-NEXT:    addi a0, zero, 8
 ; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; LMULMAX2-NEXT:    vmv.s.x v0, a0
-; LMULMAX2-NEXT:    vsetivli zero, 4, e32, m1, tu, mu
-; LMULMAX2-NEXT:    vrgather.vv v28, v11, v27, v0.t
+; LMULMAX2-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; LMULMAX2-NEXT:    vrgather.vi v26, v10, 0
+; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m1, tu, mu
+; LMULMAX2-NEXT:    vrgather.vi v26, v11, 3, v0.t
 ; LMULMAX2-NEXT:    addi a0, zero, 3
 ; LMULMAX2-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; LMULMAX2-NEXT:    vmv.s.x v0, a0
 ; LMULMAX2-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; LMULMAX2-NEXT:    vmerge.vvm v8, v28, v26, v0
+; LMULMAX2-NEXT:    vmerge.vvm v8, v26, v25, v0
 ; LMULMAX2-NEXT:    ret
  %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> <i32 0, i32 7, i32 8, i32 15>
  ret <4 x float> %z
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll
@@ -142,10 +142,8 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y)
 ; RV32-NEXT:    addi a0, zero, 8
 ; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; RV32-NEXT:    vmv.v.i v25, 1
-; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, mu
-; RV32-NEXT:    vrgatherei16.vv v26, v10, v25, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, tu, mu
+; RV32-NEXT:    vrgather.vi v26, v10, 1, v0.t
 ; RV32-NEXT:    vmv2r.v v8, v26
 ; RV32-NEXT:    ret
 ;
@@ -159,10 +157,8 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y)
 ; RV64-NEXT:    addi a0, zero, 8
 ; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; RV64-NEXT:    vmv.s.x v0, a0
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
-; RV64-NEXT:    vmv.v.i v28, 1
-; RV64-NEXT:    vsetvli zero, zero, e64, m2, tu, mu
-; RV64-NEXT:    vrgather.vv v26, v10, v28, v0.t
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, tu, mu
+; RV64-NEXT:    vrgather.vi v26, v10, 1, v0.t
 ; RV64-NEXT:    vmv2r.v v8, v26
 ; RV64-NEXT:    ret
  %s = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -93,10 +93,8 @@ define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-NEXT:    addi a0, zero, 8
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 1
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, tu, mu
-; CHECK-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, tu, mu
+; CHECK-NEXT:    vrgather.vi v25, v9, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
  %s = shufflevector <4 x i16> %x, <4 x i16> %y, <4 x i32> <i32 1, i32 2, i32 0, i32 5>
@@ -388,16 +386,13 @@ define <8 x i8> @splat_ve4_ins_i1ve3(<8 x i8> %v) {
 define <8 x i8> @splat_ve2_we0(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 2
-; CHECK-NEXT:    vrgather.vv v25, v8, v26
 ; CHECK-NEXT:    addi a0, zero, 66
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 0
+; CHECK-NEXT:    vrgather.vi v25, v8, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, mu
-; CHECK-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; CHECK-NEXT:    vrgather.vi v25, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
  %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 2, i32 2, i32 2, i32 2, i32 8, i32 2>
@@ -417,10 +412,8 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    addi a0, zero, 66
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 0
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, mu
-; CHECK-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
+; CHECK-NEXT:    vrgather.vi v25, v9, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
  %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 4, i32 8, i32 2, i32 2, i32 2, i32 2, i32 8, i32 2>
@@ -430,12 +423,11 @@ define <8 x i8> @splat_ve2_we0_ins_i0ve4(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @splat_ve2_we0_ins_i0we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0_ins_i0we4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 2
-; CHECK-NEXT:    vrgather.vv v25, v8, v26
 ; CHECK-NEXT:    addi a0, zero, 67
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vi v25, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmv.v.i v26, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
@@ -458,10 +450,8 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) {
 ; RV32-NEXT:    addi a0, zero, 66
 ; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT:    vmv.v.i v26, 0
-; RV32-NEXT:    vsetvli zero, zero, e8, mf2, tu, mu
-; RV32-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
+; RV32-NEXT:    vrgather.vi v25, v9, 0, v0.t
 ; RV32-NEXT:    vmv1r.v v8, v25
 ; RV32-NEXT:    ret
 ;
@@ -476,10 +466,8 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) {
 ; RV64-NEXT:    addi a0, zero, 66
 ; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; RV64-NEXT:    vmv.s.x v0, a0
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT:    vmv.v.i v26, 0
-; RV64-NEXT:    vsetvli zero, zero, e8, mf2, tu, mu
-; RV64-NEXT:    vrgather.vv v25, v9, v26, v0.t
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
+; RV64-NEXT:    vrgather.vi v25, v9, 0, v0.t
 ; RV64-NEXT:    vmv1r.v v8, v25
 ; RV64-NEXT:    ret
  %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 2, i32 2, i32 2, i32 8, i32 2>
@@ -489,19 +477,19 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: splat_ve2_we0_ins_i2we4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vmv.v.i v26, 2
-; CHECK-NEXT:    vrgather.vv v25, v8, v26
 ; CHECK-NEXT:    addi a0, zero, 4
-; CHECK-NEXT:    vmv.s.x v26, a0
-; CHECK-NEXT:    vmv.v.i v27, 0
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vmv.s.x v25, a0
+; CHECK-NEXT:    vmv.v.i v26, 0
 ; CHECK-NEXT:    vsetivli zero, 3, e8, mf2, tu, mu
-; CHECK-NEXT:    vslideup.vi v27, v26, 2
+; CHECK-NEXT:    vslideup.vi v26, v25, 2
 ; CHECK-NEXT:    addi a0, zero, 70
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; CHECK-NEXT:    vrgather.vv v25, v9, v27, v0.t
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vrgather.vi v25, v8, 2
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, mu
+; CHECK-NEXT:    vrgather.vv v25, v9, v26, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v25
 ; CHECK-NEXT:    ret
  %shuff = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 12, i32 2, i32 2, i32 2, i32 8, i32 2>