diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index da04880348af..cd441638bac9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5352,8 +5352,24 @@ static bool isLocalRepeatingShuffle(ArrayRef Mask, int Span) { /// Is this mask only using elements from the first span of the input? static bool isLowSourceShuffle(ArrayRef Mask, int Span) { - return all_of(Mask, - [&](const auto &Idx) { return Idx == -1 || Idx < Span; }); + return all_of(Mask, [&](const auto &Idx) { return Idx == -1 || Idx < Span; }); +} + +/// Return true for a mask which performs an arbitrary shuffle within the first +/// span, and then repeats that same result across all remaining spans. Note +/// that this doesn't check if all the inputs come from a single span! +static bool isSpanSplatShuffle(ArrayRef Mask, int Span) { + SmallVector LowSpan(Span, -1); + for (auto [I, M] : enumerate(Mask)) { + if (M == -1) + continue; + int SpanIdx = I % Span; + if (LowSpan[SpanIdx] == -1) + LowSpan[SpanIdx] = M; + if (LowSpan[SpanIdx] != M) + return false; + } + return true; } /// Try to widen element type to get a new mask value for a better permutation @@ -5771,6 +5787,35 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather, SubVec, SubIdx); } + } else if (NumElts > MinVLMAX && isLowSourceShuffle(Mask, MinVLMAX) && + isSpanSplatShuffle(Mask, MinVLMAX)) { + // If we have a shuffle which only uses the first register in our source + // register group, and repeats the same index across all spans, we can + // use a single vrgather (and possibly some register moves). + // TODO: This can be generalized for m2 or m4, or for any shuffle for + // which we can do a linear number of shuffles to form an m1 which + // contains all the output elements. + const MVT M1VT = getLMUL1VT(ContainerVT); + EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType()); + auto [InnerTrueMask, InnerVL] = + getDefaultScalableVLOps(M1VT, DL, DAG, Subtarget); + int N = ContainerVT.getVectorMinNumElements() / + M1VT.getVectorMinNumElements(); + assert(isPowerOf2_32(N) && N <= 8); + SDValue SubV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, V1, + DAG.getVectorIdxConstant(0, DL)); + SDValue SubIndex = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubIndexVT, LHSIndices, + DAG.getVectorIdxConstant(0, DL)); + SDValue SubVec = DAG.getNode(GatherVVOpc, DL, M1VT, SubV1, SubIndex, + DAG.getUNDEF(M1VT), InnerTrueMask, InnerVL); + Gather = DAG.getUNDEF(ContainerVT); + for (int i = 0; i < N; i++) { + SDValue SubIdx = + DAG.getVectorIdxConstant(M1VT.getVectorMinNumElements() * i, DL); + Gather = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Gather, + SubVec, SubIdx); + } } else if (NumElts > MinVLMAX && isLowSourceShuffle(Mask, MinVLMAX)) { // If we have a shuffle which only uses the first register in our // source register group, we can do a linear number of m1 vrgathers diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index d7120b4a1693..3e31c9de6165 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -1311,22 +1311,14 @@ define void @shuffle_i128_splat(ptr %p) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, 16 -; CHECK-NEXT: srli a1, a1, 3 +; CHECK-NEXT: lui a1, 16 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a2 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v10, v9, a1 -; CHECK-NEXT: vslidedown.vx v11, v10, a1 -; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v13, v8, v10 -; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 -; CHECK-NEXT: vrgatherei16.vv v14, v8, v11 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v9, v11, a1 +; CHECK-NEXT: vmv.v.x v9, a1 ; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v15, v8, v9 +; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 +; CHECK-NEXT: vmv.v.v v13, v12 +; CHECK-NEXT: vmv.v.v v14, v12 +; CHECK-NEXT: vmv.v.v v15, v12 ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vse64.v v12, (a0) ; CHECK-NEXT: ret @@ -1435,3 +1427,20 @@ define <4 x i16> @vmerge_3(<4 x i16> %x) { %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> ret <4 x i16> %s } + + +define <8 x i64> @shuffle_v8i164_span_splat(<8 x i64> %a) nounwind { +; CHECK-LABEL: shuffle_v8i164_span_splat: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 1 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 +; CHECK-NEXT: vmv.v.v v13, v12 +; CHECK-NEXT: vmv.v.v v14, v12 +; CHECK-NEXT: vmv.v.v v15, v12 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> + ret <8 x i64> %res +}