mirror of
https://github.com/intel/llvm.git
synced 2026-01-26 21:53:12 +08:00
[RISCV] Use v(f)slide1up for shuffle+insert idiom
This is pretty straight forward in the basic form. I did need to move the slideup matching earlier, but that looks generally profitable on it's own. As follow ups, I plan to explore the v(f)slide1down variants, and see what I can do to canonicalize the shuffle then insert pattern (see _inverse tests at the end of the vslide1up.ll test). Differential Revision: https://reviews.llvm.org/D151468
This commit is contained in:
committed by
Philip Reames
parent
891fad0448
commit
544a240ff7
@@ -3731,6 +3731,20 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlideup(const SDLoc &DL, MVT VT,
|
||||
MVT XLenVT = Subtarget.getXLenVT();
|
||||
MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
|
||||
auto TrueMask = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).first;
|
||||
if (Index == 1 && NumSubElts + Index == (int)NumElts &&
|
||||
isa<BuildVectorSDNode>(InPlace)) {
|
||||
if (SDValue Splat = cast<BuildVectorSDNode>(InPlace)->getSplatValue()) {
|
||||
auto OpCode =
|
||||
VT.isFloatingPoint() ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VSLIDE1UP_VL;
|
||||
auto Vec = DAG.getNode(OpCode, DL, ContainerVT,
|
||||
DAG.getUNDEF(ContainerVT),
|
||||
convertToScalableVector(ContainerVT, ToInsert, DAG, Subtarget),
|
||||
Splat, TrueMask,
|
||||
DAG.getConstant(NumSubElts + Index, DL, XLenVT));
|
||||
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
|
||||
}
|
||||
}
|
||||
|
||||
// We slide up by the index that the subvector is being inserted at, and set
|
||||
// VL to the index + the number of elements being inserted.
|
||||
unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED | RISCVII::MASK_AGNOSTIC;
|
||||
@@ -3967,6 +3981,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
|
||||
Subtarget, DAG);
|
||||
}
|
||||
|
||||
if (SDValue V =
|
||||
lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
// Detect an interleave shuffle and lower to
|
||||
// (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1))
|
||||
int EvenSrc, OddSrc;
|
||||
@@ -3989,10 +4007,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
|
||||
return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
|
||||
}
|
||||
|
||||
if (SDValue V =
|
||||
lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
// Detect shuffles which can be re-expressed as vector selects; these are
|
||||
// shuffles in which each element in the destination is taken from an element
|
||||
// at the corresponding index in either source vectors.
|
||||
|
||||
@@ -171,11 +171,8 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
|
||||
define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
|
||||
; CHECK-LABEL: trn1.v2i32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vwaddu.vv v10, v8, v9
|
||||
; CHECK-NEXT: li a0, -1
|
||||
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
|
||||
; CHECK-NEXT: vmv1r.v v8, v10
|
||||
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v9, 1
|
||||
; CHECK-NEXT: ret
|
||||
%tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 0, i32 2>
|
||||
ret <2 x i32> %tmp0
|
||||
@@ -256,11 +253,8 @@ define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
|
||||
define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
|
||||
; CHECK-LABEL: trn1.v2f32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vwaddu.vv v10, v8, v9
|
||||
; CHECK-NEXT: li a0, -1
|
||||
; CHECK-NEXT: vwmaccu.vx v10, a0, v9
|
||||
; CHECK-NEXT: vmv1r.v v8, v10
|
||||
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v8, v9, 1
|
||||
; CHECK-NEXT: ret
|
||||
%tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 0, i32 2>
|
||||
ret <2 x float> %tmp0
|
||||
|
||||
@@ -8,11 +8,7 @@ define <2 x i8> @vslide1up_2xi8(<2 x i8> %v, i8 %b) {
|
||||
; CHECK-LABEL: vslide1up_2xi8:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
|
||||
; CHECK-NEXT: vmv.v.x v10, a0
|
||||
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
|
||||
; CHECK-NEXT: vwaddu.vv v9, v10, v8
|
||||
; CHECK-NEXT: li a0, -1
|
||||
; CHECK-NEXT: vwmaccu.vx v9, a0, v8
|
||||
; CHECK-NEXT: vslide1up.vx v9, v8, a0
|
||||
; CHECK-NEXT: vmv1r.v v8, v9
|
||||
; CHECK-NEXT: ret
|
||||
%vb = insertelement <2 x i8> poison, i8 %b, i64 0
|
||||
@@ -33,8 +29,7 @@ define <4 x i8> @vslide1up_4xi8(<4 x i8> %v, i8 %b) {
|
||||
; RV64-LABEL: vslide1up_4xi8:
|
||||
; RV64: # %bb.0:
|
||||
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
|
||||
; RV64-NEXT: vmv.v.x v9, a0
|
||||
; RV64-NEXT: vslideup.vi v9, v8, 1
|
||||
; RV64-NEXT: vslide1up.vx v9, v8, a0
|
||||
; RV64-NEXT: vmv1r.v v8, v9
|
||||
; RV64-NEXT: ret
|
||||
%vb = insertelement <4 x i8> poison, i8 %b, i64 0
|
||||
@@ -55,8 +50,7 @@ define <4 x i8> @vslide1up_4xi8_swapped(<4 x i8> %v, i8 %b) {
|
||||
; RV64-LABEL: vslide1up_4xi8_swapped:
|
||||
; RV64: # %bb.0:
|
||||
; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
|
||||
; RV64-NEXT: vmv.v.x v9, a0
|
||||
; RV64-NEXT: vslideup.vi v9, v8, 1
|
||||
; RV64-NEXT: vslide1up.vx v9, v8, a0
|
||||
; RV64-NEXT: vmv1r.v v8, v9
|
||||
; RV64-NEXT: ret
|
||||
%vb = insertelement <4 x i8> poison, i8 %b, i64 0
|
||||
@@ -68,22 +62,16 @@ define <2 x i16> @vslide1up_2xi16(<2 x i16> %v, i16 %b) {
|
||||
; RV32-LABEL: vslide1up_2xi16:
|
||||
; RV32: # %bb.0:
|
||||
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
|
||||
; RV32-NEXT: vmv.s.x v10, a0
|
||||
; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
|
||||
; RV32-NEXT: vwaddu.vv v9, v10, v8
|
||||
; RV32-NEXT: li a0, -1
|
||||
; RV32-NEXT: vwmaccu.vx v9, a0, v8
|
||||
; RV32-NEXT: vmv.s.x v9, a0
|
||||
; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
|
||||
; RV32-NEXT: vslideup.vi v9, v8, 1
|
||||
; RV32-NEXT: vmv1r.v v8, v9
|
||||
; RV32-NEXT: ret
|
||||
;
|
||||
; RV64-LABEL: vslide1up_2xi16:
|
||||
; RV64: # %bb.0:
|
||||
; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
|
||||
; RV64-NEXT: vmv.v.x v10, a0
|
||||
; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
|
||||
; RV64-NEXT: vwaddu.vv v9, v10, v8
|
||||
; RV64-NEXT: li a0, -1
|
||||
; RV64-NEXT: vwmaccu.vx v9, a0, v8
|
||||
; RV64-NEXT: vslide1up.vx v9, v8, a0
|
||||
; RV64-NEXT: vmv1r.v v8, v9
|
||||
; RV64-NEXT: ret
|
||||
%vb = insertelement <2 x i16> poison, i16 %b, i64 0
|
||||
@@ -95,8 +83,7 @@ define <4 x i16> @vslide1up_4xi16(<4 x i16> %v, i16 %b) {
|
||||
; RV32-LABEL: vslide1up_4xi16:
|
||||
; RV32: # %bb.0:
|
||||
; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; RV32-NEXT: vmv.v.x v9, a0
|
||||
; RV32-NEXT: vslideup.vi v9, v8, 1
|
||||
; RV32-NEXT: vslide1up.vx v9, v8, a0
|
||||
; RV32-NEXT: vmv1r.v v8, v9
|
||||
; RV32-NEXT: ret
|
||||
;
|
||||
@@ -117,22 +104,16 @@ define <2 x i32> @vslide1up_2xi32(<2 x i32> %v, i32 %b) {
|
||||
; RV32-LABEL: vslide1up_2xi32:
|
||||
; RV32: # %bb.0:
|
||||
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
|
||||
; RV32-NEXT: vmv.v.x v10, a0
|
||||
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
|
||||
; RV32-NEXT: vwaddu.vv v9, v10, v8
|
||||
; RV32-NEXT: li a0, -1
|
||||
; RV32-NEXT: vwmaccu.vx v9, a0, v8
|
||||
; RV32-NEXT: vslide1up.vx v9, v8, a0
|
||||
; RV32-NEXT: vmv1r.v v8, v9
|
||||
; RV32-NEXT: ret
|
||||
;
|
||||
; RV64-LABEL: vslide1up_2xi32:
|
||||
; RV64: # %bb.0:
|
||||
; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
|
||||
; RV64-NEXT: vmv.s.x v10, a0
|
||||
; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
|
||||
; RV64-NEXT: vwaddu.vv v9, v10, v8
|
||||
; RV64-NEXT: li a0, -1
|
||||
; RV64-NEXT: vwmaccu.vx v9, a0, v8
|
||||
; RV64-NEXT: vmv.s.x v9, a0
|
||||
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
|
||||
; RV64-NEXT: vslideup.vi v9, v8, 1
|
||||
; RV64-NEXT: vmv1r.v v8, v9
|
||||
; RV64-NEXT: ret
|
||||
%vb = insertelement <2 x i32> poison, i32 %b, i64 0
|
||||
@@ -144,8 +125,7 @@ define <4 x i32> @vslide1up_4xi32(<4 x i32> %v, i32 %b) {
|
||||
; CHECK-LABEL: vslide1up_4xi32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
|
||||
; CHECK-NEXT: vmv.v.x v9, a0
|
||||
; CHECK-NEXT: vslideup.vi v9, v8, 1
|
||||
; CHECK-NEXT: vslide1up.vx v9, v8, a0
|
||||
; CHECK-NEXT: vmv.v.v v8, v9
|
||||
; CHECK-NEXT: ret
|
||||
%vb = insertelement <4 x i32> poison, i32 %b, i64 0
|
||||
@@ -171,8 +151,7 @@ define <2 x i64> @vslide1up_2xi64(<2 x i64> %v, i64 %b) {
|
||||
; RV64-LABEL: vslide1up_2xi64:
|
||||
; RV64: # %bb.0:
|
||||
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
|
||||
; RV64-NEXT: vmv.v.x v9, a0
|
||||
; RV64-NEXT: vslideup.vi v9, v8, 1
|
||||
; RV64-NEXT: vslide1up.vx v9, v8, a0
|
||||
; RV64-NEXT: vmv.v.v v8, v9
|
||||
; RV64-NEXT: ret
|
||||
%vb = insertelement <2 x i64> poison, i64 %b, i64 0
|
||||
@@ -198,8 +177,7 @@ define <4 x i64> @vslide1up_4xi64(<4 x i64> %v, i64 %b) {
|
||||
; RV64-LABEL: vslide1up_4xi64:
|
||||
; RV64: # %bb.0:
|
||||
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
|
||||
; RV64-NEXT: vmv.v.x v10, a0
|
||||
; RV64-NEXT: vslideup.vi v10, v8, 1
|
||||
; RV64-NEXT: vslide1up.vx v10, v8, a0
|
||||
; RV64-NEXT: vmv.v.v v8, v10
|
||||
; RV64-NEXT: ret
|
||||
%vb = insertelement <4 x i64> poison, i64 %b, i64 0
|
||||
@@ -211,11 +189,7 @@ define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) {
|
||||
; CHECK-LABEL: vslide1up_2xf16:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
|
||||
; CHECK-NEXT: vfmv.v.f v10, fa0
|
||||
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
|
||||
; CHECK-NEXT: vwaddu.vv v9, v10, v8
|
||||
; CHECK-NEXT: li a0, -1
|
||||
; CHECK-NEXT: vwmaccu.vx v9, a0, v8
|
||||
; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
|
||||
; CHECK-NEXT: vmv1r.v v8, v9
|
||||
; CHECK-NEXT: ret
|
||||
%vb = insertelement <2 x half> poison, half %b, i64 0
|
||||
@@ -227,8 +201,7 @@ define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) {
|
||||
; CHECK-LABEL: vslide1up_4xf16:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vfmv.v.f v9, fa0
|
||||
; CHECK-NEXT: vslideup.vi v9, v8, 1
|
||||
; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
|
||||
; CHECK-NEXT: vmv1r.v v8, v9
|
||||
; CHECK-NEXT: ret
|
||||
%vb = insertelement <4 x half> poison, half %b, i64 0
|
||||
@@ -240,11 +213,7 @@ define <2 x float> @vslide1up_2xf32(<2 x float> %v, float %b) {
|
||||
; CHECK-LABEL: vslide1up_2xf32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vfmv.v.f v10, fa0
|
||||
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vwaddu.vv v9, v10, v8
|
||||
; CHECK-NEXT: li a0, -1
|
||||
; CHECK-NEXT: vwmaccu.vx v9, a0, v8
|
||||
; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
|
||||
; CHECK-NEXT: vmv1r.v v8, v9
|
||||
; CHECK-NEXT: ret
|
||||
%vb = insertelement <2 x float> poison, float %b, i64 0
|
||||
@@ -256,8 +225,7 @@ define <4 x float> @vslide1up_4xf32(<4 x float> %v, float %b) {
|
||||
; CHECK-LABEL: vslide1up_4xf32:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
|
||||
; CHECK-NEXT: vfmv.v.f v9, fa0
|
||||
; CHECK-NEXT: vslideup.vi v9, v8, 1
|
||||
; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
|
||||
; CHECK-NEXT: vmv.v.v v8, v9
|
||||
; CHECK-NEXT: ret
|
||||
%vb = insertelement <4 x float> poison, float %b, i64 0
|
||||
@@ -269,8 +237,7 @@ define <2 x double> @vslide1up_2xf64(<2 x double> %v, double %b) {
|
||||
; CHECK-LABEL: vslide1up_2xf64:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
|
||||
; CHECK-NEXT: vfmv.v.f v9, fa0
|
||||
; CHECK-NEXT: vslideup.vi v9, v8, 1
|
||||
; CHECK-NEXT: vfslide1up.vf v9, v8, fa0
|
||||
; CHECK-NEXT: vmv.v.v v8, v9
|
||||
; CHECK-NEXT: ret
|
||||
%vb = insertelement <2 x double> poison, double %b, i64 0
|
||||
@@ -291,6 +258,24 @@ define <4 x double> @vslide1up_4xf64(<4 x double> %v, double %b) {
|
||||
ret <4 x double> %v1
|
||||
}
|
||||
|
||||
define <4 x i8> @vslide1up_4xi8_with_splat(<4 x i8> %v, i8 %b) {
|
||||
; CHECK-LABEL: vslide1up_4xi8_with_splat:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: li a1, 14
|
||||
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
|
||||
; CHECK-NEXT: vmv.s.x v0, a1
|
||||
; CHECK-NEXT: vid.v v9
|
||||
; CHECK-NEXT: vadd.vi v10, v9, -1
|
||||
; CHECK-NEXT: vmv.v.x v9, a0
|
||||
; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t
|
||||
; CHECK-NEXT: vmv1r.v v8, v9
|
||||
; CHECK-NEXT: ret
|
||||
%vb = insertelement <4 x i8> poison, i8 %b, i64 0
|
||||
%v1 = shufflevector <4 x i8> %vb, <4 x i8> poison, <4 x i32> zeroinitializer
|
||||
%v2 = shufflevector <4 x i8> %v1, <4 x i8> %v, <4 x i32> <i32 1, i32 4, i32 5, i32 6>
|
||||
ret <4 x i8> %v2
|
||||
}
|
||||
|
||||
define <2 x double> @vslide1up_v2f64_inverted(<2 x double> %v, double %b) {
|
||||
; CHECK-LABEL: vslide1up_v2f64_inverted:
|
||||
; CHECK: # %bb.0:
|
||||
@@ -320,7 +305,8 @@ define <4 x i8> @vslide1up_4xi8_inverted(<4 x i8> %v, i8 %b) {
|
||||
}
|
||||
|
||||
|
||||
; The length of the shift is less than the suffix
|
||||
; The length of the shift is less than the suffix, since we'd have to
|
||||
; materailize the splat, using the vslide1up doesn't help us.
|
||||
define <4 x i32> @vslide1up_4xi32_neg1(<4 x i32> %v, i32 %b) {
|
||||
; CHECK-LABEL: vslide1up_4xi32_neg1:
|
||||
; CHECK: # %bb.0:
|
||||
@@ -335,3 +321,15 @@ define <4 x i32> @vslide1up_4xi32_neg1(<4 x i32> %v, i32 %b) {
|
||||
%v1 = shufflevector <4 x i32> %v, <4 x i32> %vb2, <4 x i32> <i32 4, i32 0, i32 1, i32 7>
|
||||
ret <4 x i32> %v1
|
||||
}
|
||||
|
||||
; We don't know the scalar to do the vslide1up
|
||||
define <4 x i32> @vslide1up_4xi32_neg2(<4 x i32> %v1, <4 x i32> %v2) {
|
||||
; CHECK-LABEL: vslide1up_4xi32_neg2:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
|
||||
; CHECK-NEXT: vslideup.vi v9, v8, 1
|
||||
; CHECK-NEXT: vmv.v.v v8, v9
|
||||
; CHECK-NEXT: ret
|
||||
%res = shufflevector <4 x i32> %v1, <4 x i32> %v2, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
|
||||
ret <4 x i32> %res
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user