[X86] Fold CONCAT(VPERMV3(X,Y,M0),VPERMV3(Z,W,M1)) -> VPERMV3(CONCAT(X,Z),CONCAT(Y,W),CONCAT(M0,M1))

Further prep work toward supporting different subvector sizes in combineX86ShufflesRecursively
This commit is contained in:
Simon Pilgrim
2020-12-08 18:35:23 +00:00
parent 6f13445fb6
commit 24184dbb82
2 changed files with 68 additions and 42 deletions

View File

@@ -48813,6 +48813,38 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
return DAG.getBitcast(VT, Res);
}
break;
case X86ISD::VPERMV3:
if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
MVT OpVT = Op0.getSimpleValueType();
int NumSrcElts = OpVT.getVectorNumElements();
SmallVector<int, 64> ConcatMask;
for (unsigned i = 0; i != NumOps; ++i) {
bool IsUnary;
SmallVector<int, 64> SubMask;
SmallVector<SDValue, 2> SubOps;
if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
SubMask, IsUnary))
break;
for (int M : SubMask) {
if (0 <= M) {
M += M < NumSrcElts ? 0 : NumSrcElts;
M += i * NumSrcElts;
}
ConcatMask.push_back(M);
}
}
if (ConcatMask.size() == (NumOps * NumSrcElts)) {
SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
Ops[1].getOperand(0), DAG, DL);
SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
Ops[1].getOperand(2), DAG, DL);
MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
}
}
break;
case X86ISD::VSHLI:
case X86ISD::VSRAI:
case X86ISD::VSRLI:

View File

@@ -145,11 +145,10 @@ define <32 x i16> @concat_trunc_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) nou
; AVX512-NEXT: vpsrad $23, %zmm1, %zmm1
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512-NEXT: retq
%1 = ashr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
%2 = ashr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
@@ -166,11 +165,10 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou
; AVX512-NEXT: vpsrld $23, %zmm1, %zmm1
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512-NEXT: retq
%1 = lshr <16 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
%2 = lshr <16 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
@@ -183,26 +181,25 @@ define <32 x i16> @concat_trunc_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) nou
define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; AVX512F-LABEL: concat_trunc_packsswb_512:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: concat_trunc_packsswb_512:
@@ -211,11 +208,10 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512BW-NEXT: retq
%1 = ashr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -228,26 +224,25 @@ define <64 x i8> @concat_trunc_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; AVX512F-LABEL: concat_trunc_packuswb_512:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512F-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: concat_trunc_packuswb_512:
@@ -256,11 +251,10 @@ define <64 x i8> @concat_trunc_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) noun
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,6,3,7]
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,4,1,5]
; AVX512BW-NEXT: vpermi2q %ymm1, %ymm0, %ymm3
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,8,1,9,6,14,7,15]
; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm0
; AVX512BW-NEXT: retq
%1 = lshr <32 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
%2 = and <32 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>