From ae33cbc49408c90cef0b1246a7bae59bd467c93b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 15 Jun 2020 15:30:06 +0100 Subject: [PATCH] [X86][SSE] LowerVectorAllZeroTest - add support for >256-bit vectors Reduce by splitting the vector until we reach the target size for PTEST/MOVMSK_PCMPEQ. There might be some cases where AVX512 can perform this with 512-bit vectors but so far I haven't encountered any such pattern that reaches LowerVectorAllZeroTest. Prep work for D81547 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 18 +++++-- llvm/test/CodeGen/X86/ptest.ll | 63 ++++--------------------- 2 files changed, 23 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 83bc40ace2ab..b80c94661d74 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -21366,9 +21366,9 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, }) && "Reduction source vector mismatch"); - // Quit if not 128/256-bit vector. + // Quit if less than 128-bits or not splittable to 128/256-bit vector. EVT VT = VecIns[0].getValueType(); - if (!VT.is128BitVector() && !VT.is256BitVector()) + if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits())) return SDValue(); SDLoc DL(Op); @@ -21382,18 +21382,28 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS)); } + SDValue V = VecIns.back(); + + // Split down to 128/256-bit vector. + unsigned TestSize = Subtarget.hasAVX()? 256 : 128; + while (VT.getSizeInBits() > TestSize) { + auto Split = DAG.SplitVector(V, DL); + VT = Split.first.getValueType(); + V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second); + } + X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, DL, MVT::i8); bool UsePTEST = Subtarget.hasSSE41(); if (UsePTEST) { MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; - SDValue V = DAG.getBitcast(TestVT, VecIns.back()); + V = DAG.getBitcast(TestVT, V); return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V); } SDValue Result = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, - DAG.getBitcast(MVT::v16i8, VecIns.back()), + DAG.getBitcast(MVT::v16i8, V), getZeroVector(MVT::v16i8, Subtarget, DAG, DL)); Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll index 13ca7195bca3..a93a03fea670 100644 --- a/llvm/test/CodeGen/X86/ptest.ll +++ b/llvm/test/CodeGen/X86/ptest.ll @@ -148,24 +148,9 @@ define i32 @veccond512(<16 x i32> %input) { ; ; AVX512-LABEL: veccond512: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vmovq %xmm2, %rcx -; AVX512-NEXT: orq %rax, %rcx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512-NEXT: vmovq %xmm3, %rax -; AVX512-NEXT: orq %rcx, %rax -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: orq %rax, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: vpextrq $1, %xmm3, %rax -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: orq %rcx, %rdx +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vptest %ymm0, %ymm0 ; AVX512-NEXT: je .LBB2_2 ; AVX512-NEXT: # %bb.1: # %if-true-block ; AVX512-NEXT: xorl %eax, %eax @@ -283,25 +268,10 @@ define i32 @vectest512(<16 x i32> %input) { ; ; AVX512-LABEL: vectest512: ; AVX512: # %bb.0: -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vmovq %xmm2, %rcx -; AVX512-NEXT: orq %rax, %rcx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512-NEXT: vmovq %xmm3, %rax -; AVX512-NEXT: orq %rcx, %rax -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: orq %rax, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %rax -; AVX512-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: vpextrq $1, %xmm3, %rax -; AVX512-NEXT: orq %rdx, %rax -; AVX512-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512-NEXT: orq %rax, %rdx +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: orq %rcx, %rdx +; AVX512-NEXT: vptest %ymm0, %ymm0 ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -410,24 +380,9 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) { ; AVX512-LABEL: vecsel512: ; AVX512: # %bb.0: ; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovq %xmm1, %rcx -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vmovq %xmm2, %rdx -; AVX512-NEXT: orq %rcx, %rdx -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; AVX512-NEXT: vmovq %xmm3, %rcx -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: vmovq %xmm0, %rdx -; AVX512-NEXT: orq %rcx, %rdx -; AVX512-NEXT: vpextrq $1, %xmm1, %rcx -; AVX512-NEXT: vpextrq $1, %xmm2, %rdi -; AVX512-NEXT: orq %rcx, %rdi -; AVX512-NEXT: vpextrq $1, %xmm3, %rcx -; AVX512-NEXT: orq %rdi, %rcx -; AVX512-NEXT: vpextrq $1, %xmm0, %rdi -; AVX512-NEXT: orq %rcx, %rdi -; AVX512-NEXT: orq %rdx, %rdi +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vptest %ymm0, %ymm0 ; AVX512-NEXT: cmovel %esi, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq