[X86][AVX] lowerV2X128Shuffle - attempt to recognise broadcastf128 subvector load

As noticed on PR50053 we were failing to recognise when a shuffle of a load was really a subvector broadcast load
This commit is contained in:
Simon Pilgrim
2021-07-23 13:07:36 +01:00
parent b63833ac1f
commit 71d0fd3564
2 changed files with 37 additions and 23 deletions

View File

@@ -16054,9 +16054,33 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
if (V2.isUndef()) {
// Attempt to match VBROADCAST*128 subvector broadcast load.
bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
if (!Ld->isNonTemporal()) {
MVT MemVT = VT.getHalfNumVectorElementsVT();
unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
TypeSize::Fixed(Ofs), DL);
SDValue Ops[] = {Ld->getChain(), Ptr};
SDValue BcastLd = DAG.getMemIntrinsicNode(
X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
DAG.getMachineFunction().getMachineMemOperand(
Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
return BcastLd;
}
}
// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
if (Subtarget.hasAVX2())
return SDValue();
}
bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

View File

@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX2
define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v8f32_45670123:
@@ -60,15 +60,10 @@ entry:
}
define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v8f32_01230123_mem:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 $34, (%rdi), %ymm0, %ymm0 # ymm0 = mem[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_01230123_mem:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpermpd $68, (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
; AVX2-NEXT: retq
; ALL-LABEL: shuffle_v8f32_01230123_mem:
; ALL: # %bb.0: # %entry
; ALL-NEXT: vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
; ALL-NEXT: retq
entry:
%a = load <8 x float>, <8 x float>* %pa
%b = load <8 x float>, <8 x float>* %pb
@@ -92,15 +87,10 @@ entry:
}
define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v8f32_45674567_mem:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_45674567_mem:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpermpd $238, (%rdi), %ymm0 # ymm0 = mem[2,3,2,3]
; AVX2-NEXT: retq
; ALL-LABEL: shuffle_v8f32_45674567_mem:
; ALL: # %bb.0: # %entry
; ALL-NEXT: vbroadcastf128 16(%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
; ALL-NEXT: retq
entry:
%a = load <8 x float>, <8 x float>* %pa
%b = load <8 x float>, <8 x float>* %pb