mirror of
https://github.com/intel/llvm.git
synced 2026-01-13 11:02:04 +08:00
[SLP]Enable Shl as a base opcode in copyables (#156766)
Enables Shl matching for the nodes, where copyable can be modelled as shl %v, 0
This commit is contained in:
@@ -10620,7 +10620,8 @@ class InstructionsCompatibilityAnalysis {
|
||||
/// Checks if the opcode is supported as the main opcode for copyable
|
||||
/// elements.
|
||||
static bool isSupportedOpcode(const unsigned Opcode) {
|
||||
return Opcode == Instruction::Add || Opcode == Instruction::LShr;
|
||||
return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
|
||||
Opcode == Instruction::Shl;
|
||||
}
|
||||
|
||||
/// Identifies the best candidate value, which represents main opcode
|
||||
@@ -10937,6 +10938,7 @@ public:
|
||||
switch (MainOpcode) {
|
||||
case Instruction::Add:
|
||||
case Instruction::LShr:
|
||||
case Instruction::Shl:
|
||||
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
|
||||
break;
|
||||
default:
|
||||
@@ -22006,6 +22008,8 @@ bool BoUpSLP::collectValuesToDemote(
|
||||
return all_of(E.Scalars, [&](Value *V) {
|
||||
if (isa<PoisonValue>(V))
|
||||
return true;
|
||||
if (E.isCopyableElement(V))
|
||||
return true;
|
||||
auto *I = cast<Instruction>(V);
|
||||
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
|
||||
return AmtKnownBits.getMaxValue().ult(BitWidth);
|
||||
|
||||
@@ -10,14 +10,10 @@ define void @test() {
|
||||
; CHECK-NEXT: [[SUB4_I_I65_US:%.*]] = or i64 0, 1
|
||||
; CHECK-NEXT: br label [[BODY:%.*]]
|
||||
; CHECK: body:
|
||||
; CHECK-NEXT: [[ADD_I_I62_US:%.*]] = shl i64 0, 0
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> <i64 poison, i64 1>, i64 [[ADD_I_I62_US]], i32 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP2]], i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
|
||||
; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 1>), i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
|
||||
; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
|
||||
; CHECK-NEXT: [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[SUB4_I_I65_US]], i64 0
|
||||
; CHECK-NEXT: br label [[BODY]]
|
||||
;
|
||||
|
||||
@@ -8,8 +8,8 @@ define i32 @test() {
|
||||
; CHECK-NEXT: [[ENTRY:.*:]]
|
||||
; CHECK-NEXT: store i32 152, ptr @f, align 4
|
||||
; CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD_I:%.*]] = load i32, ptr @f, align 4
|
||||
; CHECK-NEXT: [[ADD_I_I:%.*]] = shl i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], 24
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[ADD_I_I]], i32 0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], i32 0
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = shl <8 x i32> [[TMP3]], <i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> <i32 83886080, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, [[TMP0]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i32> [[TMP1]], splat (i32 24)
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i32> [[TMP2]], <i32 66440127, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
||||
|
||||
@@ -247,32 +247,12 @@ entry:
|
||||
}
|
||||
|
||||
define void @shl0(ptr noalias %dst, ptr noalias %src) {
|
||||
; NON-POW2-LABEL: @shl0(
|
||||
; NON-POW2-NEXT: entry:
|
||||
; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
|
||||
; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
|
||||
; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
|
||||
; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
|
||||
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4
|
||||
; NON-POW2-NEXT: [[TMP2:%.*]] = shl <3 x i32> [[TMP1]], <i32 1, i32 2, i32 3>
|
||||
; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
|
||||
; NON-POW2-NEXT: ret void
|
||||
;
|
||||
; POW2-ONLY-LABEL: @shl0(
|
||||
; POW2-ONLY-NEXT: entry:
|
||||
; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
|
||||
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
|
||||
; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
|
||||
; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
|
||||
; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
|
||||
; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
|
||||
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
|
||||
; POW2-ONLY-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 2>
|
||||
; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
|
||||
; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
|
||||
; POW2-ONLY-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3
|
||||
; POW2-ONLY-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
|
||||
; POW2-ONLY-NEXT: ret void
|
||||
; CHECK-LABEL: @shl0(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
|
||||
; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
|
||||
|
||||
@@ -1,25 +1,44 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
|
||||
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
|
||||
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
|
||||
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
|
||||
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
|
||||
|
||||
|
||||
define i1 @test(i32 %0, i32 %1, i32 %p) {
|
||||
; CHECK-LABEL: define i1 @test(
|
||||
; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
|
||||
; CHECK-NEXT: [[CMP6:%.*]] = icmp slt i32 0, [[P]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
|
||||
; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
|
||||
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
|
||||
; CHECK-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
|
||||
; CHECK-NEXT: ret i1 [[OP_RDX2]]
|
||||
; X86-LABEL: define i1 @test(
|
||||
; X86-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
|
||||
; X86-NEXT: entry:
|
||||
; X86-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
|
||||
; X86-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
|
||||
; X86-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
|
||||
; X86-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
|
||||
; X86-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
|
||||
; X86-NEXT: [[CMP6:%.*]] = icmp slt i32 0, [[P]]
|
||||
; X86-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
|
||||
; X86-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
|
||||
; X86-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
|
||||
; X86-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
|
||||
; X86-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
|
||||
; X86-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
|
||||
; X86-NEXT: ret i1 [[OP_RDX2]]
|
||||
;
|
||||
; AARCH64-LABEL: define i1 @test(
|
||||
; AARCH64-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
|
||||
; AARCH64-NEXT: entry:
|
||||
; AARCH64-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
|
||||
; AARCH64-NEXT: [[SHL4:%.*]] = shl i32 0, [[TMP1]]
|
||||
; AARCH64-NEXT: [[CMP5:%.*]] = icmp slt i32 [[SHL4]], 0
|
||||
; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP1]], i32 1
|
||||
; AARCH64-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
|
||||
; AARCH64-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
|
||||
; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[P]], i32 0
|
||||
; AARCH64-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP4]], [[TMP5]]
|
||||
; AARCH64-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
|
||||
; AARCH64-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
|
||||
; AARCH64-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP5]]
|
||||
; AARCH64-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
|
||||
; AARCH64-NEXT: [[TMP9:%.*]] = freeze i1 [[OP_RDX]]
|
||||
; AARCH64-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 true, i1 [[OP_RDX1]]
|
||||
; AARCH64-NEXT: ret i1 [[OP_RDX2]]
|
||||
;
|
||||
entry:
|
||||
%cmp1 = icmp sgt i32 %0, 0
|
||||
|
||||
Reference in New Issue
Block a user