[SLP]Change the insertion point for outside-block-used nodes and prevec phi operand gathers

Need to set the insertion point for (non-schedulable) vector node after
the last instruction in the node to avoid def-use breakage. But it also
causes miscompilation with gather/buildvector operands of the phi nodes,
used in the same phi only in the block.
These nodes supposed to be inserted at the end of the block and after
changing the insertion point for the non-schedulable vec block, it also
may break def-use dependencies. Need to prevector such nodes, to emit
them as early as possible, so the vectorized nodes are inserted before
these nodes.

Fixes #139728

Reviewers: hiraditya, HanKuanChen, RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/139917
This commit is contained in:
Alexey Bataev
2025-05-16 12:52:27 -04:00
committed by GitHub
parent 7674d6fa9e
commit d79d9b8fbf
10 changed files with 99 additions and 33 deletions

View File

@@ -16142,16 +16142,10 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
[](Value *V) {
return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
})) ||
all_of(E->Scalars,
[](Value *V) {
return isa<PoisonValue>(V) ||
(!isVectorLikeInstWithConstOps(V) &&
isUsedOutsideBlock(V));
}) ||
(E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
return isa<ExtractElementInst, UndefValue>(V) ||
areAllOperandsNonInsts(V);
})))
all_of(E->Scalars, [](Value *V) {
return isa<PoisonValue>(V) ||
(!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
}))
Res = FindLastInst();
else
Res = FindFirstInst();
@@ -16210,7 +16204,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
}
if (IsPHI ||
(!E->isGather() && E->State != TreeEntry::SplitVectorize &&
doesNotNeedToSchedule(E->Scalars)) ||
all_of(E->Scalars, areAllOperandsNonInsts)) ||
(GatheredLoadsEntriesFirst.has_value() &&
E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
E->getOpcode() == Instruction::Load)) {
@@ -17799,17 +17793,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *VecOp = NewPhi->getIncomingValueForBlock(IBB);
NewPhi->addIncoming(VecOp, IBB);
TreeEntry *OpTE = getOperandEntry(E, I);
assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
OpTE->VectorizedValue = VecOp;
continue;
}
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
Value *Vec = vectorizeOperand(E, I);
const TreeEntry *OpE = getOperandEntry(E, I);
Value *Vec;
if (OpE->isGather()) {
assert(OpE->VectorizedValue && "Expected vectorized value.");
Vec = OpE->VectorizedValue;
if (auto *IVec = dyn_cast<Instruction>(Vec))
Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
} else {
Vec = vectorizeOperand(E, I);
}
if (VecTy != Vec->getType()) {
assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
MinBWs.contains(getOperandEntry(E, I))) &&
"Expected item in MinBWs.");
assert(
(It != MinBWs.end() || OpE->isGather() || MinBWs.contains(OpE)) &&
"Expected item in MinBWs.");
Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
}
NewPhi->addIncoming(Vec, IBB);
@@ -18696,6 +18700,28 @@ Value *BoUpSLP::vectorizeTree(
else
Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
// Vectorize gather operands of the PHI nodes.
for (const std::unique_ptr<TreeEntry> &TE : reverse(VectorizableTree)) {
if (TE->isGather() && TE->UserTreeIndex.UserTE &&
TE->UserTreeIndex.UserTE->hasState() &&
!TE->UserTreeIndex.UserTE->isAltShuffle() &&
TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI &&
!TE->VectorizedValue) {
auto *PH = cast<PHINode>(TE->UserTreeIndex.UserTE->getMainOp());
BasicBlock *IBB = PH->getIncomingBlock(TE->UserTreeIndex.EdgeIdx);
// If there is the same incoming block earlier - skip, it will be handled
// in PHI node.
if (TE->UserTreeIndex.EdgeIdx > 0 &&
any_of(seq<unsigned>(TE->UserTreeIndex.EdgeIdx), [&](unsigned Idx) {
return PH->getIncomingBlock(Idx) == IBB;
}))
continue;
Builder.SetInsertPoint(IBB->getTerminator());
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
(void)vectorizeTree(TE.get());
}
}
// Emit gathered loads first to emit better code for the users of those
// gathered loads.
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

View File

@@ -37,19 +37,19 @@ define void @test() {
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15
; CHECK-NEXT: br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]]
; CHECK: [[BB77]]:
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0
; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP17]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7
; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0
; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> [[TMP23]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison>
; CHECK-NEXT: br label %[[BB78:.*]]
; CHECK: [[BB78]]:
; CHECK-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP23]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
; CHECK-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP30]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
; CHECK-NEXT: [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
@@ -58,8 +58,8 @@ define void @test() {
; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison
; CHECK-NEXT: [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison
; CHECK-NEXT: [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 14, i32 15>
; CHECK-NEXT: [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>
; CHECK-NEXT: br i1 poison, label %[[BB78]], label %[[BB167]]
; CHECK: [[BB167]]:
; CHECK-NEXT: [[TMP32:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP29]], %[[BB78]] ]

View File

@@ -5,14 +5,14 @@ define void @test(ptr %0, float %1) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: ptr [[TMP0:%.*]], float [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP0]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP3]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[TMP3]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP3]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: br label %[[BB8:.*]]
; CHECK: [[BB8]]:
; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP15:%.*]], %[[BB8]] ], [ [[TMP5]], [[TMP2:%.*]] ]
; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP7]], %[[BB8]] ], [ [[TMP4]], [[TMP2]] ]
; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP7]], %[[BB8]] ], [ [[TMP8]], [[TMP2]] ]
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x float> [[TMP9]], zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = fadd <4 x float> [[TMP12]], zeroinitializer

View File

@@ -6,9 +6,9 @@ define i32 @test(i64 %l.549) {
; CHECK-SAME: i64 [[L_549:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[CONV3:%.*]] = sext i32 0 to i64
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i32 1
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> poison, i64 [[L_549]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>
; CHECK-NEXT: br label %[[IF_THEN19:.*]]

View File

@@ -7,11 +7,11 @@ define void @test() {
; CHECK-NEXT: br i1 false, label %[[BB1:.*]], label %[[BB5:.*]]
; CHECK: [[BB1]]:
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], %[[BB1]] ], [ zeroinitializer, %[[BB]] ]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP0]], [[TMP4]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 0, i32 0
; CHECK-NEXT: [[TMP3]] = or <2 x i32> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
; CHECK-NEXT: [[TMP5:%.*]] = or <2 x i32> [[TMP0]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
; CHECK-NEXT: [[OR3:%.*]] = or i32 [[TMP6]], 0
; CHECK-NEXT: br i1 false, label %[[BB1]], label %[[BB5]]

View File

@@ -37,18 +37,18 @@ define i32 @test(i32 %s.0) {
; CHECK: [[IF_THEN18:.*]]:
; CHECK-NEXT: br label %[[T]]
; CHECK: [[T]]:
; CHECK-NEXT: [[TMP30:%.*]] = phi <8 x i32> [ [[TMP27:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ]
; CHECK-NEXT: [[TMP19:%.*]] = phi <8 x i32> [ [[TMP27:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ]
; CHECK-NEXT: [[TMP17]] = extractelement <4 x i32> [[TMP23:%.*]], i32 0
; CHECK-NEXT: br i1 false, label %[[IF_END24]], label %[[K]]
; CHECK: [[IF_END24]]:
; CHECK-NEXT: [[TMP18:%.*]] = phi <8 x i32> [ [[TMP29]], %[[IF_THEN11]] ], [ [[TMP11]], %[[IF_END6]] ], [ [[TMP30]], %[[T]] ]
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> <i32 7, i32 1>
; CHECK-NEXT: [[TMP18:%.*]] = phi <8 x i32> [ [[TMP29]], %[[IF_THEN11]] ], [ [[TMP11]], %[[IF_END6]] ], [ [[TMP19]], %[[T]] ]
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> <i32 7, i32 1>
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> <i32 2, i32 3, i32 4, i32 6>
; CHECK-NEXT: br label %[[O]]
; CHECK: [[O]]:
; CHECK-NEXT: [[TMP22]] = phi <2 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP19]], %[[IF_END24]] ]
; CHECK-NEXT: [[TMP23]] = phi <4 x i32> [ [[TMP1]], %[[K]] ], [ [[TMP20]], %[[IF_END24]] ]
; CHECK-NEXT: [[TMP22]] = phi <2 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP30]], %[[IF_END24]] ]
; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP21]], %[[IF_END24]] ]
; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <8 x i32> [[TMP25]], <8 x i32> <i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>

View File

@@ -0,0 +1,40 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s
define i64 @test() {
; CHECK-LABEL: define i64 @test() {
; CHECK-NEXT: [[BB:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 0, i32 1
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[BB1]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB5:.*]] ]
; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP4]] = or <2 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT: br label %[[BB5]]
; CHECK: [[BB5]]:
; CHECK-NEXT: br i1 false, label %[[BB6:.*]], label %[[BB1]]
; CHECK: [[BB6]]:
; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP2]], %[[BB5]] ]
; CHECK-NEXT: ret i64 0
;
bb:
br label %bb1
bb1:
%phi = phi i32 [ 0, %bb ], [ %or, %bb5 ]
%phi2 = phi i32 [ 0, %bb ], [ %or4, %bb5 ]
%or = or i32 %phi, 0
%add = add i32 0, 0
%or3 = or i32 %add, %phi2
%or4 = or i32 %or3, 0
br label %bb5
bb5:
br i1 false, label %bb6, label %bb1
bb6:
%phi7 = phi i32 [ %or, %bb5 ]
%phi8 = phi i32 [ %or3, %bb5 ]
ret i64 0
}

View File

@@ -16,10 +16,10 @@ define i32 @test(i1 %cond) {
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
; CHECK-NEXT: [[OR92]] = or i32 1, 0
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]]
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[TMP6]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[OR92]], i32 0
; CHECK-NEXT: [[TMP8]] = xor <2 x i32> [[TMP9]], [[TMP7]]
; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]]
; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret i32 [[OP_RDX]]

View File

@@ -15,8 +15,8 @@ define i64 @Foo(ptr align 8 dereferenceable(344) %0, i64 %1) {
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP8]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> <i64 0, i64 poison>, i64 [[TMP1]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0
; CHECK-NEXT: br label %[[BB16:.*]]
; CHECK: [[BB16]]:
; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i64> [ [[TMP11]], [[TMP2:%.*]] ], [ zeroinitializer, %[[TMP25:.*]] ]

View File

@@ -234,12 +234,12 @@ define void @test7() {
define void @test8() {
; CHECK-LABEL: @test8(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> zeroinitializer, i64 0)
; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> zeroinitializer, i64 2)
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> poison, <2 x float> zeroinitializer, i64 0)
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP0]], <2 x float> zeroinitializer, i64 2)
; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP1]], <2 x float> zeroinitializer, i64 4)
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP2]], <2 x float> zeroinitializer, i64 6)
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> zeroinitializer, i64 0)
; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> zeroinitializer, i64 2)
; CHECK-NEXT: br i1 false, label [[FOR0:%.*]], label [[FOR_BODY:%.*]]
; CHECK: for0:
; CHECK-NEXT: [[TMP6:%.*]] = phi <8 x float> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ]