[AArch64][ARM] Remove load from dup and vmul tests. NFC

These tests needn't use loads in their testing of dup and mul
instructions, and as the load changes the test may no longer test what
they are intending (as in D140069).
This commit is contained in:
David Green
2022-12-20 15:23:38 +00:00
parent e51b7bff19
commit 752819e813
3 changed files with 342 additions and 532 deletions

View File

@@ -6,15 +6,15 @@ define <8 x i8> @v_dup8(i8 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.8b v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
ret <8 x i8> %tmp8
%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
ret <8 x i8> %tmp8
}
define <4 x i16> @v_dup16(i16 %A) nounwind {
@@ -22,11 +22,11 @@ define <4 x i16> @v_dup16(i16 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.4h v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
ret <4 x i16> %tmp4
%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
ret <4 x i16> %tmp4
}
define <2 x i32> @v_dup32(i32 %A) nounwind {
@@ -34,9 +34,9 @@ define <2 x i32> @v_dup32(i32 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.2s v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
ret <2 x i32> %tmp2
%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
ret <2 x i32> %tmp2
}
define <2 x float> @v_dupfloat(float %A) nounwind {
@@ -45,9 +45,9 @@ define <2 x float> @v_dupfloat(float %A) nounwind {
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
; CHECK-NEXT: dup.2s v0, v0[0]
; CHECK-NEXT: ret
%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
ret <2 x float> %tmp2
%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
ret <2 x float> %tmp2
}
define <16 x i8> @v_dupQ8(i8 %A) nounwind {
@@ -55,23 +55,23 @@ define <16 x i8> @v_dupQ8(i8 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.16b v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
ret <16 x i8> %tmp16
%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
ret <16 x i8> %tmp16
}
define <8 x i16> @v_dupQ16(i16 %A) nounwind {
@@ -79,15 +79,15 @@ define <8 x i16> @v_dupQ16(i16 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.8h v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
ret <8 x i16> %tmp8
%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
ret <8 x i16> %tmp8
}
define <4 x i32> @v_dupQ32(i32 %A) nounwind {
@@ -95,11 +95,11 @@ define <4 x i32> @v_dupQ32(i32 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.4s v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
ret <4 x i32> %tmp4
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
ret <4 x i32> %tmp4
}
define <4 x float> @v_dupQfloat(float %A) nounwind {
@@ -108,11 +108,11 @@ define <4 x float> @v_dupQfloat(float %A) nounwind {
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
; CHECK-NEXT: dup.4s v0, v0[0]
; CHECK-NEXT: ret
%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
ret <4 x float> %tmp4
%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
ret <4 x float> %tmp4
}
; Check to make sure it works with shuffles, too.
@@ -122,9 +122,9 @@ define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.8b v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
ret <8 x i8> %tmp2
%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
ret <8 x i8> %tmp2
}
define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
@@ -132,9 +132,9 @@ define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.4h v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
ret <4 x i16> %tmp2
%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
ret <4 x i16> %tmp2
}
define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
@@ -142,9 +142,9 @@ define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.2s v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
ret <2 x i32> %tmp2
%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
ret <2 x i32> %tmp2
}
define <2 x float> @v_shuffledupfloat(float %A) nounwind {
@@ -153,9 +153,9 @@ define <2 x float> @v_shuffledupfloat(float %A) nounwind {
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
; CHECK-NEXT: dup.2s v0, v0[0]
; CHECK-NEXT: ret
%tmp1 = insertelement <2 x float> undef, float %A, i32 0
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
ret <2 x float> %tmp2
%tmp1 = insertelement <2 x float> undef, float %A, i32 0
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
ret <2 x float> %tmp2
}
define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
@@ -163,9 +163,9 @@ define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.16b v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
ret <16 x i8> %tmp2
%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
ret <16 x i8> %tmp2
}
define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
@@ -173,9 +173,9 @@ define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.8h v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
ret <8 x i16> %tmp2
%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
ret <8 x i16> %tmp2
}
define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
@@ -183,9 +183,9 @@ define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: dup.4s v0, w0
; CHECK-NEXT: ret
%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
ret <4 x i32> %tmp2
%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
ret <4 x i32> %tmp2
}
define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
@@ -194,97 +194,89 @@ define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
; CHECK-NEXT: dup.4s v0, v0[0]
; CHECK-NEXT: ret
%tmp1 = insertelement <4 x float> undef, float %A, i32 0
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %tmp2
%tmp1 = insertelement <4 x float> undef, float %A, i32 0
%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %tmp2
}
define <8 x i8> @vduplane8(ptr %A) nounwind {
define <8 x i8> @vduplane8(<8 x i8> %A) nounwind {
; CHECK-LABEL: vduplane8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup.8b v0, v0[1]
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i8> %tmp2
%tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vduplane16(ptr %A) nounwind {
define <4 x i16> @vduplane16(<4 x i16> %A) nounwind {
; CHECK-LABEL: vduplane16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup.4h v0, v0[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i16> %tmp2
%tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vduplane32(ptr %A) nounwind {
define <2 x i32> @vduplane32(<2 x i32> %A) nounwind {
; CHECK-LABEL: vduplane32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup.2s v0, v0[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x i32> %tmp2
%tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x i32> %tmp2
}
define <2 x float> @vduplanefloat(ptr %A) nounwind {
define <2 x float> @vduplanefloat(<2 x float> %A) nounwind {
; CHECK-LABEL: vduplanefloat:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup.2s v0, v0[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x float>, ptr %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x float> %tmp2
%tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x float> %tmp2
}
define <16 x i8> @vduplaneQ8(ptr %A) nounwind {
define <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind {
; CHECK-LABEL: vduplaneQ8:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup.16b v0, v0[1]
; CHECK-NEXT: ret
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <16 x i8> %tmp2
%tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vduplaneQ16(ptr %A) nounwind {
define <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind {
; CHECK-LABEL: vduplaneQ16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup.8h v0, v0[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i16> %tmp2
%tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vduplaneQ32(ptr %A) nounwind {
define <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind {
; CHECK-LABEL: vduplaneQ32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup.4s v0, v0[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i32> %tmp2
%tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i32> %tmp2
}
define <4 x float> @vduplaneQfloat(ptr %A) nounwind {
define <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind {
; CHECK-LABEL: vduplaneQfloat:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup.4s v0, v0[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x float>, ptr %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x float> %tmp2
%tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x float> %tmp2
}
define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {

View File

@@ -1081,59 +1081,45 @@ declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x f
declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
define <4 x i16> @mul_4h(ptr %A, ptr %B) nounwind {
define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: mul_4h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mul.4h v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = mul <4 x i16> %tmp1, %tmp3
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = mul <4 x i16> %A, %tmp3
ret <4 x i16> %tmp4
}
define <8 x i16> @mul_8h(ptr %A, ptr %B) nounwind {
define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-LABEL: mul_8h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: mul.8h v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%tmp4 = mul <8 x i16> %tmp1, %tmp3
%tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%tmp4 = mul <8 x i16> %A, %tmp3
ret <8 x i16> %tmp4
}
define <2 x i32> @mul_2s(ptr %A, ptr %B) nounwind {
define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: mul_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mul.2s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp4 = mul <2 x i32> %tmp1, %tmp3
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = mul <2 x i32> %A, %tmp3
ret <2 x i32> %tmp4
}
define <4 x i32> @mul_4s(ptr %A, ptr %B) nounwind {
define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: mul_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: mul.4s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = mul <4 x i32> %tmp1, %tmp3
%tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = mul <4 x i32> %A, %tmp3
ret <4 x i32> %tmp4
}
@@ -1153,45 +1139,34 @@ define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
ret <2 x i64> %tmp1
}
define <2 x float> @fmul_lane_2s(ptr %A, ptr %B) nounwind {
define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
; CHECK-LABEL: fmul_lane_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmul.2s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x float>, ptr %A
%tmp2 = load <2 x float>, ptr %B
%tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp4 = fmul <2 x float> %tmp1, %tmp3
%tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = fmul <2 x float> %A, %tmp3
ret <2 x float> %tmp4
}
define <4 x float> @fmul_lane_4s(ptr %A, ptr %B) nounwind {
define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind {
; CHECK-LABEL: fmul_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: fmul.4s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x float>, ptr %A
%tmp2 = load <4 x float>, ptr %B
%tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = fmul <4 x float> %tmp1, %tmp3
%tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = fmul <4 x float> %A, %tmp3
ret <4 x float> %tmp4
}
define <2 x double> @fmul_lane_2d(ptr %A, ptr %B) nounwind {
define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind {
; CHECK-LABEL: fmul_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: fmul.2d v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x double>, ptr %A
%tmp2 = load <2 x double>, ptr %B
%tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp4 = fmul <2 x double> %tmp1, %tmp3
%tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = fmul <2 x double> %A, %tmp3
ret <2 x double> %tmp4
}
@@ -1217,101 +1192,76 @@ define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
define <2 x float> @fmulx_lane_2s(ptr %A, ptr %B) nounwind {
define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind {
; CHECK-LABEL: fmulx_lane_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmulx.2s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x float>, ptr %A
%tmp2 = load <2 x float>, ptr %B
%tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
%tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %A, <2 x float> %tmp3)
ret <2 x float> %tmp4
}
define <4 x float> @fmulx_lane_4s(ptr %A, ptr %B) nounwind {
define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind {
; CHECK-LABEL: fmulx_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: fmulx.4s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x float>, ptr %A
%tmp2 = load <4 x float>, ptr %B
%tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
%tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %A, <4 x float> %tmp3)
ret <4 x float> %tmp4
}
define <2 x double> @fmulx_lane_2d(ptr %A, ptr %B) nounwind {
define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind {
; CHECK-LABEL: fmulx_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: fmulx.2d v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x double>, ptr %A
%tmp2 = load <2 x double>, ptr %B
%tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
%tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %A, <2 x double> %tmp3)
ret <2 x double> %tmp4
}
define <4 x i16> @sqdmulh_lane_4h(ptr %A, ptr %B) nounwind {
define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: sqdmulh_lane_4h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh.4h v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3)
ret <4 x i16> %tmp4
}
define <8 x i16> @sqdmulh_lane_8h(ptr %A, ptr %B) nounwind {
define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-LABEL: sqdmulh_lane_8h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: sqdmulh.8h v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
%tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3)
ret <8 x i16> %tmp4
}
define <2 x i32> @sqdmulh_lane_2s(ptr %A, ptr %B) nounwind {
define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: sqdmulh_lane_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmulh.2s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3)
ret <2 x i32> %tmp4
}
define <4 x i32> @sqdmulh_lane_4s(ptr %A, ptr %B) nounwind {
define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: sqdmulh_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: sqdmulh.4s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
%tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3)
ret <4 x i32> %tmp4
}
@@ -1327,59 +1277,45 @@ define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
ret i32 %tmp2
}
define <4 x i16> @sqrdmulh_lane_4h(ptr %A, ptr %B) nounwind {
define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: sqrdmulh_lane_4h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh.4h v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3)
ret <4 x i16> %tmp4
}
define <8 x i16> @sqrdmulh_lane_8h(ptr %A, ptr %B) nounwind {
define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-LABEL: sqrdmulh_lane_8h:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: sqrdmulh.8h v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <8 x i16>, ptr %A
%tmp2 = load <8 x i16>, ptr %B
%tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
%tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3)
ret <8 x i16> %tmp4
}
define <2 x i32> @sqrdmulh_lane_2s(ptr %A, ptr %B) nounwind {
define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: sqrdmulh_lane_2s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqrdmulh.2s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3)
ret <2 x i32> %tmp4
}
define <4 x i32> @sqrdmulh_lane_4s(ptr %A, ptr %B) nounwind {
define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: sqrdmulh_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: ldr q1, [x1]
; CHECK-NEXT: sqrdmulh.4s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i32>, ptr %A
%tmp2 = load <4 x i32>, ptr %B
%tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
%tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3)
ret <4 x i32> %tmp4
}
@@ -1395,221 +1331,169 @@ define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
ret i32 %tmp2
}
define <4 x i32> @sqdmull_lane_4s(ptr %A, ptr %B) nounwind {
define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: sqdmull_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmull.4s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @sqdmull_lane_2d(ptr %A, ptr %B) nounwind {
define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: sqdmull_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmull.2d v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <4 x i32> @sqdmull2_lane_4s(ptr %A, ptr %B) nounwind {
define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-LABEL: sqdmull2_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0, #8]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: sqdmull.4s v0, v0, v1[1]
; CHECK-NEXT: sqdmull2.4s v0, v0, v1[1]
; CHECK-NEXT: ret
%load1 = load <8 x i16>, ptr %A
%load2 = load <8 x i16>, ptr %B
%tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
ret <4 x i32> %tmp4
}
define <2 x i64> @sqdmull2_lane_2d(ptr %A, ptr %B) nounwind {
define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-LABEL: sqdmull2_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0, #8]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: sqdmull.2d v0, v0, v1[1]
; CHECK-NEXT: sqdmull2.2d v0, v0, v1[1]
; CHECK-NEXT: ret
%load1 = load <4 x i32>, ptr %A
%load2 = load <4 x i32>, ptr %B
%tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
ret <2 x i64> %tmp4
}
define <4 x i32> @umull_lane_4s(ptr %A, ptr %B) nounwind {
define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: umull_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umull.4s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @umull_lane_2d(ptr %A, ptr %B) nounwind {
define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: umull_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umull.2d v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <4 x i32> @smull_lane_4s(ptr %A, ptr %B) nounwind {
define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind {
; CHECK-LABEL: smull_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smull.4s v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
%tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp3)
ret <4 x i32> %tmp4
}
define <2 x i64> @smull_lane_2d(ptr %A, ptr %B) nounwind {
define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind {
; CHECK-LABEL: smull_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smull.2d v0, v0, v1[1]
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
%tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp3)
ret <2 x i64> %tmp4
}
define <4 x i32> @smlal_lane_4s(ptr %A, ptr %B, ptr %C) nounwind {
define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: smlal_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: smlal.4s v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smlal.4s v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
%tmp6 = add <4 x i32> %tmp3, %tmp5
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = add <4 x i32> %C, %tmp5
ret <4 x i32> %tmp6
}
define <2 x i64> @smlal_lane_2d(ptr %A, ptr %B, ptr %C) nounwind {
define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: smlal_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: smlal.2d v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smlal.2d v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
%tmp6 = add <2 x i64> %tmp3, %tmp5
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = add <2 x i64> %C, %tmp5
ret <2 x i64> %tmp6
}
define <4 x i32> @sqdmlal_lane_4s(ptr %A, ptr %B, ptr %C) nounwind {
define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlal_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: sqdmlal.4s v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmlal.4s v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
ret <4 x i32> %tmp6
}
define <2 x i64> @sqdmlal_lane_2d(ptr %A, ptr %B, ptr %C) nounwind {
define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: sqdmlal_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: sqdmlal.2d v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmlal.2d v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
ret <2 x i64> %tmp6
}
define <4 x i32> @sqdmlal2_lane_4s(ptr %A, ptr %B, ptr %C) nounwind {
define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlal2_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: ldr d1, [x0, #8]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: sqdmlal.4s v0, v1, v2[1]
; CHECK-NEXT: sqdmlal2.4s v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%load1 = load <8 x i16>, ptr %A
%load2 = load <8 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
ret <4 x i32> %tmp6
}
define <2 x i64> @sqdmlal2_lane_2d(ptr %A, ptr %B, ptr %C) nounwind {
define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: sqdmlal2_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: ldr d1, [x0, #8]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1]
; CHECK-NEXT: sqdmlal2.2d v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%load1 = load <4 x i32>, ptr %A
%load2 = load <4 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
ret <2 x i64> %tmp6
}
@@ -1715,176 +1599,134 @@ define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
define <4 x i32> @umlal_lane_4s(ptr %A, ptr %B, ptr %C) nounwind {
define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: umlal_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: umlal.4s v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umlal.4s v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
%tmp6 = add <4 x i32> %tmp3, %tmp5
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = add <4 x i32> %C, %tmp5
ret <4 x i32> %tmp6
}
define <2 x i64> @umlal_lane_2d(ptr %A, ptr %B, ptr %C) nounwind {
define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: umlal_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: umlal.2d v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umlal.2d v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
%tmp6 = add <2 x i64> %tmp3, %tmp5
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = add <2 x i64> %C, %tmp5
ret <2 x i64> %tmp6
}
define <4 x i32> @smlsl_lane_4s(ptr %A, ptr %B, ptr %C) nounwind {
define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: smlsl_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: smlsl.4s v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smlsl.4s v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
%tmp6 = sub <4 x i32> %tmp3, %tmp5
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = sub <4 x i32> %C, %tmp5
ret <4 x i32> %tmp6
}
define <2 x i64> @smlsl_lane_2d(ptr %A, ptr %B, ptr %C) nounwind {
define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: smlsl_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: smlsl.2d v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: smlsl.2d v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
%tmp6 = sub <2 x i64> %tmp3, %tmp5
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = sub <2 x i64> %C, %tmp5
ret <2 x i64> %tmp6
}
define <4 x i32> @sqdmlsl_lane_4s(ptr %A, ptr %B, ptr %C) nounwind {
define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlsl_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: sqdmlsl.4s v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmlsl.4s v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
ret <4 x i32> %tmp6
}
define <2 x i64> @sqdmlsl_lane_2d(ptr %A, ptr %B, ptr %C) nounwind {
define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: sqdmlsl_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: sqdmlsl.2d v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: sqdmlsl.2d v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
ret <2 x i64> %tmp6
}
define <4 x i32> @sqdmlsl2_lane_4s(ptr %A, ptr %B, ptr %C) nounwind {
define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: sqdmlsl2_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: ldr d1, [x0, #8]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[1]
; CHECK-NEXT: sqdmlsl2.4s v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%load1 = load <8 x i16>, ptr %A
%load2 = load <8 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
%tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5)
ret <4 x i32> %tmp6
}
define <2 x i64> @sqdmlsl2_lane_2d(ptr %A, ptr %B, ptr %C) nounwind {
define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: sqdmlsl2_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: ldr d1, [x0, #8]
; CHECK-NEXT: ldr d2, [x1]
; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1]
; CHECK-NEXT: sqdmlsl2.2d v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%load1 = load <4 x i32>, ptr %A
%load2 = load <4 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
%tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5)
ret <2 x i64> %tmp6
}
define <4 x i32> @umlsl_lane_4s(ptr %A, ptr %B, ptr %C) nounwind {
define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind {
; CHECK-LABEL: umlsl_lane_4s:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: umlsl.4s v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umlsl.4s v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = load <4 x i16>, ptr %B
%tmp3 = load <4 x i32>, ptr %C
%tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
%tmp6 = sub <4 x i32> %tmp3, %tmp5
%tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4)
%tmp6 = sub <4 x i32> %C, %tmp5
ret <4 x i32> %tmp6
}
define <2 x i64> @umlsl_lane_2d(ptr %A, ptr %B, ptr %C) nounwind {
define <2 x i64> @umlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind {
; CHECK-LABEL: umlsl_lane_2d:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x1]
; CHECK-NEXT: ldr d2, [x0]
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: umlsl.2d v0, v2, v1[1]
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: umlsl.2d v2, v0, v1[1]
; CHECK-NEXT: mov.16b v0, v2
; CHECK-NEXT: ret
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = load <2 x i32>, ptr %B
%tmp3 = load <2 x i64>, ptr %C
%tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
%tmp6 = sub <2 x i64> %tmp3, %tmp5
%tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
%tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4)
%tmp6 = sub <2 x i64> %C, %tmp5
ret <2 x i64> %tmp6
}

View File

@@ -219,103 +219,79 @@ define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
ret <4 x float> %tmp2
}
define <8 x i8> @vduplane8(ptr %A) nounwind {
define arm_aapcs_vfpcc <8 x i8> @vduplane8(<8 x i8> %A) nounwind {
; CHECK-LABEL: vduplane8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.8 d16, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vdup.8 d0, d0[1]
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
%tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i8> %tmp2
}
define <4 x i16> @vduplane16(ptr %A) nounwind {
define arm_aapcs_vfpcc <4 x i16> @vduplane16(<4 x i16> %A) nounwind {
; CHECK-LABEL: vduplane16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.16 d16, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vdup.16 d0, d0[1]
; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
%tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i16> %tmp2
}
define <2 x i32> @vduplane32(ptr %A) nounwind {
define arm_aapcs_vfpcc <2 x i32> @vduplane32(<2 x i32> %A) nounwind {
; CHECK-LABEL: vduplane32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.32 d16, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vdup.32 d0, d0[1]
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
%tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x i32> %tmp2
}
define <2 x float> @vduplanefloat(ptr %A) nounwind {
define arm_aapcs_vfpcc <2 x float> @vduplanefloat(<2 x float> %A) nounwind {
; CHECK-LABEL: vduplanefloat:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.32 d16, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vdup.32 d0, d0[1]
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x float>, ptr %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
%tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
ret <2 x float> %tmp2
}
define <16 x i8> @vduplaneQ8(ptr %A) nounwind {
define arm_aapcs_vfpcc <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind {
; CHECK-LABEL: vduplaneQ8:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.8 q8, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: vdup.8 q0, d0[1]
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i8>, ptr %A
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
%tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <16 x i8> %tmp2
}
define <8 x i16> @vduplaneQ16(ptr %A) nounwind {
define arm_aapcs_vfpcc <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind {
; CHECK-LABEL: vduplaneQ16:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.16 q8, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: vdup.16 q0, d0[1]
; CHECK-NEXT: mov pc, lr
%tmp1 = load <4 x i16>, ptr %A
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
%tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
ret <8 x i16> %tmp2
}
define <4 x i32> @vduplaneQ32(ptr %A) nounwind {
define arm_aapcs_vfpcc <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind {
; CHECK-LABEL: vduplaneQ32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.32 q8, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: vdup.32 q0, d0[1]
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x i32>, ptr %A
%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
%tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x i32> %tmp2
}
define <4 x float> @vduplaneQfloat(ptr %A) nounwind {
define arm_aapcs_vfpcc <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind {
; CHECK-LABEL: vduplaneQfloat:
; CHECK: @ %bb.0:
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vdup.32 q8, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: vdup.32 q0, d0[1]
; CHECK-NEXT: mov pc, lr
%tmp1 = load <2 x float>, ptr %A
%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
%tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
ret <4 x float> %tmp2
}