mirror of
https://github.com/intel/llvm.git
synced 2026-01-26 12:26:52 +08:00
[GlobalISelEmitter] handle operand without MVT/class
There are some patterns in td files without MVT/class set for some operands in target pattern that are from the source pattern. This prevents GlobalISelEmitter from adding them as a valid rule, because the target child operand is an unsupported kind operand. For now, for a leaf child, only IntInit and DefInit are handled in GlobalISelEmitter. This issue can be workaround by adding MVT/class to the patterns in the td files, like the workarounds for patterns anyext and setcc in PPCInstrInfo.td in D140878. To avoid adding the same workarounds for other patterns in td files, this patch tries to handle the UnsetInit case in GlobalISelEmitter. Adding the new handling allows us to remove the workarounds in the td files and also generates many selection rules for PPC target. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D141247
This commit is contained in:
@@ -3348,9 +3348,9 @@ def : Pat<(i64 (sext i1:$in)),
|
||||
// FIXME: We should choose either a zext or a sext based on other constants
|
||||
// already around.
|
||||
def : Pat<(i32 (anyext i1:$in)),
|
||||
(SELECT_I4 crbitrc:$in, (LI 1), (LI 0))>;
|
||||
(SELECT_I4 $in, (LI 1), (LI 0))>;
|
||||
def : Pat<(i64 (anyext i1:$in)),
|
||||
(SELECT_I8 crbitrc:$in, (LI8 1), (LI8 0))>;
|
||||
(SELECT_I8 $in, (LI8 1), (LI8 0))>;
|
||||
|
||||
// match setcc on i1 variables.
|
||||
// CRANDC is:
|
||||
@@ -3756,34 +3756,34 @@ defm : CRNotPat<(i1 (setcc i64:$s1, i64:$s2, SETNE)),
|
||||
|
||||
multiclass FSetCCPat<SDPatternOperator SetCC, ValueType Ty, I FCmp> {
|
||||
defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUGE)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
|
||||
defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGE)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
|
||||
defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETULE)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
|
||||
defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLE)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
|
||||
defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUNE)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
|
||||
defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETNE)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
|
||||
defm : CRNotPat<(i1 (SetCC Ty:$s1, Ty:$s2, SETO)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_un)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>;
|
||||
|
||||
def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOLT)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
|
||||
def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETLT)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_lt)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_lt)>;
|
||||
def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOGT)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
|
||||
def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETGT)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_gt)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_gt)>;
|
||||
def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETOEQ)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
|
||||
def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETEQ)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_eq)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_eq)>;
|
||||
def : Pat<(i1 (SetCC Ty:$s1, Ty:$s2, SETUO)),
|
||||
(EXTRACT_SUBREG (FCmp Ty:$s1, Ty:$s2), sub_un)>;
|
||||
(EXTRACT_SUBREG (FCmp $s1, $s2), sub_un)>;
|
||||
}
|
||||
|
||||
let Predicates = [HasFPU] in {
|
||||
|
||||
@@ -402,8 +402,8 @@ define <2 x i64> @fcvtzs_2d_intrinsic(<2 x double> %A) nounwind {
|
||||
define <1 x i64> @fcvtzs_1d_intrinsic(<1 x double> %A) nounwind {
|
||||
;CHECK-LABEL: fcvtzs_1d_intrinsic:
|
||||
;CHECK-NOT: ld1
|
||||
;CHECK: fcvtzs d0, d0
|
||||
;CHECK-NEXT: ret
|
||||
;CHECK: fcvtzs{{.*}}, d0
|
||||
;CHECK: ret
|
||||
%tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %A)
|
||||
ret <1 x i64> %tmp3
|
||||
}
|
||||
@@ -481,8 +481,8 @@ define <2 x i64> @fcvtzu_2d_intrinsic(<2 x double> %A) nounwind {
|
||||
define <1 x i64> @fcvtzu_1d_intrinsic(<1 x double> %A) nounwind {
|
||||
;CHECK-LABEL: fcvtzu_1d_intrinsic:
|
||||
;CHECK-NOT: ld1
|
||||
;CHECK: fcvtzu d0, d0
|
||||
;CHECK-NEXT: ret
|
||||
;CHECK: fcvtzu{{.*}}, d0
|
||||
;CHECK: ret
|
||||
%tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %A)
|
||||
ret <1 x i64> %tmp3
|
||||
}
|
||||
|
||||
@@ -40,19 +40,22 @@ define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_ushort v6, v[0:1], off
|
||||
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v9, v[2:3], off
|
||||
; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:4
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
|
||||
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v8, v[2:3], off
|
||||
; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v8
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: v_pk_add_u16 v2, v7, v9
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_perm_b32 v1, v10, v9, s4
|
||||
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v10, 16, v0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_add_u16 v2, v8, v11
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1
|
||||
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
|
||||
; GFX9-NEXT: global_store_short v[4:5], v0, off
|
||||
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
|
||||
; GFX9-NEXT: global_store_short v[4:5], v2, off offset:4
|
||||
@@ -202,27 +205,34 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_ushort v6, v[0:1], off
|
||||
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:6
|
||||
; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:8
|
||||
; GFX9-NEXT: global_load_ushort v11, v[2:3], off
|
||||
; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v13, v[2:3], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:6
|
||||
; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:8
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
|
||||
; GFX9-NEXT: global_load_ushort v9, v[2:3], off
|
||||
; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:8
|
||||
; GFX9-NEXT: global_load_ushort v12, v[0:1], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v13, v[0:1], off offset:6
|
||||
; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:6
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX9-NEXT: v_perm_b32 v1, v9, v8, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v9
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v10
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX9-NEXT: v_pk_add_u16 v6, v8, v11
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX9-NEXT: v_perm_b32 v2, v12, v11, s4
|
||||
; GFX9-NEXT: v_pk_add_u16 v0, v0, v2
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v13, 16, v1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_perm_b32 v3, v14, v13, s4
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v14, 16, v2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_add_u16 v6, v10, v15
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v15, 16, v3
|
||||
; GFX9-NEXT: v_pk_add_u16 v0, v0, v2
|
||||
; GFX9-NEXT: v_pk_add_u16 v1, v1, v3
|
||||
; GFX9-NEXT: global_store_short v[4:5], v0, off
|
||||
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
|
||||
@@ -409,36 +419,47 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_ushort v6, v[0:1], off
|
||||
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:6
|
||||
; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:8
|
||||
; GFX9-NEXT: global_load_ushort v11, v[0:1], off offset:10
|
||||
; GFX9-NEXT: global_load_ushort v12, v[0:1], off offset:12
|
||||
; GFX9-NEXT: global_load_ushort v13, v[2:3], off
|
||||
; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:6
|
||||
; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:8
|
||||
; GFX9-NEXT: global_load_ushort v18, v[2:3], off offset:10
|
||||
; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:12
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
|
||||
; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:12
|
||||
; GFX9-NEXT: global_load_ushort v10, v[2:3], off
|
||||
; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:4
|
||||
; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:8
|
||||
; GFX9-NEXT: global_load_ushort v13, v[2:3], off offset:12
|
||||
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:6
|
||||
; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:10
|
||||
; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:2
|
||||
; GFX9-NEXT: global_load_ushort v18, v[2:3], off offset:6
|
||||
; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:10
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(10)
|
||||
; GFX9-NEXT: v_perm_b32 v1, v9, v8, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v8
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v10
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX9-NEXT: v_perm_b32 v2, v11, v10, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v11
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v12
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX9-NEXT: v_pk_add_u16 v8, v9, v13
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX9-NEXT: v_perm_b32 v3, v14, v13, s4
|
||||
; GFX9-NEXT: v_pk_add_u16 v0, v0, v3
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v15, 16, v1
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX9-NEXT: v_perm_b32 v6, v16, v15, s4
|
||||
; GFX9-NEXT: v_pk_add_u16 v1, v1, v6
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_perm_b32 v7, v18, v17, s4
|
||||
; GFX9-NEXT: v_lshl_or_b32 v6, v18, 16, v6
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_add_u16 v8, v12, v19
|
||||
; GFX9-NEXT: v_lshl_or_b32 v7, v19, 16, v7
|
||||
; GFX9-NEXT: v_pk_add_u16 v0, v0, v3
|
||||
; GFX9-NEXT: v_pk_add_u16 v1, v1, v6
|
||||
; GFX9-NEXT: v_pk_add_u16 v2, v2, v7
|
||||
; GFX9-NEXT: global_store_short v[4:5], v0, off
|
||||
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
|
||||
@@ -697,28 +718,30 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
|
||||
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16
|
||||
; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:18
|
||||
; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:16
|
||||
; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:18
|
||||
; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16
|
||||
; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
|
||||
; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:20
|
||||
; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:20
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:20
|
||||
; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:20
|
||||
; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:18
|
||||
; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:18
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX9-NEXT: v_perm_b32 v14, v15, v14, s4
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX9-NEXT: v_perm_b32 v15, v17, v16, s4
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX9-NEXT: v_pk_add_u16 v0, v6, v10
|
||||
; GFX9-NEXT: v_pk_add_u16 v1, v7, v11
|
||||
; GFX9-NEXT: v_pk_add_u16 v2, v8, v12
|
||||
; GFX9-NEXT: v_pk_add_u16 v3, v9, v13
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_lshl_or_b32 v7, v18, 16, v14
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_pk_add_u16 v6, v18, v19
|
||||
; GFX9-NEXT: v_pk_add_u16 v7, v14, v15
|
||||
; GFX9-NEXT: v_lshl_or_b32 v8, v19, 16, v15
|
||||
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
|
||||
; GFX9-NEXT: global_store_short v[4:5], v7, off offset:16
|
||||
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v7, off offset:18
|
||||
; GFX9-NEXT: v_pk_add_u16 v6, v16, v17
|
||||
; GFX9-NEXT: v_pk_add_u16 v0, v7, v8
|
||||
; GFX9-NEXT: global_store_short v[4:5], v0, off offset:16
|
||||
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:18
|
||||
; GFX9-NEXT: global_store_short v[4:5], v6, off offset:20
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
|
||||
@@ -99,7 +99,7 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
|
||||
; GCN-LABEL: v_andn2_i32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GCN-NEXT: v_not_b32_e32 v1, v1
|
||||
; GCN-NEXT: v_and_b32_e32 v0, v0, v1
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@@ -107,7 +107,7 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%not.src1 = xor i32 %src1, -1
|
||||
@@ -118,13 +118,13 @@ define i32 @v_andn2_i32(i32 %src0, i32 %src1) {
|
||||
define amdgpu_ps float @v_andn2_i32_sv(i32 inreg %src0, i32 %src1) {
|
||||
; GCN-LABEL: v_andn2_i32_sv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GCN-NEXT: v_not_b32_e32 v0, v0
|
||||
; GCN-NEXT: v_and_b32_e32 v0, s2, v0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_andn2_i32_sv:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
%not.src1 = xor i32 %src1, -1
|
||||
@@ -248,8 +248,8 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
|
||||
; GCN-LABEL: v_andn2_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GCN-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GCN-NEXT: v_not_b32_e32 v2, v2
|
||||
; GCN-NEXT: v_not_b32_e32 v3, v3
|
||||
; GCN-NEXT: v_and_b32_e32 v0, v0, v2
|
||||
; GCN-NEXT: v_and_b32_e32 v1, v1, v3
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -258,8 +258,8 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -271,16 +271,16 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
|
||||
define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) {
|
||||
; GCN-LABEL: v_andn2_i64_sv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GCN-NEXT: v_not_b32_e32 v0, v0
|
||||
; GCN-NEXT: v_not_b32_e32 v1, v1
|
||||
; GCN-NEXT: v_and_b32_e32 v0, s2, v0
|
||||
; GCN-NEXT: v_and_b32_e32 v1, s3, v1
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_andn2_i64_sv:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v1, s3, v1
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
@@ -466,14 +466,14 @@ define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) {
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GCN-NEXT: v_and_b32_e32 v0, s2, v0
|
||||
; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_andn2_i16_sv:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
|
||||
; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
%not.src1 = xor i16 %src1, -1
|
||||
%and = and i16 %src0, %not.src1
|
||||
@@ -487,14 +487,14 @@ define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_xor_b32 s0, s2, -1
|
||||
; GCN-NEXT: v_and_b32_e32 v0, s0, v0
|
||||
; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_andn2_i16_vs:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0
|
||||
; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
%not.src1 = xor i16 %src1, -1
|
||||
%and = and i16 %src0, %not.src1
|
||||
|
||||
@@ -458,8 +458,8 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
|
||||
; GFX7-NEXT: s_lshl_b32 s2, s1, 8
|
||||
; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008
|
||||
; GFX7-NEXT: s_or_b32 s1, s1, s2
|
||||
; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX7-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX7-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX7-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
@@ -497,7 +497,7 @@ define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
|
||||
; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX8-LABEL: v_bswap_i16_zext_to_i32:
|
||||
|
||||
@@ -1530,15 +1530,16 @@ define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) {
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
|
||||
; SI-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; VI-LABEL: v_test_uitofp_i16_byte_to_f32:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
|
||||
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
%masked = and i16 %arg0, 255
|
||||
%itofp = uitofp i16 %masked to float
|
||||
|
||||
@@ -669,10 +669,9 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
|
||||
; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fdiv_v2f16:
|
||||
@@ -762,10 +761,8 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
|
||||
; GFX8-NEXT: v_rcp_f16_e32 v2, v1
|
||||
; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fdiv_v2f16_afn:
|
||||
@@ -894,10 +891,9 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
|
||||
; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fdiv_v2f16_ulp25:
|
||||
@@ -1051,9 +1047,8 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
|
||||
; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_rcp_v2f16:
|
||||
@@ -1202,9 +1197,8 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
|
||||
; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_rcp_v2f16_arcp:
|
||||
@@ -1285,11 +1279,9 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
|
||||
; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_rcp_f16_e32 v0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_rcp_f16_e32 v1, v0
|
||||
; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
|
||||
@@ -1397,11 +1389,9 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
|
||||
; GFX8-LABEL: v_rcp_v2f16_ulp25:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_rcp_f16_e32 v0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_rcp_f16_e32 v1, v0
|
||||
; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_rcp_v2f16_ulp25:
|
||||
@@ -1457,10 +1447,8 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
|
||||
; GFX8-NEXT: v_rcp_f16_e32 v2, v1
|
||||
; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
|
||||
@@ -1589,10 +1577,9 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
|
||||
; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
|
||||
@@ -1682,10 +1669,8 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
|
||||
; GFX8-NEXT: v_rcp_f16_e32 v2, v1
|
||||
; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
|
||||
|
||||
@@ -269,9 +269,8 @@ define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) {
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
||||
; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
|
||||
; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fma_v2f16:
|
||||
@@ -327,9 +326,8 @@ define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half>
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
||||
; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
|
||||
; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fma_v2f16_fneg_lhs:
|
||||
@@ -386,9 +384,8 @@ define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half>
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
||||
; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
|
||||
; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fma_v2f16_fneg_rhs:
|
||||
@@ -439,9 +436,8 @@ define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x h
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
||||
; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
|
||||
; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fma_v2f16_fneg_lhs_rhs:
|
||||
@@ -498,11 +494,9 @@ define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) {
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4
|
||||
; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4
|
||||
; GFX8-NEXT: v_fma_f16 v2, v6, v7, v8
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fma_v3f16:
|
||||
@@ -568,13 +562,12 @@ define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v5
|
||||
; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4
|
||||
; GFX8-NEXT: v_fma_f16 v2, v6, v8, v10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 16
|
||||
; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5
|
||||
; GFX8-NEXT: v_fma_f16 v3, v7, v9, v11
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_fma_v4f16:
|
||||
|
||||
@@ -297,11 +297,10 @@ define <2 x half> @test_min_max_v2f16(<2 x half> %a) #0 {
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 0x4000
|
||||
; GFX8-NEXT: v_max_f16_e32 v1, 2.0, v0
|
||||
; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_min_f16_e32 v0, 4.0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 0x4400
|
||||
; GFX8-NEXT: v_min_f16_e32 v1, 4.0, v1
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
%maxnum = call nnan <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
|
||||
%fmed = call nnan <2 x half> @llvm.minnum.v2f16(<2 x half> %maxnum, <2 x half> <half 4.0, half 4.0>)
|
||||
|
||||
@@ -15,10 +15,8 @@ define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v2f16:
|
||||
@@ -43,10 +41,8 @@ define <2 x half> @v_fmul_v2f16_fneg_lhs(<2 x half> %a, <2 x half> %b) {
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v2f16_fneg_lhs:
|
||||
@@ -72,10 +68,8 @@ define <2 x half> @v_fmul_v2f16_fneg_rhs(<2 x half> %a, <2 x half> %b) {
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v2f16_fneg_rhs:
|
||||
@@ -100,10 +94,8 @@ define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b)
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v0, v1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v2f16_fneg_lhs_fneg_rhs:
|
||||
@@ -130,12 +122,9 @@ define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v3f16:
|
||||
@@ -164,12 +153,9 @@ define <3 x half> @v_fmul_v3f16_fneg_lhs(<3 x half> %a, <3 x half> %b) {
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000
|
||||
; GFX8-NEXT: v_xor_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v3f16_fneg_lhs:
|
||||
@@ -199,12 +185,9 @@ define <3 x half> @v_fmul_v3f16_fneg_rhs(<3 x half> %a, <3 x half> %b) {
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 0x80008000
|
||||
; GFX8-NEXT: v_xor_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v3f16_fneg_rhs:
|
||||
@@ -231,12 +214,9 @@ define <3 x half> @v_fmul_v3f16_fneg_lhs_fneg_rhs(<3 x half> %a, <3 x half> %b)
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v3f16_fneg_lhs_fneg_rhs:
|
||||
@@ -264,14 +244,11 @@ define <4 x half> @v_fmul_v4f16(<4 x half> %a, <4 x half> %b) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v4f16:
|
||||
@@ -299,14 +276,11 @@ define <4 x half> @v_fmul_v4f16_fneg_lhs(<4 x half> %a, <4 x half> %b) {
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v4f16_fneg_lhs:
|
||||
@@ -335,14 +309,11 @@ define <4 x half> @v_fmul_v4f16_fneg_rhs(<4 x half> %a, <4 x half> %b) {
|
||||
; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
|
||||
; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v4f16_fneg_rhs:
|
||||
@@ -369,14 +340,11 @@ define <4 x half> @v_fmul_v4f16_fneg_lhs_fneg_rhs(<4 x half> %a, <4 x half> %b)
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v2, v1, v3
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v4f16_fneg_lhs_fneg_rhs:
|
||||
@@ -405,19 +373,14 @@ define <6 x half> @v_fmul_v6f16(<6 x half> %a, <6 x half> %b) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v6f16:
|
||||
@@ -448,19 +411,14 @@ define <6 x half> @v_fmul_v6f16_fneg_lhs(<6 x half> %a, <6 x half> %b) {
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
|
||||
; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v6f16_fneg_lhs:
|
||||
@@ -492,19 +450,14 @@ define <6 x half> @v_fmul_v6f16_fneg_rhs(<6 x half> %a, <6 x half> %b) {
|
||||
; GFX8-NEXT: v_xor_b32_e32 v4, 0x80008000, v4
|
||||
; GFX8-NEXT: v_xor_b32_e32 v5, 0x80008000, v5
|
||||
; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v6f16_fneg_rhs:
|
||||
@@ -533,19 +486,14 @@ define <6 x half> @v_fmul_v6f16_fneg_lhs_fneg_rhs(<6 x half> %a, <6 x half> %b)
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_f16_e32 v6, v0, v3
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v3, v1, v4
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v2, v5
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v6f16_fneg_lhs_fneg_rhs:
|
||||
@@ -576,23 +524,17 @@ define <8 x half> @v_fmul_v8f16(<8 x half> %a, <8 x half> %b) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
|
||||
; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v8f16:
|
||||
@@ -626,23 +568,17 @@ define <8 x half> @v_fmul_v8f16_fneg_lhs(<8 x half> %a, <8 x half> %b) {
|
||||
; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
|
||||
; GFX8-NEXT: v_xor_b32_e32 v3, 0x80008000, v3
|
||||
; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
|
||||
; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v8f16_fneg_lhs:
|
||||
@@ -677,23 +613,17 @@ define <8 x half> @v_fmul_v8f16_fneg_rhs(<8 x half> %a, <8 x half> %b) {
|
||||
; GFX8-NEXT: v_xor_b32_e32 v6, 0x80008000, v6
|
||||
; GFX8-NEXT: v_xor_b32_e32 v7, 0x80008000, v7
|
||||
; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
|
||||
; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v8f16_fneg_rhs:
|
||||
@@ -724,23 +654,17 @@ define <8 x half> @v_fmul_v8f16_fneg_lhs_fneg_rhs(<8 x half> %a, <8 x half> %b)
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mul_f16_e32 v8, v0, v4
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v4, v1, v5
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v5, v2, v6
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mul_f16_e32 v6, v3, v7
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
|
||||
; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_fmul_v8f16_fneg_lhs_fneg_rhs:
|
||||
|
||||
@@ -208,13 +208,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX8-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_pow_v2f16:
|
||||
@@ -226,14 +224,13 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX9-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_pow_v2f16:
|
||||
@@ -248,37 +245,39 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX10-NEXT: v_exp_f16_e32 v1, v2
|
||||
; GFX10-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: v_pow_v2f16:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; GFX11-NEXT: v_log_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_log_f16_e32 v2, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_log_f16_e32 v2, v2
|
||||
; GFX11-NEXT: v_log_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
|
||||
; GFX11-NEXT: s_waitcnt_depctr 0xfff
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GFX11-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX11-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_waitcnt_depctr 0xfff
|
||||
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
|
||||
ret <2 x half> %pow
|
||||
@@ -319,13 +318,11 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX8-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
|
||||
@@ -338,14 +335,13 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX9-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_pow_v2f16_fneg_lhs:
|
||||
@@ -361,11 +357,12 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX10-NEXT: v_exp_f16_e32 v1, v2
|
||||
; GFX10-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: v_pow_v2f16_fneg_lhs:
|
||||
@@ -376,23 +373,25 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; GFX11-NEXT: v_log_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_log_f16_e32 v2, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_log_f16_e32 v2, v2
|
||||
; GFX11-NEXT: v_log_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_waitcnt_depctr 0xfff
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX11-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_waitcnt_depctr 0xfff
|
||||
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%x.fneg = fneg <2 x half> %x
|
||||
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
|
||||
@@ -429,18 +428,16 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
|
||||
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX8-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_exp_f16_e32 v1, v2
|
||||
; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
|
||||
@@ -453,14 +450,13 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_exp_f16_e32 v1, v2
|
||||
; GFX9-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_pow_v2f16_fneg_rhs:
|
||||
@@ -468,46 +464,48 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_log_f16_e32 v2, v0
|
||||
; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
|
||||
; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX10-NEXT: v_exp_f16_e32 v1, v2
|
||||
; GFX10-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: v_pow_v2f16_fneg_rhs:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; GFX11-NEXT: v_log_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_log_f16_e32 v2, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_log_f16_e32 v2, v2
|
||||
; GFX11-NEXT: v_log_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GFX11-NEXT: s_waitcnt_depctr 0xfff
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GFX11-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX11-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_waitcnt_depctr 0xfff
|
||||
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%y.fneg = fneg <2 x half> %y
|
||||
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
|
||||
@@ -550,18 +548,16 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
|
||||
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX8-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_exp_f16_e32 v1, v2
|
||||
; GFX8-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:
|
||||
@@ -575,14 +571,13 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_exp_f16_e32 v1, v2
|
||||
; GFX9-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs:
|
||||
@@ -599,11 +594,12 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
|
||||
; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX10-NEXT: v_exp_f16_e32 v1, v2
|
||||
; GFX10-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs:
|
||||
@@ -613,26 +609,27 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
|
||||
; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; GFX11-NEXT: v_log_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_log_f16_e32 v2, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_log_f16_e32 v2, v2
|
||||
; GFX11-NEXT: v_log_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
|
||||
; GFX11-NEXT: s_waitcnt_depctr 0xfff
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v1 :: v_dual_mul_dx9_zero_f32 v1, v2, v3
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GFX11-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX11-NEXT: v_exp_f16_e32 v1, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_exp_f16_e32 v0, v0
|
||||
; GFX11-NEXT: s_waitcnt_depctr 0xfff
|
||||
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%x.fneg = fneg <2 x half> %x
|
||||
%y.fneg = fneg <2 x half> %y
|
||||
|
||||
@@ -532,12 +532,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; CI-NEXT: v_fma_f32 v3, -v3, v6, v4
|
||||
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
||||
; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
|
||||
; CI-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1
|
||||
; CI-NEXT: v_trunc_f32_e32 v3, v3
|
||||
; CI-NEXT: v_fma_f32 v1, -v3, v2, v1
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; CI-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; CI-NEXT: v_or_b32_e32 v2, v0, v1
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s4
|
||||
@@ -573,9 +571,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1
|
||||
; VI-NEXT: v_trunc_f16_e32 v1, v1
|
||||
; VI-NEXT: v_fma_f16 v1, -v1, v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 16
|
||||
; VI-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, v0, v1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
@@ -670,18 +667,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; CI-NEXT: v_fma_f32 v5, -v5, v8, v6
|
||||
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
|
||||
; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
|
||||
; CI-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; CI-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; CI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; CI-NEXT: v_bfe_u32 v1, v2, 0, 16
|
||||
; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3
|
||||
; CI-NEXT: v_trunc_f32_e32 v5, v5
|
||||
; CI-NEXT: v_fma_f32 v3, -v5, v4, v3
|
||||
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; CI-NEXT: v_bfe_u32 v2, v3, 0, 16
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; CI-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
|
||||
; CI-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
@@ -723,6 +716,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; VI-NEXT: v_trunc_f16_e32 v1, v1
|
||||
; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; VI-NEXT: v_mul_f32_e32 v2, v2, v4
|
||||
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s9
|
||||
@@ -735,11 +730,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7
|
||||
; VI-NEXT: v_trunc_f16_e32 v3, v3
|
||||
; VI-NEXT: v_fma_f16 v3, -v3, v4, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v4, 16
|
||||
; VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
|
||||
; VI-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -283,7 +283,7 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1
|
||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v3, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@@ -300,7 +300,7 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX8-NEXT: s_and_b32 s2, s4, 0xffff
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -317,7 +317,7 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX7-NEXT: s_and_b32 s1, s4, 0xffff
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v3, s0, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -334,7 +334,7 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@@ -352,7 +352,7 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v3, -1, v1
|
||||
; GFX11-NEXT: v_not_b32_e32 v3, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@@ -375,7 +375,7 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX9-NEXT: s_mov_b32 s1, 0xffff
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1
|
||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v3, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@@ -391,7 +391,7 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX8-NEXT: s_mov_b32 s1, 0xffff
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -408,7 +408,7 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v1
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v3, s0, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -426,7 +426,7 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX10-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
@@ -443,7 +443,7 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX11-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3
|
||||
@@ -466,7 +466,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX9-NEXT: s_and_b32 s1, s2, 0xffff
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v4, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -483,7 +483,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -500,7 +500,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v3, v0, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -517,7 +517,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1
|
||||
; GFX10-NEXT: v_not_b32_e32 v4, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -535,7 +535,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v4, -1, v1
|
||||
; GFX11-NEXT: v_not_b32_e32 v4, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -647,7 +647,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX9-NEXT: s_mov_b32 s0, 0xffff
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v3, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -663,7 +663,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -680,7 +680,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v3, v0, v1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -696,7 +696,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -715,7 +715,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX11-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3
|
||||
@@ -1053,7 +1053,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
@@ -1080,7 +1080,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v4, v0, v3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
@@ -1107,7 +1107,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_or_b32_e32 v4, v0, v3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
@@ -1130,7 +1130,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX10-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo
|
||||
@@ -1161,7 +1161,7 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
|
||||
; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX11-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_and_or_b32 v5, v5, v2, v3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
@@ -1192,7 +1192,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
@@ -1218,7 +1218,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_and_b32_e32 v1, v3, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
@@ -1245,7 +1245,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
|
||||
; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v4, v1, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
@@ -1267,7 +1267,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
|
||||
@@ -1296,7 +1296,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX11-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
||||
@@ -1327,7 +1327,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
|
||||
; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX9-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, 0
|
||||
@@ -1350,7 +1350,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
|
||||
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX8-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 0
|
||||
@@ -1373,7 +1373,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v6, s0, v2
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
|
||||
; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX7-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, 0
|
||||
@@ -1397,7 +1397,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v4
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
|
||||
; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2
|
||||
@@ -1421,7 +1421,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v3, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v5
|
||||
; GFX11-NEXT: v_xor_b32_e32 v3, -1, v4
|
||||
; GFX11-NEXT: v_not_b32_e32 v3, v4
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
@@ -1569,7 +1569,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
|
||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX9-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, 0
|
||||
@@ -1591,7 +1591,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
|
||||
; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX8-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, 0
|
||||
@@ -1614,7 +1614,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v3
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
|
||||
; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX7-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
|
||||
; GFX7-NEXT: v_mov_b32_e32 v5, 0
|
||||
@@ -1637,7 +1637,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v5
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
|
||||
; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2
|
||||
@@ -1662,7 +1662,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v3, -1, v5
|
||||
; GFX11-NEXT: v_not_b32_e32 v3, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
|
||||
; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2
|
||||
@@ -2222,7 +2222,7 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s9
|
||||
@@ -2259,7 +2259,7 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v6, v0, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s8
|
||||
@@ -2296,7 +2296,7 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
|
||||
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_or_b32_e32 v5, v0, v2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s8
|
||||
@@ -2327,7 +2327,7 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6
|
||||
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2
|
||||
; GFX10-NEXT: v_not_b32_e32 v5, v2
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo
|
||||
@@ -2366,7 +2366,7 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v5, -1, v2
|
||||
; GFX11-NEXT: v_not_b32_e32 v5, v2
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
|
||||
@@ -2408,7 +2408,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
@@ -2444,7 +2444,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v6, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
@@ -2481,7 +2481,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
|
||||
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
@@ -2512,7 +2512,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3
|
||||
; GFX10-NEXT: v_not_b32_e32 v5, v3
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo
|
||||
@@ -2551,7 +2551,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v5, -1, v3
|
||||
; GFX11-NEXT: v_not_b32_e32 v5, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
||||
@@ -2588,7 +2588,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, 0
|
||||
@@ -2617,7 +2617,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, 0
|
||||
@@ -2649,7 +2649,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s10, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -2678,7 +2678,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7
|
||||
; GFX10-NEXT: v_not_b32_e32 v7, v7
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0
|
||||
@@ -2708,7 +2708,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1
|
||||
; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7
|
||||
; GFX11-NEXT: v_not_b32_e32 v7, v7
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
@@ -2895,7 +2895,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, 0
|
||||
@@ -2923,7 +2923,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, 0
|
||||
@@ -2955,7 +2955,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s10, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -2983,7 +2983,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8
|
||||
; GFX10-NEXT: v_not_b32_e32 v2, v8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -3017,7 +3017,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
|
||||
; GFX11-NEXT: v_xor_b32_e32 v2, -1, v8
|
||||
; GFX11-NEXT: v_not_b32_e32 v2, v8
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
@@ -3587,7 +3587,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s17
|
||||
@@ -3647,7 +3647,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v9, v0, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s16
|
||||
@@ -3707,7 +3707,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0
|
||||
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_or_b32_e32 v9, v0, v2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s16
|
||||
@@ -3754,7 +3754,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2
|
||||
; GFX10-NEXT: v_not_b32_e32 v9, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1
|
||||
@@ -3808,7 +3808,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12
|
||||
; GFX11-NEXT: v_xor_b32_e32 v9, -1, v2
|
||||
; GFX11-NEXT: v_not_b32_e32 v9, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0
|
||||
@@ -3880,7 +3880,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s20
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s13
|
||||
@@ -3939,7 +3939,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s20
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s12
|
||||
@@ -3999,7 +3999,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
|
||||
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11]
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v9, v1, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s12
|
||||
@@ -4046,7 +4046,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s9
|
||||
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3
|
||||
; GFX10-NEXT: v_not_b32_e32 v9, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1
|
||||
@@ -4100,7 +4100,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, s9
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v8, v1, v0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v9, -1, v3
|
||||
; GFX11-NEXT: v_not_b32_e32 v9, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
|
||||
@@ -4160,7 +4160,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc
|
||||
@@ -4208,7 +4208,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc
|
||||
@@ -4257,7 +4257,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s18, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(1)
|
||||
@@ -4305,7 +4305,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s5
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11
|
||||
; GFX10-NEXT: v_not_b32_e32 v11, v11
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0
|
||||
@@ -4351,7 +4351,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, s5
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v11, -1, v11
|
||||
; GFX11-NEXT: v_not_b32_e32 v11, v11
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
@@ -4556,7 +4556,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
||||
@@ -4603,7 +4603,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
||||
@@ -4652,7 +4652,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s18, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(1)
|
||||
@@ -4699,7 +4699,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v12
|
||||
; GFX10-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v13, 0
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
@@ -4745,7 +4745,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v2, v3, v2
|
||||
; GFX11-NEXT: v_xor_b32_e32 v3, -1, v12
|
||||
; GFX11-NEXT: v_not_b32_e32 v3, v12
|
||||
; GFX11-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v13, 0
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
||||
|
||||
@@ -1077,7 +1077,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX9-NEXT: s_and_b32 s2, s4, 0xff
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1
|
||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v3, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@@ -1094,7 +1094,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX8-NEXT: s_and_b32 s2, s4, 0xff
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -1111,7 +1111,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX7-NEXT: s_and_b32 s1, s4, 0xff
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
@@ -1129,7 +1129,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@@ -1147,7 +1147,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v3, -1, v1
|
||||
; GFX11-NEXT: v_not_b32_e32 v3, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@@ -1170,7 +1170,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX9-NEXT: s_movk_i32 s1, 0xff
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1
|
||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v3, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@@ -1186,7 +1186,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX8-NEXT: s_movk_i32 s1, 0xff
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -1203,7 +1203,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
@@ -1222,7 +1222,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX10-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3
|
||||
; GFX10-NEXT: global_store_dword v[0:1], v2, off
|
||||
@@ -1239,7 +1239,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX11-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3
|
||||
@@ -1262,7 +1262,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX9-NEXT: s_and_b32 s1, s2, 0xff
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v4, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -1279,7 +1279,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -1299,7 +1299,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
|
||||
@@ -1315,7 +1315,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1
|
||||
; GFX10-NEXT: v_not_b32_e32 v4, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -1333,7 +1333,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v4, -1, v1
|
||||
; GFX11-NEXT: v_not_b32_e32 v4, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -1447,7 +1447,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX9-NEXT: s_movk_i32 s0, 0xff
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v3, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -1463,7 +1463,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0
|
||||
@@ -1483,7 +1483,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
|
||||
@@ -1498,7 +1498,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -1517,7 +1517,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX11-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3
|
||||
@@ -1935,7 +1935,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
@@ -1962,7 +1962,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v4, v0, v3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
@@ -1989,7 +1989,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
@@ -2013,7 +2013,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX10-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo
|
||||
@@ -2044,7 +2044,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
|
||||
; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX11-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_and_or_b32 v5, v5, v2, v3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
@@ -2075,7 +2075,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
@@ -2101,7 +2101,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_and_b32_e32 v1, v3, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
@@ -2128,7 +2128,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
|
||||
; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v3, v1, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
@@ -2151,7 +2151,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
|
||||
@@ -2180,7 +2180,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX11-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
|
||||
@@ -2211,7 +2211,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
|
||||
; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX9-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, 0
|
||||
@@ -2234,7 +2234,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
|
||||
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX8-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 0
|
||||
@@ -2260,7 +2260,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
|
||||
; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX7-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -2283,7 +2283,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v4
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
|
||||
; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2
|
||||
@@ -2307,7 +2307,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v2, v3, s0
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v5
|
||||
; GFX11-NEXT: v_xor_b32_e32 v3, -1, v4
|
||||
; GFX11-NEXT: v_not_b32_e32 v3, v4
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
@@ -2457,7 +2457,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
|
||||
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX9-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, 0
|
||||
@@ -2479,7 +2479,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
|
||||
; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX8-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, 0
|
||||
@@ -2505,7 +2505,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
|
||||
; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX7-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
|
||||
; GFX7-NEXT: s_mov_b32 s6, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -2527,7 +2527,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5
|
||||
; GFX10-NEXT: v_not_b32_e32 v3, v5
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
|
||||
; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2
|
||||
@@ -2552,7 +2552,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v3, -1, v5
|
||||
; GFX11-NEXT: v_not_b32_e32 v3, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
|
||||
; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2
|
||||
@@ -3112,7 +3112,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX9-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s9
|
||||
@@ -3149,7 +3149,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v6, v0, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s8
|
||||
@@ -3186,7 +3186,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
|
||||
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_or_b32_e32 v5, v0, v2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s8
|
||||
@@ -3217,7 +3217,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6
|
||||
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2
|
||||
; GFX10-NEXT: v_not_b32_e32 v5, v2
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo
|
||||
@@ -3256,7 +3256,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_xor_b32_e32 v5, -1, v2
|
||||
; GFX11-NEXT: v_not_b32_e32 v5, v2
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
|
||||
@@ -3298,7 +3298,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
@@ -3334,7 +3334,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v6, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
@@ -3371,7 +3371,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
|
||||
; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
|
||||
; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
@@ -3402,7 +3402,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3
|
||||
; GFX10-NEXT: v_not_b32_e32 v5, v3
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo
|
||||
@@ -3441,7 +3441,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0
|
||||
; GFX11-NEXT: v_xor_b32_e32 v5, -1, v3
|
||||
; GFX11-NEXT: v_not_b32_e32 v5, v3
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
|
||||
@@ -3478,7 +3478,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, 0
|
||||
@@ -3507,7 +3507,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, 0
|
||||
@@ -3539,7 +3539,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s10, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -3568,7 +3568,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7
|
||||
; GFX10-NEXT: v_not_b32_e32 v7, v7
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0
|
||||
@@ -3598,7 +3598,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v7, v0, 0xff
|
||||
; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1
|
||||
; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7
|
||||
; GFX11-NEXT: v_not_b32_e32 v7, v7
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
@@ -3785,7 +3785,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX9-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, 0
|
||||
@@ -3813,7 +3813,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, 0
|
||||
@@ -3845,7 +3845,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
|
||||
; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
|
||||
; GFX7-NEXT: s_mov_b32 s10, -1
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -3873,7 +3873,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
|
||||
; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xff
|
||||
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8
|
||||
; GFX10-NEXT: v_not_b32_e32 v2, v8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -3907,7 +3907,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
|
||||
; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
|
||||
; GFX11-NEXT: v_xor_b32_e32 v2, -1, v8
|
||||
; GFX11-NEXT: v_not_b32_e32 v2, v8
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v9, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
|
||||
@@ -77,7 +77,7 @@ body: |
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: %src0:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: %zero:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; GFX6-NEXT: %ineg:vgpr_32, dead %4:sreg_64_xexec = V_SUB_CO_U32_e64 %zero, %src0, 0, implicit $exec
|
||||
; GFX6-NEXT: %ineg:vgpr_32, dead %4:sreg_64 = V_SUB_CO_U32_e64 %zero, %src0, 0, implicit $exec
|
||||
; GFX6-NEXT: %smax:vgpr_32 = V_MAX_I32_e64 %src0, %ineg, implicit $exec
|
||||
; GFX6-NEXT: S_ENDPGM 0, implicit %smax
|
||||
; GFX9-LABEL: name: smax_neg_abs_pattern_s32_vv
|
||||
|
||||
@@ -22,10 +22,10 @@ body: |
|
||||
; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
|
||||
; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], [[COPY1]], implicit-def $scc
|
||||
; GFX6-NEXT: %7:vgpr_32, dead %12:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[S_ADD_I32_]], 0, implicit $exec
|
||||
; GFX6-NEXT: %8:vgpr_32, dead %11:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_ADD_I32_]], %7, 0, implicit $exec
|
||||
; GFX6-NEXT: %9:vgpr_32, dead %10:sreg_64_xexec = V_ADD_CO_U32_e64 %8, [[COPY2]], 0, implicit $exec
|
||||
; GFX6-NEXT: S_ENDPGM 0, implicit [[S_ADD_I32_]], implicit %7, implicit %8, implicit %9
|
||||
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY2]], [[S_ADD_I32_]], 0, implicit $exec
|
||||
; GFX6-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[S_ADD_I32_]], [[V_ADD_CO_U32_e64_]], 0, implicit $exec
|
||||
; GFX6-NEXT: [[V_ADD_CO_U32_e64_4:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_5:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_ADD_CO_U32_e64_2]], [[COPY2]], 0, implicit $exec
|
||||
; GFX6-NEXT: S_ENDPGM 0, implicit [[S_ADD_I32_]], implicit [[V_ADD_CO_U32_e64_]], implicit [[V_ADD_CO_U32_e64_2]], implicit [[V_ADD_CO_U32_e64_4]]
|
||||
; GFX9-LABEL: name: add_s32
|
||||
; GFX9: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr3_vgpr4
|
||||
; GFX9-NEXT: {{ $}}
|
||||
@@ -103,8 +103,8 @@ body: |
|
||||
; GFX6: liveins: $vgpr0
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: %2:vgpr_32, dead %3:sreg_64 = V_SUB_CO_U32_e64 [[COPY]], 64, 0, implicit $exec
|
||||
; GFX6-NEXT: S_ENDPGM 0, implicit %2
|
||||
; GFX6-NEXT: [[V_SUB_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_SUB_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_SUB_CO_U32_e64 [[COPY]], 64, 0, implicit $exec
|
||||
; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_CO_U32_e64_]]
|
||||
; GFX9-LABEL: name: add_neg_inline_const_64_to_sub_s32_v
|
||||
; GFX9: liveins: $vgpr0
|
||||
; GFX9-NEXT: {{ $}}
|
||||
@@ -164,8 +164,8 @@ body: |
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
|
||||
; GFX6-NEXT: %2:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
|
||||
; GFX6-NEXT: S_ENDPGM 0, implicit %2
|
||||
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
|
||||
; GFX6-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]]
|
||||
; GFX9-LABEL: name: add_neg_inline_const_16_to_sub_s32_v
|
||||
; GFX9: liveins: $vgpr0
|
||||
; GFX9-NEXT: {{ $}}
|
||||
|
||||
@@ -62,8 +62,9 @@ body: |
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
|
||||
; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_NC_U16_e64_]], 0, 16, implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
|
||||
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ADD_NC_U16_e64_]], implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
%2:vgpr(s16) = G_TRUNC %0
|
||||
@@ -125,8 +126,9 @@ body: |
|
||||
; GFX10-NEXT: {{ $}}
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
|
||||
; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_SUB_NC_U16_e64_]], 0, 16, implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
|
||||
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_SUB_NC_U16_e64_]], implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s16) = G_TRUNC %0
|
||||
%2:vgpr(s16) = G_CONSTANT i16 -64
|
||||
|
||||
@@ -22,7 +22,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
|
||||
; WAVE32-LABEL: name: and_s1_vcc_vcc_vcc
|
||||
; WAVE32: liveins: $vgpr0, $vgpr1
|
||||
@@ -32,7 +32,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_AND_B32_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
@@ -401,7 +401,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
|
||||
; WAVE32-LABEL: name: and_s1_vcc_copy_to_vcc
|
||||
; WAVE32: liveins: $vgpr0, $vgpr1
|
||||
@@ -412,7 +412,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_AND_B32_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
@@ -446,7 +446,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_AND_B64_]]
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]]
|
||||
; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32
|
||||
@@ -458,7 +458,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_AND_B32_1]]
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]]
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
@@ -494,7 +494,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
|
||||
; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64
|
||||
; WAVE32: liveins: $vgpr0, $sgpr0
|
||||
@@ -505,7 +505,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_AND_B32_1]]
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]]
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
|
||||
@@ -36,8 +36,8 @@ body: |
|
||||
; GCN-LABEL: name: anyext_sgpr_s32_to_sgpr_s64
|
||||
; GCN: liveins: $sgpr0
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY $sgpr0
|
||||
; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF
|
||||
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1
|
||||
; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
@@ -100,13 +100,6 @@ body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0
|
||||
|
||||
; GCN-LABEL: name: anyext_vgpr_s16_to_vgpr_s64
|
||||
; GCN: liveins: $vgpr0
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
|
||||
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[DEF]], %subreg.sub1
|
||||
; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s16) = G_TRUNC %0
|
||||
%2:vgpr(s64) = G_ANYEXT %1
|
||||
@@ -143,8 +136,9 @@ body: |
|
||||
; GCN: liveins: $sgpr0
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[COPY]], 1048576, implicit-def $scc
|
||||
; GCN-NEXT: $sgpr0 = COPY [[S_BFE_U32_]]
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_]], [[COPY]], implicit-def $scc
|
||||
; GCN-NEXT: $sgpr0 = COPY [[S_AND_B32_]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
%1:sgpr(s1) = G_TRUNC %0
|
||||
%2:sgpr(s16) = G_ANYEXT %1
|
||||
@@ -207,8 +201,9 @@ body: |
|
||||
; GCN: liveins: $vgpr0
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[COPY]], 0, 16, implicit $exec
|
||||
; GCN-NEXT: $vgpr0 = COPY [[V_BFE_U32_e64_]]
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
|
||||
; GCN-NEXT: $vgpr0 = COPY [[V_AND_B32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s1) = G_TRUNC %0
|
||||
%2:vgpr(s16) = G_ANYEXT %1
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
# ERR-NOT: remark
|
||||
# ERR: remark: <unknown>:0:0: cannot select: %4:sgpr(s16) = G_ASHR %2:sgpr, %3:sgpr(s16) (in function: ashr_s16_s16_ss)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:vgpr, %1:vgpr(s32) (in function: ashr_s16_s32_vv)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %5:vgpr(s64) = G_ZEXT %4:vgpr(s16) (in function: ashr_s16_vv_zext_to_s64)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:sgpr(s16) = G_ASHR %2:sgpr, %1:sgpr(s32) (in function: ashr_s16_s32_ss)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:sgpr, %1:vgpr(s32) (in function: ashr_s16_s32_sv)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:vgpr(s16) = G_ASHR %2:vgpr, %1:sgpr(s32) (in function: ashr_s16_s32_vs)
|
||||
@@ -240,16 +239,18 @@ body: |
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ASHRREV_I16_e64_]], 0, 16, implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
|
||||
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ASHRREV_I16_e64_]], implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
|
||||
; GFX11-LABEL: name: ashr_s16_s16_vv_zext_to_s32
|
||||
; GFX11: liveins: $vgpr0, $vgpr1
|
||||
; GFX11-NEXT: {{ $}}
|
||||
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX11-NEXT: [[V_ASHRREV_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX11-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ASHRREV_I16_t16_e64_]], 0, 16, implicit $exec
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
|
||||
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ASHRREV_I16_t16_e64_]], implicit $exec
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
%2:vgpr(s16) = G_TRUNC %0
|
||||
@@ -271,43 +272,51 @@ body: |
|
||||
; GFX8-LABEL: name: ashr_s16_vv_zext_to_s64
|
||||
; GFX8: liveins: $vgpr0, $vgpr1
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX8-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX8-NEXT: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX8-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16)
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX8-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec
|
||||
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
; GFX9-LABEL: name: ashr_s16_vv_zext_to_s64
|
||||
; GFX9: liveins: $vgpr0, $vgpr1
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX9-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16)
|
||||
; GFX9-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX9-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
; GFX10-LABEL: name: ashr_s16_vv_zext_to_s64
|
||||
; GFX10: liveins: $vgpr0, $vgpr1
|
||||
; GFX10-NEXT: {{ $}}
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX10-NEXT: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX10-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16)
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX10-NEXT: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_e64_]], implicit $exec
|
||||
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
; GFX11-LABEL: name: ashr_s16_vv_zext_to_s64
|
||||
; GFX11: liveins: $vgpr0, $vgpr1
|
||||
; GFX11-NEXT: {{ $}}
|
||||
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX11-NEXT: [[ASHR:%[0-9]+]]:vgpr(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX11-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[ASHR]](s16)
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX11-NEXT: [[V_ASHRREV_I16_t16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_ASHRREV_I16_t16_e64_]], implicit $exec
|
||||
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
%2:vgpr(s16) = G_TRUNC %0
|
||||
|
||||
@@ -243,8 +243,8 @@ body: |
|
||||
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
|
||||
; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
|
||||
; GCN-NEXT: %5:sreg_64_xexec = nofpexcept V_CMP_EQ_F32_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], %5, implicit-def dead $scc
|
||||
; GCN-NEXT: [[V_CMP_EQ_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_EQ_F32_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
|
||||
; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_F32_e64_]], implicit-def $scc
|
||||
; GCN-NEXT: $vcc = COPY [[S_AND_B64_]]
|
||||
; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
|
||||
; GCN-NEXT: {{ $}}
|
||||
@@ -283,7 +283,7 @@ body: |
|
||||
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY2]], implicit-def $scc
|
||||
; GCN-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
|
||||
; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_NE_U32_e64_]], implicit-def $scc
|
||||
; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_AND_B64_]], $exec, implicit-def $scc
|
||||
; GCN-NEXT: $vcc = COPY [[S_AND_B64_1]]
|
||||
; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
|
||||
@@ -321,7 +321,7 @@ body: |
|
||||
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
|
||||
; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
|
||||
; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[S_MOV_B64_]], implicit-def $scc
|
||||
; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[S_XOR_B64_]], $exec, implicit-def $scc
|
||||
; GCN-NEXT: $vcc = COPY [[S_AND_B64_]]
|
||||
; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc
|
||||
|
||||
@@ -572,9 +572,11 @@ body: |
|
||||
; GFX9PLUS-LABEL: name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant
|
||||
; GFX9PLUS: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
|
||||
; GFX9PLUS-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
|
||||
; GFX9PLUS-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[DEF]], 1048576, implicit-def $scc
|
||||
; GFX9PLUS-NEXT: [[S_BFE_U32_1:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_MOV_B32_]], 1048576, implicit-def $scc
|
||||
; GFX9PLUS-NEXT: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_BFE_U32_]], [[S_BFE_U32_1]]
|
||||
; GFX9PLUS-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX9PLUS-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_1]], [[DEF]], implicit-def $scc
|
||||
; GFX9PLUS-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX9PLUS-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_2]], [[S_MOV_B32_]], implicit-def $scc
|
||||
; GFX9PLUS-NEXT: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_AND_B32_]], [[S_AND_B32_1]]
|
||||
; GFX9PLUS-NEXT: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
|
||||
%0:sgpr(s16) = G_IMPLICIT_DEF
|
||||
%1:sgpr(s16) = G_CONSTANT i16 123
|
||||
|
||||
@@ -79,8 +79,9 @@ body: |
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]]
|
||||
; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY1]], 0, implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
%2:vgpr(s32) = G_CTPOP %0
|
||||
@@ -103,8 +104,9 @@ body: |
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]]
|
||||
; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY1]], [[V_BCNT_U32_B32_e64_]], 0, implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
%2:vgpr(s32) = G_CTPOP %0
|
||||
@@ -153,8 +155,9 @@ body: |
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]]
|
||||
; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY1]], 0, implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:sgpr(s32) = COPY $sgpr0
|
||||
%2:vgpr(s32) = G_CTPOP %0
|
||||
@@ -178,8 +181,9 @@ body: |
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]]
|
||||
; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY1]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_BCNT_U32_B32_e64_]], [[COPY]], 0, implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:sgpr(s32) = COPY $sgpr0
|
||||
%2:vgpr(s32) = G_CTPOP %1
|
||||
@@ -203,8 +207,9 @@ body: |
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK-NEXT: [[V_BCNT_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_BCNT_U32_B32_e64 [[COPY]], [[COPY1]], implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_BCNT_U32_B32_e64_]]
|
||||
; CHECK-NEXT: [[S_BCNT1_I32_B32_:%[0-9]+]]:sreg_32 = S_BCNT1_I32_B32 [[COPY]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[S_BCNT1_I32_B32_]], [[COPY1]], 0, implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
%2:sgpr(s32) = G_CTPOP %0
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
# ERR-NOT: remark
|
||||
# ERR: remark: <unknown>:0:0: cannot select: %4:sgpr(s16) = G_LSHR %2:sgpr, %3:sgpr(s16) (in function: lshr_s16_s16_ss)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:vgpr, %1:vgpr(s32) (in function: lshr_s16_s32_vv)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %5:vgpr(s64) = G_ZEXT %4:vgpr(s16) (in function: lshr_s16_vv_zext_to_s64)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:sgpr(s16) = G_LSHR %2:sgpr, %1:sgpr(s32) (in function: lshr_s16_s32_ss)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:sgpr, %1:vgpr(s32) (in function: lshr_s16_s32_sv)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:vgpr(s16) = G_LSHR %2:vgpr, %1:sgpr(s32) (in function: lshr_s16_s32_vs)
|
||||
@@ -238,16 +237,18 @@ body: |
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_LSHRREV_B16_e64_]], 0, 16, implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
|
||||
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHRREV_B16_e64_]], implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
|
||||
; GFX11-LABEL: name: lshr_s16_s16_vv_zext_to_s32
|
||||
; GFX11: liveins: $vgpr0, $vgpr1
|
||||
; GFX11-NEXT: {{ $}}
|
||||
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX11-NEXT: [[V_LSHRREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX11-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_LSHRREV_B16_t16_e64_]], 0, 16, implicit $exec
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
|
||||
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHRREV_B16_t16_e64_]], implicit $exec
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
%2:vgpr(s16) = G_TRUNC %0
|
||||
@@ -269,43 +270,51 @@ body: |
|
||||
; GFX8-LABEL: name: lshr_s16_vv_zext_to_s64
|
||||
; GFX8: liveins: $vgpr0, $vgpr1
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX8-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX8-NEXT: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX8-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16)
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX8-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec
|
||||
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
; GFX9-LABEL: name: lshr_s16_vv_zext_to_s64
|
||||
; GFX9: liveins: $vgpr0, $vgpr1
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX9-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16)
|
||||
; GFX9-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX9-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
; GFX10-LABEL: name: lshr_s16_vv_zext_to_s64
|
||||
; GFX10: liveins: $vgpr0, $vgpr1
|
||||
; GFX10-NEXT: {{ $}}
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX10-NEXT: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX10-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16)
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX10-NEXT: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_e64_]], implicit $exec
|
||||
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
; GFX11-LABEL: name: lshr_s16_vv_zext_to_s64
|
||||
; GFX11: liveins: $vgpr0, $vgpr1
|
||||
; GFX11-NEXT: {{ $}}
|
||||
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX11-NEXT: [[LSHR:%[0-9]+]]:vgpr(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX11-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[LSHR]](s16)
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX11-NEXT: [[V_LSHRREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHRREV_B16_t16_e64_]], implicit $exec
|
||||
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
%2:vgpr(s16) = G_TRUNC %0
|
||||
|
||||
@@ -22,7 +22,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_OR_B64_]]
|
||||
; WAVE32-LABEL: name: or_s1_vcc_vcc_vcc
|
||||
; WAVE32: liveins: $vgpr0, $vgpr1
|
||||
@@ -32,7 +32,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_OR_B32_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
@@ -401,7 +401,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_OR_B64_]]
|
||||
; WAVE32-LABEL: name: or_s1_vcc_copy_to_vcc
|
||||
; WAVE32: liveins: $vgpr0, $vgpr1
|
||||
@@ -412,7 +412,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_OR_B32_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
@@ -446,7 +446,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_OR_B64_]]
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]]
|
||||
; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32
|
||||
@@ -458,7 +458,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_OR_B32_]]
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]]
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
@@ -494,7 +494,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_OR_B64_]]
|
||||
; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64
|
||||
; WAVE32: liveins: $vgpr0, $sgpr0
|
||||
@@ -505,7 +505,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_OR_B32_]]
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]]
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
|
||||
@@ -56,9 +56,9 @@ body: |
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 %3, [[COPY2]], 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit %4
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_ADD_CO_U32_e64_]], [[COPY2]], 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]]
|
||||
; GFX9-LABEL: name: add_s32_vgpr_vgpr_vgpr
|
||||
; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX9-NEXT: {{ $}}
|
||||
@@ -91,9 +91,9 @@ body: |
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 %3, [[COPY2]], 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit %4, implicit %3
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_ADD_CO_U32_e64_]], [[COPY2]], 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]], implicit [[V_ADD_CO_U32_e64_]]
|
||||
; GFX9-LABEL: name: add_s32_vgpr_vgpr_vgpr_multi_use
|
||||
; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX9-NEXT: {{ $}}
|
||||
@@ -128,9 +128,9 @@ body: |
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 %3, [[COPY2]], 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit %4
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_ADD_CO_U32_e64_]], [[COPY2]], 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]]
|
||||
; GFX9-LABEL: name: add_p3_vgpr_vgpr_vgpr
|
||||
; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX9-NEXT: {{ $}}
|
||||
@@ -164,9 +164,9 @@ body: |
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 %3, [[COPY2]], 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit %4
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_ADD_CO_U32_e64_]], [[COPY2]], 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]]
|
||||
; GFX9-LABEL: name: add_p5_vgpr_vgpr_vgpr
|
||||
; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX9-NEXT: {{ $}}
|
||||
@@ -200,9 +200,9 @@ body: |
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], %3, 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit %4
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[V_ADD_CO_U32_e64_]], 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]]
|
||||
; GFX9-LABEL: name: add_p3_s32_vgpr_vgpr_vgpr
|
||||
; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX9-NEXT: {{ $}}
|
||||
@@ -237,9 +237,9 @@ body: |
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX8-NEXT: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], %3, 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit %4
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[V_ADD_CO_U32_e64_]], 0, implicit $exec
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_2]]
|
||||
; GFX9-LABEL: name: add_p5_s32_vgpr_vgpr_vgpr
|
||||
; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX9-NEXT: {{ $}}
|
||||
|
||||
@@ -15,8 +15,9 @@ body: |
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN-NEXT: [[S_BFE_I32_:%[0-9]+]]:sreg_32 = S_BFE_I32 [[COPY]], 65536, implicit-def $scc
|
||||
; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_BFE_I32_]], 1048576, implicit-def $scc
|
||||
; GCN-NEXT: $sgpr0 = COPY [[S_BFE_U32_]]
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_]], [[S_BFE_I32_]], implicit-def $scc
|
||||
; GCN-NEXT: $sgpr0 = COPY [[S_AND_B32_]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
%1:sgpr(s1) = G_TRUNC %0
|
||||
%2:sgpr(s16) = G_SEXT %1
|
||||
@@ -165,8 +166,9 @@ body: |
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[COPY]], 0, 1, implicit $exec
|
||||
; GCN-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_BFE_I32_e64_]], 0, 16, implicit $exec
|
||||
; GCN-NEXT: $vgpr0 = COPY [[V_BFE_U32_e64_]]
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_BFE_I32_e64_]], implicit $exec
|
||||
; GCN-NEXT: $vgpr0 = COPY [[V_AND_B32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s1) = G_TRUNC %0
|
||||
%2:vgpr(s16) = G_SEXT %1
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
# ERR-NOT: remark
|
||||
# ERR: remark: <unknown>:0:0: cannot select: %4:sgpr(s16) = G_SHL %2:sgpr, %3:sgpr(s16) (in function: shl_s16_s16_ss)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:vgpr(s16) = G_SHL %2:vgpr, %1:vgpr(s32) (in function: shl_s16_s32_vv)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %5:vgpr(s64) = G_ZEXT %4:vgpr(s16) (in function: shl_s16_vv_zext_to_s64)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:sgpr(s16) = G_SHL %2:sgpr, %1:sgpr(s32) (in function: shl_s16_s32_ss)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:vgpr(s16) = G_SHL %2:sgpr, %1:vgpr(s32) (in function: shl_s16_s32_sv)
|
||||
# ERR-NEXT: remark: <unknown>:0:0: cannot select: %3:vgpr(s16) = G_SHL %2:vgpr, %1:sgpr(s32) (in function: shl_s16_s32_vs)
|
||||
@@ -238,16 +237,18 @@ body: |
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX10-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_LSHLREV_B16_e64_]], 0, 16, implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
|
||||
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHLREV_B16_e64_]], implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
|
||||
; GFX11-LABEL: name: shl_s16_s16_vv_zext_to_s32
|
||||
; GFX11: liveins: $vgpr0, $vgpr1
|
||||
; GFX11-NEXT: {{ $}}
|
||||
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX11-NEXT: [[V_LSHLREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX11-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_LSHLREV_B16_t16_e64_]], 0, 16, implicit $exec
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_BFE_U32_e64_]]
|
||||
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_LSHLREV_B16_t16_e64_]], implicit $exec
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
%2:vgpr(s16) = G_TRUNC %0
|
||||
@@ -269,43 +270,51 @@ body: |
|
||||
; GFX8-LABEL: name: shl_s16_vv_zext_to_s64
|
||||
; GFX8: liveins: $vgpr0, $vgpr1
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX8-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX8-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX8-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16)
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX8-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX8-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec
|
||||
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX8-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
; GFX9-LABEL: name: shl_s16_vv_zext_to_s64
|
||||
; GFX9: liveins: $vgpr0, $vgpr1
|
||||
; GFX9-NEXT: {{ $}}
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX9-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX9-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX9-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16)
|
||||
; GFX9-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX9-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec
|
||||
; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX9-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
; GFX10-LABEL: name: shl_s16_vv_zext_to_s64
|
||||
; GFX10: liveins: $vgpr0, $vgpr1
|
||||
; GFX10-NEXT: {{ $}}
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX10-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX10-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX10-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16)
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX10-NEXT: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_e64_]], implicit $exec
|
||||
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
; GFX11-LABEL: name: shl_s16_vv_zext_to_s64
|
||||
; GFX11: liveins: $vgpr0, $vgpr1
|
||||
; GFX11-NEXT: {{ $}}
|
||||
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
|
||||
; GFX11-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY]](s32)
|
||||
; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:vgpr(s16) = G_TRUNC [[COPY1]](s32)
|
||||
; GFX11-NEXT: [[SHL:%[0-9]+]]:vgpr(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
|
||||
; GFX11-NEXT: [[ZEXT:%[0-9]+]]:vgpr(s64) = G_ZEXT [[SHL]](s16)
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[ZEXT]](s64)
|
||||
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX11-NEXT: [[V_LSHLREV_B16_t16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_t16_e64 [[COPY1]], [[COPY]], implicit $exec
|
||||
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_1]], [[V_LSHLREV_B16_t16_e64_]], implicit $exec
|
||||
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY2]], %subreg.sub1
|
||||
; GFX11-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
%2:vgpr(s16) = G_TRUNC %0
|
||||
|
||||
@@ -861,15 +861,25 @@ body: |
|
||||
; GFX6-LABEL: name: store_atomic_global_s32
|
||||
; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
|
||||
; GFX6-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store monotonic (s32), addrspace 1)
|
||||
; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
|
||||
; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
|
||||
; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
|
||||
; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
|
||||
; GFX6-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1)
|
||||
; GFX7-LABEL: name: store_atomic_global_s32
|
||||
; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32), addrspace 1)
|
||||
; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
|
||||
; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
|
||||
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
|
||||
; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
|
||||
; GFX7-NEXT: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s32), addrspace 1)
|
||||
; GFX7-FLAT-LABEL: name: store_atomic_global_s32
|
||||
; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2
|
||||
; GFX7-FLAT-NEXT: {{ $}}
|
||||
@@ -914,15 +924,25 @@ body: |
|
||||
; GFX6-LABEL: name: store_atomic_global_s64
|
||||
; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
|
||||
; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3
|
||||
; GFX6-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store monotonic (s64), addrspace 1)
|
||||
; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
|
||||
; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
|
||||
; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
|
||||
; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
|
||||
; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
|
||||
; GFX6-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1)
|
||||
; GFX7-LABEL: name: store_atomic_global_s64
|
||||
; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
|
||||
; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
|
||||
; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64), addrspace 1)
|
||||
; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
|
||||
; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
|
||||
; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
|
||||
; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
|
||||
; GFX7-NEXT: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store monotonic (s64), addrspace 1)
|
||||
; GFX7-FLAT-LABEL: name: store_atomic_global_s64
|
||||
; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
|
||||
; GFX7-FLAT-NEXT: {{ $}}
|
||||
|
||||
@@ -23,10 +23,10 @@ body: |
|
||||
; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
|
||||
; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY]], [[COPY1]], implicit-def $scc
|
||||
; GFX6-NEXT: %7:vgpr_32, dead %12:sreg_64_xexec = V_SUB_CO_U32_e64 [[COPY2]], [[S_SUB_I32_]], 0, implicit $exec
|
||||
; GFX6-NEXT: %8:vgpr_32, dead %11:sreg_64_xexec = V_SUB_CO_U32_e64 [[S_SUB_I32_]], %7, 0, implicit $exec
|
||||
; GFX6-NEXT: %9:vgpr_32, dead %10:sreg_64_xexec = V_SUB_CO_U32_e64 %8, [[COPY2]], 0, implicit $exec
|
||||
; GFX6-NEXT: S_ENDPGM 0, implicit %9
|
||||
; GFX6-NEXT: [[V_SUB_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_SUB_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_SUB_CO_U32_e64 [[COPY2]], [[S_SUB_I32_]], 0, implicit $exec
|
||||
; GFX6-NEXT: [[V_SUB_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_SUB_CO_U32_e64_3:%[0-9]+]]:sreg_64 = V_SUB_CO_U32_e64 [[S_SUB_I32_]], [[V_SUB_CO_U32_e64_]], 0, implicit $exec
|
||||
; GFX6-NEXT: [[V_SUB_CO_U32_e64_4:%[0-9]+]]:vgpr_32, dead [[V_SUB_CO_U32_e64_5:%[0-9]+]]:sreg_64 = V_SUB_CO_U32_e64 [[V_SUB_CO_U32_e64_2]], [[COPY2]], 0, implicit $exec
|
||||
; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_CO_U32_e64_4]]
|
||||
; GFX9-LABEL: name: sub_s32
|
||||
; GFX9: liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr3_vgpr4
|
||||
; GFX9-NEXT: {{ $}}
|
||||
|
||||
@@ -22,7 +22,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_XOR_B64_]]
|
||||
; WAVE32-LABEL: name: xor_s1_vcc_vcc_vcc
|
||||
; WAVE32: liveins: $vgpr0, $vgpr1
|
||||
@@ -32,7 +32,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_EQ_U32_e64_]], [[V_CMP_EQ_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_XOR_B32_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
@@ -402,7 +402,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_XOR_B64_]]
|
||||
; WAVE32-LABEL: name: xor_s1_vcc_copy_to_vcc
|
||||
; WAVE32: liveins: $vgpr0, $vgpr1
|
||||
@@ -413,7 +413,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY1]], implicit $exec
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_1]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_XOR_B32_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
@@ -447,7 +447,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_XOR_B64_]]
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]]
|
||||
; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32
|
||||
@@ -459,7 +459,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_XOR_B32_]]
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]]
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
@@ -495,7 +495,7 @@ body: |
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE64-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE64-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_XOR_B64_]]
|
||||
; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64
|
||||
; WAVE32: liveins: $vgpr0, $sgpr0
|
||||
@@ -506,7 +506,7 @@ body: |
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
|
||||
; WAVE32-NEXT: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
|
||||
; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
|
||||
; WAVE32-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def $scc
|
||||
; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_XOR_B32_]]
|
||||
; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]]
|
||||
%1:vgpr(s32) = COPY $vgpr0
|
||||
|
||||
@@ -81,8 +81,9 @@ body: |
|
||||
; GCN: liveins: $sgpr0
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN-NEXT: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[COPY]], 1048576, implicit-def $scc
|
||||
; GCN-NEXT: $sgpr0 = COPY [[S_BFE_U32_]]
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_MOV_B32_]], [[COPY]], implicit-def $scc
|
||||
; GCN-NEXT: $sgpr0 = COPY [[S_AND_B32_]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
%1:sgpr(s16) = G_TRUNC %0
|
||||
%2:sgpr(s32) = G_ZEXT %1
|
||||
@@ -126,8 +127,8 @@ body: |
|
||||
; GCN-LABEL: name: zext_sgpr_s32_to_sgpr_s64
|
||||
; GCN: liveins: $sgpr0
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xexec_hi_and_sreg_32_xm0 = COPY $sgpr0
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
|
||||
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
|
||||
; GCN-NEXT: $sgpr0_sgpr1 = COPY [[REG_SEQUENCE]]
|
||||
%0:sgpr(s32) = COPY $sgpr0
|
||||
@@ -208,8 +209,9 @@ body: |
|
||||
; GCN: liveins: $vgpr0
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[COPY]], 0, 16, implicit $exec
|
||||
; GCN-NEXT: $vgpr0 = COPY [[V_BFE_U32_e64_]]
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
|
||||
; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
|
||||
; GCN-NEXT: $vgpr0 = COPY [[V_AND_B32_e64_]]
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s16) = G_TRUNC %0
|
||||
%2:vgpr(s32) = G_ZEXT %1
|
||||
|
||||
@@ -10,13 +10,6 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_fdot2:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
|
||||
ret float %r
|
||||
}
|
||||
@@ -44,13 +37,6 @@ define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_fdot2_neg_a:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %a
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false)
|
||||
ret float %r
|
||||
@@ -62,13 +48,6 @@ define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_fdot2_neg_b:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.b = fneg <2 x half> %b
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false)
|
||||
ret float %r
|
||||
@@ -80,13 +59,6 @@ define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_fdot2_neg_a_neg_b:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %b
|
||||
%neg.b = fneg <2 x half> %b
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %neg.b, float %c, i1 false)
|
||||
@@ -100,14 +72,6 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
|
||||
; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
|
||||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_fdot2_neg_c:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.c = fneg float %c
|
||||
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
|
||||
ret float %r
|
||||
@@ -119,13 +83,6 @@ define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_fdot2_inline_literal_a:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
|
||||
ret float %ret
|
||||
}
|
||||
@@ -136,13 +93,6 @@ define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_fdot2_inline_literal_b:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
|
||||
ret float %ret
|
||||
}
|
||||
@@ -153,13 +103,6 @@ define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) {
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_fdot2_inline_literal_c:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
@@ -448,27 +448,28 @@ main_body:
|
||||
define amdgpu_ps float @atomic_add_i32_2d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t) {
|
||||
; GFX9-LABEL: atomic_add_i32_2d:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i32_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -485,27 +486,28 @@ main_body:
|
||||
define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %r) {
|
||||
; GFX9-LABEL: atomic_add_i32_3d:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i32_3d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -522,27 +524,28 @@ main_body:
|
||||
define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %face) {
|
||||
; GFX9-LABEL: atomic_add_i32_cube:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i32_cube:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -559,27 +562,28 @@ main_body:
|
||||
define amdgpu_ps float @atomic_add_i32_1darray(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %slice) {
|
||||
; GFX9-LABEL: atomic_add_i32_1darray:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i32_1darray:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -596,27 +600,28 @@ main_body:
|
||||
define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %slice) {
|
||||
; GFX9-LABEL: atomic_add_i32_2darray:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i32_2darray:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -633,27 +638,28 @@ main_body:
|
||||
define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %fragid) {
|
||||
; GFX9-LABEL: atomic_add_i32_2dmsaa:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v1, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX9-NEXT: image_atomic_add v0, v[2:3], s[0:7] dmask:0x1 unorm glc a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i32_2dmsaa:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v2, v2, v1, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -670,28 +676,31 @@ main_body:
|
||||
define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid) {
|
||||
; GFX9-LABEL: atomic_add_i32_2darraymsaa:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s8
|
||||
; GFX9-NEXT: v_perm_b32 v2, v4, v3, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2
|
||||
; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i32_2darraymsaa:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
@@ -1185,27 +1194,28 @@ main_body:
|
||||
define amdgpu_ps <2 x float> @atomic_add_i64_2d(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t) {
|
||||
; GFX9-LABEL: atomic_add_i64_2d:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v2, v3, v2, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i64_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -1222,27 +1232,28 @@ main_body:
|
||||
define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %r) {
|
||||
; GFX9-LABEL: atomic_add_i64_3d:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i64_3d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -1259,27 +1270,28 @@ main_body:
|
||||
define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %face) {
|
||||
; GFX9-LABEL: atomic_add_i64_cube:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i64_cube:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -1296,27 +1308,28 @@ main_body:
|
||||
define amdgpu_ps <2 x float> @atomic_add_i64_1darray(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %slice) {
|
||||
; GFX9-LABEL: atomic_add_i64_1darray:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v2, v3, v2, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i64_1darray:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -1333,27 +1346,28 @@ main_body:
|
||||
define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %slice) {
|
||||
; GFX9-LABEL: atomic_add_i64_2darray:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i64_2darray:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -1370,27 +1384,28 @@ main_body:
|
||||
define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %fragid) {
|
||||
; GFX9-LABEL: atomic_add_i64_2dmsaa:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v3, v3, v2, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX9-NEXT: image_atomic_add v[0:1], v[3:4], s[0:7] dmask:0x3 unorm glc a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i64_2dmsaa:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v3, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10-NEXT: s_mov_b32 s6, s8
|
||||
@@ -1407,28 +1422,31 @@ main_body:
|
||||
define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc, i64 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid) {
|
||||
; GFX9-LABEL: atomic_add_i64_2darraymsaa:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v4
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v2, v3, v2, s8
|
||||
; GFX9-NEXT: v_perm_b32 v3, v5, v4, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v5, 16, v3
|
||||
; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: atomic_add_i64_2darraymsaa:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v3, v5, v4, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10-NEXT: s_mov_b32 s5, s7
|
||||
|
||||
@@ -9,19 +9,19 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
|
||||
; GFX9-NEXT: s_mov_b64 s[14:15], exec
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -32,6 +32,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
|
||||
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
@@ -43,7 +44,7 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
|
||||
; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -59,19 +60,19 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
; GFX9-NEXT: s_mov_b64 s[14:15], exec
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
|
||||
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -82,6 +83,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
@@ -93,7 +95,7 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
|
||||
; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -109,19 +111,19 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX9-NEXT: s_mov_b64 s[14:15], exec
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
|
||||
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -132,6 +134,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
@@ -143,7 +146,7 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
|
||||
; GFX10NSA-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -159,19 +162,19 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
; GFX9-NEXT: s_mov_b64 s[14:15], exec
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -182,6 +185,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
@@ -193,7 +197,7 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
|
||||
; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -209,19 +213,19 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
|
||||
; GFX9-NEXT: s_mov_b64 s[14:15], exec
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
|
||||
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX9-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -232,6 +236,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
|
||||
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
@@ -243,7 +248,7 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
|
||||
; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -260,20 +265,20 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v4, s12
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0
|
||||
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX9-NEXT: image_gather4_c_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -284,6 +289,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
@@ -295,7 +301,7 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
|
||||
; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -311,19 +317,19 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
; GFX9-NEXT: s_mov_b64 s[14:15], exec
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -334,6 +340,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
@@ -345,7 +352,7 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
|
||||
; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -361,19 +368,19 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
; GFX9-NEXT: s_mov_b64 s[14:15], exec
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_perm_b32 v2, v3, v2, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -384,6 +391,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
@@ -395,7 +403,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
|
||||
; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -412,20 +420,20 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v4, s12
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0
|
||||
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX9-NEXT: image_gather4_b_cl v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -436,6 +444,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
@@ -447,7 +456,7 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
|
||||
; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -464,20 +473,20 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_wqm_b64 exec, exec
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, v4
|
||||
; GFX9-NEXT: v_perm_b32 v2, v5, v2, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v5, 16, v2
|
||||
; GFX9-NEXT: s_and_b64 exec, exec, s[14:15]
|
||||
; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -488,6 +497,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
@@ -499,7 +509,7 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10NSA-NEXT: s_mov_b32 s9, s11
|
||||
; GFX10NSA-NEXT: s_mov_b32 s10, s12
|
||||
; GFX10NSA-NEXT: s_mov_b32 s11, s13
|
||||
; GFX10NSA-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
|
||||
; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -512,31 +522,32 @@ main_body:
|
||||
define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t, half %lod) {
|
||||
; GFX9-LABEL: gather4_l_2d:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_perm_b32 v1, v1, v0, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
|
||||
; GFX9-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10NSA-LABEL: gather4_l_2d:
|
||||
; GFX10NSA: ; %bb.0: ; %main_body
|
||||
; GFX10NSA-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v1, 16, v0
|
||||
; GFX10NSA-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10NSA-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10NSA-NEXT: s_mov_b32 s6, s8
|
||||
@@ -556,33 +567,34 @@ main_body:
|
||||
define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t, half %lod) {
|
||||
; GFX9-LABEL: gather4_c_l_2d:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v4, s12
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v0
|
||||
; GFX9-NEXT: image_gather4_c_l v[0:3], v[1:3], s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10NSA-LABEL: gather4_c_l_2d:
|
||||
; GFX10NSA: ; %bb.0: ; %main_body
|
||||
; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10NSA-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10NSA-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10NSA-NEXT: s_mov_b32 s6, s8
|
||||
@@ -602,31 +614,32 @@ main_body:
|
||||
define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
|
||||
; GFX9-LABEL: gather4_lz_2d:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10NSA-LABEL: gather4_lz_2d:
|
||||
; GFX10NSA: ; %bb.0: ; %main_body
|
||||
; GFX10NSA-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10NSA-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10NSA-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10NSA-NEXT: s_mov_b32 s6, s8
|
||||
@@ -646,31 +659,32 @@ main_body:
|
||||
define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %s, half %t) {
|
||||
; GFX9-LABEL: gather4_c_lz_2d:
|
||||
; GFX9: ; %bb.0: ; %main_body
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s12, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s8, s10
|
||||
; GFX9-NEXT: s_mov_b32 s9, s11
|
||||
; GFX9-NEXT: s_mov_b32 s10, s12
|
||||
; GFX9-NEXT: s_mov_b32 s11, s13
|
||||
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s12
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10NSA-LABEL: gather4_c_lz_2d:
|
||||
; GFX10NSA: ; %bb.0: ; %main_body
|
||||
; GFX10NSA-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10NSA-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10NSA-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10NSA-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10NSA-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10NSA-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10NSA-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10NSA-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10NSA-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10NSA-NEXT: s_mov_b32 s6, s8
|
||||
|
||||
@@ -6,28 +6,31 @@
|
||||
define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) {
|
||||
; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s8
|
||||
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: load_2darraymsaa_v4f32_xyzw:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10PLUS-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10PLUS-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10PLUS-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
|
||||
@@ -43,22 +46,23 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc,
|
||||
define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %slice, i16 %fragid) {
|
||||
; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX9-NEXT: v_perm_b32 v10, v1, v0, s8
|
||||
; GFX9-NEXT: v_perm_b32 v11, v3, v2, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v5
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v7
|
||||
@@ -73,14 +77,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
|
||||
; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX10-NEXT: v_perm_b32 v10, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v11, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, v5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, v5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, v5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v10, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v11, v3, 16, v2
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
@@ -101,14 +107,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
|
||||
; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX11-NEXT: v_perm_b32 v10, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v11, v3, v2, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-NEXT: s_mov_b32 s0, s2
|
||||
; GFX11-NEXT: s_mov_b32 s1, s3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-NEXT: v_mov_b32_e32 v7, v5
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, v5
|
||||
; GFX11-NEXT: v_mov_b32_e32 v9, v5
|
||||
; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0
|
||||
; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2
|
||||
; GFX11-NEXT: s_mov_b32 s2, s4
|
||||
; GFX11-NEXT: s_mov_b32 s3, s5
|
||||
; GFX11-NEXT: s_mov_b32 s4, s6
|
||||
@@ -135,22 +143,23 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
|
||||
define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %slice, i16 %fragid) {
|
||||
; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX9-NEXT: v_perm_b32 v10, v1, v0, s8
|
||||
; GFX9-NEXT: v_perm_b32 v11, v3, v2, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v11, v3, 16, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, v5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v5
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v7
|
||||
@@ -165,14 +174,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
|
||||
; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX10-NEXT: v_perm_b32 v10, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v11, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, v5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, v5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, v5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v10, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v11, v3, 16, v2
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
@@ -193,14 +204,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
|
||||
; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_mov_b32_e32 v5, 0
|
||||
; GFX11-NEXT: v_perm_b32 v10, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v11, v3, v2, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-NEXT: s_mov_b32 s0, s2
|
||||
; GFX11-NEXT: s_mov_b32 s1, s3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GFX11-NEXT: v_mov_b32_e32 v7, v5
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, v5
|
||||
; GFX11-NEXT: v_mov_b32_e32 v9, v5
|
||||
; GFX11-NEXT: v_lshl_or_b32 v10, v1, 16, v0
|
||||
; GFX11-NEXT: v_lshl_or_b32 v11, v3, 16, v2
|
||||
; GFX11-NEXT: s_mov_b32 s2, s4
|
||||
; GFX11-NEXT: s_mov_b32 s3, s5
|
||||
; GFX11-NEXT: s_mov_b32 s4, s6
|
||||
|
||||
@@ -6,27 +6,28 @@
|
||||
define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %r) {
|
||||
; GFX9-LABEL: load_3d_v4f32_xyzw:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_perm_b32 v1, v1, v0, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
|
||||
; GFX9-NEXT: image_load v[0:3], v[1:2], s[0:7] dmask:0xf unorm a16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: load_3d_v4f32_xyzw:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: v_perm_b32 v1, v1, v0, 0x5040100
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10PLUS-NEXT: v_lshl_or_b32 v1, v1, 16, v0
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s4, s6
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s6, s8
|
||||
@@ -41,22 +42,22 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
|
||||
define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %r) {
|
||||
; GFX9-LABEL: load_3d_v4f32_xyzw_tfe:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, v2
|
||||
; GFX9-NEXT: v_perm_b32 v5, v1, v0, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v10, v7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v11, v7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v7
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v9
|
||||
@@ -71,14 +72,15 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
|
||||
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, v2
|
||||
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, v7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v11, v7
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v1, 16, v0
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
@@ -99,7 +101,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
|
||||
; GFX11-LABEL: load_3d_v4f32_xyzw_tfe:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0
|
||||
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: s_mov_b32 s0, s2
|
||||
; GFX11-NEXT: s_mov_b32 s1, s3
|
||||
; GFX11-NEXT: s_mov_b32 s2, s4
|
||||
@@ -107,6 +109,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
|
||||
; GFX11-NEXT: v_mov_b32_e32 v11, v7
|
||||
; GFX11-NEXT: v_mov_b32_e32 v10, v7
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0
|
||||
; GFX11-NEXT: s_mov_b32 s3, s5
|
||||
; GFX11-NEXT: s_mov_b32 s4, s6
|
||||
; GFX11-NEXT: s_mov_b32 s5, s7
|
||||
@@ -130,22 +133,22 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
|
||||
define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i16 %s, i16 %t, i16 %r) {
|
||||
; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s8, 0x5040100
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v6, v2
|
||||
; GFX9-NEXT: v_perm_b32 v5, v1, v0, s8
|
||||
; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v10, v7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v11, v7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v7
|
||||
; GFX9-NEXT: s_mov_b32 s0, s2
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_mov_b32 s2, s4
|
||||
; GFX9-NEXT: s_mov_b32 s3, s5
|
||||
; GFX9-NEXT: s_mov_b32 s4, s6
|
||||
; GFX9-NEXT: s_mov_b32 s5, s7
|
||||
; GFX9-NEXT: s_mov_b32 s6, s8
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, v9
|
||||
@@ -160,14 +163,15 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
|
||||
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, v2
|
||||
; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: s_mov_b32 s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, v7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v11, v7
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v1, 16, v0
|
||||
; GFX10-NEXT: s_mov_b32 s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s3, s5
|
||||
; GFX10-NEXT: s_mov_b32 s4, s6
|
||||
@@ -188,7 +192,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
|
||||
; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, 0
|
||||
; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: s_mov_b32 s0, s2
|
||||
; GFX11-NEXT: s_mov_b32 s1, s3
|
||||
; GFX11-NEXT: s_mov_b32 s2, s4
|
||||
@@ -196,6 +200,7 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
|
||||
; GFX11-NEXT: v_mov_b32_e32 v11, v7
|
||||
; GFX11-NEXT: v_mov_b32_e32 v10, v7
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0
|
||||
; GFX11-NEXT: s_mov_b32 s3, s5
|
||||
; GFX11-NEXT: s_mov_b32 s4, s6
|
||||
; GFX11-NEXT: s_mov_b32 s5, s7
|
||||
|
||||
@@ -15,8 +15,10 @@ main_body:
|
||||
define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
|
||||
; GFX10-LABEL: sample_cd_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
@@ -39,8 +41,10 @@ main_body:
|
||||
define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
|
||||
; GFX10-LABEL: sample_c_cd_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3
|
||||
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
@@ -63,8 +67,10 @@ main_body:
|
||||
define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
|
||||
; GFX10-LABEL: sample_cd_cl_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
@@ -88,10 +94,11 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10-LABEL: sample_c_cd_cl_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, v2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, v3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v3
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v1
|
||||
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
|
||||
@@ -22,16 +22,20 @@ main_body:
|
||||
define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
|
||||
; GFX10-LABEL: sample_d_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX11-LABEL: sample_d_2d:
|
||||
; GFX11: ; %bb.0: ; %main_body
|
||||
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
@@ -44,17 +48,21 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
|
||||
; GFX10-LABEL: sample_d_3d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, v3
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX10-NEXT: v_perm_b32 v2, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v9, 0xffff, v9
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9
|
||||
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX11-LABEL: sample_d_3d:
|
||||
; GFX11: ; %bb.0: ; %main_body
|
||||
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v1, v4, v3, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3
|
||||
; GFX11-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v1, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
@@ -83,16 +91,20 @@ main_body:
|
||||
define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
|
||||
; GFX10-LABEL: sample_c_d_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v4, 16, v3
|
||||
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX11-LABEL: sample_c_d_2d:
|
||||
; GFX11: ; %bb.0: ; %main_body
|
||||
; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v4, 16, v3
|
||||
; GFX11-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
@@ -121,16 +133,20 @@ main_body:
|
||||
define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
|
||||
; GFX10-LABEL: sample_d_cl_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX11-LABEL: sample_d_cl_2d:
|
||||
; GFX11: ; %bb.0: ; %main_body
|
||||
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX11-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
@@ -160,18 +176,21 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10-LABEL: sample_c_d_cl_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, v2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, v3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX10-NEXT: v_perm_b32 v3, v8, v1, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v3
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v1
|
||||
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX11-LABEL: sample_c_d_cl_2d:
|
||||
; GFX11: ; %bb.0: ; %main_body
|
||||
; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v4, 16, v3
|
||||
; GFX11-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
@@ -185,19 +204,22 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, v2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, v3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v11, v4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX10-NEXT: v_perm_b32 v4, v10, v9, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v5, v5, v11, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v9
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0
|
||||
; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX11-LABEL: sample_c_d_o_2darray_V1:
|
||||
; GFX11: ; %bb.0: ; %main_body
|
||||
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v3, v5, v4, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX11-NEXT: v_lshl_or_b32 v3, v5, 16, v4
|
||||
; GFX11-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
@@ -211,19 +233,22 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, v2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, v3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v11, v4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX10-NEXT: v_perm_b32 v4, v10, v9, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v5, v5, v11, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v9
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0
|
||||
; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX11-LABEL: sample_c_d_o_2darray_V2:
|
||||
; GFX11: ; %bb.0: ; %main_body
|
||||
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v3, v5, v4, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX11-NEXT: v_lshl_or_b32 v3, v5, 16, v4
|
||||
; GFX11-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
|
||||
@@ -64,10 +64,13 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %
|
||||
;
|
||||
; GFX11-LABEL: image_bvh_intersect_ray_a16:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_perm_b32 v9, v5, v7, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v10, v5, v7, 0x7060302
|
||||
; GFX11-NEXT: v_perm_b32 v11, v6, v8, 0x5040100
|
||||
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[9:11]], s[0:3] a16
|
||||
; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v7
|
||||
; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v8
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_lshl_or_b32 v8, v5, 16, v9
|
||||
; GFX11-NEXT: v_perm_b32 v9, v5, v7, 0x7060302
|
||||
; GFX11-NEXT: v_lshl_or_b32 v10, v6, 16, v10
|
||||
; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v[2:4], v[8:10]], s[0:3] a16
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
|
||||
@@ -124,10 +127,13 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
|
||||
;
|
||||
; GFX11-LABEL: image_bvh64_intersect_ray_a16:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_perm_b32 v10, v6, v8, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v11, v6, v8, 0x7060302
|
||||
; GFX11-NEXT: v_perm_b32 v12, v7, v9, 0x5040100
|
||||
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[10:12]], s[0:3] a16
|
||||
; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v8
|
||||
; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v9
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_lshl_or_b32 v9, v6, 16, v10
|
||||
; GFX11-NEXT: v_perm_b32 v10, v6, v8, 0x7060302
|
||||
; GFX11-NEXT: v_lshl_or_b32 v11, v7, 16, v11
|
||||
; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[0:1], v2, v[3:5], v[9:11]], s[0:3] a16
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: ; return to shader part epilog
|
||||
%v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
|
||||
@@ -327,12 +333,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
|
||||
; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_and_b32 v0, 0xffff, v7
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v8
|
||||
; GFX11-NEXT: v_dual_mov_b32 v13, v2 :: v_dual_mov_b32 v14, v3
|
||||
; GFX11-NEXT: v_mov_b32_e32 v15, v4
|
||||
; GFX11-NEXT: v_perm_b32 v4, v5, v7, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302
|
||||
; GFX11-NEXT: v_perm_b32 v6, v6, v8, 0x5040100
|
||||
; GFX11-NEXT: s_mov_b32 s1, exec_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_lshl_or_b32 v4, v5, 16, v0
|
||||
; GFX11-NEXT: v_perm_b32 v5, v5, v7, 0x7060302
|
||||
; GFX11-NEXT: v_lshl_or_b32 v6, v6, 16, v1
|
||||
; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s4, v9
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s5, v10
|
||||
@@ -558,11 +566,14 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
|
||||
; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v1
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v8
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v9
|
||||
; GFX11-NEXT: v_dual_mov_b32 v19, v2 :: v_dual_mov_b32 v14, v3
|
||||
; GFX11-NEXT: v_dual_mov_b32 v15, v4 :: v_dual_mov_b32 v16, v5
|
||||
; GFX11-NEXT: v_perm_b32 v4, v6, v8, 0x5040100
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
||||
; GFX11-NEXT: v_lshl_or_b32 v4, v6, 16, v0
|
||||
; GFX11-NEXT: v_perm_b32 v5, v6, v8, 0x7060302
|
||||
; GFX11-NEXT: v_perm_b32 v6, v7, v9, 0x5040100
|
||||
; GFX11-NEXT: v_lshl_or_b32 v6, v7, 16, v1
|
||||
; GFX11-NEXT: s_mov_b32 s1, exec_lo
|
||||
; GFX11-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s4, v10
|
||||
@@ -713,41 +724,23 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
|
||||
; GFX1030: ; %bb.0:
|
||||
; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
||||
; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0
|
||||
; GFX1030-NEXT: s_movk_i32 s9, 0x4600
|
||||
; GFX1030-NEXT: s_movk_i32 s8, 0x4700
|
||||
; GFX1030-NEXT: s_bfe_u32 s8, s8, 0x100000
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700
|
||||
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX1030-NEXT: s_movk_i32 s1, 0x4400
|
||||
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
|
||||
; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
|
||||
; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
|
||||
; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
|
||||
; GFX1030-NEXT: s_movk_i32 s2, 0x4200
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
|
||||
; GFX1030-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GFX1030-NEXT: flat_load_dword v1, v[2:3]
|
||||
; GFX1030-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX1030-NEXT: s_movk_i32 s3, 0x4800
|
||||
; GFX1030-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GFX1030-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX1030-NEXT: s_movk_i32 s0, 0x4500
|
||||
; GFX1030-NEXT: s_or_b32 s1, s2, s1
|
||||
; GFX1030-NEXT: s_bfe_u32 s2, s9, 0x100000
|
||||
; GFX1030-NEXT: s_bfe_u32 s3, s3, 0x100000
|
||||
; GFX1030-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX1030-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX1030-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX1030-NEXT: s_or_b32 s0, s0, s2
|
||||
; GFX1030-NEXT: s_or_b32 s2, s8, s3
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v7, s2
|
||||
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
|
||||
; GFX1030-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -758,41 +751,23 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
|
||||
; GFX1013: ; %bb.0:
|
||||
; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
|
||||
; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0
|
||||
; GFX1013-NEXT: s_movk_i32 s9, 0x4600
|
||||
; GFX1013-NEXT: s_movk_i32 s8, 0x4700
|
||||
; GFX1013-NEXT: s_bfe_u32 s8, s8, 0x100000
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700
|
||||
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX1013-NEXT: s_movk_i32 s1, 0x4400
|
||||
; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6
|
||||
; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
|
||||
; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6
|
||||
; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
|
||||
; GFX1013-NEXT: s_movk_i32 s2, 0x4200
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500
|
||||
; GFX1013-NEXT: flat_load_dword v0, v[4:5]
|
||||
; GFX1013-NEXT: flat_load_dword v1, v[2:3]
|
||||
; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX1013-NEXT: s_movk_i32 s3, 0x4800
|
||||
; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GFX1013-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX1013-NEXT: s_movk_i32 s0, 0x4500
|
||||
; GFX1013-NEXT: s_or_b32 s1, s2, s1
|
||||
; GFX1013-NEXT: s_bfe_u32 s2, s9, 0x100000
|
||||
; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000
|
||||
; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX1013-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX1013-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX1013-NEXT: s_or_b32 s0, s0, s2
|
||||
; GFX1013-NEXT: s_or_b32 s2, s8, s3
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v7, s2
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200
|
||||
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16
|
||||
; GFX1013-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -967,38 +942,20 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
|
||||
; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
|
||||
; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; GFX1030-NEXT: s_movk_i32 s6, 0x4200
|
||||
; GFX1030-NEXT: s_movk_i32 s7, 0x4800
|
||||
; GFX1030-NEXT: s_bfe_u32 s6, s6, 0x100000
|
||||
; GFX1030-NEXT: s_movk_i32 s9, 0x4600
|
||||
; GFX1030-NEXT: s_movk_i32 s8, 0x4700
|
||||
; GFX1030-NEXT: s_bfe_u32 s7, s7, 0x100000
|
||||
; GFX1030-NEXT: s_bfe_u32 s8, s8, 0x100000
|
||||
; GFX1030-NEXT: s_lshl_b32 s7, s7, 16
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
|
||||
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX1030-NEXT: s_movk_i32 s5, 0x4400
|
||||
; GFX1030-NEXT: s_movk_i32 s4, 0x4500
|
||||
; GFX1030-NEXT: s_bfe_u32 s5, s5, 0x100000
|
||||
; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
|
||||
; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
|
||||
; GFX1030-NEXT: s_lshl_b32 s5, s5, 16
|
||||
; GFX1030-NEXT: s_bfe_u32 s4, s4, 0x100000
|
||||
; GFX1030-NEXT: s_or_b32 s5, s6, s5
|
||||
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
|
||||
; GFX1030-NEXT: s_bfe_u32 s6, s9, 0x100000
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
|
||||
; GFX1030-NEXT: s_lshl_b32 s6, s6, 16
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
|
||||
; GFX1030-NEXT: s_or_b32 s4, s4, s6
|
||||
; GFX1030-NEXT: s_or_b32 s6, s8, s7
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v6, s5
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v7, s4
|
||||
; GFX1030-NEXT: v_mov_b32_e32 v8, s6
|
||||
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
|
||||
; GFX1030-NEXT: s_waitcnt vmcnt(0)
|
||||
@@ -1011,38 +968,20 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
|
||||
; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
|
||||
; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
|
||||
; GFX1013-NEXT: s_movk_i32 s1, 0x4400
|
||||
; GFX1013-NEXT: s_movk_i32 s9, 0x4600
|
||||
; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX1013-NEXT: s_movk_i32 s0, 0x4500
|
||||
; GFX1013-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX1013-NEXT: s_movk_i32 s8, 0x4700
|
||||
; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX1013-NEXT: s_bfe_u32 s8, s8, 0x100000
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
|
||||
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX1013-NEXT: s_movk_i32 s2, 0x4200
|
||||
; GFX1013-NEXT: s_movk_i32 s3, 0x4800
|
||||
; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
|
||||
; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
|
||||
; GFX1013-NEXT: s_or_b32 s1, s2, s1
|
||||
; GFX1013-NEXT: s_bfe_u32 s2, s9, 0x100000
|
||||
; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000
|
||||
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
|
||||
; GFX1013-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX1013-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX1013-NEXT: s_or_b32 s0, s0, s2
|
||||
; GFX1013-NEXT: s_or_b32 s2, s8, s3
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v6, s1
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v7, s0
|
||||
; GFX1013-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
|
||||
; GFX1013-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
@@ -124,10 +124,10 @@ define amdgpu_ps float @raw_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_vof
|
||||
; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -183,10 +183,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__sgp
|
||||
; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -83,10 +83,10 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vgpr_
|
||||
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -146,10 +146,10 @@ define amdgpu_ps void @raw_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cmp__
|
||||
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -166,10 +166,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp
|
||||
; GFX908-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX908-NEXT: {{ $}}
|
||||
; GFX908-NEXT: bb.3:
|
||||
@@ -217,10 +217,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgp
|
||||
; GFX90A-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: bb.3:
|
||||
@@ -272,10 +272,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v
|
||||
; GFX908-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
|
||||
; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX908-NEXT: {{ $}}
|
||||
; GFX908-NEXT: bb.3:
|
||||
@@ -321,10 +321,10 @@ define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_v
|
||||
; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
|
||||
; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: bb.3:
|
||||
|
||||
@@ -174,10 +174,10 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr
|
||||
; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
@@ -224,10 +224,10 @@ define amdgpu_ps half @raw_buffer_load_format_f16__vgpr_rsrc__sgpr_voffset__vgpr
|
||||
; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
|
||||
@@ -124,10 +124,10 @@ define amdgpu_ps float @raw_buffer_load_format_f32__vgpr_rsrc__sgpr_voffset__vgp
|
||||
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -73,7 +73,7 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__sgpr_soffs
|
||||
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -125,10 +125,10 @@ define amdgpu_ps float @raw_buffer_load_f32__vgpr_rsrc__vgpr_voffset__vgpr_soffs
|
||||
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -523,7 +523,7 @@ define amdgpu_ps half @raw_buffer_load_f16__vgpr_rsrc__vgpr_voffset__sgpr_soffse
|
||||
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -575,7 +575,7 @@ define amdgpu_ps float @raw_buffer_load_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffse
|
||||
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -709,7 +709,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
|
||||
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 7)
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
|
||||
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
@@ -854,7 +854,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
|
||||
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -893,7 +893,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
|
||||
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.2:
|
||||
@@ -910,7 +910,7 @@ define amdgpu_ps float @raw_buffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffs
|
||||
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -183,7 +183,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; UNPACKED-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -231,7 +231,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
@@ -417,7 +417,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; UNPACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
|
||||
; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
|
||||
; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
|
||||
; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
|
||||
; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec
|
||||
@@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; PACKED-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; PACKED-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
|
||||
; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
|
||||
; PACKED-NEXT: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 7)
|
||||
; PACKED-NEXT: S_ENDPGM 0
|
||||
%voffset.add = add i32 %voffset, 4096
|
||||
@@ -465,7 +465,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; UNPACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; UNPACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; UNPACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
|
||||
; UNPACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
|
||||
; UNPACKED-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
|
||||
; UNPACKED-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
|
||||
; UNPACKED-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY4]], implicit $exec
|
||||
@@ -488,7 +488,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -522,7 +522,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; PACKED-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; PACKED-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; PACKED-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
|
||||
; PACKED-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
|
||||
; PACKED-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.2:
|
||||
@@ -539,7 +539,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; PACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
|
||||
@@ -139,7 +139,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; CHECK-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; CHECK-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec
|
||||
; CHECK-NEXT: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), align 1, addrspace 7)
|
||||
; CHECK-NEXT: S_ENDPGM 0
|
||||
%voffset.add = add i32 %voffset, 4096
|
||||
@@ -293,7 +293,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; CHECK-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; CHECK-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY8]], [[COPY10]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.2:
|
||||
@@ -310,7 +310,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
|
||||
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -75,7 +75,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
|
||||
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -170,10 +170,10 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
|
||||
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -524,7 +524,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
|
||||
; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -638,7 +638,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
|
||||
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
|
||||
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 7)
|
||||
; CHECK-NEXT: S_ENDPGM 0
|
||||
%voffset.add = add i32 %voffset, 4096
|
||||
@@ -739,7 +739,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
|
||||
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
|
||||
; CHECK-NEXT: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s16>), align 1, addrspace 7)
|
||||
; CHECK-NEXT: S_ENDPGM 0
|
||||
%voffset.add = add i32 %voffset, 4096
|
||||
@@ -764,7 +764,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
|
||||
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr2
|
||||
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
|
||||
; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec
|
||||
; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.2:
|
||||
@@ -781,7 +781,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
|
||||
; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -835,7 +835,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
|
||||
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -172,10 +172,10 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs
|
||||
; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -222,10 +222,10 @@ define amdgpu_ps half @raw_tbuffer_load_f16__vgpr_rsrc__sgpr_voffset__vgpr_soffs
|
||||
; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
|
||||
@@ -123,10 +123,10 @@ define amdgpu_ps float @raw_tbuffer_load_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff
|
||||
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -151,7 +151,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff
|
||||
; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -197,7 +197,7 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__sgpr_soff
|
||||
; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
@@ -249,10 +249,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff
|
||||
; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -298,10 +298,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__vgpr_voffset__vgpr_soff
|
||||
; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
@@ -354,10 +354,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff
|
||||
; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -404,10 +404,10 @@ define amdgpu_ps void @raw_tbuffer_store_f16__vgpr_rsrc__sgpr_voffset__vgpr_soff
|
||||
; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
|
||||
@@ -66,7 +66,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs
|
||||
; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -112,7 +112,7 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__sgpr_soffs
|
||||
; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
@@ -164,10 +164,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs
|
||||
; UNPACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -213,10 +213,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__vgpr_voffset__vgpr_soffs
|
||||
; PACKED-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
@@ -269,10 +269,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs
|
||||
; UNPACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -319,10 +319,10 @@ define amdgpu_ps void @raw_tbuffer_store_i8__vgpr_rsrc__sgpr_voffset__vgpr_soffs
|
||||
; PACKED-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
|
||||
@@ -143,7 +143,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__sgpr_soff
|
||||
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -195,10 +195,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__vgpr_voffset__vgpr_soff
|
||||
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -251,10 +251,10 @@ define amdgpu_ps void @raw_tbuffer_store_f32__vgpr_rsrc__sgpr_voffset__vgpr_soff
|
||||
; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -613,7 +613,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
|
||||
; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -669,7 +669,7 @@ define amdgpu_ps void @raw_tbuffer_store_f32__sgpr_rsrc__vgpr_voffset__sgpr_soff
|
||||
; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -2713,7 +2713,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
|
||||
; GFX6-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -2760,7 +2760,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
|
||||
; GFX7-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -2807,7 +2807,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc(<4 x i32> %rsrc, i32 inreg %
|
||||
; GFX8-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -2858,7 +2858,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
|
||||
; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -2903,7 +2903,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
|
||||
; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -2948,7 +2948,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4092(<4 x i32> %
|
||||
; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -3004,7 +3004,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
|
||||
; GFX6-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -3053,7 +3053,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
|
||||
; GFX7-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -3102,7 +3102,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_soffset_add_4096(<4 x i32> %
|
||||
; GFX8-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -3154,7 +3154,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc)
|
||||
; GFX6-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -3199,7 +3199,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc)
|
||||
; GFX7-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -3244,7 +3244,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4095(<4 x i32> %rsrc)
|
||||
; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -3297,7 +3297,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc)
|
||||
; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -3344,7 +3344,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc)
|
||||
; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -3389,7 +3389,7 @@ define amdgpu_ps float @s_buffer_load_f32_vgpr_rsrc_offset_4096(<4 x i32> %rsrc)
|
||||
; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -3441,7 +3441,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
|
||||
; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -3503,7 +3503,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
|
||||
; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -3565,7 +3565,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4064(<4 x i32> %
|
||||
; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -3639,7 +3639,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
|
||||
; GFX6-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -3705,7 +3705,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
|
||||
; GFX7-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -3771,7 +3771,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4068(<4 x i32> %
|
||||
; GFX8-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -3843,7 +3843,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
|
||||
; GFX6-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -3909,7 +3909,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
|
||||
; GFX7-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -3975,7 +3975,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_rsrc_add_4096(<4 x i32> %
|
||||
; GFX8-NEXT: [[COPY9:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY9]], [[COPY7]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -4044,7 +4044,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
|
||||
; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -4107,7 +4107,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
|
||||
; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -4170,7 +4170,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_5000
|
||||
; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -4239,7 +4239,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
|
||||
; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -4302,7 +4302,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
|
||||
; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -4365,7 +4365,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4076
|
||||
; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -4434,7 +4434,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
|
||||
; GFX6-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -4497,7 +4497,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
|
||||
; GFX7-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -4560,7 +4560,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_add_4080
|
||||
; GFX8-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY8]], [[COPY6]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -4628,7 +4628,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
|
||||
; GFX6-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec
|
||||
; GFX6-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX6-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX6-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX6-NEXT: {{ $}}
|
||||
; GFX6-NEXT: bb.3:
|
||||
@@ -4690,7 +4690,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
|
||||
; GFX7-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec
|
||||
; GFX7-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX7-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX7-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX7-NEXT: {{ $}}
|
||||
; GFX7-NEXT: bb.3:
|
||||
@@ -4752,7 +4752,7 @@ define amdgpu_ps <8 x float> @s_buffer_load_v8f32_vgpr_offset_vgpr_rsrc_offset_4
|
||||
; GFX8-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY6]], [[COPY4]], implicit $exec
|
||||
; GFX8-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY7]], [[COPY5]], implicit $exec
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX8-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX8-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: bb.3:
|
||||
@@ -4900,7 +4900,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
|
||||
; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
|
||||
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
|
||||
; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
|
||||
@@ -4917,7 +4917,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
|
||||
; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
|
||||
; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
|
||||
; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
|
||||
@@ -4934,7 +4934,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_vgpr_sgpr_imm(<4 x i32> inr
|
||||
; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
|
||||
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
|
||||
@@ -4958,7 +4958,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
|
||||
; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
|
||||
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
|
||||
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
|
||||
; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
|
||||
; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
|
||||
@@ -4975,7 +4975,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
|
||||
; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
|
||||
; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
|
||||
; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
|
||||
; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
|
||||
; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
|
||||
@@ -4992,7 +4992,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_sgpr_vgpr_imm(<4 x i32> inr
|
||||
; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
|
||||
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
|
||||
@@ -5073,7 +5073,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
|
||||
; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
|
||||
; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
|
||||
; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
|
||||
; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
@@ -5090,7 +5090,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
|
||||
; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
|
||||
; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
|
||||
; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
|
||||
; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
@@ -5107,7 +5107,7 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
|
||||
; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
|
||||
; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec
|
||||
; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
|
||||
; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
|
||||
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
|
||||
@@ -14,7 +14,8 @@ define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX908-LABEL: v_sdot2:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2:
|
||||
@@ -60,9 +61,9 @@ define amdgpu_ps float @v_sdot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inr
|
||||
;
|
||||
; GFX908-LABEL: v_sdot2_sgpr_sgpr_sgpr:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, s0, v0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v0, s0, v1
|
||||
; GFX908-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_sgpr_sgpr_sgpr:
|
||||
@@ -85,7 +86,8 @@ define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) {
|
||||
; GFX908-LABEL: v_sdot2_inline_literal_a:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v1, 0x40004, v0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_inline_literal_a:
|
||||
@@ -108,7 +110,8 @@ define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) {
|
||||
; GFX908-LABEL: v_sdot2_inline_literal_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v1, 0x40004, v0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_inline_literal_b:
|
||||
@@ -131,7 +134,9 @@ define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
|
||||
; GFX908-LABEL: v_sdot2_inline_literal_a_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 0x40004
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v0, 0x80008, v1
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_inline_literal_a_b:
|
||||
@@ -154,7 +159,9 @@ define i32 @v_sdot2_inline_literal_a_b_c() {
|
||||
; GFX908-LABEL: v_sdot2_inline_literal_a_b_c:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
|
||||
; GFX908-NEXT: v_mov_b32_e32 v1, 0x40004
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, 8
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v0, 0x80008, v1
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_inline_literal_a_b_c:
|
||||
@@ -177,7 +184,9 @@ define i32 @v_sdot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
|
||||
; GFX908-LABEL: v_sdot2_inline_literal_c:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, 7
|
||||
; GFX908-NEXT: v_mov_b32_e32 v2, 7
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_inline_literal_c:
|
||||
@@ -200,7 +209,9 @@ define i32 @v_sdot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX908-LABEL: v_sdot2_fneg_a:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
|
||||
; GFX908-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_fneg_a:
|
||||
@@ -225,7 +236,9 @@ define i32 @v_sdot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
|
||||
; GFX908-LABEL: v_sdot2_fneg_b:
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
|
||||
; GFX908-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_fneg_b:
|
||||
@@ -252,7 +265,8 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_fnegf32_c:
|
||||
@@ -280,7 +294,8 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_fnegv2f16_c:
|
||||
@@ -308,7 +323,8 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_shuffle10_a:
|
||||
@@ -335,7 +351,8 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
|
||||
; GFX908: ; %bb.0:
|
||||
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16
|
||||
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
|
||||
; GFX908-NEXT: v_dot2c_i32_i16_e32 v2, v0, v1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX908-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: v_sdot2_shuffle10_b:
|
||||
|
||||
@@ -14,7 +14,8 @@ define i32 @v_sdot4(i32 %a, i32 %b, i32 %c) {
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
|
||||
; GFX10-NEXT: v_dot4c_i32_i8_e32 v2, v0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%r = call i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 false)
|
||||
ret i32 %r
|
||||
@@ -80,7 +81,8 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6
|
||||
; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2
|
||||
; GFX10-NEXT: v_or3_b32 v1, v3, v4, v5
|
||||
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v8
|
||||
; GFX10-NEXT: v_dot4c_i32_i8_e32 v8, v0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, v8
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%a.cast = bitcast <4 x i8> %a to i32
|
||||
%b.cast = bitcast <4 x i8> %b to i32
|
||||
@@ -101,7 +103,8 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
|
||||
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
|
||||
; GFX10-NEXT: v_dot4c_i32_i8_e32 v2, v0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg float %a
|
||||
%cast.neg.a = bitcast float %neg.a to i32
|
||||
@@ -122,7 +125,8 @@ define i32 @v_sdot4_fnegv2f16_a(<2 x half> %a, i32 %b, i32 %c) {
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX10-NEXT: v_dot4_i32_i8 v0, v0, v1, v2
|
||||
; GFX10-NEXT: v_dot4c_i32_i8_e32 v2, v0, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.a = fneg <2 x half> %a
|
||||
%cast.neg.a = bitcast <2 x half> %neg.a to i32
|
||||
|
||||
@@ -134,10 +134,10 @@ define amdgpu_ps float @struct_buffer_atomic_add_i32__sgpr_val__vgpr_rsrc__sgpr_
|
||||
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -196,10 +196,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_i32_noret__sgpr_val__vgpr_rsrc__
|
||||
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -90,10 +90,10 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__sgpr_val__sgpr_cmp__vg
|
||||
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
@@ -156,10 +156,10 @@ define amdgpu_ps void @struct_buffer_atomic_cmpswap_i32_noret__sgpr_val__sgpr_cm
|
||||
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY8]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -180,10 +180,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
|
||||
; GFX908-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
|
||||
; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX908-NEXT: {{ $}}
|
||||
; GFX908-NEXT: bb.3:
|
||||
@@ -234,10 +234,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
|
||||
; GFX90A-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
|
||||
; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: bb.3:
|
||||
@@ -292,10 +292,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
|
||||
; GFX908-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX908-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX908-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; GFX908-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; GFX908-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; GFX908-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX908-NEXT: {{ $}}
|
||||
; GFX908-NEXT: bb.3:
|
||||
@@ -343,10 +343,10 @@ define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__
|
||||
; GFX90A-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; GFX90A-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; GFX90A-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; GFX90A-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; GFX90A-NEXT: {{ $}}
|
||||
; GFX90A-NEXT: bb.3:
|
||||
|
||||
@@ -187,10 +187,10 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin
|
||||
; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -261,10 +261,10 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin
|
||||
; PACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
|
||||
@@ -133,10 +133,10 @@ define amdgpu_ps <4 x float> @struct_buffer_load_format_v4f32__vpr_rsrc__sgpr_vi
|
||||
; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -204,10 +204,10 @@ define amdgpu_ps float @struct_buffer_load_f32__vgpr_rsrc__sgpr_vindex__sgpr_vof
|
||||
; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -166,10 +166,10 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr
|
||||
; UNPACKED-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
@@ -220,10 +220,10 @@ define amdgpu_ps void @struct_buffer_store_format_f16__sgpr_val__vgpr_rsrc__sgpr
|
||||
; PACKED-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
|
||||
@@ -128,10 +128,10 @@ define amdgpu_ps void @struct_buffer_store_format_f32__sgpr_val__vgpr_rsrc__sgpr
|
||||
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY13]], [[COPY11]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY14]], [[COPY12]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -133,10 +133,10 @@ define amdgpu_ps void @struct_buffer_store_v4f32_vgpr_rsrc__sgpr_val__sgpr_vinde
|
||||
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -227,10 +227,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__
|
||||
; PACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; PACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; PACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; PACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; PACKED-NEXT: {{ $}}
|
||||
; PACKED-NEXT: bb.3:
|
||||
@@ -283,10 +283,10 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__
|
||||
; UNPACKED-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; UNPACKED-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; UNPACKED-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; UNPACKED-NEXT: {{ $}}
|
||||
; UNPACKED-NEXT: bb.3:
|
||||
|
||||
@@ -156,10 +156,10 @@ define amdgpu_ps <4 x float> @struct_tbuffer_load_v4f32__vgpr_rsrc__sgpr_vindex_
|
||||
; CHECK-NEXT: [[COPY12:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY12]], [[COPY10]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc
|
||||
; CHECK-NEXT: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; CHECK-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def $scc
|
||||
; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: bb.3:
|
||||
|
||||
@@ -662,13 +662,6 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
|
||||
; SI-NEXT: s_mov_b64 s[0:1], exec
|
||||
; SI-NEXT: s_wqm_b64 exec, exec
|
||||
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
|
||||
; SI-NEXT: s_movk_i32 s2, 0x3c00
|
||||
; SI-NEXT: s_bfe_u32 s3, 0, 0x100000
|
||||
; SI-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; SI-NEXT: s_lshl_b32 s4, s3, 16
|
||||
; SI-NEXT: s_or_b32 s4, s2, s4
|
||||
; SI-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; SI-NEXT: s_or_b32 s5, s3, s2
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
||||
; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
|
||||
@@ -677,8 +670,8 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
|
||||
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB6_7
|
||||
; SI-NEXT: ; %bb.2: ; %.demote0
|
||||
; SI-NEXT: s_wqm_b64 s[6:7], s[0:1]
|
||||
; SI-NEXT: s_and_b64 exec, exec, s[6:7]
|
||||
; SI-NEXT: s_wqm_b64 s[4:5], s[0:1]
|
||||
; SI-NEXT: s_and_b64 exec, exec, s[4:5]
|
||||
; SI-NEXT: .LBB6_3: ; %.continue0
|
||||
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[0:1]
|
||||
@@ -693,8 +686,8 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
|
||||
; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
|
||||
; SI-NEXT: s_and_b64 s[2:3], s[0:1], vcc
|
||||
; SI-NEXT: s_xor_b64 s[2:3], s[2:3], -1
|
||||
; SI-NEXT: s_and_saveexec_b64 s[6:7], s[2:3]
|
||||
; SI-NEXT: s_xor_b64 s[2:3], exec, s[6:7]
|
||||
; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
|
||||
; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5]
|
||||
; SI-NEXT: s_cbranch_execz .LBB6_6
|
||||
; SI-NEXT: ; %bb.4: ; %.demote1
|
||||
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
||||
@@ -703,8 +696,8 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
|
||||
; SI-NEXT: s_mov_b64 exec, 0
|
||||
; SI-NEXT: .LBB6_6: ; %.continue1
|
||||
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0x3c00
|
||||
; SI-NEXT: v_bfrev_b32_e32 v1, 60
|
||||
; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm
|
||||
; SI-NEXT: s_endpgm
|
||||
; SI-NEXT: .LBB6_7:
|
||||
@@ -892,13 +885,6 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
|
||||
; SI-NEXT: s_mov_b64 s[0:1], exec
|
||||
; SI-NEXT: s_wqm_b64 exec, exec
|
||||
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
|
||||
; SI-NEXT: s_movk_i32 s2, 0x3c00
|
||||
; SI-NEXT: s_bfe_u32 s3, 0, 0x100000
|
||||
; SI-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; SI-NEXT: s_lshl_b32 s4, s3, 16
|
||||
; SI-NEXT: s_or_b32 s6, s2, s4
|
||||
; SI-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; SI-NEXT: s_or_b32 s7, s3, s2
|
||||
; SI-NEXT: s_mov_b32 s4, 0
|
||||
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
|
||||
@@ -908,8 +894,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
|
||||
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB7_9
|
||||
; SI-NEXT: ; %bb.2: ; %.demote0
|
||||
; SI-NEXT: s_wqm_b64 s[8:9], s[0:1]
|
||||
; SI-NEXT: s_and_b64 exec, exec, s[8:9]
|
||||
; SI-NEXT: s_wqm_b64 s[6:7], s[0:1]
|
||||
; SI-NEXT: s_and_b64 exec, exec, s[6:7]
|
||||
; SI-NEXT: .LBB7_3: ; %.continue0.preheader
|
||||
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; SI-NEXT: s_mov_b64 s[2:3], 0
|
||||
@@ -936,8 +922,8 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
|
||||
; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
|
||||
; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc
|
||||
; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1
|
||||
; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
|
||||
; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9]
|
||||
; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
|
||||
; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7]
|
||||
; SI-NEXT: s_cbranch_execz .LBB7_4
|
||||
; SI-NEXT: ; %bb.6: ; %.demote1
|
||||
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
||||
@@ -945,14 +931,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB7_9
|
||||
; SI-NEXT: ; %bb.7: ; %.demote1
|
||||
; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
|
||||
; SI-NEXT: s_wqm_b64 s[8:9], s[0:1]
|
||||
; SI-NEXT: s_and_b64 exec, exec, s[8:9]
|
||||
; SI-NEXT: s_wqm_b64 s[6:7], s[0:1]
|
||||
; SI-NEXT: s_and_b64 exec, exec, s[6:7]
|
||||
; SI-NEXT: s_branch .LBB7_4
|
||||
; SI-NEXT: .LBB7_8: ; %.return
|
||||
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
|
||||
; SI-NEXT: s_and_b64 exec, exec, s[0:1]
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; SI-NEXT: v_mov_b32_e32 v0, 0x3c00
|
||||
; SI-NEXT: v_bfrev_b32_e32 v1, 60
|
||||
; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm
|
||||
; SI-NEXT: s_endpgm
|
||||
; SI-NEXT: .LBB7_9:
|
||||
|
||||
@@ -125,7 +125,7 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1
|
||||
; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = mul i16 %num, %den
|
||||
ret i16 %result
|
||||
|
||||
@@ -99,7 +99,7 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
|
||||
; GCN-LABEL: v_orn2_i32:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GCN-NEXT: v_not_b32_e32 v1, v1
|
||||
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@@ -107,7 +107,7 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%not.src1 = xor i32 %src1, -1
|
||||
@@ -118,13 +118,13 @@ define i32 @v_orn2_i32(i32 %src0, i32 %src1) {
|
||||
define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) {
|
||||
; GCN-LABEL: v_orn2_i32_sv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GCN-NEXT: v_not_b32_e32 v0, v0
|
||||
; GCN-NEXT: v_or_b32_e32 v0, s2, v0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_orn2_i32_sv:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
%not.src1 = xor i32 %src1, -1
|
||||
@@ -248,8 +248,8 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
|
||||
; GCN-LABEL: v_orn2_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GCN-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GCN-NEXT: v_not_b32_e32 v2, v2
|
||||
; GCN-NEXT: v_not_b32_e32 v3, v3
|
||||
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -258,8 +258,8 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v2, v2
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v3, v3
|
||||
; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -271,16 +271,16 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
|
||||
define amdgpu_ps <2 x float> @v_orn2_i64_sv(i64 inreg %src0, i64 %src1) {
|
||||
; GCN-LABEL: v_orn2_i64_sv:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GCN-NEXT: v_not_b32_e32 v0, v0
|
||||
; GCN-NEXT: v_not_b32_e32 v1, v1
|
||||
; GCN-NEXT: v_or_b32_e32 v0, s2, v0
|
||||
; GCN-NEXT: v_or_b32_e32 v1, s3, v1
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_orn2_i64_sv:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX10PLUS-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0
|
||||
; GFX10PLUS-NEXT: v_or_b32_e32 v1, s3, v1
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
@@ -466,14 +466,14 @@ define amdgpu_ps float @v_orn2_i16_sv(i16 inreg %src0, i16 %src1) {
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GCN-NEXT: v_or_b32_e32 v0, s2, v0
|
||||
; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_orn2_i16_sv:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0
|
||||
; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
%not.src1 = xor i16 %src1, -1
|
||||
%or = or i16 %src0, %not.src1
|
||||
@@ -487,14 +487,14 @@ define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) {
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_xor_b32 s0, s2, -1
|
||||
; GCN-NEXT: v_or_b32_e32 v0, s0, v0
|
||||
; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_orn2_i16_vs:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1
|
||||
; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0
|
||||
; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
%not.src1 = xor i16 %src1, -1
|
||||
%or = or i16 %src0, %not.src1
|
||||
|
||||
@@ -75,31 +75,29 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
|
||||
;
|
||||
; GFX8-LABEL: s_saddsat_i7:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, 0
|
||||
; GFX8-NEXT: s_max_i32 s5, s3, s4
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, 0
|
||||
; GFX8-NEXT: s_max_i32 s4, s2, s3
|
||||
; GFX8-NEXT: s_min_i32 s2, s2, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
|
||||
; GFX8-NEXT: s_max_i32 s1, s3, s1
|
||||
; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
|
||||
; GFX8-NEXT: s_max_i32 s1, s2, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s5
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s4
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s2
|
||||
; GFX8-NEXT: s_add_i32 s0, s0, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, s2
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 9
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_saddsat_i7:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp
|
||||
; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0
|
||||
@@ -108,9 +106,8 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_saddsat_i7:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp
|
||||
; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0
|
||||
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@@ -189,31 +186,29 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
|
||||
;
|
||||
; GFX8-LABEL: s_saddsat_i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, 0
|
||||
; GFX8-NEXT: s_max_i32 s5, s3, s4
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, 0
|
||||
; GFX8-NEXT: s_max_i32 s4, s2, s3
|
||||
; GFX8-NEXT: s_min_i32 s2, s2, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
|
||||
; GFX8-NEXT: s_max_i32 s1, s3, s1
|
||||
; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
|
||||
; GFX8-NEXT: s_max_i32 s1, s2, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s5
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s4
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s2
|
||||
; GFX8-NEXT: s_add_i32 s0, s0, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, s2
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 8
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_saddsat_i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_add_i16 v0, s0, v0 clamp
|
||||
; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
|
||||
@@ -222,9 +217,8 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_saddsat_i8:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX10PLUS-NEXT: v_add_nc_i16 v0, s0, s1 clamp
|
||||
; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
|
||||
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@@ -300,9 +294,10 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4
|
||||
; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp
|
||||
@@ -318,9 +313,11 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp
|
||||
@@ -335,8 +332,10 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_add_i16 v0, v0, v1 clamp
|
||||
@@ -387,45 +386,44 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
|
||||
;
|
||||
; GFX8-LABEL: s_saddsat_v2i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s6, 0
|
||||
; GFX8-NEXT: s_max_i32 s7, s5, s6
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s6
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, 0
|
||||
; GFX8-NEXT: s_max_i32 s6, s4, s5
|
||||
; GFX8-NEXT: s_min_i32 s4, s4, s5
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
|
||||
; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7
|
||||
; GFX8-NEXT: s_max_i32 s1, s5, s1
|
||||
; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
|
||||
; GFX8-NEXT: s_max_i32 s1, s4, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s7
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s6
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s4
|
||||
; GFX8-NEXT: s_add_i32 s0, s0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s2, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s3, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s2, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s3, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s1
|
||||
; GFX8-NEXT: s_max_i32 s5, s3, s6
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s6
|
||||
; GFX8-NEXT: s_max_i32 s4, s3, s5
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s5
|
||||
; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5
|
||||
; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4
|
||||
; GFX8-NEXT: s_max_i32 s2, s3, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s4
|
||||
; GFX8-NEXT: s_min_i32 s2, s2, s3
|
||||
; GFX8-NEXT: s_add_i32 s1, s1, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s1, s1, s4
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, s4
|
||||
; GFX8-NEXT: s_ashr_i32 s1, s1, 8
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 8
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
|
||||
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@@ -628,14 +626,15 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX9-LABEL: v_saddsat_v4i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
||||
; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
@@ -662,15 +661,17 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
||||
; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 24
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_add_i16 v2, v2, v3 clamp
|
||||
@@ -692,12 +693,14 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4
|
||||
; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
@@ -785,47 +788,46 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
|
||||
;
|
||||
; GFX8-LABEL: s_saddsat_v4i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s0, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s9, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s10, 0
|
||||
; GFX8-NEXT: s_max_i32 s11, s9, s10
|
||||
; GFX8-NEXT: s_min_i32 s9, s9, s10
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s8, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s9, 0
|
||||
; GFX8-NEXT: s_max_i32 s10, s8, s9
|
||||
; GFX8-NEXT: s_min_i32 s8, s8, s9
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s1, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s8
|
||||
; GFX8-NEXT: s_sub_i32 s9, 0xffff8000, s9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s9, s9
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s8, s8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sub_i32 s11, 0x7fff, s11
|
||||
; GFX8-NEXT: s_max_i32 s1, s9, s1
|
||||
; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10
|
||||
; GFX8-NEXT: s_max_i32 s1, s8, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s9, s11
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s8, s10
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s8
|
||||
; GFX8-NEXT: s_add_i32 s0, s0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s2, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s5, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s2, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s5, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s1
|
||||
; GFX8-NEXT: s_max_i32 s9, s5, s10
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s10
|
||||
; GFX8-NEXT: s_max_i32 s8, s5, s9
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s9
|
||||
; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9
|
||||
; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8
|
||||
; GFX8-NEXT: s_max_i32 s2, s5, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s8
|
||||
; GFX8-NEXT: s_min_i32 s2, s2, s5
|
||||
; GFX8-NEXT: s_add_i32 s1, s1, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s3, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s3, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s6, s8
|
||||
; GFX8-NEXT: s_max_i32 s6, s5, s10
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s10
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s6, 8
|
||||
; GFX8-NEXT: s_max_i32 s6, s5, s9
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s9
|
||||
; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
@@ -835,11 +837,11 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s6
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s5
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s4, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s4, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s3
|
||||
; GFX8-NEXT: s_max_i32 s6, s5, s10
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s10
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s7, s8
|
||||
; GFX8-NEXT: s_max_i32 s6, s5, s9
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s9
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s7, 8
|
||||
; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s4
|
||||
@@ -847,21 +849,21 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
|
||||
; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6
|
||||
; GFX8-NEXT: s_max_i32 s4, s5, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s1, s1, s8
|
||||
; GFX8-NEXT: s_ashr_i32 s1, s1, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s6
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, s8
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_min_i32 s4, s4, s5
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
|
||||
; GFX8-NEXT: s_ashr_i32 s2, s2, s8
|
||||
; GFX8-NEXT: s_ashr_i32 s2, s2, 8
|
||||
; GFX8-NEXT: s_add_i32 s3, s3, s4
|
||||
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s2, 0xff
|
||||
; GFX8-NEXT: s_ashr_i32 s3, s3, s8
|
||||
; GFX8-NEXT: s_ashr_i32 s3, s3, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s3, 0xff
|
||||
@@ -2911,8 +2913,8 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s4
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s3
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s1
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s1, 0xffff, s2
|
||||
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
@@ -3301,12 +3303,12 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_min_i32 s2, s2, s3
|
||||
; GFX8-NEXT: s_add_i32 s5, s5, s2
|
||||
; GFX8-NEXT: s_bfe_u32 s2, s4, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s2, 0xffff, s4
|
||||
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_bfe_u32 s2, s5, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s2, 0xffff, s5
|
||||
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX8-NEXT: s_or_b32 s1, s1, s2
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
@@ -3658,16 +3660,16 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s4
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s4
|
||||
; GFX8-NEXT: s_add_i32 s8, s8, s3
|
||||
; GFX8-NEXT: s_bfe_u32 s3, s6, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s3, 0xffff, s6
|
||||
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s3
|
||||
; GFX8-NEXT: s_bfe_u32 s3, s7, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s3, 0xffff, s7
|
||||
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX8-NEXT: s_or_b32 s1, s1, s3
|
||||
; GFX8-NEXT: s_bfe_u32 s3, s8, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s3, 0xffff, s8
|
||||
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX8-NEXT: s_or_b32 s2, s2, s3
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
@@ -4105,20 +4107,20 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_min_i32 s4, s4, s5
|
||||
; GFX8-NEXT: s_add_i32 s11, s11, s4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s8, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s8
|
||||
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s9, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s9
|
||||
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_or_b32 s1, s1, s4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s10, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s10
|
||||
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_or_b32 s2, s2, s4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s11, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s11
|
||||
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_or_b32 s3, s3, s4
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
|
||||
@@ -2905,19 +2905,20 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
|
||||
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
|
||||
; GFX9-NEXT: s_xor_b32 s4, s11, s7
|
||||
; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1
|
||||
; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
|
||||
; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1
|
||||
; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
|
||||
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
|
||||
; GFX9-NEXT: global_store_dword v2, v1, s[2:3]
|
||||
@@ -2938,7 +2939,6 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1
|
||||
; GFX10-NEXT: s_sub_i32 s6, 0, s2
|
||||
; GFX10-NEXT: s_sub_i32 s7, 0, s1
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
@@ -2946,14 +2946,15 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1
|
||||
; GFX10-NEXT: s_sub_i32 s6, 0, s1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1
|
||||
; GFX10-NEXT: s_sext_i32_i16 s6, s0
|
||||
; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010
|
||||
; GFX10-NEXT: s_ashr_i32 s9, s6, 31
|
||||
; GFX10-NEXT: s_ashr_i32 s10, s0, 31
|
||||
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
|
||||
; GFX10-NEXT: s_add_i32 s6, s6, s9
|
||||
; GFX10-NEXT: s_add_i32 s0, s0, s10
|
||||
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
|
||||
; GFX10-NEXT: s_xor_b32 s6, s6, s9
|
||||
; GFX10-NEXT: s_xor_b32 s0, s0, s10
|
||||
@@ -2962,43 +2963,45 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
|
||||
; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3
|
||||
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
|
||||
; GFX10-NEXT: s_xor_b32 s1, s9, s3
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
|
||||
; GFX10-NEXT: s_xor_b32 s0, s10, s8
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3
|
||||
; GFX10-NEXT: s_xor_b32 s0, s10, s8
|
||||
; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s10, v3
|
||||
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
|
||||
; GFX10-NEXT: global_store_dword v1, v2, s[6:7]
|
||||
|
||||
@@ -46,19 +46,20 @@ define <4 x half> @test_v4s16(<4 x half> %a) #0 {
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5]
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[4:5]
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v2, v1, 0, vcc
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v1, v3, 0, s[4:5]
|
||||
; GCN-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GCN-NEXT: v_perm_b32 v0, v0, v4, s4
|
||||
; GCN-NEXT: v_perm_b32 v1, v1, v2, s4
|
||||
; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v4
|
||||
; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v3
|
||||
; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v2
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
%fcmp = fcmp olt <4 x half> %a, zeroinitializer
|
||||
@@ -71,31 +72,34 @@ define <8 x half> @test_v8s16(<8 x half> %a) #0 {
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0
|
||||
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v0
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v8, v0, 0, vcc
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v1
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v1, v5, 0, s[4:5]
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v2
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v2, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v1, v5, 0, s[4:5]
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5]
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v2, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_cmp_gt_f16_e32 vcc, 0, v3
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v3, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5]
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v6, v3, 0, vcc
|
||||
; GCN-NEXT: v_cmp_lt_f16_sdwa s[4:5], v3, s6 src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v4
|
||||
; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v5
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v3, v7, 0, s[4:5]
|
||||
; GCN-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GCN-NEXT: v_perm_b32 v0, v0, v8, s4
|
||||
; GCN-NEXT: v_perm_b32 v1, v1, v4, s4
|
||||
; GCN-NEXT: v_perm_b32 v2, v2, v5, s4
|
||||
; GCN-NEXT: v_perm_b32 v3, v3, v6, s4
|
||||
; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8
|
||||
; GCN-NEXT: v_lshl_or_b32 v2, v2, 16, v4
|
||||
; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6
|
||||
; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v7
|
||||
; GCN-NEXT: v_lshl_or_b32 v3, v3, 16, v4
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
%fcmp = fcmp olt <8 x half> %a, zeroinitializer
|
||||
|
||||
@@ -49,24 +49,21 @@ define amdgpu_ps i8 @s_sext_inreg_i8(i8 inreg %value) {
|
||||
;
|
||||
; GFX8-LABEL: s_sext_inreg_i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s1, 3, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 3
|
||||
; GFX8-NEXT: s_sext_i32_i8 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 3
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_sext_inreg_i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s1, 3, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 3
|
||||
; GFX9-NEXT: s_sext_i32_i8 s0, s0
|
||||
; GFX9-NEXT: s_ashr_i32 s0, s0, 3
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_sext_inreg_i8:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s1, 3, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 3
|
||||
; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0
|
||||
; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 3
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
@@ -83,24 +80,21 @@ define amdgpu_ps i8 @s_sext_inreg_i8_6(i8 inreg %value) {
|
||||
;
|
||||
; GFX8-LABEL: s_sext_inreg_i8_6:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s1, 6, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 6
|
||||
; GFX8-NEXT: s_sext_i32_i8 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 6
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_sext_inreg_i8_6:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s1, 6, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 6
|
||||
; GFX9-NEXT: s_sext_i32_i8 s0, s0
|
||||
; GFX9-NEXT: s_ashr_i32 s0, s0, 6
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_sext_inreg_i8_6:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s1, 6, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 6
|
||||
; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0
|
||||
; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 6
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
@@ -612,24 +606,21 @@ define amdgpu_ps i16 @s_sext_inreg_i16_9(i16 inreg %value) {
|
||||
;
|
||||
; GFX8-LABEL: s_sext_inreg_i16_9:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s1, 9, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 9
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_sext_inreg_i16_9:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s1, 9, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX9-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX9-NEXT: s_ashr_i32 s0, s0, 9
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_sext_inreg_i16_9:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s1, 9, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 9
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
@@ -646,24 +637,21 @@ define amdgpu_ps i16 @s_sext_inreg_i16_15(i16 inreg %value) {
|
||||
;
|
||||
; GFX8-LABEL: s_sext_inreg_i16_15:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s1, 15, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 15
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 15
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_sext_inreg_i16_15:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s1, 15, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 15
|
||||
; GFX9-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX9-NEXT: s_ashr_i32 s0, s0, 15
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_sext_inreg_i16_15:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s1, 15, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 15
|
||||
; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 15
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
@@ -762,9 +750,8 @@ define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) {
|
||||
; GFX8-LABEL: s_sext_inreg_v2i16_11:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; GFX8-NEXT: s_bfe_u32 s2, 11, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 11
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 11
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 11
|
||||
@@ -889,8 +876,8 @@ define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) {
|
||||
;
|
||||
; GFX8-LABEL: s_sext_inreg_v4i16_14:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s0, -1, 0x100000
|
||||
; GFX8-NEXT: s_mov_b32 s1, s0
|
||||
; GFX8-NEXT: s_mov_b32 s0, 0xffff
|
||||
; GFX8-NEXT: s_mov_b32 s1, 0xffff
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_sext_inreg_v4i16_14:
|
||||
@@ -978,11 +965,10 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) {
|
||||
; GFX8-LABEL: v_sext_inreg_v8i16_11:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: s_bfe_u32 s4, -1, 0x100000
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 0xffff
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 0xffff
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_sext_inreg_v8i16_11:
|
||||
@@ -1041,32 +1027,31 @@ define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) {
|
||||
; GFX8-LABEL: s_sext_inreg_v8i16_5:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
|
||||
; GFX8-NEXT: s_bfe_u32 s8, 10, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 10
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s8
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s5, s5, s8
|
||||
; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 10
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s5, s5, 10
|
||||
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 10
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s6, s6, s8
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
|
||||
; GFX8-NEXT: s_lshl_b32 s6, s6, 10
|
||||
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s3, 16
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s2, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s2, 10
|
||||
; GFX8-NEXT: s_or_b32 s1, s1, s4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s6, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s7, s7, s8
|
||||
; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
|
||||
; GFX8-NEXT: s_lshl_b32 s7, s7, 10
|
||||
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s3, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s3, 10
|
||||
; GFX8-NEXT: s_or_b32 s2, s2, s4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s7
|
||||
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_or_b32 s3, s3, s4
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
@@ -1550,251 +1535,18 @@ define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
|
||||
}
|
||||
|
||||
define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
|
||||
; GFX6-LABEL: s_sext_inreg_i65_33:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_mov_b32 s0, 0
|
||||
; GFX6-NEXT: s_mov_b32 s1, 0
|
||||
; GFX6-NEXT: s_mov_b32 s2, 0
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-LABEL: s_sext_inreg_i65_33:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s0, 1, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s1, 2, 0x100000
|
||||
; GFX8-NEXT: s_lshr_b32 s0, 0, s0
|
||||
; GFX8-NEXT: s_lshr_b32 s1, 0, s1
|
||||
; GFX8-NEXT: s_bfe_u32 s2, 3, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s0, s0, 1
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s2, 0, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 17
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 18
|
||||
; GFX8-NEXT: s_bfe_u32 s3, 4, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s2, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s3, 0, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 19
|
||||
; GFX8-NEXT: s_bfe_u32 s4, 5, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s3, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s4, 0, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 20
|
||||
; GFX8-NEXT: s_bfe_u32 s5, 6, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s4, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s5, 0, s5
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 21
|
||||
; GFX8-NEXT: s_bfe_u32 s6, 7, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s5, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s6, 0, s6
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 22
|
||||
; GFX8-NEXT: s_bfe_u32 s7, 8, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s6, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s7, 0, s7
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 23
|
||||
; GFX8-NEXT: s_bfe_u32 s8, 9, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s7, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s8, 0, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 24
|
||||
; GFX8-NEXT: s_bfe_u32 s9, 10, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s8, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s9, 0, s9
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 25
|
||||
; GFX8-NEXT: s_bfe_u32 s10, 11, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s9, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s10, 0, s10
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 26
|
||||
; GFX8-NEXT: s_bfe_u32 s11, 12, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s10, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s11, 0, s11
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 27
|
||||
; GFX8-NEXT: s_bfe_u32 s12, 13, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s11, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s12, 0, s12
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 28
|
||||
; GFX8-NEXT: s_bfe_u32 s13, 14, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s12, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s13, 0, s13
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 29
|
||||
; GFX8-NEXT: s_bfe_u32 s14, 15, 0x100000
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s13, 1
|
||||
; GFX8-NEXT: s_lshr_b32 s14, 0, s14
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 30
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s14, 1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 31
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_mov_b32 s1, s0
|
||||
; GFX8-NEXT: s_mov_b32 s2, 0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_sext_inreg_i65_33:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s0, 1, 0x100000
|
||||
; GFX9-NEXT: s_bfe_u32 s1, 2, 0x100000
|
||||
; GFX9-NEXT: s_lshr_b32 s0, 0, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s1, 0, s1
|
||||
; GFX9-NEXT: s_bfe_u32 s2, 3, 0x100000
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 1
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s2, 0, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 17
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 18
|
||||
; GFX9-NEXT: s_bfe_u32 s3, 4, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s2, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s3, 0, s3
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 19
|
||||
; GFX9-NEXT: s_bfe_u32 s4, 5, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s3, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s4, 0, s4
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 20
|
||||
; GFX9-NEXT: s_bfe_u32 s5, 6, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s4, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s5, 0, s5
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 21
|
||||
; GFX9-NEXT: s_bfe_u32 s6, 7, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s5, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s6, 0, s6
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 22
|
||||
; GFX9-NEXT: s_bfe_u32 s7, 8, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s6, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s7, 0, s7
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 23
|
||||
; GFX9-NEXT: s_bfe_u32 s8, 9, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s7, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s8, 0, s8
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 24
|
||||
; GFX9-NEXT: s_bfe_u32 s9, 10, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s8, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s9, 0, s9
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 25
|
||||
; GFX9-NEXT: s_bfe_u32 s10, 11, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s9, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s10, 0, s10
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 26
|
||||
; GFX9-NEXT: s_bfe_u32 s11, 12, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s10, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s11, 0, s11
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 27
|
||||
; GFX9-NEXT: s_bfe_u32 s12, 13, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s11, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s12, 0, s12
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 28
|
||||
; GFX9-NEXT: s_bfe_u32 s13, 14, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s12, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s13, 0, s13
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 29
|
||||
; GFX9-NEXT: s_bfe_u32 s14, 15, 0x100000
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s13, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s14, 0, s14
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 30
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, s14, 1
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 31
|
||||
; GFX9-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX9-NEXT: s_mov_b32 s1, s0
|
||||
; GFX9-NEXT: s_mov_b32 s2, 0
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
; GCN-LABEL: s_sext_inreg_i65_33:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_mov_b32 s0, 0
|
||||
; GCN-NEXT: s_mov_b32 s1, 0
|
||||
; GCN-NEXT: s_mov_b32 s2, 0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_sext_inreg_i65_33:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s0, 1, 0x100000
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s1, 2, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s0, 0, s0
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s2, 3, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s1, 0, s1
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s2, 0, s2
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s3, 4, 0x100000
|
||||
; GFX10PLUS-NEXT: s_and_b32 s0, s0, 1
|
||||
; GFX10PLUS-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s4, 5, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s3, 0, s3
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 17
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 18
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s4, 0, s4
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s5, 6, 0x100000
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 19
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s3, 1
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s6, 7, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s5, 0, s5
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 20
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s4, 1
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s6, 0, s6
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s7, 8, 0x100000
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 21
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s5, 1
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s8, 9, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s7, 0, s7
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 22
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s6, 1
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s8, 0, s8
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s9, 10, 0x100000
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 23
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s7, 1
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s10, 11, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s9, 0, s9
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 24
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s8, 1
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s10, 0, s10
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s11, 12, 0x100000
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 25
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s9, 1
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s12, 13, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s11, 0, s11
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 26
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s10, 1
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s12, 0, s12
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s13, 14, 0x100000
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 27
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s11, 1
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s14, 15, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s13, 0, s13
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 28
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s12, 1
|
||||
; GFX10PLUS-NEXT: s_lshr_b32 s14, 0, s14
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 29
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s13, 1
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 30
|
||||
; GFX10PLUS-NEXT: s_and_b32 s2, s14, 1
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 31
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s0, 0
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s1, 0
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s2, 0
|
||||
; GFX10PLUS-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_mov_b32 s1, s0
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
%shl = shl i65 %value, 33
|
||||
%ashr = shl i65 %shl, 33
|
||||
|
||||
@@ -664,7 +664,7 @@ define i32 @v_shl_i32_zext_i16(i16 %x) {
|
||||
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x3fff, v0
|
||||
; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 2, v0
|
||||
; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
||||
%and = and i16 %x, 16383
|
||||
%ext = zext i16 %and to i32
|
||||
|
||||
@@ -93,27 +93,14 @@ define amdgpu_ps i8 @s_shl_i8(i8 inreg %value, i8 inreg %amount) {
|
||||
}
|
||||
|
||||
define amdgpu_ps i8 @s_shl_i8_7(i8 inreg %value) {
|
||||
; GFX6-LABEL: s_shl_i8_7:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_lshl_b32 s0, s0, 7
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-LABEL: s_shl_i8_7:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s1, 7, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_shl_i8_7:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s1, 7, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
; GCN-LABEL: s_shl_i8_7:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_lshl_b32 s0, s0, 7
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_shl_i8_7:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s1, 7, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 7
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
%result = shl i8 %value, 7
|
||||
ret i8 %result
|
||||
@@ -675,27 +662,14 @@ define amdgpu_ps i16 @s_shl_i16(i16 inreg %value, i16 inreg %amount) {
|
||||
}
|
||||
|
||||
define amdgpu_ps i16 @s_shl_i16_15(i16 inreg %value) {
|
||||
; GFX6-LABEL: s_shl_i16_15:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_lshl_b32 s0, s0, 15
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-LABEL: s_shl_i16_15:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s1, 15, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_shl_i16_15:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s1, 15, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
; GCN-LABEL: s_shl_i16_15:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_lshl_b32 s0, s0, 15
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_shl_i16_15:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s1, 15, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 15
|
||||
; GFX10PLUS-NEXT: ; return to shader part epilog
|
||||
%result = shl i16 %value, 15
|
||||
ret i16 %result
|
||||
|
||||
@@ -75,31 +75,29 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
|
||||
;
|
||||
; GFX8-LABEL: s_ssubsat_i7:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, -1
|
||||
; GFX8-NEXT: s_max_i32 s5, s3, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s5
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, -1
|
||||
; GFX8-NEXT: s_max_i32 s4, s2, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s2, s2, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s1, s4, s1
|
||||
; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s1, s3, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s2
|
||||
; GFX8-NEXT: s_sub_i32 s0, s0, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, s2
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 9
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_ssubsat_i7:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp
|
||||
; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0
|
||||
@@ -108,9 +106,8 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_ssubsat_i7:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp
|
||||
; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0
|
||||
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@@ -189,31 +186,29 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
|
||||
;
|
||||
; GFX8-LABEL: s_ssubsat_i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, -1
|
||||
; GFX8-NEXT: s_max_i32 s5, s3, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s5
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, -1
|
||||
; GFX8-NEXT: s_max_i32 s4, s2, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s2, s2, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s1, s4, s1
|
||||
; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s1, s3, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s2
|
||||
; GFX8-NEXT: s_sub_i32 s0, s0, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, s2
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 8
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: s_ssubsat_i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp
|
||||
; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
|
||||
@@ -222,9 +217,8 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_ssubsat_i8:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp
|
||||
; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
|
||||
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@@ -300,9 +294,10 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4
|
||||
; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
|
||||
@@ -318,9 +313,11 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
|
||||
@@ -335,8 +332,10 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 clamp
|
||||
@@ -387,45 +386,44 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
|
||||
;
|
||||
; GFX8-LABEL: s_ssubsat_v2i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s6, -1
|
||||
; GFX8-NEXT: s_max_i32 s7, s5, s6
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, -1
|
||||
; GFX8-NEXT: s_max_i32 s6, s4, s5
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
|
||||
; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s6
|
||||
; GFX8-NEXT: s_sext_i32_i16 s7, s7
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s4, s4, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s6, s6
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s1, s7, s1
|
||||
; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s1, s6, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s4
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s4
|
||||
; GFX8-NEXT: s_sub_i32 s0, s0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s2, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s3, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s2, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s3, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s1
|
||||
; GFX8-NEXT: s_max_i32 s5, s3, s6
|
||||
; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s6
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_max_i32 s4, s3, s5
|
||||
; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s5
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s2, s5, s2
|
||||
; GFX8-NEXT: s_max_i32 s2, s4, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_min_i32 s2, s2, s3
|
||||
; GFX8-NEXT: s_sub_i32 s1, s1, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s1, s1, s4
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, s4
|
||||
; GFX8-NEXT: s_ashr_i32 s1, s1, 8
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 8
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
|
||||
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@@ -628,14 +626,15 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX9-LABEL: v_ssubsat_v4i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
||||
; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
@@ -662,15 +661,17 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
||||
; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 24
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_sub_i16 v2, v2, v3 clamp
|
||||
@@ -692,12 +693,14 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4
|
||||
; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
@@ -785,48 +788,47 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
|
||||
;
|
||||
; GFX8-LABEL: s_ssubsat_v4i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s0, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s9, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s10, -1
|
||||
; GFX8-NEXT: s_max_i32 s11, s9, s10
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s8, s0
|
||||
; GFX8-NEXT: s_sext_i32_i16 s9, -1
|
||||
; GFX8-NEXT: s_max_i32 s10, s8, s9
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s1, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s8
|
||||
; GFX8-NEXT: s_sub_i32 s11, s11, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s9, s9, s10
|
||||
; GFX8-NEXT: s_sext_i32_i16 s11, s11
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s8, s8, s9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s10, s10
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sub_i32 s9, s9, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s1, s11, s1
|
||||
; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s1, s10, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sext_i32_i16 s9, s9
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s8, s8
|
||||
; GFX8-NEXT: s_min_i32 s1, s1, s8
|
||||
; GFX8-NEXT: s_sub_i32 s0, s0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s2, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s5, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s2, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s5, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s1
|
||||
; GFX8-NEXT: s_max_i32 s9, s5, s10
|
||||
; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s10
|
||||
; GFX8-NEXT: s_sext_i32_i16 s9, s9
|
||||
; GFX8-NEXT: s_max_i32 s8, s5, s9
|
||||
; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s8, s8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s2, s9, s2
|
||||
; GFX8-NEXT: s_max_i32 s2, s8, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_min_i32 s2, s2, s5
|
||||
; GFX8-NEXT: s_sub_i32 s1, s1, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s3, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s3, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s6, s8
|
||||
; GFX8-NEXT: s_max_i32 s6, s5, s10
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s6, 8
|
||||
; GFX8-NEXT: s_max_i32 s6, s5, s9
|
||||
; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s10
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s6, s6
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000
|
||||
@@ -835,33 +837,33 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_min_i32 s3, s3, s5
|
||||
; GFX8-NEXT: s_sub_i32 s2, s2, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s4, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s4, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s3
|
||||
; GFX8-NEXT: s_max_i32 s6, s5, s10
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s7, s8
|
||||
; GFX8-NEXT: s_max_i32 s6, s5, s9
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s7, 8
|
||||
; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s10
|
||||
; GFX8-NEXT: s_min_i32 s5, s5, s9
|
||||
; GFX8-NEXT: s_sext_i32_i16 s6, s6
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s4, s6, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s1, s1, s8
|
||||
; GFX8-NEXT: s_ashr_i32 s1, s1, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, s8
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s0, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GFX8-NEXT: s_min_i32 s4, s4, s5
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 0xff
|
||||
; GFX8-NEXT: s_ashr_i32 s2, s2, s8
|
||||
; GFX8-NEXT: s_ashr_i32 s2, s2, 8
|
||||
; GFX8-NEXT: s_sub_i32 s3, s3, s4
|
||||
; GFX8-NEXT: s_and_b32 s0, s0, 0xff
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s2, 0xff
|
||||
; GFX8-NEXT: s_ashr_i32 s3, s3, s8
|
||||
; GFX8-NEXT: s_ashr_i32 s3, s3, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s3, 0xff
|
||||
@@ -2897,8 +2899,8 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
|
||||
; GFX8-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX8-NEXT: s_min_i32 s1, s3, s1
|
||||
; GFX8-NEXT: s_sub_i32 s1, s2, s1
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
@@ -3286,13 +3288,13 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
|
||||
; GFX8-NEXT: s_sext_i32_i16 s4, s4
|
||||
; GFX8-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX8-NEXT: s_min_i32 s3, s4, s3
|
||||
; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; GFX8-NEXT: s_sub_i32 s3, s5, s3
|
||||
; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s2, 0xffff, s3
|
||||
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX8-NEXT: s_or_b32 s1, s1, s2
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
@@ -3640,20 +3642,20 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
|
||||
; GFX8-NEXT: s_sext_i32_i16 s7, s11
|
||||
; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s6, s6, s7
|
||||
; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
|
||||
; GFX8-NEXT: s_sext_i32_i16 s6, s6
|
||||
; GFX8-NEXT: s_sext_i32_i16 s5, s5
|
||||
; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX8-NEXT: s_min_i32 s5, s6, s5
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s3
|
||||
; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s3, 0xffff, s4
|
||||
; GFX8-NEXT: s_sub_i32 s5, s8, s5
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX8-NEXT: s_or_b32 s1, s1, s3
|
||||
; GFX8-NEXT: s_bfe_u32 s3, s5, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s3, 0xffff, s5
|
||||
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX8-NEXT: s_or_b32 s2, s2, s3
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
@@ -4081,30 +4083,30 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
|
||||
; GFX8-NEXT: s_sext_i32_i16 s7, s11
|
||||
; GFX8-NEXT: s_max_i32 s8, s7, s17
|
||||
; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
|
||||
; GFX8-NEXT: s_min_i32 s7, s7, s17
|
||||
; GFX8-NEXT: s_sext_i32_i16 s8, s8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s9, s15
|
||||
; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_sub_i32 s7, s7, 0xffff8000
|
||||
; GFX8-NEXT: s_max_i32 s8, s8, s9
|
||||
; GFX8-NEXT: s_or_b32 s0, s0, s4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s5
|
||||
; GFX8-NEXT: s_sub_i32 s6, s10, s6
|
||||
; GFX8-NEXT: s_sext_i32_i16 s8, s8
|
||||
; GFX8-NEXT: s_sext_i32_i16 s7, s7
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_min_i32 s7, s8, s7
|
||||
; GFX8-NEXT: s_or_b32 s1, s1, s4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s6, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s6
|
||||
; GFX8-NEXT: s_sub_i32 s7, s11, s7
|
||||
; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_or_b32 s2, s2, s4
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s7, 0x100000
|
||||
; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000
|
||||
; GFX8-NEXT: s_and_b32 s4, 0xffff, s7
|
||||
; GFX8-NEXT: s_and_b32 s3, 0xffff, s3
|
||||
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
|
||||
; GFX8-NEXT: s_or_b32 s3, s3, s4
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
|
||||
@@ -69,56 +69,55 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX9-NEXT: s_bfe_u32 s0, 8, 0x100000
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_bfe_u32 s3, s4, 0x100000
|
||||
; GFX9-NEXT: s_and_b32 s1, 0xffff, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s3, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s4, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s1, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s4, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:1
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_bfe_u32 s2, s5, 0x100000
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:3
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s2, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s5, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_bfe_u32 s2, s6, 0x100000
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:7
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s2, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s6, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_bfe_u32 s2, s7, 0x100000
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:11
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s2, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s7, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:13
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:14
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, 0xffff, s5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:3
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s1, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s5, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, 0xffff, s6
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:7
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s1, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s6, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, 0xffff, s7
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:11
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s1, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s7, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:13
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:15
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
@@ -180,51 +179,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
|
||||
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX10-NEXT: s_bfe_u32 s0, 8, 0x100000
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_bfe_u32 s3, s4, 0x100000
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s4, 16
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s4, 16
|
||||
; GFX10-NEXT: s_and_b32 s1, 0xffff, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s2, s5, 16
|
||||
; GFX10-NEXT: s_bfe_u32 s4, s5, 0x100000
|
||||
; GFX10-NEXT: s_and_b32 s3, 0xffff, s5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX10-NEXT: s_lshr_b32 s3, s3, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s5, s6, 16
|
||||
; GFX10-NEXT: s_bfe_u32 s8, s6, 0x100000
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s1, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s4, s6, 16
|
||||
; GFX10-NEXT: s_and_b32 s5, 0xffff, s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX10-NEXT: s_lshr_b32 s6, s1, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s4, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s4, s2, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, s6
|
||||
; GFX10-NEXT: s_lshr_b32 s6, s0, 8
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s3, 8
|
||||
; GFX10-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s2, s8, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, s1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, s1
|
||||
; GFX10-NEXT: s_lshr_b32 s2, s5, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GFX10-NEXT: ds_write_b8 v1, v0
|
||||
; GFX10-NEXT: ds_write_b8 v1, v2 offset:4
|
||||
; GFX10-NEXT: ds_write_b8 v1, v4 offset:2
|
||||
; GFX10-NEXT: ds_write_b8 v1, v5 offset:6
|
||||
; GFX10-NEXT: ds_write_b8 v1, v6 offset:1
|
||||
; GFX10-NEXT: ds_write_b8 v1, v7 offset:3
|
||||
; GFX10-NEXT: ds_write_b8 v1, v8 offset:5
|
||||
; GFX10-NEXT: ds_write_b8 v1, v5 offset:6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s5, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s4, 8
|
||||
; GFX10-NEXT: ds_write_b8 v1, v9 offset:7
|
||||
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
|
||||
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
|
||||
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX10-NEXT: s_bfe_u32 s1, s7, 0x100000
|
||||
; GFX10-NEXT: s_lshr_b32 s2, s7, 16
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s1, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: s_and_b32 s0, 0xffff, s7
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s7, 16
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s0, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s2, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s0
|
||||
; GFX10-NEXT: ds_write_b8 v1, v0 offset:11
|
||||
; GFX10-NEXT: ds_write_b8 v1, v2 offset:12
|
||||
@@ -238,48 +236,46 @@ define amdgpu_kernel void @store_lds_v4i32_align1(ptr addrspace(3) %out, <4 x i3
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
|
||||
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
|
||||
; GFX11-NEXT: s_bfe_u32 s1, 8, 0x100000
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_bfe_u32 s3, s4, 0x100000
|
||||
; GFX11-NEXT: s_lshr_b32 s2, s4, 16
|
||||
; GFX11-NEXT: s_and_b32 s2, 0xffff, s4
|
||||
; GFX11-NEXT: s_lshr_b32 s1, s4, 16
|
||||
; GFX11-NEXT: s_lshr_b32 s2, s2, 8
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s5, 16
|
||||
; GFX11-NEXT: s_lshr_b32 s3, s3, s1
|
||||
; GFX11-NEXT: s_bfe_u32 s4, s5, 0x100000
|
||||
; GFX11-NEXT: s_bfe_u32 s8, s6, 0x100000
|
||||
; GFX11-NEXT: s_lshr_b32 s9, s2, s1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2
|
||||
; GFX11-NEXT: s_lshr_b32 s5, s6, 16
|
||||
; GFX11-NEXT: s_lshr_b32 s2, s4, s1
|
||||
; GFX11-NEXT: s_lshr_b32 s4, s0, s1
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s8, s1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GFX11-NEXT: s_and_b32 s3, 0xffff, s5
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
|
||||
; GFX11-NEXT: s_lshr_b32 s4, s6, 16
|
||||
; GFX11-NEXT: s_and_b32 s5, 0xffff, s6
|
||||
; GFX11-NEXT: s_lshr_b32 s6, s1, 8
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s6
|
||||
; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
|
||||
; GFX11-NEXT: s_lshr_b32 s1, s3, 8
|
||||
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s5, 8
|
||||
; GFX11-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v9, s3
|
||||
; GFX11-NEXT: ds_store_b8 v1, v0
|
||||
; GFX11-NEXT: ds_store_b8 v1, v5 offset:1
|
||||
; GFX11-NEXT: ds_store_b8 v1, v3 offset:2
|
||||
; GFX11-NEXT: ds_store_b8 v1, v6 offset:3
|
||||
; GFX11-NEXT: ds_store_b8 v1, v6 offset:1
|
||||
; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
|
||||
; GFX11-NEXT: ds_store_b8 v1, v7 offset:3
|
||||
; GFX11-NEXT: ds_store_b8 v1, v2 offset:4
|
||||
; GFX11-NEXT: ds_store_b8 v1, v7 offset:5
|
||||
; GFX11-NEXT: ds_store_b8 v1, v4 offset:6
|
||||
; GFX11-NEXT: ds_store_b8 v1, v8 offset:7
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX11-NEXT: s_lshr_b32 s2, s7, 16
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v7, s2
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s5, s1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v5, s7
|
||||
; GFX11-NEXT: ds_store_b8 v1, v8 offset:5
|
||||
; GFX11-NEXT: ds_store_b8 v1, v5 offset:6
|
||||
; GFX11-NEXT: ds_store_b8 v1, v9 offset:7
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v5, s7
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s4, 8
|
||||
; GFX11-NEXT: s_lshr_b32 s1, s7, 16
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX11-NEXT: s_bfe_u32 s0, s7, 0x100000
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s0, s1
|
||||
; GFX11-NEXT: s_and_b32 s0, 0xffff, s7
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v7, s1
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s0, 8
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s2, s1
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s1, 8
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX11-NEXT: ds_store_b8 v1, v0 offset:8
|
||||
; GFX11-NEXT: ds_store_b8 v1, v2 offset:9
|
||||
; GFX11-NEXT: ds_store_b8 v1, v3 offset:10
|
||||
; GFX11-NEXT: ds_store_b8 v1, v3 offset:8
|
||||
; GFX11-NEXT: ds_store_b8 v1, v0 offset:9
|
||||
; GFX11-NEXT: ds_store_b8 v1, v2 offset:10
|
||||
; GFX11-NEXT: ds_store_b8 v1, v4 offset:11
|
||||
; GFX11-NEXT: ds_store_b8 v1, v5 offset:12
|
||||
; GFX11-NEXT: ds_store_b8 v1, v6 offset:13
|
||||
|
||||
@@ -65,44 +65,43 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
|
||||
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX9-NEXT: s_bfe_u32 s0, 8, 0x100000
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_bfe_u32 s3, s4, 0x100000
|
||||
; GFX9-NEXT: s_and_b32 s1, 0xffff, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s3, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s4, 16
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s1, 8
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s4, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:1
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_bfe_u32 s2, s5, 0x100000
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:3
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s2, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s5, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_bfe_u32 s2, s6, 0x100000
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:7
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s2, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s6, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, 0xffff, s5
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:3
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s1, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s5, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:5
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: s_and_b32 s1, 0xffff, s6
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:7
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s1, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s6, 16
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:9
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: ds_write_b8 v1, v0 offset:11
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
@@ -153,29 +152,30 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
|
||||
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; GFX10-NEXT: s_bfe_u32 s0, 8, 0x100000
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s4, 16
|
||||
; GFX10-NEXT: s_bfe_u32 s3, s4, 0x100000
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s4, 16
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s2, s5, 16
|
||||
; GFX10-NEXT: s_bfe_u32 s4, s5, 0x100000
|
||||
; GFX10-NEXT: s_and_b32 s3, 0xffff, s5
|
||||
; GFX10-NEXT: s_and_b32 s1, 0xffff, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX10-NEXT: s_lshr_b32 s5, s6, 16
|
||||
; GFX10-NEXT: s_bfe_u32 s7, s6, 0x100000
|
||||
; GFX10-NEXT: s_lshr_b32 s4, s6, 16
|
||||
; GFX10-NEXT: s_and_b32 s5, 0xffff, s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX10-NEXT: s_lshr_b32 s6, s1, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s4, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s4, s2, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s3, s3, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s6, s0, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s3, 8
|
||||
; GFX10-NEXT: s_lshr_b32 s3, s2, 8
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s1, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s2, s7, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GFX10-NEXT: s_lshr_b32 s2, s5, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, s1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s4, 8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, s1
|
||||
; GFX10-NEXT: ds_write_b8 v1, v0
|
||||
; GFX10-NEXT: ds_write_b8 v1, v2 offset:4
|
||||
; GFX10-NEXT: ds_write_b8 v1, v4 offset:2
|
||||
@@ -183,15 +183,13 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
|
||||
; GFX10-NEXT: ds_write_b8 v1, v6 offset:1
|
||||
; GFX10-NEXT: ds_write_b8 v1, v7 offset:3
|
||||
; GFX10-NEXT: ds_write_b8 v1, v8 offset:5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s5, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX10-NEXT: ds_write_b8 v1, v9 offset:7
|
||||
; GFX10-NEXT: ds_write_b8 v1, v3 offset:8
|
||||
; GFX10-NEXT: ds_write_b8 v1, v0 offset:9
|
||||
; GFX10-NEXT: ds_write_b8 v1, v2 offset:10
|
||||
; GFX10-NEXT: ds_write_b8 v1, v4 offset:11
|
||||
; GFX10-NEXT: ds_write_b8 v1, v10 offset:9
|
||||
; GFX10-NEXT: ds_write_b8 v1, v0 offset:10
|
||||
; GFX10-NEXT: ds_write_b8 v1, v2 offset:11
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: store_lds_v3i32_align1:
|
||||
@@ -199,27 +197,26 @@ define amdgpu_kernel void @store_lds_v3i32_align1(ptr addrspace(3) %out, <3 x i3
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10
|
||||
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
|
||||
; GFX11-NEXT: s_bfe_u32 s1, 8, 0x100000
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_bfe_u32 s3, s4, 0x100000
|
||||
; GFX11-NEXT: s_lshr_b32 s2, s4, 16
|
||||
; GFX11-NEXT: s_and_b32 s2, 0xffff, s4
|
||||
; GFX11-NEXT: s_lshr_b32 s1, s4, 16
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
|
||||
; GFX11-NEXT: s_lshr_b32 s4, s6, 16
|
||||
; GFX11-NEXT: s_lshr_b32 s2, s2, 8
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s5, 16
|
||||
; GFX11-NEXT: s_bfe_u32 s4, s5, 0x100000
|
||||
; GFX11-NEXT: s_and_b32 s3, 0xffff, s5
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6
|
||||
; GFX11-NEXT: s_lshr_b32 s5, s6, 16
|
||||
; GFX11-NEXT: s_lshr_b32 s3, s3, s1
|
||||
; GFX11-NEXT: s_bfe_u32 s7, s6, 0x100000
|
||||
; GFX11-NEXT: s_lshr_b32 s6, s2, s1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v7, s3
|
||||
; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s0
|
||||
; GFX11-NEXT: s_lshr_b32 s2, s4, s1
|
||||
; GFX11-NEXT: s_lshr_b32 s4, s0, s1
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s7, s1
|
||||
; GFX11-NEXT: s_lshr_b32 s1, s5, s1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s2
|
||||
; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v12, s1
|
||||
; GFX11-NEXT: s_and_b32 s5, 0xffff, s6
|
||||
; GFX11-NEXT: s_lshr_b32 s6, s1, 8
|
||||
; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s2
|
||||
; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
|
||||
; GFX11-NEXT: s_lshr_b32 s1, s3, 8
|
||||
; GFX11-NEXT: s_lshr_b32 s3, s0, 8
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s5, 8
|
||||
; GFX11-NEXT: s_lshr_b32 s5, s4, 8
|
||||
; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s1
|
||||
; GFX11-NEXT: v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v11, s0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v12, s5
|
||||
; GFX11-NEXT: ds_store_b8 v1, v0
|
||||
; GFX11-NEXT: ds_store_b8 v1, v7 offset:1
|
||||
; GFX11-NEXT: ds_store_b8 v1, v4 offset:2
|
||||
|
||||
@@ -27,9 +27,8 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x ha
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
||||
; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
|
||||
; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
%val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
|
||||
ret <2 x half> %val
|
||||
@@ -51,11 +50,9 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4
|
||||
; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4
|
||||
; GFX8-NEXT: v_fma_f16 v2, v6, v7, v8
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
%val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
|
||||
ret <3 x half> %val
|
||||
@@ -80,13 +77,12 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v5
|
||||
; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4
|
||||
; GFX8-NEXT: v_fma_f16 v2, v6, v8, v10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 16
|
||||
; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5
|
||||
; GFX8-NEXT: v_fma_f16 v3, v7, v9, v11
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
%val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
|
||||
ret <4 x half> %val
|
||||
@@ -144,9 +140,8 @@ define <2 x half> @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> %
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
||||
; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
|
||||
; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.x = fneg <2 x half> %x
|
||||
%neg.y = fneg <2 x half> %y
|
||||
|
||||
@@ -11,7 +11,7 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1
|
||||
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v2, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, v2, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 25, v0
|
||||
@@ -61,9 +61,8 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
|
||||
;
|
||||
; GFX8-LABEL: s_uaddsat_i7:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0
|
||||
@@ -72,9 +71,8 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
|
||||
;
|
||||
; GFX9-LABEL: s_uaddsat_i7:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
|
||||
@@ -83,9 +81,8 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_uaddsat_i7:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp
|
||||
; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0
|
||||
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@@ -100,7 +97,7 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
|
||||
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v2, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, v2, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0
|
||||
@@ -150,9 +147,8 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
|
||||
;
|
||||
; GFX8-LABEL: s_uaddsat_i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
|
||||
@@ -161,9 +157,8 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
|
||||
;
|
||||
; GFX9-LABEL: s_uaddsat_i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
|
||||
@@ -172,9 +167,8 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_uaddsat_i8:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX10PLUS-NEXT: v_add_nc_u16 v0, s0, s1 clamp
|
||||
; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0
|
||||
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@@ -191,12 +185,12 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
|
||||
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v4, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, v4, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
|
||||
; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v3, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v2, v3, v2
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1
|
||||
@@ -225,9 +219,10 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4
|
||||
; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 clamp
|
||||
@@ -243,9 +238,11 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
|
||||
@@ -260,8 +257,10 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_add_u16 v0, v0, v1 clamp
|
||||
@@ -302,15 +301,14 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
|
||||
;
|
||||
; GFX8-LABEL: s_uaddsat_v2i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s3, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s3, 8
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s2, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s2, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp
|
||||
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
|
||||
@@ -406,22 +404,22 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1
|
||||
; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v8, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, v8, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5
|
||||
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v5, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v2, v5, v2
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6
|
||||
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2
|
||||
; GFX6-NEXT: v_not_b32_e32 v5, v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v3, v5, v3
|
||||
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7
|
||||
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3
|
||||
; GFX6-NEXT: v_not_b32_e32 v5, v3
|
||||
; GFX6-NEXT: v_min_u32_e32 v4, v5, v4
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2
|
||||
@@ -468,14 +466,15 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX9-LABEL: v_uaddsat_v4i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
||||
; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
@@ -502,15 +501,17 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
||||
; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 24
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_add_u16 v2, v2, v3 clamp
|
||||
@@ -532,12 +533,14 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4
|
||||
; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
@@ -605,29 +608,28 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
|
||||
;
|
||||
; GFX8-LABEL: s_uaddsat_v4i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s1, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s0, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s5, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s5, 8
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s2, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s2, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_add_u16_e64 v1, s0, v1 clamp
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s6, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s6, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s3, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s7, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
|
||||
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_e64 v2, s0, v2 clamp
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s4, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
|
||||
@@ -782,7 +784,7 @@ define i24 @v_uaddsat_i24(i24 %lhs, i24 %rhs) {
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
|
||||
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v2, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, v2, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0
|
||||
@@ -866,7 +868,7 @@ define i32 @v_uaddsat_i32(i32 %lhs, i32 %rhs) {
|
||||
; GFX6-LABEL: v_uaddsat_i32:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v2, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, v2, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -954,7 +956,7 @@ define amdgpu_ps float @uaddsat_i32_sv(i32 inreg %lhs, i32 %rhs) {
|
||||
define amdgpu_ps float @uaddsat_i32_vs(i32 %lhs, i32 inreg %rhs) {
|
||||
; GFX6-LABEL: uaddsat_i32_vs:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v1, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, s0, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; GFX6-NEXT: ; return to shader part epilog
|
||||
@@ -982,10 +984,10 @@ define <2 x i32> @v_uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
|
||||
; GFX6-LABEL: v_uaddsat_v2i32:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v4, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v2, v4, v2
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
||||
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v2, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v2, v2, v3
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -1061,13 +1063,13 @@ define <3 x i32> @v_uaddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
|
||||
; GFX6-LABEL: v_uaddsat_v3i32:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_xor_b32_e32 v6, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v6, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v3, v6, v3
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
|
||||
; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v3, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v3, v3, v4
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3
|
||||
; GFX6-NEXT: v_xor_b32_e32 v3, -1, v2
|
||||
; GFX6-NEXT: v_not_b32_e32 v3, v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v3, v3, v5
|
||||
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -1157,16 +1159,16 @@ define <4 x i32> @v_uaddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
|
||||
; GFX6-LABEL: v_uaddsat_v4i32:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v8, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v4, v8, v4
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
||||
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v4, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v4, v4, v5
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
|
||||
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v2
|
||||
; GFX6-NEXT: v_not_b32_e32 v4, v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v4, v4, v6
|
||||
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
|
||||
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v3
|
||||
; GFX6-NEXT: v_not_b32_e32 v4, v3
|
||||
; GFX6-NEXT: v_min_u32_e32 v4, v4, v7
|
||||
; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -1270,19 +1272,19 @@ define <5 x i32> @v_uaddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
|
||||
; GFX6-LABEL: v_uaddsat_v5i32:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_xor_b32_e32 v10, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v10, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v5, v10, v5
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v5
|
||||
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v5, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v5, v5, v6
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5
|
||||
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2
|
||||
; GFX6-NEXT: v_not_b32_e32 v5, v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v5, v5, v7
|
||||
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
|
||||
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3
|
||||
; GFX6-NEXT: v_not_b32_e32 v5, v3
|
||||
; GFX6-NEXT: v_min_u32_e32 v5, v5, v8
|
||||
; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
|
||||
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v4
|
||||
; GFX6-NEXT: v_not_b32_e32 v5, v4
|
||||
; GFX6-NEXT: v_min_u32_e32 v5, v5, v9
|
||||
; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
|
||||
; GFX6-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -1400,53 +1402,53 @@ define <16 x i32> @v_uaddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
|
||||
; GFX6-LABEL: v_uaddsat_v16i32:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_xor_b32_e32 v31, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v31, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v31, v16
|
||||
; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v17
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v2
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v18
|
||||
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v3
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v3
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v19
|
||||
; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v4
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v4
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v20
|
||||
; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v5
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v5
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v21
|
||||
; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v6
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v6
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v22
|
||||
; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v7
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v7
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v23
|
||||
; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v8
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v8
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v24
|
||||
; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v9
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v9
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v25
|
||||
; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v10
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v10
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v26
|
||||
; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v11
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v11
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v27
|
||||
; GFX6-NEXT: v_add_i32_e32 v11, vcc, v11, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v12
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v12
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v28
|
||||
; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v13
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v13
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v29
|
||||
; GFX6-NEXT: v_add_i32_e32 v13, vcc, v13, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v14
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v14
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v30
|
||||
; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v15
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v15
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX6-NEXT: v_min_u32_e32 v16, v16, v31
|
||||
; GFX6-NEXT: v_add_i32_e32 v15, vcc, v15, v16
|
||||
@@ -1751,7 +1753,7 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v2, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, v2, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
@@ -1848,7 +1850,7 @@ define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v1, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v1, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v1, s0, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
@@ -1879,12 +1881,12 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX6-NEXT: v_xor_b32_e32 v4, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v4, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v2, v4, v2
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
|
||||
; GFX6-NEXT: v_xor_b32_e32 v3, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v3, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v2, v3, v2
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
@@ -1895,10 +1897,8 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_add_u16_e64 v2, v0, v1 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_uaddsat_v2i16:
|
||||
@@ -1938,15 +1938,14 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
|
||||
;
|
||||
; GFX8-LABEL: s_uaddsat_v2i16:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: v_add_u16_e64 v1, s2, v1 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@@ -1989,10 +1988,8 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
|
||||
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_add_u16_e64 v1, s0, v0 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: uaddsat_v2i16_sv:
|
||||
@@ -2014,12 +2011,12 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX6-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v2, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v2, s0, v2
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
|
||||
; GFX6-NEXT: s_lshl_b32 s0, s1, 16
|
||||
; GFX6-NEXT: v_xor_b32_e32 v2, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v2, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v2, s0, v2
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
@@ -2031,10 +2028,8 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
|
||||
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_add_u16_e64 v1, v0, s0 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: uaddsat_v2i16_vs:
|
||||
@@ -2068,22 +2063,22 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
||||
; GFX6-NEXT: v_xor_b32_e32 v8, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v8, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v4, v8, v4
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
|
||||
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v5, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v4, v5, v4
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6
|
||||
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v2
|
||||
; GFX6-NEXT: v_not_b32_e32 v5, v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v4, v5, v4
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7
|
||||
; GFX6-NEXT: v_xor_b32_e32 v5, -1, v3
|
||||
; GFX6-NEXT: v_not_b32_e32 v5, v3
|
||||
; GFX6-NEXT: v_min_u32_e32 v4, v5, v4
|
||||
; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
@@ -2096,14 +2091,11 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_add_u16_e64 v4, v0, v2 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_e64 v2, v1, v3 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_uaddsat_v4i16:
|
||||
@@ -2160,23 +2152,22 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
|
||||
;
|
||||
; GFX8-LABEL: s_uaddsat_v4i16:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s3, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: v_add_u16_e64 v1, s4, v1 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 16
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX8-NEXT: v_add_u16_e64 v3, s5, v3 clamp
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
@@ -2220,32 +2211,32 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
|
||||
; GFX6-NEXT: v_xor_b32_e32 v12, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v12, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v6, v12, v6
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7
|
||||
; GFX6-NEXT: v_xor_b32_e32 v7, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v7, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v6, v7, v6
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v6
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8
|
||||
; GFX6-NEXT: v_xor_b32_e32 v7, -1, v2
|
||||
; GFX6-NEXT: v_not_b32_e32 v7, v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v6, v7, v6
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9
|
||||
; GFX6-NEXT: v_xor_b32_e32 v7, -1, v3
|
||||
; GFX6-NEXT: v_not_b32_e32 v7, v3
|
||||
; GFX6-NEXT: v_min_u32_e32 v6, v7, v6
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
||||
; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10
|
||||
; GFX6-NEXT: v_xor_b32_e32 v7, -1, v4
|
||||
; GFX6-NEXT: v_not_b32_e32 v7, v4
|
||||
; GFX6-NEXT: v_min_u32_e32 v6, v7, v6
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
||||
; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11
|
||||
; GFX6-NEXT: v_xor_b32_e32 v7, -1, v5
|
||||
; GFX6-NEXT: v_not_b32_e32 v7, v5
|
||||
; GFX6-NEXT: v_min_u32_e32 v6, v7, v6
|
||||
; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
@@ -2260,19 +2251,14 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_add_u16_e64 v6, v0, v3 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_e64 v3, v1, v4 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_e64 v4, v2, v5 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_uaddsat_v6i16:
|
||||
@@ -2345,31 +2331,30 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
|
||||
;
|
||||
; GFX8-LABEL: s_uaddsat_v6i16:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_lshr_b32 s9, s3, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s10, s4, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s11, s5, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_add_u16_e64 v1, s6, v1 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s9, s3, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s10, s4, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s8, s2, 16
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: s_lshr_b32 s11, s5, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_add_u16_e64 v3, s7, v3 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s11
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GFX8-NEXT: v_add_u16_e64 v5, s8, v5 clamp
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s11
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s8
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
|
||||
; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s2, v2
|
||||
@@ -2408,42 +2393,42 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
|
||||
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8
|
||||
; GFX6-NEXT: v_xor_b32_e32 v16, -1, v0
|
||||
; GFX6-NEXT: v_not_b32_e32 v16, v0
|
||||
; GFX6-NEXT: v_min_u32_e32 v8, v16, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9
|
||||
; GFX6-NEXT: v_xor_b32_e32 v9, -1, v1
|
||||
; GFX6-NEXT: v_not_b32_e32 v9, v1
|
||||
; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10
|
||||
; GFX6-NEXT: v_xor_b32_e32 v9, -1, v2
|
||||
; GFX6-NEXT: v_not_b32_e32 v9, v2
|
||||
; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11
|
||||
; GFX6-NEXT: v_xor_b32_e32 v9, -1, v3
|
||||
; GFX6-NEXT: v_not_b32_e32 v9, v3
|
||||
; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
||||
; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12
|
||||
; GFX6-NEXT: v_xor_b32_e32 v9, -1, v4
|
||||
; GFX6-NEXT: v_not_b32_e32 v9, v4
|
||||
; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
||||
; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13
|
||||
; GFX6-NEXT: v_xor_b32_e32 v9, -1, v5
|
||||
; GFX6-NEXT: v_not_b32_e32 v9, v5
|
||||
; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6
|
||||
; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14
|
||||
; GFX6-NEXT: v_xor_b32_e32 v9, -1, v6
|
||||
; GFX6-NEXT: v_not_b32_e32 v9, v6
|
||||
; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
|
||||
; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8
|
||||
; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15
|
||||
; GFX6-NEXT: v_xor_b32_e32 v9, -1, v7
|
||||
; GFX6-NEXT: v_not_b32_e32 v9, v7
|
||||
; GFX6-NEXT: v_min_u32_e32 v8, v9, v8
|
||||
; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
@@ -2460,23 +2445,17 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_add_u16_e64 v8, v0, v4 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_e64 v4, v1, v5 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_e64 v5, v2, v6 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_add_u16_e64 v6, v3, v7 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
|
||||
; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_uaddsat_v8i16:
|
||||
@@ -2565,39 +2544,38 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
|
||||
;
|
||||
; GFX8-LABEL: s_uaddsat_v8i16:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s8, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s12
|
||||
; GFX8-NEXT: s_lshr_b32 s9, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: v_add_u16_e64 v1, s8, v1 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s13
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s11, s3, 16
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX8-NEXT: v_add_u16_e64 v3, s9, v3 clamp
|
||||
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s12
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s13
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s9
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s15
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX8-NEXT: v_add_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GFX8-NEXT: v_add_u16_e64 v5, s10, v5 clamp
|
||||
; GFX8-NEXT: v_add_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
||||
; GFX8-NEXT: v_add_u16_e64 v7, s11, v7 clamp
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s15
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, s11
|
||||
; GFX8-NEXT: v_add_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_add_u16_e64 v2, s1, v2 clamp
|
||||
; GFX8-NEXT: v_add_u16_e64 v4, s2, v4 clamp
|
||||
; GFX8-NEXT: v_add_u16_e64 v6, s3, v6 clamp
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_add_u16_sdwa v7, v8, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
|
||||
; GFX8-NEXT: v_or_b32_e32 v3, v6, v7
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s2, v2
|
||||
|
||||
@@ -2320,11 +2320,12 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
|
||||
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
||||
; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
|
||||
; GFX9-NEXT: s_mov_b32 s0, 0x5040100
|
||||
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
|
||||
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
|
||||
; GFX9-NEXT: global_store_dword v2, v1, s[6:7]
|
||||
@@ -2339,7 +2340,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1
|
||||
; GFX10-NEXT: s_sub_i32 s3, 0, s2
|
||||
; GFX10-NEXT: s_sub_i32 s6, 0, s1
|
||||
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
@@ -2347,10 +2348,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1
|
||||
; GFX10-NEXT: s_sub_i32 s3, 0, s1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1
|
||||
; GFX10-NEXT: s_and_b32 s3, s0, 0xffff
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
|
||||
@@ -2358,32 +2359,34 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
|
||||
; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2
|
||||
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s2, v2
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
|
||||
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
|
||||
; GFX10-NEXT: global_store_dword v1, v2, s[6:7]
|
||||
|
||||
@@ -59,9 +59,8 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
|
||||
;
|
||||
; GFX8-LABEL: s_usubsat_i7:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0
|
||||
@@ -70,9 +69,8 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
|
||||
;
|
||||
; GFX9-LABEL: s_usubsat_i7:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0
|
||||
@@ -81,9 +79,8 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_usubsat_i7:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s2, 9, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9
|
||||
; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
|
||||
; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0
|
||||
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@@ -146,9 +143,8 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
|
||||
;
|
||||
; GFX8-LABEL: s_usubsat_i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
|
||||
@@ -157,9 +153,8 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
|
||||
;
|
||||
; GFX9-LABEL: s_usubsat_i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX9-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX9-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0
|
||||
@@ -168,9 +163,8 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_usubsat_i8:
|
||||
; GFX10PLUS: ; %bb.0:
|
||||
; GFX10PLUS-NEXT: s_bfe_u32 s2, 8, 0x100000
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s2
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
|
||||
; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0
|
||||
; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@@ -219,9 +213,10 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_perm_b32 v0, v2, v0, s4
|
||||
; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
|
||||
@@ -237,9 +232,11 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10-NEXT: s_movk_i32 s4, 0xff
|
||||
; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
|
||||
; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
|
||||
@@ -254,8 +251,10 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
|
||||
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
|
||||
@@ -294,15 +293,14 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
|
||||
;
|
||||
; GFX8-LABEL: s_usubsat_v2i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s3, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s3, 8
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s2, s4
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s2, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp
|
||||
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
|
||||
@@ -456,14 +454,15 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX9-LABEL: v_usubsat_v4i8:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX9-NEXT: v_perm_b32 v2, v2, v0, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
||||
; GFX9-NEXT: v_perm_b32 v3, v4, v1, s4
|
||||
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6
|
||||
; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
@@ -490,15 +489,17 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX10-NEXT: v_perm_b32 v2, v2, v0, 0x5040100
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
||||
; GFX10-NEXT: v_perm_b32 v3, v4, v1, 0x5040100
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 24
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
|
||||
; GFX10-NEXT: v_pk_sub_u16 v2, v2, v3 clamp
|
||||
@@ -520,12 +521,14 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GFX11-NEXT: v_perm_b32 v2, v2, v0, 0x5040100
|
||||
; GFX11-NEXT: v_perm_b32 v3, v3, v1, 0x5040100
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, v4, v0, 16
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
||||
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0
|
||||
; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4
|
||||
; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
|
||||
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
|
||||
@@ -589,29 +592,28 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
|
||||
;
|
||||
; GFX8-LABEL: s_usubsat_v4i8:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_bfe_u32 s8, 8, 0x100000
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s1, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 8
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s0, 24
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s5, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s5, 8
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s2, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s2, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s6, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s6, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s3, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s3, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s7, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s7, 8
|
||||
; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s4, s8
|
||||
; GFX8-NEXT: s_lshl_b32 s0, s4, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1
|
||||
@@ -1807,10 +1809,8 @@ define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_sub_u16_e64 v2, v0, v1 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_usubsat_v2i16:
|
||||
@@ -1848,15 +1848,14 @@ define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
|
||||
;
|
||||
; GFX8-LABEL: s_usubsat_v2i16:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v1, s2, v1 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@@ -1897,10 +1896,8 @@ define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
|
||||
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v1, s0, v0 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: usubsat_v2i16_sv:
|
||||
@@ -1937,10 +1934,8 @@ define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
|
||||
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v1, v0, s0 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX9-LABEL: usubsat_v2i16_vs:
|
||||
@@ -1998,14 +1993,11 @@ define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_sub_u16_e64 v4, v0, v2 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v2, v1, v3 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_usubsat_v4i16:
|
||||
@@ -2058,23 +2050,22 @@ define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
|
||||
;
|
||||
; GFX8-LABEL: s_usubsat_v4i16:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s3, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: v_sub_u16_e64 v1, s4, v1 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 16
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX8-NEXT: v_sub_u16_e64 v3, s5, v3 clamp
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
@@ -2152,19 +2143,14 @@ define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_sub_u16_e64 v6, v0, v3 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v3, v1, v4 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v4, v2, v5 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v6, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v2
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_usubsat_v6i16:
|
||||
@@ -2231,31 +2217,30 @@ define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
|
||||
;
|
||||
; GFX8-LABEL: s_usubsat_v6i16:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_lshr_b32 s9, s3, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s6, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s10, s4, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s11, s5, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_sub_u16_e64 v1, s6, v1 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s9, s3, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s10, s4, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s8, s2, 16
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: s_lshr_b32 s11, s5, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_sub_u16_e64 v3, s7, v3 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s11
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GFX8-NEXT: v_sub_u16_e64 v5, s8, v5 clamp
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s11
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s8
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
|
||||
; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s2, v2
|
||||
@@ -2338,23 +2323,17 @@ define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_sub_u16_e64 v8, v0, v4 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v4, v1, v5 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v5, v2, v6 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_sub_u16_e64 v6, v3, v7 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v8, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
|
||||
; GFX8-NEXT: v_or_b32_e32 v3, v6, v3
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_usubsat_v8i16:
|
||||
@@ -2435,39 +2414,38 @@ define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
|
||||
;
|
||||
; GFX8-LABEL: s_usubsat_v8i16:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s8, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s12
|
||||
; GFX8-NEXT: s_lshr_b32 s9, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: v_sub_u16_e64 v1, s8, v1 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s13
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
|
||||
; GFX8-NEXT: s_lshr_b32 s11, s3, 16
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX8-NEXT: v_sub_u16_e64 v3, s9, v3 clamp
|
||||
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s12
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s13
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s9
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s15
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GFX8-NEXT: v_sub_u16_e64 v5, s10, v5 clamp
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
||||
; GFX8-NEXT: v_sub_u16_e64 v7, s11, v7 clamp
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v7, s15
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, s11
|
||||
; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp
|
||||
; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp
|
||||
; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp
|
||||
; GFX8-NEXT: v_sub_u16_e64 v6, s3, v6 clamp
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_sub_u16_sdwa v7, v8, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v2, v3
|
||||
; GFX8-NEXT: v_or_b32_e32 v2, v4, v5
|
||||
; GFX8-NEXT: v_or_b32_e32 v3, v6, v7
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s2, v2
|
||||
|
||||
@@ -212,38 +212,19 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b)
|
||||
}
|
||||
|
||||
define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) {
|
||||
; GFX7-LABEL: vector_xnor_i32_one_use:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX8-LABEL: vector_xnor_i32_one_use:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX900-LABEL: vector_xnor_i32_one_use:
|
||||
; GFX900: ; %bb.0: ; %entry
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX906-LABEL: vector_xnor_i32_one_use:
|
||||
; GFX906: ; %bb.0: ; %entry
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v1
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
; GCN-LABEL: vector_xnor_i32_one_use:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GCN-NEXT: v_not_b32_e32 v0, v0
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: vector_xnor_i32_one_use:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor3_b32 v0, v0, v1, -1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX10-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
%xor = xor i32 %a, %b
|
||||
@@ -252,46 +233,23 @@ entry:
|
||||
}
|
||||
|
||||
define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) {
|
||||
; GFX7-LABEL: vector_xnor_i64_one_use:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX8-LABEL: vector_xnor_i64_one_use:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX900-LABEL: vector_xnor_i64_one_use:
|
||||
; GFX900: ; %bb.0: ; %entry
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, v0, v2
|
||||
; GFX900-NEXT: v_xor_b32_e32 v1, v1, v3
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX900-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX906-LABEL: vector_xnor_i64_one_use:
|
||||
; GFX906: ; %bb.0: ; %entry
|
||||
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v2
|
||||
; GFX906-NEXT: v_xnor_b32_e32 v1, v1, v3
|
||||
; GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
; GCN-LABEL: vector_xnor_i64_one_use:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
|
||||
; GCN-NEXT: v_xor_b32_e32 v1, v1, v3
|
||||
; GCN-NEXT: v_not_b32_e32 v0, v0
|
||||
; GCN-NEXT: v_not_b32_e32 v1, v1
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-LABEL: vector_xnor_i64_one_use:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_xor3_b32 v0, v0, v2, -1
|
||||
; GFX10-NEXT: v_xor3_b32 v1, v1, v3, -1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
|
||||
; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3
|
||||
; GFX10-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX10-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
%xor = xor i64 %a, %b
|
||||
@@ -300,32 +258,16 @@ entry:
|
||||
}
|
||||
|
||||
define amdgpu_ps float @xnor_s_v_i32_one_use(i32 inreg %s, i32 %v) {
|
||||
; GFX7-LABEL: xnor_s_v_i32_one_use:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-LABEL: xnor_s_v_i32_one_use:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX900-LABEL: xnor_s_v_i32_one_use:
|
||||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX900-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX906-LABEL: xnor_s_v_i32_one_use:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: v_xnor_b32_e32 v0, s0, v0
|
||||
; GFX906-NEXT: ; return to shader part epilog
|
||||
; GCN-LABEL: xnor_s_v_i32_one_use:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GCN-NEXT: v_not_b32_e32 v0, v0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: xnor_s_v_i32_one_use:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_xor3_b32 v0, s0, v0, -1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX10-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%xor = xor i32 %s, %v
|
||||
%d = xor i32 %xor, -1
|
||||
@@ -334,32 +276,16 @@ define amdgpu_ps float @xnor_s_v_i32_one_use(i32 inreg %s, i32 %v) {
|
||||
}
|
||||
|
||||
define amdgpu_ps float @xnor_v_s_i32_one_use(i32 inreg %s, i32 %v) {
|
||||
; GFX7-LABEL: xnor_v_s_i32_one_use:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-LABEL: xnor_v_s_i32_one_use:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX900-LABEL: xnor_v_s_i32_one_use:
|
||||
; GFX900: ; %bb.0:
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX900-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX906-LABEL: xnor_v_s_i32_one_use:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: v_xnor_b32_e64 v0, v0, s0
|
||||
; GFX906-NEXT: ; return to shader part epilog
|
||||
; GCN-LABEL: xnor_v_s_i32_one_use:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GCN-NEXT: v_not_b32_e32 v0, v0
|
||||
; GCN-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: xnor_v_s_i32_one_use:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_xor3_b32 v0, v0, s0, -1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX10-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%xor = xor i32 %v, %s
|
||||
%d = xor i32 %xor, -1
|
||||
@@ -373,8 +299,8 @@ define amdgpu_ps <2 x float> @xnor_i64_s_v_one_use(i64 inreg %a, i64 %b64) {
|
||||
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 29
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-LABEL: xnor_i64_s_v_one_use:
|
||||
@@ -382,8 +308,8 @@ define amdgpu_ps <2 x float> @xnor_i64_s_v_one_use(i64 inreg %a, i64 %b64) {
|
||||
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX900-LABEL: xnor_i64_s_v_one_use:
|
||||
@@ -391,22 +317,26 @@ define amdgpu_ps <2 x float> @xnor_i64_s_v_one_use(i64 inreg %a, i64 %b64) {
|
||||
; GFX900-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX900-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX900-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX900-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX900-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX900-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX906-LABEL: xnor_i64_s_v_one_use:
|
||||
; GFX906: ; %bb.0: ; %entry
|
||||
; GFX906-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
|
||||
; GFX906-NEXT: v_xnor_b32_e32 v0, s0, v0
|
||||
; GFX906-NEXT: v_xnor_b32_e32 v1, s1, v1
|
||||
; GFX906-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX906-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX906-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX906-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX906-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: xnor_i64_s_v_one_use:
|
||||
; GFX10: ; %bb.0: ; %entry
|
||||
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
|
||||
; GFX10-NEXT: v_xor3_b32 v0, s0, v0, -1
|
||||
; GFX10-NEXT: v_xor3_b32 v1, s1, v1, -1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX10-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX10-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
entry:
|
||||
%b = shl i64 %b64, 29
|
||||
@@ -422,8 +352,8 @@ define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) {
|
||||
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 29
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX8-LABEL: xnor_i64_v_s_one_use:
|
||||
@@ -431,8 +361,8 @@ define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) {
|
||||
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX900-LABEL: xnor_i64_v_s_one_use:
|
||||
@@ -440,22 +370,26 @@ define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) {
|
||||
; GFX900-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX900-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX900-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX900-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX900-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX900-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX906-LABEL: xnor_i64_v_s_one_use:
|
||||
; GFX906: ; %bb.0:
|
||||
; GFX906-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
|
||||
; GFX906-NEXT: v_xnor_b32_e64 v0, v0, s0
|
||||
; GFX906-NEXT: v_xnor_b32_e64 v1, v1, s1
|
||||
; GFX906-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX906-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX906-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX906-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX906-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-LABEL: xnor_i64_v_s_one_use:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
|
||||
; GFX10-NEXT: v_xor3_b32 v0, v0, s0, -1
|
||||
; GFX10-NEXT: v_xor3_b32 v1, v1, s1, -1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX10-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX10-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%b = shl i64 %b64, 29
|
||||
%xor = xor i64 %b, %a
|
||||
@@ -468,21 +402,21 @@ define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) {
|
||||
; GFX7-LABEL: vector_xor_na_b_i32_one_use:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX7-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX8-LABEL: vector_xor_na_b_i32_one_use:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX8-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX900-LABEL: vector_xor_na_b_i32_one_use:
|
||||
; GFX900: ; %bb.0: ; %entry
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0
|
||||
; GFX900-NEXT: v_not_b32_e32 v0, v0
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@@ -508,21 +442,21 @@ define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) {
|
||||
; GFX7-LABEL: vector_xor_a_nb_i32_one_use:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX7-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX8-LABEL: vector_xor_a_nb_i32_one_use:
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX8-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX900-LABEL: vector_xor_a_nb_i32_one_use:
|
||||
; GFX900: ; %bb.0: ; %entry
|
||||
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX900-NEXT: v_xor_b32_e32 v1, -1, v1
|
||||
; GFX900-NEXT: v_not_b32_e32 v1, v1
|
||||
; GFX900-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX900-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
||||
@@ -139,6 +139,10 @@ define <4 x i32> @csh_v4i32(<4 x i32> %a, <4 x i32> %b) {
|
||||
; GISEL-LABEL: csh_v4i32:
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-NEXT: v_and_b32_e32 v4, 31, v4
|
||||
; GISEL-NEXT: v_and_b32_e32 v5, 31, v5
|
||||
; GISEL-NEXT: v_and_b32_e32 v6, 31, v6
|
||||
; GISEL-NEXT: v_and_b32_e32 v7, 31, v7
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v8, v4, v0
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v9, v5, v1
|
||||
; GISEL-NEXT: v_lshlrev_b32_e32 v10, v6, v2
|
||||
|
||||
@@ -31,7 +31,8 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_d_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v5, 16, v4
|
||||
; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -58,8 +59,9 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
|
||||
; GFX10GISEL-LABEL: sample_d_3d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6
|
||||
; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -95,7 +97,8 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_c_d_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v6, 16, v5
|
||||
; GFX10GISEL-NEXT: image_sample_c_d v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -114,7 +117,8 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_d_cl_1d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -139,8 +143,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
; GFX10GISEL-LABEL: sample_d_cl_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v5
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v5, v6
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v7, 16, v4
|
||||
; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -159,7 +164,8 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_c_d_cl_1d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -185,8 +191,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10GISEL-LABEL: sample_c_d_cl_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v6, v7
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v5, v8, v5, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v8, 16, v5
|
||||
; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -222,7 +229,8 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_cd_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v5, 16, v4
|
||||
; GFX10GISEL-NEXT: image_sample_cd v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -258,7 +266,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_c_cd_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v6, 16, v5
|
||||
; GFX10GISEL-NEXT: image_sample_c_cd v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -277,7 +286,8 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_cd_cl_1d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -302,8 +312,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10GISEL-LABEL: sample_cd_cl_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v5
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v5, v6
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v7, 16, v4
|
||||
; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -322,7 +333,8 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_c_cd_cl_1d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -348,8 +360,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10GISEL-LABEL: sample_c_cd_cl_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v5
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v6, v7
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v5, v8, v5, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v8, 16, v5
|
||||
; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -376,8 +389,9 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6
|
||||
; GFX10GISEL-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -404,8 +418,9 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
|
||||
; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v9, 16, v6
|
||||
; GFX10GISEL-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -464,8 +479,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_g16_noa16_d_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -488,9 +505,11 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x
|
||||
; GFX10GISEL-LABEL: sample_g16_noa16_d_3d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v2, v1, v0, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v9, 0xffff, v9
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v1, 16, v0
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v9
|
||||
; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -527,8 +546,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3
|
||||
; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -565,8 +586,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -606,10 +629,11 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc,
|
||||
; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v3, v8, v1, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v0
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v1
|
||||
; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -646,8 +670,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -684,8 +710,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v2, v4, v3, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3
|
||||
; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -722,8 +750,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -763,10 +793,11 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc,
|
||||
; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_2d:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v3, v8, v1, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v3
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v3, v8, 16, v0
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v1
|
||||
; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -792,11 +823,12 @@ define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc,
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v4, v10, v9, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v5, v5, v11, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v9
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v5, 16, v1
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v10, 16, v0
|
||||
; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -822,11 +854,12 @@ define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v4, v10, v9, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v5, v5, v11, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v4
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v9
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v5, v5, 16, v1
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v10, 16, v0
|
||||
; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -886,9 +919,12 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
|
||||
;
|
||||
; GFX10GISEL-LABEL: sample_d_2d_g16_a16:
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4
|
||||
; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
@@ -914,11 +950,14 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
|
||||
; GFX10GISEL: ; %bb.0: ; %main_body
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v7
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, v8
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v2, v1, v0, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v4, v4, v9, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_perm_b32 v6, v10, v6, 0x5040100
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; GFX10GISEL-NEXT: v_and_b32_e32 v8, 0xffff, v9
|
||||
; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v2, v1, 16, v0
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v6, v10, 16, v6
|
||||
; GFX10GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v8
|
||||
; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
|
||||
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10GISEL-NEXT: ; return to shader part epilog
|
||||
|
||||
@@ -32,7 +32,7 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
|
||||
; GFX7GLISEL-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7GLISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: s_and_b32 s3, s3, 0x7fff
|
||||
; GFX7GLISEL-NEXT: s_bfe_u32 s3, s3, 0x100000
|
||||
; GFX7GLISEL-NEXT: s_and_b32 s3, 0xffff, s3
|
||||
; GFX7GLISEL-NEXT: s_cmpk_gt_u32 s3, 0x7c00
|
||||
; GFX7GLISEL-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX7GLISEL-NEXT: s_bfe_i32 s3, s3, 0x10000
|
||||
@@ -217,7 +217,7 @@ define i1 @snan_f16(half %x) nounwind {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00
|
||||
@@ -274,7 +274,7 @@ define i1 @qnan_f16(half %x) nounwind {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00
|
||||
; GFX7GLISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
@@ -440,7 +440,7 @@ define i1 @posnormal_f16(half %x) nounwind {
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v2, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2
|
||||
; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
@@ -506,7 +506,7 @@ define i1 @negnormal_f16(half %x) nounwind {
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v2, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, v2
|
||||
; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
@@ -626,7 +626,7 @@ define i1 @negsubnormal_f16(half %x) nounwind {
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v2, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v2
|
||||
; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[4:5], 1, v1
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
@@ -844,7 +844,7 @@ define i1 @negfinite_f16(half %x) nounwind {
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v0, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v1, v0
|
||||
@@ -900,7 +900,7 @@ define i1 @isnan_f16(half %x) nounwind {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
@@ -954,7 +954,7 @@ define i1 @not_isnan_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v0
|
||||
@@ -1016,10 +1016,10 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1
|
||||
@@ -1096,13 +1096,13 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0x7fff, v2
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v2, v2, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
|
||||
@@ -1248,17 +1248,17 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0x7fff, v2
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v2, v2, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0x7fff, v3
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v3, v3, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
|
||||
@@ -1397,7 +1397,7 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
@@ -1451,7 +1451,7 @@ define i1 @isinf_f16(half %x) nounwind {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
@@ -1507,7 +1507,7 @@ define i1 @isfinite_f16(half %x) nounwind {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
@@ -1566,7 +1566,7 @@ define i1 @issubnormal_or_zero_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[4:5], 1, v0
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
@@ -1632,7 +1632,7 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v1
|
||||
@@ -1757,7 +1757,7 @@ define i1 @not_isnormal_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[4:5], 1, v0
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff
|
||||
@@ -1828,7 +1828,7 @@ define i1 @not_is_plus_normal_f16(half %x) {
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v2, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, v2
|
||||
; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[6:7], 1, v1
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
@@ -1906,7 +1906,7 @@ define i1 @not_is_neg_normal_f16(half %x) {
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v2, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2
|
||||
; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[6:7], 1, v1
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
@@ -2037,7 +2037,7 @@ define i1 @not_issubnormal_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v1
|
||||
@@ -2101,7 +2101,7 @@ define i1 @iszero_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0:
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -2159,7 +2159,7 @@ define i1 @not_iszero_f16(half %x) {
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v1
|
||||
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
@@ -2288,7 +2288,7 @@ define i1 @not_ispositive_f16(half %x) {
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], s6, v1
|
||||
@@ -2356,7 +2356,7 @@ define i1 @isnegative_f16(half %x) {
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v1, v2
|
||||
@@ -2422,7 +2422,7 @@ define i1 @not_isnegative_f16(half %x) {
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s6, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v1, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0
|
||||
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc
|
||||
@@ -2481,7 +2481,7 @@ define i1 @iszero_or_nan_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1
|
||||
@@ -2542,7 +2542,7 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1
|
||||
@@ -2603,7 +2603,7 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1
|
||||
@@ -2675,7 +2675,7 @@ define i1 @not_iszero_or_nan_f16(half %x) {
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2
|
||||
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
@@ -2751,7 +2751,7 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2
|
||||
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
@@ -2827,7 +2827,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2
|
||||
; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
||||
@@ -2892,7 +2892,7 @@ define i1 @iszero_or_qnan_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GFX7GLISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1
|
||||
@@ -2956,7 +2956,7 @@ define i1 @iszero_or_snan_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], v0, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7e00
|
||||
@@ -3036,7 +3036,7 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s8, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s8, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7e00
|
||||
@@ -3120,7 +3120,7 @@ define i1 @not_iszero_or_snan_f16(half %x) {
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v1, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7e00
|
||||
@@ -3186,7 +3186,7 @@ define i1 @isinf_or_nan_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
|
||||
@@ -3245,7 +3245,7 @@ define i1 @not_isinf_or_nan_f16(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
@@ -3302,7 +3302,7 @@ define i1 @isfinite_or_nan_f(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: s_movk_i32 s4, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
|
||||
; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], s4, v0
|
||||
@@ -3361,7 +3361,7 @@ define i1 @not_isfinite_or_nan_f(half %x) {
|
||||
; GFX7GLISEL: ; %bb.0: ; %entry
|
||||
; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0
|
||||
; GFX7GLISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7c00
|
||||
; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
|
||||
; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
|
||||
@@ -14,15 +14,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
|
||||
; SDAG-VI: ; %bb.0:
|
||||
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
|
||||
; SDAG-CI: ; %bb.0:
|
||||
@@ -32,18 +32,6 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s
|
||||
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
|
||||
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
|
||||
; GISEL-VI: ; %bb.0:
|
||||
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
|
||||
; GISEL-CI: ; %bb.0:
|
||||
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
@@ -71,16 +59,16 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
|
||||
; SDAG-VI: ; %bb.0:
|
||||
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; SDAG-VI-NEXT: v_or_b32_e32 v0, 0x3c00, v0
|
||||
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_e32 v0, 0x3c00, v0
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
|
||||
; SDAG-CI: ; %bb.0:
|
||||
@@ -91,21 +79,6 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
|
||||
; SDAG-CI-NEXT: v_mov_b32_e32 v0, 1.0
|
||||
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
|
||||
; GISEL-VI: ; %bb.0:
|
||||
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GISEL-VI-NEXT: s_movk_i32 s4, 0x3c00
|
||||
; GISEL-VI-NEXT: s_bfe_u32 s4, s4, 0x100000
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, s4, v0
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
|
||||
; GISEL-CI: ; %bb.0:
|
||||
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
@@ -133,16 +106,16 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
|
||||
; SDAG-VI: ; %bb.0:
|
||||
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; SDAG-VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
|
||||
; SDAG-CI: ; %bb.0:
|
||||
@@ -153,19 +126,6 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
|
||||
; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3
|
||||
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
|
||||
; GISEL-VI: ; %bb.0:
|
||||
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
|
||||
; GISEL-CI: ; %bb.0:
|
||||
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
@@ -194,15 +154,15 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
|
||||
; SDAG-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
|
||||
; SDAG-VI: ; %bb.0:
|
||||
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
|
||||
; SDAG-CI: ; %bb.0:
|
||||
@@ -216,22 +176,10 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
|
||||
; GISEL-GFX9: ; %bb.0:
|
||||
; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
|
||||
; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GISEL-GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
|
||||
; GISEL-VI: ; %bb.0:
|
||||
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
|
||||
; GISEL-CI: ; %bb.0:
|
||||
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
@@ -240,7 +188,6 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; GISEL-CI-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
|
||||
%src0.ext = fpext half %src0 to float
|
||||
@@ -330,15 +277,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %
|
||||
; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
|
||||
; SDAG-VI: ; %bb.0:
|
||||
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SDAG-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
|
||||
; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
|
||||
; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
|
||||
; SDAG-CI: ; %bb.0:
|
||||
@@ -348,18 +295,6 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %
|
||||
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
|
||||
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
|
||||
; GISEL-VI: ; %bb.0:
|
||||
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GISEL-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
|
||||
; GISEL-CI: ; %bb.0:
|
||||
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
@@ -387,15 +322,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
|
||||
; SDAG-VI: ; %bb.0:
|
||||
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
|
||||
; SDAG-CI: ; %bb.0:
|
||||
@@ -405,18 +340,6 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
|
||||
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp
|
||||
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
|
||||
; GISEL-VI: ; %bb.0:
|
||||
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
|
||||
; GISEL-CI: ; %bb.0:
|
||||
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
@@ -455,18 +378,18 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
|
||||
; SDAG-VI: ; %bb.0:
|
||||
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; SDAG-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; SDAG-VI-NEXT: flat_store_short v[0:1], v0
|
||||
; SDAG-VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; VI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; VI-NEXT: flat_store_short v[0:1], v0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
|
||||
; SDAG-CI: ; %bb.0:
|
||||
@@ -480,21 +403,6 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
|
||||
; SDAG-CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
|
||||
; GISEL-VI: ; %bb.0:
|
||||
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; GISEL-VI-NEXT: flat_store_short v[0:1], v0
|
||||
; GISEL-VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; GISEL-VI-NEXT: v_max_f16_e64 v0, v0, v0 clamp
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
|
||||
; GISEL-CI: ; %bb.0:
|
||||
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
@@ -539,4 +447,3 @@ attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign"
|
||||
attributes #1 = { nounwind readnone speculatable }
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; CI: {{.*}}
|
||||
; VI: {{.*}}
|
||||
|
||||
@@ -315,11 +315,9 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v5
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mix_v2f32:
|
||||
@@ -446,16 +444,13 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v8, v6, v7
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v4
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v8
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v8
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v5
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_bfe_u32 v1, v2, 0, 16
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mix_v3f32:
|
||||
@@ -603,29 +598,26 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
|
||||
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v6, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v7, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v9, v3
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v11, v5
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v10, v6, v8
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v11, v7, v9
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v4
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v10
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v3, v5
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v11
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v4, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v3
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mix_v4f32:
|
||||
@@ -729,11 +721,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v5 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
|
||||
@@ -885,16 +875,13 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v8, v6, v7
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v4 clamp
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v8 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v2, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v8 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v2, v5 clamp
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_bfe_u32 v1, v2, 0, 16
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v2
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
|
||||
@@ -1042,29 +1029,26 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
|
||||
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v6, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v7, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v8, v2
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v9, v3
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v10, v4
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v11, v5
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v10, v6, v8
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v4, v0, v2
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v11, v7, v9
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v1, v3
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v1, v4 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v0, v10 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e64 v2, v11 clamp
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v4, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v3, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v3
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
|
||||
@@ -1192,9 +1176,8 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
|
||||
; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
|
||||
; GISEL-GFX900-NEXT: v_max_f16_e64 v4, v3, v3 clamp
|
||||
; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
|
||||
; GISEL-GFX900-NEXT: v_bfe_u32 v0, v4, 0, 16
|
||||
; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, 0xffff0000
|
||||
; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v1, v0
|
||||
; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, 0xffff0000
|
||||
; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v0, v4
|
||||
; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
|
||||
@@ -1203,9 +1186,8 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
|
||||
; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
|
||||
; GISEL-GFX906-NEXT: v_max_f16_e64 v4, v3, v3 clamp
|
||||
; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
|
||||
; GISEL-GFX906-NEXT: v_bfe_u32 v0, v4, 0, 16
|
||||
; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, 0xffff0000
|
||||
; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v1, v0
|
||||
; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, 0xffff0000
|
||||
; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v0, v4
|
||||
; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
|
||||
@@ -1219,14 +1201,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v5
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-VI-NEXT: v_max_f16_e64 v1, v0, v0 clamp
|
||||
; GISEL-VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
|
||||
@@ -1239,14 +1219,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
|
||||
; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5
|
||||
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4
|
||||
; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
|
||||
; GISEL-CI-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
|
||||
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
@@ -1255,7 +1233,6 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -1333,8 +1310,8 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
|
||||
; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
|
||||
; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
|
||||
; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
|
||||
; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, 16
|
||||
; GISEL-GFX900-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v4
|
||||
; GISEL-GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; GISEL-GFX900-NEXT: v_and_or_b32 v0, v3, v1, v0
|
||||
; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -1345,8 +1322,8 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
|
||||
; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
|
||||
; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
|
||||
; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
|
||||
; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, 16
|
||||
; GISEL-GFX906-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v4
|
||||
; GISEL-GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, 0xffff
|
||||
; GISEL-GFX906-NEXT: v_and_or_b32 v0, v3, v1, v0
|
||||
; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
|
||||
@@ -1362,14 +1339,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v5, v3, v4
|
||||
; GISEL-VI-NEXT: v_mac_f32_e32 v2, v0, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v2
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v5
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_max_f16_sdwa v1, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v5
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-VI-NEXT: v_max_f16_sdwa v1, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@@ -1383,14 +1356,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
|
||||
; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5
|
||||
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4
|
||||
; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
|
||||
; GISEL-CI-NEXT: v_bfe_u32 v0, v0, 0, 16
|
||||
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
|
||||
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
@@ -1400,7 +1371,6 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
|
||||
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-CI-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
@@ -1510,11 +1480,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GISEL-VI-NEXT: v_mad_f32 v3, v3, v4, v5 clamp
|
||||
; GISEL-VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v3
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mix_v2f32_clamp_precvt:
|
||||
@@ -1657,16 +1625,13 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
|
||||
; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
|
||||
; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp
|
||||
; GISEL-VI-NEXT: v_mad_f32 v6, v6, v7, v8 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp
|
||||
; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v6
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mix_v3f32_clamp_precvt:
|
||||
@@ -1846,15 +1811,12 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
|
||||
; GISEL-VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp
|
||||
; GISEL-VI-NEXT: v_mad_f32 v2, v7, v9, v11 clamp
|
||||
; GISEL-VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v3, v6
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v4, 16
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v3, v0
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-CI-LABEL: v_mad_mix_v4f32_clamp_precvt:
|
||||
|
||||
@@ -437,10 +437,8 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_rndne_f16_e32 v1, v0
|
||||
; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_roundeven_v2f16:
|
||||
@@ -568,10 +566,8 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
|
||||
; GFX8-NEXT: v_rndne_f16_e32 v1, v0
|
||||
; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_roundeven_v2f16_fneg:
|
||||
@@ -712,14 +708,11 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-NEXT: v_rndne_f16_e32 v2, v0
|
||||
; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_rndne_f16_e32 v3, v1
|
||||
; GFX8-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, 16
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
|
||||
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: v_roundeven_v4f16:
|
||||
|
||||
@@ -84,10 +84,8 @@ define <2 x half> @v_constained_fmul_v2f16_fpexcept_strict(<2 x half> %x, <2 x h
|
||||
; GFX8-GISEL: ; %bb.0:
|
||||
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_e32 v2, v0, v1
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_strict:
|
||||
@@ -119,10 +117,8 @@ define <2 x half> @v_constained_fmul_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h
|
||||
; GFX8-GISEL: ; %bb.0:
|
||||
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_e32 v2, v0, v1
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_ignore:
|
||||
@@ -154,10 +150,8 @@ define <2 x half> @v_constained_fmul_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x
|
||||
; GFX8-GISEL: ; %bb.0:
|
||||
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_e32 v2, v0, v1
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap:
|
||||
@@ -198,12 +192,9 @@ define <3 x half> @v_constained_fmul_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
|
||||
; GFX8-GISEL: ; %bb.0:
|
||||
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_e32 v1, v1, v3
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict:
|
||||
@@ -277,14 +268,11 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
|
||||
; GFX8-GISEL: ; %bb.0:
|
||||
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_e32 v4, v0, v2
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_e32 v2, v1, v3
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
|
||||
@@ -370,15 +358,14 @@ define amdgpu_ps <2 x half> @s_constained_fmul_v2f16_fpexcept_strict(<2 x half>
|
||||
;
|
||||
; GFX8-GISEL-LABEL: s_constained_fmul_v2f16_fpexcept_strict:
|
||||
; GFX8-GISEL: ; %bb.0:
|
||||
; GFX8-GISEL-NEXT: s_lshr_b32 s1, s3, 16
|
||||
; GFX8-GISEL-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-GISEL-NEXT: s_lshr_b32 s1, s3, 16
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_e32 v1, s0, v1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_e32 v0, s2, v0
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_mul_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-GISEL-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10PLUS-LABEL: s_constained_fmul_v2f16_fpexcept_strict:
|
||||
|
||||
@@ -93,10 +93,8 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x h
|
||||
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX8-GISEL-NEXT: v_add_f16_e32 v2, v0, v1
|
||||
; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict:
|
||||
@@ -165,10 +163,8 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h
|
||||
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX8-GISEL-NEXT: v_add_f16_e32 v2, v0, v1
|
||||
; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore:
|
||||
@@ -237,10 +233,8 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x
|
||||
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
|
||||
; GFX8-GISEL-NEXT: v_add_f16_e32 v2, v0, v1
|
||||
; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 16
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v2, v0
|
||||
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap:
|
||||
@@ -296,9 +290,8 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
|
||||
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2
|
||||
; GFX9-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX9-GISEL-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3
|
||||
; GFX9-GISEL-NEXT: v_perm_b32 v0, v0, v4, s4
|
||||
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v4
|
||||
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX8-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
|
||||
@@ -314,12 +307,9 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
|
||||
; GFX8-GISEL: ; %bb.0:
|
||||
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2
|
||||
; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_bfe_u32 v1, v1, 0, 16
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
|
||||
@@ -339,7 +329,8 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
|
||||
; GFX10-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2
|
||||
; GFX10-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX10-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3
|
||||
; GFX10-GISEL-NEXT: v_perm_b32 v0, v0, v4, 0x5040100
|
||||
; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4
|
||||
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2
|
||||
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict:
|
||||
@@ -363,7 +354,8 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
|
||||
; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v0, v0, v2
|
||||
; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3
|
||||
; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v2, v4, v5
|
||||
; GFX10PLUS-GISEL-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
|
||||
; GFX10PLUS-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10PLUS-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX10PLUS-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
%val = call <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
|
||||
ret <3 x half> %val
|
||||
@@ -390,9 +382,8 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
|
||||
; GFX9-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX9-GISEL-NEXT: v_sub_f16_e32 v2, v1, v3
|
||||
; GFX9-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX9-GISEL-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-GISEL-NEXT: v_perm_b32 v0, v0, v4, s4
|
||||
; GFX9-GISEL-NEXT: v_perm_b32 v1, v1, v2, s4
|
||||
; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v4
|
||||
; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v2
|
||||
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX8-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
|
||||
@@ -410,14 +401,11 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
|
||||
; GFX8-GISEL: ; %bb.0:
|
||||
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX8-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2
|
||||
; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_sub_f16_e32 v2, v1, v3
|
||||
; GFX8-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, 16
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v4, v0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v2, v1
|
||||
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
|
||||
@@ -437,11 +425,13 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
|
||||
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-GISEL-NEXT: v_sub_f16_e32 v4, v0, v2
|
||||
; GFX10-GISEL-NEXT: v_sub_f16_e32 v5, v1, v3
|
||||
; GFX10-GISEL-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX10-GISEL-NEXT: v_sub_f16_e32 v2, v1, v3
|
||||
; GFX10-GISEL-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; GFX10-GISEL-NEXT: v_perm_b32 v0, v0, v4, 0x5040100
|
||||
; GFX10-GISEL-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
|
||||
; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v4
|
||||
; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v5
|
||||
; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2
|
||||
; GFX10-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v3
|
||||
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict:
|
||||
@@ -472,8 +462,10 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
|
||||
; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v1, v1, v3
|
||||
; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v2, v4, v6
|
||||
; GFX10PLUS-GISEL-NEXT: v_sub_f16_e32 v3, v5, v7
|
||||
; GFX10PLUS-GISEL-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
|
||||
; GFX10PLUS-GISEL-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
|
||||
; GFX10PLUS-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX10PLUS-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
||||
; GFX10PLUS-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0
|
||||
; GFX10PLUS-GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v1
|
||||
; GFX10PLUS-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
%val = call <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
|
||||
ret <4 x half> %val
|
||||
@@ -528,15 +520,14 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half>
|
||||
; GFX8-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
|
||||
; GFX8-GISEL: ; %bb.0:
|
||||
; GFX8-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000
|
||||
; GFX8-GISEL-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX8-GISEL-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-GISEL-NEXT: s_lshr_b32 s3, s0, 16
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-GISEL-NEXT: v_add_f16_e32 v1, s1, v1
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 16
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-GISEL-NEXT: v_add_f16_e32 v0, s2, v0
|
||||
; GFX8-GISEL-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GFX8-GISEL-NEXT: ; return to shader part epilog
|
||||
;
|
||||
; GFX10-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict:
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s
|
||||
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s
|
||||
|
||||
; <GFX9 has no V_SAT_PK, GFX9+ has V_SAT_PK, GFX11 has V_SAT_PK with t16
|
||||
@@ -27,25 +27,25 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) {
|
||||
; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: basic_smax_smin:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
|
||||
; GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
|
||||
; GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
; SDAG-GFX9-LABEL: basic_smax_smin:
|
||||
; SDAG-GFX9: ; %bb.0:
|
||||
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff
|
||||
; SDAG-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
|
||||
; SDAG-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
|
||||
; SDAG-GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
|
||||
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: basic_smax_smin:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
|
||||
; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
; SDAG-GFX11-LABEL: basic_smax_smin:
|
||||
; SDAG-GFX11: ; %bb.0:
|
||||
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
|
||||
; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
|
||||
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: basic_smax_smin:
|
||||
; GISEL-VI: ; %bb.0:
|
||||
@@ -57,6 +57,27 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) {
|
||||
; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-GFX9-LABEL: basic_smax_smin:
|
||||
; GISEL-GFX9: ; %bb.0:
|
||||
; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff
|
||||
; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
|
||||
; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
|
||||
; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-GFX11-LABEL: basic_smax_smin:
|
||||
; GISEL-GFX11: ; %bb.0:
|
||||
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
|
||||
; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
|
||||
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0)
|
||||
%src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255)
|
||||
%src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0)
|
||||
@@ -124,8 +145,8 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg
|
||||
; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GISEL-VI-NEXT: s_min_i32 s3, s3, s5
|
||||
; GISEL-VI-NEXT: s_min_i32 s2, s2, s5
|
||||
; GISEL-VI-NEXT: s_bfe_u32 s3, s3, 0x100000
|
||||
; GISEL-VI-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3
|
||||
; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GISEL-VI-NEXT: s_or_b32 s2, s2, s3
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
@@ -199,25 +220,25 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) {
|
||||
; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: basic_smin_smax:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
|
||||
; GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
|
||||
; GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
; SDAG-GFX9-LABEL: basic_smin_smax:
|
||||
; SDAG-GFX9: ; %bb.0:
|
||||
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff
|
||||
; SDAG-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
|
||||
; SDAG-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
|
||||
; SDAG-GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
|
||||
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: basic_smin_smax:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
|
||||
; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
; SDAG-GFX11-LABEL: basic_smin_smax:
|
||||
; SDAG-GFX11: ; %bb.0:
|
||||
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
|
||||
; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
|
||||
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: basic_smin_smax:
|
||||
; GISEL-VI: ; %bb.0:
|
||||
@@ -229,6 +250,27 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) {
|
||||
; GISEL-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-GFX9-LABEL: basic_smin_smax:
|
||||
; GISEL-GFX9: ; %bb.0:
|
||||
; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff
|
||||
; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
|
||||
; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
|
||||
; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-GFX11-LABEL: basic_smin_smax:
|
||||
; GISEL-GFX11: ; %bb.0:
|
||||
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
|
||||
; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
|
||||
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255)
|
||||
%src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0)
|
||||
%src1.min = call i16 @llvm.smin.i16(i16 %src1, i16 255)
|
||||
@@ -250,25 +292,25 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) {
|
||||
; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: basic_smin_smax_combined:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0xff
|
||||
; GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
|
||||
; GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
|
||||
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
|
||||
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
; SDAG-GFX9-LABEL: basic_smin_smax_combined:
|
||||
; SDAG-GFX9: ; %bb.0:
|
||||
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff
|
||||
; SDAG-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
|
||||
; SDAG-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
|
||||
; SDAG-GFX9-NEXT: s_mov_b32 s4, 0x5040100
|
||||
; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
|
||||
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX11-LABEL: basic_smin_smax_combined:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
|
||||
; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
; SDAG-GFX11-LABEL: basic_smin_smax_combined:
|
||||
; SDAG-GFX11: ; %bb.0:
|
||||
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
|
||||
; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
|
||||
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
|
||||
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-VI-LABEL: basic_smin_smax_combined:
|
||||
; GISEL-VI: ; %bb.0:
|
||||
@@ -280,6 +322,27 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) {
|
||||
; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1
|
||||
; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-GFX9-LABEL: basic_smin_smax_combined:
|
||||
; GISEL-GFX9: ; %bb.0:
|
||||
; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff
|
||||
; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2
|
||||
; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2
|
||||
; GISEL-GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GISEL-GFX11-LABEL: basic_smin_smax_combined:
|
||||
; GISEL-GFX11: ; %bb.0:
|
||||
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff
|
||||
; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff
|
||||
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
%src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255)
|
||||
%src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0)
|
||||
%src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0)
|
||||
@@ -404,8 +467,8 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
|
||||
; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
|
||||
; GISEL-VI-NEXT: s_min_i32 s3, s3, s4
|
||||
; GISEL-VI-NEXT: s_min_i32 s2, s2, s4
|
||||
; GISEL-VI-NEXT: s_bfe_u32 s3, s3, 0x100000
|
||||
; GISEL-VI-NEXT: s_bfe_u32 s2, s2, 0x100000
|
||||
; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3
|
||||
; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2
|
||||
; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GISEL-VI-NEXT: s_or_b32 s2, s2, s3
|
||||
; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
|
||||
@@ -3,10 +3,17 @@
|
||||
include "llvm/Target/Target.td"
|
||||
include "GlobalISelEmitterCommon.td"
|
||||
|
||||
// No rule will be added to the match table.
|
||||
// CHECK: constexpr static int64_t MatchTable0[] = {
|
||||
// CHECK-NEXT: GIM_Reject
|
||||
// CHECK-NEXT: };
|
||||
// CHECK-NEXT: GIM_Try,
|
||||
// CHECK-NEXT: GIM_CheckOpcode{{.*}}TargetOpcode::G_ANYEXT,
|
||||
// CHECK-NEXT: GIM_CheckType{{.*}}/*Type*/GILLT_s32,
|
||||
// CHECK-NEXT: GIM_CheckType{{.*}}/*Type*/GILLT_s8,
|
||||
// CHECK-NEXT: GIM_CheckRegBankForClass{{.*}}/*RC*/MyTarget::GPR32RegClassID,
|
||||
// CHECK-NEXT: // (anyext:{{.*}}=>{{.*}}(SELECT_I4:
|
||||
// CHECK: GIR_Done,
|
||||
// CHECK-NEXT: // Label 0:
|
||||
// CHECK-NEXT: GIM_Reject,
|
||||
// CHECK-NEXT: };
|
||||
|
||||
def SELECT_I4 : I<(outs GPR32:$dst), (ins GPR8:$cond, GPR32:$T, GPR32:$F), []>;
|
||||
def LI : I<(outs GPR32:$dst), (ins i32imm:$src), []>;
|
||||
|
||||
@@ -3696,7 +3696,7 @@ private:
|
||||
const TreePatternNode *Src, const TreePatternNode *Dst);
|
||||
Expected<action_iterator> createAndImportSubInstructionRenderer(
|
||||
action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst,
|
||||
unsigned TempReg);
|
||||
const TreePatternNode *Src, unsigned TempReg);
|
||||
Expected<action_iterator>
|
||||
createInstructionRenderer(action_iterator InsertPt, RuleMatcher &M,
|
||||
const TreePatternNode *Dst);
|
||||
@@ -3705,14 +3705,12 @@ private:
|
||||
action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
|
||||
const TreePatternNode *Src, const TreePatternNode *Dst);
|
||||
|
||||
Expected<action_iterator>
|
||||
importExplicitUseRenderers(action_iterator InsertPt, RuleMatcher &M,
|
||||
BuildMIAction &DstMIBuilder,
|
||||
const llvm::TreePatternNode *Dst);
|
||||
Expected<action_iterator>
|
||||
importExplicitUseRenderer(action_iterator InsertPt, RuleMatcher &Rule,
|
||||
BuildMIAction &DstMIBuilder,
|
||||
const TreePatternNode *DstChild);
|
||||
Expected<action_iterator> importExplicitUseRenderers(
|
||||
action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
|
||||
const llvm::TreePatternNode *Dst, const TreePatternNode *Src);
|
||||
Expected<action_iterator> importExplicitUseRenderer(
|
||||
action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder,
|
||||
const TreePatternNode *DstChild, const TreePatternNode *Src);
|
||||
Error importDefaultOperandRenderers(action_iterator InsertPt, RuleMatcher &M,
|
||||
BuildMIAction &DstMIBuilder,
|
||||
DagInit *DefaultOps) const;
|
||||
@@ -4510,7 +4508,7 @@ Error GlobalISelEmitter::importChildMatcher(
|
||||
|
||||
Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
|
||||
action_iterator InsertPt, RuleMatcher &Rule, BuildMIAction &DstMIBuilder,
|
||||
const TreePatternNode *DstChild) {
|
||||
const TreePatternNode *DstChild, const TreePatternNode *Src) {
|
||||
|
||||
const auto &SubOperand = Rule.getComplexSubOperand(DstChild->getName());
|
||||
if (SubOperand) {
|
||||
@@ -4580,7 +4578,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
|
||||
DstMIBuilder.addRenderer<TempRegRenderer>(TempRegID);
|
||||
|
||||
auto InsertPtOrError = createAndImportSubInstructionRenderer(
|
||||
++InsertPt, Rule, DstChild, TempRegID);
|
||||
++InsertPt, Rule, DstChild, Src, TempRegID);
|
||||
if (auto Error = InsertPtOrError.takeError())
|
||||
return std::move(Error);
|
||||
return InsertPtOrError.get();
|
||||
@@ -4652,6 +4650,16 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderer(
|
||||
return failedImport(
|
||||
"Dst pattern child def is an unsupported tablegen class");
|
||||
}
|
||||
|
||||
// Handle the case where the MVT/register class is omitted in the dest pattern
|
||||
// but MVT exists in the source pattern.
|
||||
if (isa<UnsetInit>(DstChild->getLeafValue())) {
|
||||
for (unsigned NumOp = 0; NumOp < Src->getNumChildren(); NumOp++)
|
||||
if (Src->getChild(NumOp)->getName() == DstChild->getName()) {
|
||||
DstMIBuilder.addRenderer<CopyRenderer>(Src->getChild(NumOp)->getName());
|
||||
return InsertPt;
|
||||
}
|
||||
}
|
||||
return failedImport("Dst pattern child is an unsupported kind");
|
||||
}
|
||||
|
||||
@@ -4682,8 +4690,9 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
|
||||
.takeError())
|
||||
return std::move(Error);
|
||||
|
||||
if (auto Error = importExplicitUseRenderers(InsertPt, M, DstMIBuilder, Dst)
|
||||
.takeError())
|
||||
if (auto Error =
|
||||
importExplicitUseRenderers(InsertPt, M, DstMIBuilder, Dst, Src)
|
||||
.takeError())
|
||||
return std::move(Error);
|
||||
|
||||
return DstMIBuilder;
|
||||
@@ -4692,7 +4701,7 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
|
||||
Expected<action_iterator>
|
||||
GlobalISelEmitter::createAndImportSubInstructionRenderer(
|
||||
const action_iterator InsertPt, RuleMatcher &M, const TreePatternNode *Dst,
|
||||
unsigned TempRegID) {
|
||||
const TreePatternNode *Src, unsigned TempRegID) {
|
||||
auto InsertPtOrError = createInstructionRenderer(InsertPt, M, Dst);
|
||||
|
||||
// TODO: Assert there's exactly one result.
|
||||
@@ -4706,8 +4715,8 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
|
||||
// Assign the result to TempReg.
|
||||
DstMIBuilder.addRenderer<TempRegRenderer>(TempRegID, true);
|
||||
|
||||
InsertPtOrError =
|
||||
importExplicitUseRenderers(InsertPtOrError.get(), M, DstMIBuilder, Dst);
|
||||
InsertPtOrError = importExplicitUseRenderers(InsertPtOrError.get(), M,
|
||||
DstMIBuilder, Dst, Src);
|
||||
if (auto Error = InsertPtOrError.takeError())
|
||||
return std::move(Error);
|
||||
|
||||
@@ -4869,7 +4878,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitDefRenderers(
|
||||
|
||||
Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
|
||||
action_iterator InsertPt, RuleMatcher &M, BuildMIAction &DstMIBuilder,
|
||||
const llvm::TreePatternNode *Dst) {
|
||||
const llvm::TreePatternNode *Dst, const llvm::TreePatternNode *Src) {
|
||||
const CodeGenInstruction *DstI = DstMIBuilder.getCGI();
|
||||
CodeGenInstruction *OrigDstI = &Target.getInstruction(Dst->getOperator());
|
||||
|
||||
@@ -4898,7 +4907,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
|
||||
InsertPt, *ExtractSrcTy, TempRegID);
|
||||
|
||||
auto InsertPtOrError = createAndImportSubInstructionRenderer(
|
||||
++InsertPt, M, ValChild, TempRegID);
|
||||
++InsertPt, M, ValChild, Src, TempRegID);
|
||||
if (auto Error = InsertPtOrError.takeError())
|
||||
return std::move(Error);
|
||||
|
||||
@@ -4955,7 +4964,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
|
||||
CodeGenSubRegIndex *SubIdx = CGRegs.getSubRegIdx(SubRegInit->getDef());
|
||||
|
||||
auto InsertPtOrError =
|
||||
importExplicitUseRenderer(InsertPt, M, DstMIBuilder, ValChild);
|
||||
importExplicitUseRenderer(InsertPt, M, DstMIBuilder, ValChild, Src);
|
||||
if (auto Error = InsertPtOrError.takeError())
|
||||
return std::move(Error);
|
||||
InsertPt = InsertPtOrError.get();
|
||||
@@ -5024,7 +5033,7 @@ Expected<action_iterator> GlobalISelEmitter::importExplicitUseRenderers(
|
||||
}
|
||||
|
||||
auto InsertPtOrError = importExplicitUseRenderer(InsertPt, M, DstMIBuilder,
|
||||
Dst->getChild(Child));
|
||||
Dst->getChild(Child), Src);
|
||||
if (auto Error = InsertPtOrError.takeError())
|
||||
return std::move(Error);
|
||||
InsertPt = InsertPtOrError.get();
|
||||
|
||||
Reference in New Issue
Block a user