Files
llvm/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
Anshil Gandhi fbdf8ab590 [LSV] Merge contiguous chains across scalar types (#154069)
This change enables the LoadStoreVectorizer to merge and vectorize
contiguous chains even when their scalar element types differ, as long
as the total bitwidth matches. To do so, we rebase offsets between
chains, normalize value types to a common integer type, and insert the
necessary casts around loads and stores. This uncovers more
vectorization opportunities and explains the expected codegen updates
across AMDGPU tests.

Key changes:
- Chain merging
  - Build contiguous subchains and then merge adjacent ones when:
- They refer to the same underlying pointer object and address space.
    - They are either all loads or all stores.
    - A constant leader-to-leader delta exists.
- Rebasing one chain into the other's coordinate space does not overlap.
    - All elements have equal total bit width.
- Rebase the second chain by the computed delta and append it to the
first.

- Type normalization and casting
- Normalize merged chains to a common integer type sized to the total
bits.
- For loads: create a new load of the normalized type, copy metadata,
and cast back to the original type for uses if needed.
  - For stores: bitcast the value to the normalized type and store that.
- Insert zext/trunc for integer size changes; use bit-or-pointer casts
when sizes match.

- Cleanups
  - Erase replaced instructions and DCE pointer operands when safe.
- New helpers: computeLeaderDelta, chainsOverlapAfterRebase,
rebaseChain, normalizeChainToType, and allElemsMatchTotalBits.

Impact:
- Increases vectorization opportunities across mixed-typed but
size-compatible access chains.
- Large set of expected AMDGPU codegen diffs due to more/changed
vectorization.

This PR resolves #97715.
2025-12-01 23:05:17 -05:00

1233 lines
50 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn < %s | FileCheck %s -check-prefixes=SI
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck %s -check-prefixes=VI
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG
declare float @llvm.fabs.f32(float) #1
define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float %in) {
; SI-LABEL: fp_to_uint_f32_to_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_u32_f32_e32 v0, s6
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_f32_to_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_f32_to_i32:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, s2
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_f32_to_i32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, s2
; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: TRUNC * T0.W, KC0[2].Z,
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: FLT_TO_UINT * T1.X, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%conv = fptoui float %in to i32
store i32 %conv, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x float> %in) {
; SI-LABEL: fp_to_uint_v2f32_to_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_cvt_u32_f32_e32 v1, s3
; SI-NEXT: v_cvt_u32_f32_e32 v0, s2
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_v2f32_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v1, s3
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i32:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, s3
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, s2
; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_v2f32_to_v2i32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, s2
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v1, s3
; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v2f32_to_v2i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: TRUNC T0.W, KC0[3].X,
; EG-NEXT: TRUNC * T1.W, KC0[2].W,
; EG-NEXT: FLT_TO_UINT * T0.Y, PV.W,
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; EG-NEXT: FLT_TO_UINT * T0.X, T1.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%result = fptoui <2 x float> %in to <2 x i32>
store <2 x i32> %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; SI-LABEL: fp_to_uint_v4f32_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_u32_f32_e32 v3, s7
; SI-NEXT: v_cvt_u32_f32_e32 v2, s6
; SI-NEXT: v_cvt_u32_f32_e32 v1, s5
; SI-NEXT: v_cvt_u32_f32_e32 v0, s4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_v4f32_to_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v3, s7
; VI-NEXT: v_cvt_u32_f32_e32 v2, s6
; VI-NEXT: v_cvt_u32_f32_e32 v1, s5
; VI-NEXT: v_cvt_u32_f32_e32 v0, s4
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_v4f32_to_v4i32:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v3, s7
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v2, s6
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, s5
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, s4
; GFX11-SDAG-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_v4f32_to_v4i32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, s4
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v1, s5
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v2, s6
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v3, s7
; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v4f32_to_v4i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: TRUNC T0.W, T0.W,
; EG-NEXT: TRUNC * T1.W, T0.Z,
; EG-NEXT: FLT_TO_UINT * T0.W, PV.W,
; EG-NEXT: TRUNC T2.W, T0.Y,
; EG-NEXT: FLT_TO_UINT * T0.Z, T1.W,
; EG-NEXT: TRUNC T1.W, T0.X,
; EG-NEXT: FLT_TO_UINT * T0.Y, PV.W,
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; EG-NEXT: FLT_TO_UINT * T0.X, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%value = load <4 x float>, ptr addrspace(1) %in
%result = fptoui <4 x float> %value to <4 x i32>
store <4 x i32> %result, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x) {
; SI-LABEL: fp_to_uint_f32_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dword s4, s[4:5], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s5, 0xcf800000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_trunc_f32_e32 v0, s4
; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; SI-NEXT: v_floor_f32_e32 v2, v1
; SI-NEXT: v_cvt_u32_f32_e32 v1, v2
; SI-NEXT: v_fma_f32 v0, v2, s5, v0
; SI-NEXT: v_cvt_u32_f32_e32 v0, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_f32_to_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s3, 0xcf800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s2
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_floor_f32_e32 v2, v1
; VI-NEXT: v_fma_f32 v0, v2, s3, v0
; VI-NEXT: v_cvt_u32_f32_e32 v1, v2
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_f32_to_i64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x2c
; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_trunc_f32_e32 v0, s0
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX11-SDAG-NEXT: v_floor_f32_e32 v1, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_f32_to_i64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c
; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_trunc_f32_e32 v0, s0
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX11-GISEL-NEXT: v_floor_f32_e32 v1, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_fmac_f32_e32 v0, 0xcf800000, v1
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 40, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T0.W, KC0[2].Z, literal.x, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Z, literal.y,
; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38)
; EG-NEXT: OR_INT T1.W, PS, literal.x,
; EG-NEXT: ADD_INT * T2.W, PV.W, literal.y,
; EG-NEXT: 8388608(1.175494e-38), -150(nan)
; EG-NEXT: ADD_INT T0.X, T0.W, literal.x,
; EG-NEXT: AND_INT T0.Y, PS, literal.y,
; EG-NEXT: SUB_INT T0.Z, literal.z, T0.W,
; EG-NEXT: NOT_INT T0.W, PS,
; EG-NEXT: LSHR * T3.W, PV.W, 1,
; EG-NEXT: -127(nan), 31(4.344025e-44)
; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00)
; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
; EG-NEXT: AND_INT T1.Y, PV.Z, literal.x,
; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, T1.W, PV.Z,
; EG-NEXT: LSHL T0.W, T1.W, PV.Y,
; EG-NEXT: AND_INT * T1.W, T2.W, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
; EG-NEXT: CNDE_INT T0.W, PS, PV.X, PV.W,
; EG-NEXT: SETGT_INT * T1.W, T0.X, literal.x,
; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T1.Z, PS, 0.0, PV.W,
; EG-NEXT: CNDE_INT T0.W, PS, PV.Z, PV.Y,
; EG-NEXT: ASHR * T1.W, KC0[2].Z, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: XOR_INT T0.W, PV.W, PS,
; EG-NEXT: XOR_INT * T2.W, PV.Z, PS,
; EG-NEXT: SUB_INT T2.W, PS, T1.W,
; EG-NEXT: SUBB_UINT * T3.W, PV.W, T1.W,
; EG-NEXT: SUB_INT T2.W, PV.W, PS,
; EG-NEXT: SETGT_INT * T3.W, 0.0, T0.X,
; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0,
; EG-NEXT: SUB_INT * T0.W, T0.W, T1.W,
; EG-NEXT: CNDE_INT T0.X, T3.W, PV.W, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%conv = fptoui float %x to i64
store i64 %conv, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x float> %x) {
; SI-LABEL: fp_to_uint_v2f32_to_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s8, 0xcf800000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_trunc_f32_e32 v0, s3
; SI-NEXT: v_trunc_f32_e32 v2, s2
; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; SI-NEXT: v_floor_f32_e32 v4, v1
; SI-NEXT: v_floor_f32_e32 v5, v3
; SI-NEXT: v_cvt_u32_f32_e32 v3, v4
; SI-NEXT: v_cvt_u32_f32_e32 v1, v5
; SI-NEXT: v_fma_f32 v0, v4, s8, v0
; SI-NEXT: v_fma_f32 v4, v5, s8, v2
; SI-NEXT: v_cvt_u32_f32_e32 v2, v0
; SI-NEXT: v_cvt_u32_f32_e32 v0, v4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_v2f32_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s3
; VI-NEXT: v_trunc_f32_e32 v4, s2
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4
; VI-NEXT: v_floor_f32_e32 v5, v1
; VI-NEXT: s_mov_b32 s2, 0xcf800000
; VI-NEXT: v_floor_f32_e32 v6, v2
; VI-NEXT: v_fma_f32 v0, v5, s2, v0
; VI-NEXT: v_cvt_u32_f32_e32 v2, v0
; VI-NEXT: v_fma_f32 v0, v6, s2, v4
; VI-NEXT: v_cvt_u32_f32_e32 v3, v5
; VI-NEXT: v_cvt_u32_f32_e32 v1, v6
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_trunc_f32_e32 v0, s3
; GFX11-SDAG-NEXT: v_trunc_f32_e32 v4, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX11-SDAG-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_floor_f32_e32 v1, v1
; GFX11-SDAG-NEXT: v_floor_f32_e32 v2, v2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_fmac_f32_e32 v0, 0xcf800000, v1
; GFX11-SDAG-NEXT: v_fmac_f32_e32 v4, 0xcf800000, v2
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v1
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, v2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v0
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, v4
; GFX11-SDAG-NEXT: global_store_b128 v5, v[0:3], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_v2f32_to_v2i64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_trunc_f32_e32 v0, s2
; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, s3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX11-GISEL-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_floor_f32_e32 v1, v1
; GFX11-GISEL-NEXT: v_floor_f32_e32 v3, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_fmac_f32_e32 v0, 0xcf800000, v1
; GFX11-GISEL-NEXT: v_fmac_f32_e32 v2, 0xcf800000, v3
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v2f32_to_v2i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 74, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T0.Z, KC0[3].X, literal.x, PV.W,
; EG-NEXT: BFE_UINT T0.W, KC0[2].W, literal.x, PV.W,
; EG-NEXT: AND_INT * T1.Z, KC0[2].W, literal.y,
; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38)
; EG-NEXT: ADD_INT T1.W, PV.W, literal.x,
; EG-NEXT: ADD_INT * T2.W, PV.Z, literal.x,
; EG-NEXT: -150(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.X, PS, literal.x,
; EG-NEXT: AND_INT T0.Y, PV.W, literal.x,
; EG-NEXT: OR_INT T1.Z, T1.Z, literal.y,
; EG-NEXT: SUB_INT T3.W, literal.z, T0.W,
; EG-NEXT: AND_INT * T4.W, KC0[3].X, literal.w,
; EG-NEXT: 31(4.344025e-44), 8388608(1.175494e-38)
; EG-NEXT: 150(2.101948e-43), 8388607(1.175494e-38)
; EG-NEXT: OR_INT T1.X, PS, literal.x,
; EG-NEXT: AND_INT T1.Y, PV.W, literal.y,
; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PV.Z, PV.W,
; EG-NEXT: LSHL T3.W, PV.Z, PV.Y,
; EG-NEXT: AND_INT * T4.W, T1.W, literal.y,
; EG-NEXT: 8388608(1.175494e-38), 32(4.484155e-44)
; EG-NEXT: CNDE_INT T0.Y, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T2.Z, PV.Y, PV.Z, 0.0,
; EG-NEXT: LSHL T5.W, PV.X, T0.X,
; EG-NEXT: AND_INT * T6.W, T2.W, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0,
; EG-NEXT: NOT_INT T1.Y, T1.W,
; EG-NEXT: SUB_INT T3.Z, literal.x, T0.Z,
; EG-NEXT: NOT_INT T1.W, T2.W, BS:VEC_120/SCL_212
; EG-NEXT: LSHR * T2.W, T1.X, 1,
; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T2.X, T1.Z, 1,
; EG-NEXT: ADD_INT T2.Y, T0.Z, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, PS, PV.W,
; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, T1.X, PV.Z,
; EG-NEXT: AND_INT * T2.W, PV.Z, literal.y,
; EG-NEXT: -127(nan), 32(4.484155e-44)
; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T3.Y, T6.W, PV.Z, T5.W, BS:VEC_021/SCL_122
; EG-NEXT: SETGT_INT T0.Z, PV.Y, literal.x,
; EG-NEXT: BIT_ALIGN_INT T1.W, 0.0, PV.X, T1.Y,
; EG-NEXT: ADD_INT * T0.W, T0.W, literal.y,
; EG-NEXT: 23(3.222986e-44), -127(nan)
; EG-NEXT: CNDE_INT T2.X, T4.W, PV.W, T3.W,
; EG-NEXT: SETGT_INT T1.Y, PS, literal.x,
; EG-NEXT: CNDE_INT T1.Z, PV.Z, 0.0, PV.Y,
; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T0.X,
; EG-NEXT: ASHR * T2.W, KC0[3].X, literal.y,
; EG-NEXT: 23(3.222986e-44), 31(4.344025e-44)
; EG-NEXT: XOR_INT T0.X, PV.W, PS,
; EG-NEXT: XOR_INT T3.Y, PV.Z, PS,
; EG-NEXT: CNDE_INT T0.Z, PV.Y, 0.0, PV.X,
; EG-NEXT: CNDE_INT T1.W, PV.Y, T2.Z, T0.Y,
; EG-NEXT: ASHR * T3.W, KC0[2].W, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: XOR_INT T0.Y, PV.W, PS,
; EG-NEXT: XOR_INT T0.Z, PV.Z, PS,
; EG-NEXT: SUB_INT T1.W, PV.Y, T2.W,
; EG-NEXT: SUBB_UINT * T4.W, PV.X, T2.W,
; EG-NEXT: SUB_INT T1.Y, PV.W, PS,
; EG-NEXT: SETGT_INT T1.Z, 0.0, T2.Y,
; EG-NEXT: SUB_INT T1.W, PV.Z, T3.W,
; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T3.W,
; EG-NEXT: SUB_INT T0.Z, PV.W, PS,
; EG-NEXT: SETGT_INT T0.W, 0.0, T0.W,
; EG-NEXT: CNDE_INT * T1.W, PV.Z, PV.Y, 0.0,
; EG-NEXT: CNDE_INT T1.Y, PV.W, PV.Z, 0.0,
; EG-NEXT: SUB_INT * T2.W, T0.X, T2.W,
; EG-NEXT: CNDE_INT T1.Z, T1.Z, PV.W, 0.0,
; EG-NEXT: SUB_INT * T2.W, T0.Y, T3.W,
; EG-NEXT: CNDE_INT T1.X, T0.W, PV.W, 0.0,
; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%conv = fptoui <2 x float> %x to <2 x i64>
store <2 x i64> %conv, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x float> %x) {
; SI-LABEL: fp_to_uint_v4f32_to_v4i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s8, 0xcf800000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_trunc_f32_e32 v0, s5
; SI-NEXT: v_trunc_f32_e32 v2, s4
; SI-NEXT: v_trunc_f32_e32 v4, s7
; SI-NEXT: v_trunc_f32_e32 v6, s6
; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; SI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; SI-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
; SI-NEXT: v_floor_f32_e32 v8, v1
; SI-NEXT: v_floor_f32_e32 v9, v3
; SI-NEXT: v_floor_f32_e32 v10, v5
; SI-NEXT: v_floor_f32_e32 v11, v7
; SI-NEXT: v_cvt_u32_f32_e32 v3, v8
; SI-NEXT: v_cvt_u32_f32_e32 v1, v9
; SI-NEXT: v_fma_f32 v0, v8, s8, v0
; SI-NEXT: v_fma_f32 v8, v9, s8, v2
; SI-NEXT: v_cvt_u32_f32_e32 v7, v10
; SI-NEXT: v_cvt_u32_f32_e32 v5, v11
; SI-NEXT: v_fma_f32 v4, v10, s8, v4
; SI-NEXT: v_fma_f32 v9, v11, s8, v6
; SI-NEXT: v_cvt_u32_f32_e32 v2, v0
; SI-NEXT: v_cvt_u32_f32_e32 v0, v8
; SI-NEXT: v_cvt_u32_f32_e32 v6, v4
; SI-NEXT: v_cvt_u32_f32_e32 v4, v9
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_v4f32_to_v4i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s2, 0xcf800000
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s9
; VI-NEXT: v_trunc_f32_e32 v4, s8
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4
; VI-NEXT: v_floor_f32_e32 v5, v1
; VI-NEXT: v_floor_f32_e32 v6, v2
; VI-NEXT: v_fma_f32 v0, v5, s2, v0
; VI-NEXT: v_cvt_u32_f32_e32 v2, v0
; VI-NEXT: v_fma_f32 v0, v6, s2, v4
; VI-NEXT: v_trunc_f32_e32 v4, s11
; VI-NEXT: v_cvt_u32_f32_e32 v3, v5
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; VI-NEXT: v_trunc_f32_e32 v8, s10
; VI-NEXT: v_cvt_u32_f32_e32 v1, v6
; VI-NEXT: v_floor_f32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v8
; VI-NEXT: v_floor_f32_e32 v9, v5
; VI-NEXT: v_fma_f32 v4, v6, s2, v4
; VI-NEXT: v_cvt_u32_f32_e32 v7, v6
; VI-NEXT: v_cvt_u32_f32_e32 v6, v4
; VI-NEXT: v_fma_f32 v4, v9, s2, v8
; VI-NEXT: v_cvt_u32_f32_e32 v5, v9
; VI-NEXT: v_cvt_u32_f32_e32 v4, v4
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_v4f32_to_v4i64:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX11-SDAG-NEXT: v_mov_b32_e32 v10, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_trunc_f32_e32 v0, s1
; GFX11-SDAG-NEXT: v_trunc_f32_e32 v8, s0
; GFX11-SDAG-NEXT: v_trunc_f32_e32 v4, s3
; GFX11-SDAG-NEXT: v_trunc_f32_e32 v9, s2
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX11-SDAG-NEXT: v_mul_f32_e32 v2, 0x2f800000, v8
; GFX11-SDAG-NEXT: v_mul_f32_e32 v3, 0x2f800000, v4
; GFX11-SDAG-NEXT: v_mul_f32_e32 v5, 0x2f800000, v9
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-NEXT: v_floor_f32_e32 v6, v1
; GFX11-SDAG-NEXT: v_floor_f32_e32 v2, v2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-NEXT: v_floor_f32_e32 v7, v3
; GFX11-SDAG-NEXT: v_floor_f32_e32 v5, v5
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-SDAG-NEXT: v_fmac_f32_e32 v0, 0xcf800000, v6
; GFX11-SDAG-NEXT: v_fmac_f32_e32 v8, 0xcf800000, v2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-SDAG-NEXT: v_dual_fmac_f32 v4, 0xcf800000, v7 :: v_dual_fmac_f32 v9, 0xcf800000, v5
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v3, v6
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v6, v4
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v4, v9
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, v2
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v2, v0
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, v8
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: global_store_b128 v10, v[4:7], s[0:1] offset:16
; GFX11-SDAG-NEXT: global_store_b128 v10, v[0:3], s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_v4f32_to_v4i64:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
; GFX11-GISEL-NEXT: v_mov_b32_e32 v8, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_trunc_f32_e32 v0, s0
; GFX11-GISEL-NEXT: v_trunc_f32_e32 v2, s1
; GFX11-GISEL-NEXT: v_trunc_f32_e32 v4, s2
; GFX11-GISEL-NEXT: v_trunc_f32_e32 v6, s3
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX11-GISEL-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX11-GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; GFX11-GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-GISEL-NEXT: v_floor_f32_e32 v1, v1
; GFX11-GISEL-NEXT: v_floor_f32_e32 v3, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-GISEL-NEXT: v_floor_f32_e32 v5, v5
; GFX11-GISEL-NEXT: v_floor_f32_e32 v7, v7
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-GISEL-NEXT: v_fmac_f32_e32 v0, 0xcf800000, v1
; GFX11-GISEL-NEXT: v_fmac_f32_e32 v2, 0xcf800000, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-GISEL-NEXT: v_fmac_f32_e32 v4, 0xcf800000, v5
; GFX11-GISEL-NEXT: v_fmac_f32_e32 v6, 0xcf800000, v7
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX11-GISEL-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v4f32_to_v4i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 99, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 54, @106, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: MOV * T0.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: BFE_UINT T1.W, KC0[3].Z, literal.x, PV.W,
; EG-NEXT: AND_INT * T2.W, KC0[3].Z, literal.y,
; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38)
; EG-NEXT: OR_INT T2.W, PS, literal.x,
; EG-NEXT: ADD_INT * T3.W, PV.W, literal.y,
; EG-NEXT: 8388608(1.175494e-38), -150(nan)
; EG-NEXT: ADD_INT T0.X, T1.W, literal.x,
; EG-NEXT: BFE_UINT T0.Y, KC0[4].X, literal.y, T0.W,
; EG-NEXT: AND_INT T0.Z, PS, literal.z,
; EG-NEXT: NOT_INT T4.W, PS,
; EG-NEXT: LSHR * T5.W, PV.W, 1,
; EG-NEXT: -127(nan), 23(3.222986e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
; EG-NEXT: AND_INT T1.Y, T3.W, literal.x,
; EG-NEXT: LSHL T0.Z, T2.W, PV.Z, BS:VEC_120/SCL_212
; EG-NEXT: AND_INT T3.W, KC0[4].X, literal.y,
; EG-NEXT: ADD_INT * T4.W, PV.Y, literal.z,
; EG-NEXT: 32(4.484155e-44), 8388607(1.175494e-38)
; EG-NEXT: -150(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.Y, PS, literal.x,
; EG-NEXT: OR_INT T1.Z, PV.W, literal.y,
; EG-NEXT: CNDE_INT T3.W, PV.Y, PV.X, PV.Z,
; EG-NEXT: SETGT_INT * T5.W, T0.X, literal.z,
; EG-NEXT: 31(4.344025e-44), 8388608(1.175494e-38)
; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T3.Y, PS, 0.0, PV.W,
; EG-NEXT: SUB_INT T2.Z, literal.x, T1.W,
; EG-NEXT: LSHL T1.W, PV.Z, PV.Y,
; EG-NEXT: AND_INT * T3.W, T4.W, literal.y,
; EG-NEXT: 150(2.101948e-43), 32(4.484155e-44)
; EG-NEXT: CNDE_INT T1.X, PS, PV.W, 0.0,
; EG-NEXT: AND_INT T2.Y, PV.Z, literal.x,
; EG-NEXT: SUB_INT T3.Z, literal.y, T0.Y,
; EG-NEXT: NOT_INT T4.W, T4.W,
; EG-NEXT: LSHR * T6.W, T1.Z, 1,
; EG-NEXT: 32(4.484155e-44), 150(2.101948e-43)
; EG-NEXT: BIT_ALIGN_INT T2.X, 0.0, T2.W, T2.Z,
; EG-NEXT: ADD_INT T0.Y, T0.Y, literal.x,
; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
; EG-NEXT: BIT_ALIGN_INT T2.W, 0.0, T1.Z, PV.Z,
; EG-NEXT: AND_INT * T4.W, PV.Z, literal.y,
; EG-NEXT: -127(nan), 32(4.484155e-44)
; EG-NEXT: CNDE_INT T3.X, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T4.Y, T3.W, PV.Z, T1.W,
; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x,
; EG-NEXT: CNDE_INT T1.W, T1.Y, T0.Z, 0.0,
; EG-NEXT: CNDE_INT * T2.W, T2.Y, PV.X, 0.0,
; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T2.X, T5.W, PS, PV.W,
; EG-NEXT: ASHR T1.Y, KC0[3].Z, literal.x,
; EG-NEXT: CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
; EG-NEXT: CNDE_INT T1.W, PV.Z, PV.X, T1.X,
; EG-NEXT: ASHR * T2.W, KC0[4].X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: XOR_INT T2.Y, PV.W, PS,
; EG-NEXT: XOR_INT T0.Z, PV.Z, PS,
; EG-NEXT: XOR_INT T1.W, PV.X, PV.Y,
; EG-NEXT: XOR_INT * T3.W, T3.Y, PV.Y,
; EG-NEXT: SUB_INT T3.Y, PS, T1.Y,
; EG-NEXT: SUBB_UINT T1.Z, PV.W, T1.Y,
; EG-NEXT: SUB_INT T3.W, PV.Z, T2.W,
; EG-NEXT: SUBB_UINT * T4.W, PV.Y, T2.W,
; EG-NEXT: SUB_INT T4.Y, PV.W, PS,
; EG-NEXT: SUB_INT T0.Z, PV.Y, PV.Z,
; EG-NEXT: BFE_UINT T3.W, KC0[3].Y, literal.x, T0.W,
; EG-NEXT: AND_INT * T4.W, KC0[3].Y, literal.y,
; EG-NEXT: 23(3.222986e-44), 8388607(1.175494e-38)
; EG-NEXT: SETGT_INT T0.X, 0.0, T0.X,
; EG-NEXT: ADD_INT T3.Y, PV.W, literal.x,
; EG-NEXT: OR_INT T1.Z, PS, literal.y,
; EG-NEXT: BFE_UINT T0.W, KC0[3].W, literal.z, T0.W,
; EG-NEXT: ADD_INT * T4.W, PV.W, literal.w,
; EG-NEXT: -127(nan), 8388608(1.175494e-38)
; EG-NEXT: 23(3.222986e-44), -150(nan)
; EG-NEXT: AND_INT T1.X, KC0[3].W, literal.x,
; EG-NEXT: ADD_INT T5.Y, PV.W, literal.y,
; EG-NEXT: SUB_INT T2.Z, literal.z, T3.W,
; EG-NEXT: NOT_INT T3.W, PS,
; EG-NEXT: LSHR * T5.W, PV.Z, 1,
; EG-NEXT: 8388607(1.175494e-38), -150(nan)
; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00)
; EG-NEXT: BIT_ALIGN_INT T2.X, 0.0, PS, PV.W,
; EG-NEXT: AND_INT T6.Y, PV.Z, literal.x,
; EG-NEXT: AND_INT T3.Z, PV.Y, literal.y,
; EG-NEXT: OR_INT T3.W, PV.X, literal.z,
; EG-NEXT: AND_INT * T5.W, T4.W, literal.y,
; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44)
; EG-NEXT: 8388608(1.175494e-38), 0(0.000000e+00)
; EG-NEXT: BIT_ALIGN_INT T1.X, 0.0, T1.Z, T2.Z,
; EG-NEXT: LSHL T7.Y, T1.Z, PS,
; EG-NEXT: AND_INT T1.Z, T4.W, literal.x,
; EG-NEXT: LSHL T4.W, PV.W, PV.Z,
; EG-NEXT: AND_INT * T5.W, T5.Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T3.X, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T8.Y, PV.Z, PV.Y, 0.0,
; EG-NEXT: CNDE_INT * T2.Z, T6.Y, PV.X, 0.0,
; EG-NEXT: ALU clause starting at 106:
; EG-NEXT: CNDE_INT T6.W, T1.Z, T2.X, T7.Y, BS:VEC_021/SCL_122
; EG-NEXT: SETGT_INT * T7.W, T3.Y, literal.x,
; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T1.X, PS, 0.0, PV.W,
; EG-NEXT: CNDE_INT T6.Y, PS, T2.Z, T8.Y,
; EG-NEXT: SUB_INT T1.Z, literal.x, T0.W,
; EG-NEXT: NOT_INT T6.W, T5.Y,
; EG-NEXT: LSHR * T7.W, T3.W, 1,
; EG-NEXT: 150(2.101948e-43), 0(0.000000e+00)
; EG-NEXT: ASHR T2.X, KC0[3].Y, literal.x,
; EG-NEXT: ADD_INT T5.Y, T0.W, literal.y,
; EG-NEXT: BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
; EG-NEXT: BIT_ALIGN_INT T0.W, 0.0, T3.W, PV.Z,
; EG-NEXT: AND_INT * T3.W, PV.Z, literal.z,
; EG-NEXT: 31(4.344025e-44), -127(nan)
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T4.X, PS, PV.W, 0.0,
; EG-NEXT: CNDE_INT T7.Y, T5.W, PV.Z, T4.W,
; EG-NEXT: SETGT_INT T1.Z, PV.Y, literal.x,
; EG-NEXT: XOR_INT T0.W, T6.Y, PV.X,
; EG-NEXT: XOR_INT * T3.W, T1.X, PV.X,
; EG-NEXT: 23(3.222986e-44), 0(0.000000e+00)
; EG-NEXT: SUB_INT T1.X, PS, T2.X,
; EG-NEXT: SUBB_UINT T6.Y, PV.W, T2.X,
; EG-NEXT: CNDE_INT T2.Z, PV.Z, 0.0, PV.Y,
; EG-NEXT: CNDE_INT T3.W, PV.Z, PV.X, T3.X,
; EG-NEXT: ASHR * T4.W, KC0[3].W, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: XOR_INT T3.X, PV.W, PS,
; EG-NEXT: XOR_INT T7.Y, PV.Z, PS,
; EG-NEXT: SUB_INT T1.Z, PV.X, PV.Y,
; EG-NEXT: SETGT_INT T3.W, 0.0, T3.Y,
; EG-NEXT: CNDE_INT * T6.W, T0.X, T0.Z, 0.0,
; EG-NEXT: SETGT_INT T1.X, 0.0, T0.Y,
; EG-NEXT: CNDE_INT T6.Y, PV.W, PV.Z, 0.0,
; EG-NEXT: SUB_INT T0.Z, T1.W, T1.Y, BS:VEC_021/SCL_122
; EG-NEXT: SUB_INT T1.W, PV.Y, T4.W,
; EG-NEXT: SUBB_UINT * T5.W, PV.X, T4.W,
; EG-NEXT: SUB_INT T4.X, PV.W, PS,
; EG-NEXT: SETGT_INT T0.Y, 0.0, T5.Y, BS:VEC_021/SCL_122
; EG-NEXT: CNDE_INT T6.Z, T0.X, PV.Z, 0.0,
; EG-NEXT: SUB_INT T0.W, T0.W, T2.X,
; EG-NEXT: CNDE_INT * T1.W, PV.X, T4.Y, 0.0,
; EG-NEXT: CNDE_INT T6.X, T3.W, PV.W, 0.0,
; EG-NEXT: CNDE_INT T1.Y, PV.Y, PV.X, 0.0,
; EG-NEXT: SUB_INT T0.W, T2.Y, T2.W,
; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T1.Z, T1.X, PV.W, 0.0,
; EG-NEXT: SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212
; EG-NEXT: CNDE_INT T1.X, T0.Y, PV.W, 0.0,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T2.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%conv = fptoui <4 x float> %x to <4 x i64>
store <4 x i64> %conv, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in) #0 {
; SI-LABEL: fp_to_uint_f32_to_i1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s6
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_f32_to_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s6, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s6
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_f32_to_i1:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cmp_eq_f32_e64 s2, 1.0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-SDAG-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_f32_to_i1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, s2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-GISEL-NEXT: global_store_b8 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
; EG-NEXT: SETE_DX10 * T1.W, KC0[2].Z, 1.0,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PS, 1,
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%conv = fptoui float %in to i1
store i1 %conv, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, float %in) #0 {
; SI-LABEL: fp_to_uint_fabs_f32_to_i1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s6|
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_fabs_f32_to_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s6, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s6|
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_fabs_f32_to_i1:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cmp_eq_f32_e64 s2, 1.0, |s2|
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-SDAG-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_fabs_f32_to_i1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e64 v0, |s2|
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-GISEL-NEXT: global_store_b8 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_fabs_f32_to_i1:
; EG: ; %bb.0:
; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
; EG-NEXT: SETE_DX10 * T1.W, |KC0[2].Z|, 1.0,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PS, 1,
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%in.fabs = call float @llvm.fabs.f32(float %in)
%conv = fptoui float %in.fabs to i1
store i1 %conv, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %in) #0 {
; SI-LABEL: fp_to_uint_f32_to_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s6, s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_u32_f32_e32 v0, s6
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_f32_to_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_f32_to_i16:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, s2
; GFX11-SDAG-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_f32_to_i16:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, s2
; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: TRUNC T0.W, KC0[2].Z,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T1.W, PS, literal.x,
; EG-NEXT: FLT_TO_UINT * T0.X, PV.W,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, PS, PV.W,
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%uint = fptoui float %in to i16
store i16 %uint, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i16(ptr addrspace(1) %out, <2 x float> %in) {
; SI-LABEL: fp_to_uint_v2f32_to_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_cvt_u32_f32_e32 v0, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_cvt_u32_f32_e32 v1, s2
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_v2f32_to_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_cvt_u32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_u32_f32_e32 v1, s2
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i16:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, s3
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v2, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX11-SDAG-NEXT: global_store_b32 v1, v2, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_v2f32_to_v2i16:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, s3
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v1, s2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v2f32_to_v2i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: TRUNC T0.W, KC0[3].X,
; EG-NEXT: TRUNC * T1.W, KC0[2].W,
; EG-NEXT: FLT_TO_UINT * T0.Y, PV.W,
; EG-NEXT: LSHL T0.W, PS, literal.x,
; EG-NEXT: FLT_TO_UINT * T0.Y, T1.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT T4.X, PS, PV.W,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%uint = fptoui <2 x float> %in to <2 x i16>
store <2 x i16> %uint, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @fp_to_uint_f32_to_v2i16(ptr addrspace(1) %out, float %in0, float %in1) {
; SI-LABEL: fp_to_uint_f32_to_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_cvt_u32_f32_e32 v0, s4
; SI-NEXT: v_cvt_u32_f32_e32 v1, s5
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: fp_to_uint_f32_to_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
; VI-NEXT: v_cvt_u32_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fp_to_uint_f32_to_v2i16:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, s3
; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v2, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v2.h, v0.l
; GFX11-SDAG-NEXT: global_store_b32 v1, v2, s[0:1]
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fp_to_uint_f32_to_v2i16:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, s3
; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v1, s2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v1.h, v0.l
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-GISEL-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_v2i16:
; EG: ; %bb.0:
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: TRUNC T0.W, KC0[2].W,
; EG-NEXT: TRUNC * T1.W, KC0[2].Z,
; EG-NEXT: FLT_TO_UINT * T0.Y, PV.W,
; EG-NEXT: LSHL T0.W, PS, literal.x,
; EG-NEXT: FLT_TO_UINT * T0.Y, T1.W,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT T4.X, PV.W, PS,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%uint0 = fptoui float %in0 to i16
%uint1 = fptoui float %in1 to i16
%res0 = insertelement <2 x i16> poison, i16 %uint0, i32 0
%res1 = insertelement <2 x i16> %res0, i16 %uint1, i32 1
store <2 x i16> %res1, ptr addrspace(1) %out
ret void
}
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX11: {{.*}}