[clang][NVPTX] Add support for mixed-precision FP arithmetic (#168359)

This change adds support for mixed precision floating point 
arithmetic for `f16` and `bf16` where the following patterns:
```
%fh = fpext half %h to float
%resfh = fp-operation(%fh, ...)
...
%fb = fpext bfloat %b to float
%resfb = fp-operation(%fb, ...)

where the fp-operation can be any of:
- fadd
- fsub
- llvm.fma.f32
- llvm.nvvm.add(/fma).*
```
are lowered to the corresponding mixed precision instructions which 
combine the conversion and operation into one instruction from 
`sm_100` onwards.

This also adds the following intrinsics to complete support for 
all variants of the floating point `add/fma` operations in order 
to support the corresponding mixed-precision instructions:
- `llvm.nvvm.add.(rn/rz/rm/rp){.ftz}.sat.f`
- `llvm.nvvm.fma.(rn/rz/rm/rp){.ftz}.sat.f`

We lower `fneg` followed by one of the above addition
intrinsics to the corresponding `sub` instruction.

Tests are added in `fp-arith-sat.ll` , `fp-fold-sub.ll`, and
`bultins-nvptx.c`
for the newly added intrinsics and builtins, and in
`mixed-precision-fp.ll`
for the mixed precision instructions.

PTX spec reference for mixed precision instructions:
https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions
This commit is contained in:
Srinivasa Ravi
2025-12-15 16:28:23 +05:30
committed by GitHub
parent 0636225b93
commit 7d0865122e
7 changed files with 813 additions and 15 deletions

View File

@@ -393,13 +393,21 @@ def __nvvm_fma_rn_relu_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16, __bf1
def __nvvm_fma_rn_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
def __nvvm_fma_rn_relu_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
def __nvvm_fma_rn_ftz_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rn_ftz_sat_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rn_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rn_sat_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rz_ftz_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rz_ftz_sat_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rz_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rz_sat_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rm_ftz_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rm_ftz_sat_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rm_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rm_sat_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rp_ftz_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rp_ftz_sat_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rp_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rp_sat_f : NVPTXBuiltin<"float(float, float, float)">;
def __nvvm_fma_rn_d : NVPTXBuiltin<"double(double, double, double)">;
def __nvvm_fma_rz_d : NVPTXBuiltin<"double(double, double, double)">;
def __nvvm_fma_rm_d : NVPTXBuiltin<"double(double, double, double)">;
@@ -451,13 +459,21 @@ def __nvvm_rsqrt_approx_d : NVPTXBuiltin<"double(double)">;
// Add
def __nvvm_add_rn_ftz_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rn_ftz_sat_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rn_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rn_sat_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rz_ftz_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rz_ftz_sat_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rz_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rz_sat_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rm_ftz_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rm_ftz_sat_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rm_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rm_sat_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rp_ftz_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rp_ftz_sat_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rp_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rp_sat_f : NVPTXBuiltin<"float(float, float)">;
def __nvvm_add_rn_d : NVPTXBuiltin<"double(double, double)">;
def __nvvm_add_rz_d : NVPTXBuiltin<"double(double, double)">;

View File

@@ -1519,3 +1519,42 @@ __device__ void nvvm_min_max_sm86() {
#endif
// CHECK: ret void
}
// CHECK-LABEL: nvvm_add_fma_f32_sat
__device__ void nvvm_add_fma_f32_sat() {
// CHECK: call float @llvm.nvvm.add.rn.sat.f
__nvvm_add_rn_sat_f(1.0f, 2.0f);
// CHECK: call float @llvm.nvvm.add.rn.ftz.sat.f
__nvvm_add_rn_ftz_sat_f(1.0f, 2.0f);
// CHECK: call float @llvm.nvvm.add.rz.sat.f
__nvvm_add_rz_sat_f(1.0f, 2.0f);
// CHECK: call float @llvm.nvvm.add.rz.ftz.sat.f
__nvvm_add_rz_ftz_sat_f(1.0f, 2.0f);
// CHECK: call float @llvm.nvvm.add.rm.sat.f
__nvvm_add_rm_sat_f(1.0f, 2.0f);
// CHECK: call float @llvm.nvvm.add.rm.ftz.sat.f
__nvvm_add_rm_ftz_sat_f(1.0f, 2.0f);
// CHECK: call float @llvm.nvvm.add.rp.sat.f
__nvvm_add_rp_sat_f(1.0f, 2.0f);
// CHECK: call float @llvm.nvvm.add.rp.ftz.sat.f
__nvvm_add_rp_ftz_sat_f(1.0f, 2.0f);
// CHECK: call float @llvm.nvvm.fma.rn.sat.f
__nvvm_fma_rn_sat_f(1.0f, 2.0f, 3.0f);
// CHECK: call float @llvm.nvvm.fma.rn.ftz.sat.f
__nvvm_fma_rn_ftz_sat_f(1.0f, 2.0f, 3.0f);
// CHECK: call float @llvm.nvvm.fma.rz.sat.f
__nvvm_fma_rz_sat_f(1.0f, 2.0f, 3.0f);
// CHECK: call float @llvm.nvvm.fma.rz.ftz.sat.f
__nvvm_fma_rz_ftz_sat_f(1.0f, 2.0f, 3.0f);
// CHECK: call float @llvm.nvvm.fma.rm.sat.f
__nvvm_fma_rm_sat_f(1.0f, 2.0f, 3.0f);
// CHECK: call float @llvm.nvvm.fma.rm.ftz.sat.f
__nvvm_fma_rm_ftz_sat_f(1.0f, 2.0f, 3.0f);
// CHECK: call float @llvm.nvvm.fma.rp.sat.f
__nvvm_fma_rp_sat_f(1.0f, 2.0f, 3.0f);
// CHECK: call float @llvm.nvvm.fma.rp.ftz.sat.f
__nvvm_fma_rp_ftz_sat_f(1.0f, 2.0f, 3.0f);
// CHECK: ret void
}

View File

@@ -1501,16 +1501,18 @@ let TargetPrefix = "nvvm" in {
} // ftz
} // variant
foreach rnd = ["rn", "rz", "rm", "rp"] in {
foreach ftz = ["", "_ftz"] in
def int_nvvm_fma_ # rnd # ftz # _f : NVVMBuiltin,
PureIntrinsic<[llvm_float_ty],
[llvm_float_ty, llvm_float_ty, llvm_float_ty]>;
def int_nvvm_fma_ # rnd # _d : NVVMBuiltin,
foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
foreach ftz = ["", "_ftz"] in {
foreach sat = ["", "_sat"] in {
def int_nvvm_fma # rnd # ftz # sat # _f : NVVMBuiltin,
PureIntrinsic<[llvm_float_ty],
[llvm_float_ty, llvm_float_ty, llvm_float_ty]>;
} // sat
} // ftz
def int_nvvm_fma # rnd # _d : NVVMBuiltin,
PureIntrinsic<[llvm_double_ty],
[llvm_double_ty, llvm_double_ty, llvm_double_ty]>;
}
} // rnd
//
// Rcp
@@ -1568,14 +1570,16 @@ let TargetPrefix = "nvvm" in {
// Add
//
let IntrProperties = [IntrNoMem, IntrSpeculatable, Commutative] in {
foreach rnd = ["rn", "rz", "rm", "rp"] in {
foreach ftz = ["", "_ftz"] in
def int_nvvm_add_ # rnd # ftz # _f : NVVMBuiltin,
DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>;
def int_nvvm_add_ # rnd # _d : NVVMBuiltin,
foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
foreach ftz = ["", "_ftz"] in {
foreach sat = ["", "_sat"] in {
def int_nvvm_add # rnd # ftz # sat # _f : NVVMBuiltin,
DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>;
} // sat
} // ftz
def int_nvvm_add # rnd # _d : NVVMBuiltin,
DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>;
}
} // rnd
}
//

View File

@@ -1672,13 +1672,21 @@ multiclass FMA_INST {
FMA_TUPLE<"_rp_f64", int_nvvm_fma_rp_d, B64>,
FMA_TUPLE<"_rn_ftz_f32", int_nvvm_fma_rn_ftz_f, B32>,
FMA_TUPLE<"_rn_ftz_sat_f32", int_nvvm_fma_rn_ftz_sat_f, B32>,
FMA_TUPLE<"_rn_f32", int_nvvm_fma_rn_f, B32>,
FMA_TUPLE<"_rn_sat_f32", int_nvvm_fma_rn_sat_f, B32>,
FMA_TUPLE<"_rz_ftz_f32", int_nvvm_fma_rz_ftz_f, B32>,
FMA_TUPLE<"_rz_ftz_sat_f32", int_nvvm_fma_rz_ftz_sat_f, B32>,
FMA_TUPLE<"_rz_f32", int_nvvm_fma_rz_f, B32>,
FMA_TUPLE<"_rz_sat_f32", int_nvvm_fma_rz_sat_f, B32>,
FMA_TUPLE<"_rm_f32", int_nvvm_fma_rm_f, B32>,
FMA_TUPLE<"_rm_sat_f32", int_nvvm_fma_rm_sat_f, B32>,
FMA_TUPLE<"_rm_ftz_f32", int_nvvm_fma_rm_ftz_f, B32>,
FMA_TUPLE<"_rm_ftz_sat_f32", int_nvvm_fma_rm_ftz_sat_f, B32>,
FMA_TUPLE<"_rp_f32", int_nvvm_fma_rp_f, B32>,
FMA_TUPLE<"_rp_sat_f32", int_nvvm_fma_rp_sat_f, B32>,
FMA_TUPLE<"_rp_ftz_f32", int_nvvm_fma_rp_ftz_f, B32>,
FMA_TUPLE<"_rp_ftz_sat_f32", int_nvvm_fma_rp_ftz_sat_f, B32>,
FMA_TUPLE<"_rn_f16", int_nvvm_fma_rn_f16, B16, [hasPTX<42>, hasSM<53>]>,
FMA_TUPLE<"_rn_ftz_f16", int_nvvm_fma_rn_ftz_f16, B16,
@@ -1729,6 +1737,32 @@ multiclass FMA_INST {
defm INT_NVVM_FMA : FMA_INST;
foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
foreach sat = ["", "_sat"] in {
foreach type = [f16, bf16] in {
def INT_NVVM_MIXED_FMA # rnd # sat # _f32_ # type :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B16:$b, B32:$c),
!subst("_", ".", "fma" # rnd # sat # "_f32_" # type),
[(set f32:$dst,
(!cast<Intrinsic>("int_nvvm_fma" # rnd # sat # "_f")
(f32 (fpextend type:$a)),
(f32 (fpextend type:$b)),
f32:$c))]>,
Requires<[hasSM<100>, hasPTX<86>]>;
}
}
}
// Pattern for llvm.fma.f32 intrinsic when there is no FTZ flag
let Predicates = [hasSM<100>, hasPTX<86>, doNoF32FTZ] in {
def : Pat<(f32 (fma (f32 (fpextend f16:$a)),
(f32 (fpextend f16:$b)), f32:$c)),
(INT_NVVM_MIXED_FMA_rn_f32_f16 B16:$a, B16:$b, B32:$c)>;
def : Pat<(f32 (fma (f32 (fpextend bf16:$a)),
(f32 (fpextend bf16:$b)), f32:$c)),
(INT_NVVM_MIXED_FMA_rn_f32_bf16 B16:$a, B16:$b, B32:$c)>;
}
//
// Rcp
//
@@ -1828,19 +1862,95 @@ let Predicates = [doRsqrtOpt] in {
//
def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32", B32, B32, B32, int_nvvm_add_rn_ftz_f>;
def INT_NVVM_ADD_RN_SAT_FTZ_F : F_MATH_2<"add.rn.sat.ftz.f32", B32, B32, B32, int_nvvm_add_rn_ftz_sat_f>;
def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32", B32, B32, B32, int_nvvm_add_rn_f>;
def INT_NVVM_ADD_RN_SAT_F : F_MATH_2<"add.rn.sat.f32", B32, B32, B32, int_nvvm_add_rn_sat_f>;
def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32", B32, B32, B32, int_nvvm_add_rz_ftz_f>;
def INT_NVVM_ADD_RZ_SAT_FTZ_F : F_MATH_2<"add.rz.sat.ftz.f32", B32, B32, B32, int_nvvm_add_rz_ftz_sat_f>;
def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32", B32, B32, B32, int_nvvm_add_rz_f>;
def INT_NVVM_ADD_RZ_SAT_F : F_MATH_2<"add.rz.sat.f32", B32, B32, B32, int_nvvm_add_rz_sat_f>;
def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32", B32, B32, B32, int_nvvm_add_rm_ftz_f>;
def INT_NVVM_ADD_RM_SAT_FTZ_F : F_MATH_2<"add.rm.sat.ftz.f32", B32, B32, B32, int_nvvm_add_rm_ftz_sat_f>;
def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32", B32, B32, B32, int_nvvm_add_rm_f>;
def INT_NVVM_ADD_RM_SAT_F : F_MATH_2<"add.rm.sat.f32", B32, B32, B32, int_nvvm_add_rm_sat_f>;
def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32", B32, B32, B32, int_nvvm_add_rp_ftz_f>;
def INT_NVVM_ADD_RP_SAT_FTZ_F : F_MATH_2<"add.rp.sat.ftz.f32", B32, B32, B32, int_nvvm_add_rp_ftz_sat_f>;
def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32", B32, B32, B32, int_nvvm_add_rp_f>;
def INT_NVVM_ADD_RP_SAT_F : F_MATH_2<"add.rp.sat.f32", B32, B32, B32, int_nvvm_add_rp_sat_f>;
def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64", B64, B64, B64, int_nvvm_add_rn_d>;
def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64", B64, B64, B64, int_nvvm_add_rz_d>;
def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64", B64, B64, B64, int_nvvm_add_rm_d>;
def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64", B64, B64, B64, int_nvvm_add_rp_d>;
foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
foreach sat = ["", "_sat"] in {
foreach type = [f16, bf16] in {
def INT_NVVM_MIXED_ADD # rnd # sat # _f32_ # type :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B32:$b),
!subst("_", ".", "add" # rnd # sat # "_f32_" # type),
[(set f32:$dst,
(!cast<Intrinsic>("int_nvvm_add" # rnd # sat # "_f")
(f32 (fpextend type:$a)),
f32:$b))]>,
Requires<[hasSM<100>, hasPTX<86>]>;
}
}
}
// Pattern for fadd when there is no FTZ flag
let Predicates = [hasSM<100>, hasPTX<86>, doNoF32FTZ] in {
def : Pat<(f32 (fadd (f32 (fpextend f16:$a)), f32:$b)),
(INT_NVVM_MIXED_ADD_rn_f32_f16 B16:$a, B32:$b)>;
def : Pat<(f32 (fadd (f32 (fpextend bf16:$a)), f32:$b)),
(INT_NVVM_MIXED_ADD_rn_f32_bf16 B16:$a, B32:$b)>;
}
//
// Sub
//
foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
foreach ftz = ["", "_ftz"] in {
foreach sat = ["", "_sat"] in {
defvar add_intrin = !cast<Intrinsic>("int_nvvm_add" # rnd # ftz # sat # "_f");
def INT_NVVM_SUB # rnd # ftz # sat # _F :
BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b),
!subst("_", ".", "sub" # rnd # sat # ftz # "_f32"),
[(set f32:$dst, (add_intrin f32:$a, (f32 (fneg f32:$b))))]>;
}
}
defvar add_intrin = !cast<Intrinsic>("int_nvvm_add" # rnd # "_d");
def INT_NVVM_SUB # rnd # _D :
BasicNVPTXInst<(outs B64:$dst), (ins B64:$a, B64:$b),
!subst("_", ".", "sub" # rnd # "_f64"),
[(set f64:$dst, (add_intrin f64:$a, (f64 (fneg f64:$b))))]>;
}
foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in {
foreach sat = ["", "_sat"] in {
foreach type = [f16, bf16] in {
def INT_NVVM_MIXED_SUB # rnd # sat # _f32_ # type :
BasicNVPTXInst<(outs B32:$dst), (ins B16:$a, B32:$b),
!subst("_", ".", "sub" # rnd # sat # "_f32_" # type),
[(set f32:$dst,
(!cast<Intrinsic>("int_nvvm_add" # rnd # sat # "_f")
(f32 (fpextend type:$a)),
(f32 (fneg f32:$b))))]>,
Requires<[hasSM<100>, hasPTX<86>]>;
}
}
}
// Pattern for fsub when there is no FTZ flag
let Predicates = [hasSM<100>, hasPTX<86>, doNoF32FTZ] in {
def : Pat<(f32 (fsub (f32 (fpextend f16:$a)), f32:$b)),
(INT_NVVM_MIXED_SUB_rn_f32_f16 B16:$a, B32:$b)>;
def : Pat<(f32 (fsub (f32 (fpextend bf16:$a)), f32:$b)),
(INT_NVVM_MIXED_SUB_rn_f32_bf16 B16:$a, B32:$b)>;
}
//
// BFIND
//
@@ -6122,3 +6232,4 @@ foreach sp = [0, 1] in {
}
}
}

View File

@@ -0,0 +1,115 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx42 | FileCheck %s
; RUN: %if ptxas-sm_20 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify -arch=sm_20 %}
define float @add_sat_f32(float %a, float %b) {
; CHECK-LABEL: add_sat_f32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<11>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [add_sat_f32_param_0];
; CHECK-NEXT: ld.param.b32 %r2, [add_sat_f32_param_1];
; CHECK-NEXT: add.rn.sat.f32 %r3, %r1, %r2;
; CHECK-NEXT: add.rn.sat.ftz.f32 %r4, %r1, %r3;
; CHECK-NEXT: add.rz.sat.f32 %r5, %r1, %r4;
; CHECK-NEXT: add.rz.sat.ftz.f32 %r6, %r1, %r5;
; CHECK-NEXT: add.rm.sat.f32 %r7, %r1, %r6;
; CHECK-NEXT: add.rm.sat.ftz.f32 %r8, %r1, %r7;
; CHECK-NEXT: add.rp.sat.f32 %r9, %r1, %r8;
; CHECK-NEXT: add.rp.sat.ftz.f32 %r10, %r1, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r10;
; CHECK-NEXT: ret;
%r1 = call float @llvm.nvvm.add.rn.sat.f(float %a, float %b)
%r2 = call float @llvm.nvvm.add.rn.ftz.sat.f(float %a, float %r1)
%r3 = call float @llvm.nvvm.add.rz.sat.f(float %a, float %r2)
%r4 = call float @llvm.nvvm.add.rz.ftz.sat.f(float %a, float %r3)
%r5 = call float @llvm.nvvm.add.rm.sat.f(float %a, float %r4)
%r6 = call float @llvm.nvvm.add.rm.ftz.sat.f(float %a, float %r5)
%r7 = call float @llvm.nvvm.add.rp.sat.f(float %a, float %r6)
%r8 = call float @llvm.nvvm.add.rp.ftz.sat.f(float %a, float %r7)
ret float %r8
}
define float @sub_sat_f32(float %a, float %b) {
; CHECK-LABEL: sub_sat_f32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<11>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [sub_sat_f32_param_0];
; CHECK-NEXT: ld.param.b32 %r2, [sub_sat_f32_param_1];
; CHECK-NEXT: sub.rn.sat.f32 %r3, %r1, %r2;
; CHECK-NEXT: sub.rn.sat.ftz.f32 %r4, %r1, %r3;
; CHECK-NEXT: sub.rz.sat.f32 %r5, %r1, %r4;
; CHECK-NEXT: sub.rz.sat.ftz.f32 %r6, %r1, %r5;
; CHECK-NEXT: sub.rm.sat.f32 %r7, %r1, %r6;
; CHECK-NEXT: sub.rm.sat.ftz.f32 %r8, %r1, %r7;
; CHECK-NEXT: sub.rp.sat.f32 %r9, %r1, %r8;
; CHECK-NEXT: sub.rp.sat.ftz.f32 %r10, %r1, %r9;
; CHECK-NEXT: st.param.b32 [func_retval0], %r10;
; CHECK-NEXT: ret;
%f0 = fneg float %b
%r1 = call float @llvm.nvvm.add.rn.sat.f(float %a, float %f0)
%f1 = fneg float %r1
%r2 = call float @llvm.nvvm.add.rn.ftz.sat.f(float %a, float %f1)
%f2 = fneg float %r2
%r3 = call float @llvm.nvvm.add.rz.sat.f(float %a, float %f2)
%f3 = fneg float %r3
%r4 = call float @llvm.nvvm.add.rz.ftz.sat.f(float %a, float %f3)
%f4 = fneg float %r4
%r5 = call float @llvm.nvvm.add.rm.sat.f(float %a, float %f4)
%f5 = fneg float %r5
%r6 = call float @llvm.nvvm.add.rm.ftz.sat.f(float %a, float %f5)
%f6 = fneg float %r6
%r7 = call float @llvm.nvvm.add.rp.sat.f(float %a, float %f6)
%f7 = fneg float %r7
%r8 = call float @llvm.nvvm.add.rp.ftz.sat.f(float %a, float %f7)
ret float %r8
}
define float @fma_sat_f32(float %a, float %b, float %c) {
; CHECK-LABEL: fma_sat_f32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<12>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [fma_sat_f32_param_0];
; CHECK-NEXT: ld.param.b32 %r2, [fma_sat_f32_param_1];
; CHECK-NEXT: ld.param.b32 %r3, [fma_sat_f32_param_2];
; CHECK-NEXT: fma.rn.sat.f32 %r4, %r1, %r2, %r3;
; CHECK-NEXT: fma.rn.ftz.sat.f32 %r5, %r1, %r2, %r4;
; CHECK-NEXT: fma.rz.sat.f32 %r6, %r1, %r2, %r5;
; CHECK-NEXT: fma.rz.ftz.sat.f32 %r7, %r1, %r2, %r6;
; CHECK-NEXT: fma.rm.sat.f32 %r8, %r1, %r2, %r7;
; CHECK-NEXT: fma.rm.ftz.sat.f32 %r9, %r1, %r2, %r8;
; CHECK-NEXT: fma.rp.sat.f32 %r10, %r1, %r2, %r9;
; CHECK-NEXT: fma.rp.ftz.sat.f32 %r11, %r1, %r2, %r10;
; CHECK-NEXT: st.param.b32 [func_retval0], %r11;
; CHECK-NEXT: ret;
%r1 = call float @llvm.nvvm.fma.rn.sat.f(float %a, float %b, float %c)
%r2 = call float @llvm.nvvm.fma.rn.ftz.sat.f(float %a, float %b, float %r1)
%r3 = call float @llvm.nvvm.fma.rz.sat.f(float %a, float %b, float %r2)
%r4 = call float @llvm.nvvm.fma.rz.ftz.sat.f(float %a, float %b, float %r3)
%r5 = call float @llvm.nvvm.fma.rm.sat.f(float %a, float %b, float %r4)
%r6 = call float @llvm.nvvm.fma.rm.ftz.sat.f(float %a, float %b, float %r5)
%r7 = call float @llvm.nvvm.fma.rp.sat.f(float %a, float %b, float %r6)
%r8 = call float @llvm.nvvm.fma.rp.ftz.sat.f(float %a, float %b, float %r7)
ret float %r8
}

View File

@@ -0,0 +1,70 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx60 | FileCheck %s
; RUN: %if ptxas-sm_20 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify -arch=sm_20 %}
define float @sub_f32(float %a, float %b) {
; CHECK-LABEL: sub_f32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [sub_f32_param_0];
; CHECK-NEXT: ld.param.b32 %r2, [sub_f32_param_1];
; CHECK-NEXT: sub.rn.f32 %r3, %r1, %r2;
; CHECK-NEXT: sub.rn.ftz.f32 %r4, %r1, %r3;
; CHECK-NEXT: sub.rz.f32 %r5, %r1, %r4;
; CHECK-NEXT: sub.rz.ftz.f32 %r6, %r1, %r5;
; CHECK-NEXT: sub.rm.f32 %r7, %r1, %r6;
; CHECK-NEXT: sub.rm.ftz.f32 %r8, %r1, %r7;
; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-NEXT: ret;
%f0 = fneg float %b
%r1 = call float @llvm.nvvm.add.rn.f(float %a, float %f0)
%f1 = fneg float %r1
%r2 = call float @llvm.nvvm.add.rn.ftz.f(float %a, float %f1)
%f2 = fneg float %r2
%r3 = call float @llvm.nvvm.add.rz.f(float %a, float %f2)
%f3 = fneg float %r3
%r4 = call float @llvm.nvvm.add.rz.ftz.f(float %a, float %f3)
%f4 = fneg float %r4
%r5 = call float @llvm.nvvm.add.rm.f(float %a, float %f4)
%f5 = fneg float %r5
%r6 = call float @llvm.nvvm.add.rm.ftz.f(float %a, float %f5)
ret float %r6
}
define double @sub_f64(double %a, double %b) {
; CHECK-LABEL: sub_f64(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [sub_f64_param_0];
; CHECK-NEXT: ld.param.b64 %rd2, [sub_f64_param_1];
; CHECK-NEXT: sub.rn.f64 %rd3, %rd1, %rd2;
; CHECK-NEXT: sub.rz.f64 %rd4, %rd1, %rd3;
; CHECK-NEXT: sub.rm.f64 %rd5, %rd1, %rd4;
; CHECK-NEXT: sub.rp.f64 %rd6, %rd1, %rd5;
; CHECK-NEXT: st.param.b64 [func_retval0], %rd6;
; CHECK-NEXT: ret;
%f0 = fneg double %b
%r1 = call double @llvm.nvvm.add.rn.d(double %a, double %f0)
%f1 = fneg double %r1
%r2 = call double @llvm.nvvm.add.rz.d(double %a, double %f1)
%f2 = fneg double %r2
%r3 = call double @llvm.nvvm.add.rm.d(double %a, double %f2)
%f3 = fneg double %r3
%r4 = call double @llvm.nvvm.add.rp.d(double %a, double %f3)
ret double %r4
}

View File

@@ -0,0 +1,443 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK,CHECK-NOF32FTZ %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 -denormal-fp-math=preserve-sign | FileCheck --check-prefixes=CHECK,CHECK-F32FTZ %s
; RUN: %if ptxas-sm_100 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 | %ptxas-verify -arch=sm_100 %}
; RUN: %if ptxas-sm_100 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 -denormal-fp-math=preserve-sign | %ptxas-verify -arch=sm_100 %}
; ADD
define float @test_add_f32_f16_1(half %a, float %b) {
; CHECK-LABEL: test_add_f32_f16_1(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [test_add_f32_f16_1_param_0];
; CHECK-NEXT: ld.param.b32 %r1, [test_add_f32_f16_1_param_1];
; CHECK-NEXT: add.rn.f32.f16 %r2, %rs1, %r1;
; CHECK-NEXT: add.rz.f32.f16 %r3, %rs1, %r2;
; CHECK-NEXT: add.rm.f32.f16 %r4, %rs1, %r3;
; CHECK-NEXT: add.rp.f32.f16 %r5, %rs1, %r4;
; CHECK-NEXT: add.rn.sat.f32.f16 %r6, %rs1, %r5;
; CHECK-NEXT: add.rz.sat.f32.f16 %r7, %rs1, %r6;
; CHECK-NEXT: add.rm.sat.f32.f16 %r8, %rs1, %r7;
; CHECK-NEXT: add.rp.sat.f32.f16 %r9, %rs1, %r8;
; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
; CHECK-NEXT: ret;
%r0 = fpext half %a to float
%r1 = call float @llvm.nvvm.add.rn.f(float %r0, float %b)
%r2 = call float @llvm.nvvm.add.rz.f(float %r0, float %r1)
%r3 = call float @llvm.nvvm.add.rm.f(float %r0, float %r2)
%r4 = call float @llvm.nvvm.add.rp.f(float %r0, float %r3)
; SAT
%r5 = call float @llvm.nvvm.add.rn.sat.f(float %r0, float %r4)
%r6 = call float @llvm.nvvm.add.rz.sat.f(float %r0, float %r5)
%r7 = call float @llvm.nvvm.add.rm.sat.f(float %r0, float %r6)
%r8 = call float @llvm.nvvm.add.rp.sat.f(float %r0, float %r7)
ret float %r8
}
define float @test_add_f32_f16_2(half %a, float %b) {
; CHECK-NOF32FTZ-LABEL: test_add_f32_f16_2(
; CHECK-NOF32FTZ: {
; CHECK-NOF32FTZ-NEXT: .reg .b16 %rs<2>;
; CHECK-NOF32FTZ-NEXT: .reg .b32 %r<3>;
; CHECK-NOF32FTZ-EMPTY:
; CHECK-NOF32FTZ-NEXT: // %bb.0:
; CHECK-NOF32FTZ-NEXT: ld.param.b16 %rs1, [test_add_f32_f16_2_param_0];
; CHECK-NOF32FTZ-NEXT: ld.param.b32 %r1, [test_add_f32_f16_2_param_1];
; CHECK-NOF32FTZ-NEXT: add.rn.f32.f16 %r2, %rs1, %r1;
; CHECK-NOF32FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NOF32FTZ-NEXT: ret;
;
; CHECK-F32FTZ-LABEL: test_add_f32_f16_2(
; CHECK-F32FTZ: {
; CHECK-F32FTZ-NEXT: .reg .b16 %rs<2>;
; CHECK-F32FTZ-NEXT: .reg .b32 %r<4>;
; CHECK-F32FTZ-EMPTY:
; CHECK-F32FTZ-NEXT: // %bb.0:
; CHECK-F32FTZ-NEXT: ld.param.b16 %rs1, [test_add_f32_f16_2_param_0];
; CHECK-F32FTZ-NEXT: cvt.ftz.f32.f16 %r1, %rs1;
; CHECK-F32FTZ-NEXT: ld.param.b32 %r2, [test_add_f32_f16_2_param_1];
; CHECK-F32FTZ-NEXT: add.rn.ftz.f32 %r3, %r1, %r2;
; CHECK-F32FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-F32FTZ-NEXT: ret;
%r0 = fpext half %a to float
%r1 = fadd float %r0, %b
ret float %r1
}
define float @test_add_f32_bf16_1(bfloat %a, float %b) {
; CHECK-LABEL: test_add_f32_bf16_1(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [test_add_f32_bf16_1_param_0];
; CHECK-NEXT: ld.param.b32 %r1, [test_add_f32_bf16_1_param_1];
; CHECK-NEXT: add.rn.f32.bf16 %r2, %rs1, %r1;
; CHECK-NEXT: add.rz.f32.bf16 %r3, %rs1, %r2;
; CHECK-NEXT: add.rm.f32.bf16 %r4, %rs1, %r3;
; CHECK-NEXT: add.rp.f32.bf16 %r5, %rs1, %r4;
; CHECK-NEXT: add.rn.sat.f32.bf16 %r6, %rs1, %r5;
; CHECK-NEXT: add.rz.sat.f32.bf16 %r7, %rs1, %r6;
; CHECK-NEXT: add.rm.sat.f32.bf16 %r8, %rs1, %r7;
; CHECK-NEXT: add.rp.sat.f32.bf16 %r9, %rs1, %r8;
; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
; CHECK-NEXT: ret;
%r0 = fpext bfloat %a to float
%r1 = call float @llvm.nvvm.add.rn.f(float %r0, float %b)
%r2 = call float @llvm.nvvm.add.rz.f(float %r0, float %r1)
%r3 = call float @llvm.nvvm.add.rm.f(float %r0, float %r2)
%r4 = call float @llvm.nvvm.add.rp.f(float %r0, float %r3)
; SAT
%r5 = call float @llvm.nvvm.add.rn.sat.f(float %r0, float %r4)
%r6 = call float @llvm.nvvm.add.rz.sat.f(float %r0, float %r5)
%r7 = call float @llvm.nvvm.add.rm.sat.f(float %r0, float %r6)
%r8 = call float @llvm.nvvm.add.rp.sat.f(float %r0, float %r7)
ret float %r8
}
define float @test_add_f32_bf16_2(bfloat %a, float %b) {
; CHECK-NOF32FTZ-LABEL: test_add_f32_bf16_2(
; CHECK-NOF32FTZ: {
; CHECK-NOF32FTZ-NEXT: .reg .b16 %rs<2>;
; CHECK-NOF32FTZ-NEXT: .reg .b32 %r<3>;
; CHECK-NOF32FTZ-EMPTY:
; CHECK-NOF32FTZ-NEXT: // %bb.0:
; CHECK-NOF32FTZ-NEXT: ld.param.b16 %rs1, [test_add_f32_bf16_2_param_0];
; CHECK-NOF32FTZ-NEXT: ld.param.b32 %r1, [test_add_f32_bf16_2_param_1];
; CHECK-NOF32FTZ-NEXT: add.rn.f32.bf16 %r2, %rs1, %r1;
; CHECK-NOF32FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NOF32FTZ-NEXT: ret;
;
; CHECK-F32FTZ-LABEL: test_add_f32_bf16_2(
; CHECK-F32FTZ: {
; CHECK-F32FTZ-NEXT: .reg .b16 %rs<2>;
; CHECK-F32FTZ-NEXT: .reg .b32 %r<4>;
; CHECK-F32FTZ-EMPTY:
; CHECK-F32FTZ-NEXT: // %bb.0:
; CHECK-F32FTZ-NEXT: ld.param.b16 %rs1, [test_add_f32_bf16_2_param_0];
; CHECK-F32FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
; CHECK-F32FTZ-NEXT: ld.param.b32 %r2, [test_add_f32_bf16_2_param_1];
; CHECK-F32FTZ-NEXT: add.rn.ftz.f32 %r3, %r1, %r2;
; CHECK-F32FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-F32FTZ-NEXT: ret;
%r0 = fpext bfloat %a to float
%r1 = fadd float %r0, %b
ret float %r1
}
; SUB
define float @test_sub_f32_f16_1(half %a, float %b) {
; CHECK-LABEL: test_sub_f32_f16_1(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [test_sub_f32_f16_1_param_0];
; CHECK-NEXT: ld.param.b32 %r1, [test_sub_f32_f16_1_param_1];
; CHECK-NEXT: sub.rn.f32.f16 %r2, %rs1, %r1;
; CHECK-NEXT: sub.rz.f32.f16 %r3, %rs1, %r2;
; CHECK-NEXT: sub.rm.f32.f16 %r4, %rs1, %r3;
; CHECK-NEXT: sub.rm.f32.f16 %r5, %rs1, %r4;
; CHECK-NEXT: sub.rn.sat.f32.f16 %r6, %rs1, %r5;
; CHECK-NEXT: sub.rz.sat.f32.f16 %r7, %rs1, %r6;
; CHECK-NEXT: sub.rm.sat.f32.f16 %r8, %rs1, %r7;
; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-NEXT: ret;
%r0 = fpext half %a to float
%f0 = fneg float %b
%r1 = call float @llvm.nvvm.add.rn.f(float %r0, float %f0)
%f1 = fneg float %r1
%r2 = call float @llvm.nvvm.add.rz.f(float %r0, float %f1)
%f2 = fneg float %r2
%r3 = call float @llvm.nvvm.add.rm.f(float %r0, float %f2)
%f3 = fneg float %r3
%r4 = call float @llvm.nvvm.add.rm.f(float %r0, float %f3)
; SAT
%f4 = fneg float %r4
%r5 = call float @llvm.nvvm.add.rn.sat.f(float %r0, float %f4)
%f5 = fneg float %r5
%r6 = call float @llvm.nvvm.add.rz.sat.f(float %r0, float %f5)
%f6 = fneg float %r6
%r7 = call float @llvm.nvvm.add.rm.sat.f(float %r0, float %f6)
%f7 = fneg float %r7
%r8 = call float @llvm.nvvm.add.rp.sat.f(float %r0, float %f7)
ret float %r7
}
define float @test_sub_f32_f16_2(half %a, float %b) {
; CHECK-NOF32FTZ-LABEL: test_sub_f32_f16_2(
; CHECK-NOF32FTZ: {
; CHECK-NOF32FTZ-NEXT: .reg .b16 %rs<2>;
; CHECK-NOF32FTZ-NEXT: .reg .b32 %r<3>;
; CHECK-NOF32FTZ-EMPTY:
; CHECK-NOF32FTZ-NEXT: // %bb.0:
; CHECK-NOF32FTZ-NEXT: ld.param.b16 %rs1, [test_sub_f32_f16_2_param_0];
; CHECK-NOF32FTZ-NEXT: ld.param.b32 %r1, [test_sub_f32_f16_2_param_1];
; CHECK-NOF32FTZ-NEXT: sub.rn.f32.f16 %r2, %rs1, %r1;
; CHECK-NOF32FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NOF32FTZ-NEXT: ret;
;
; CHECK-F32FTZ-LABEL: test_sub_f32_f16_2(
; CHECK-F32FTZ: {
; CHECK-F32FTZ-NEXT: .reg .b16 %rs<2>;
; CHECK-F32FTZ-NEXT: .reg .b32 %r<4>;
; CHECK-F32FTZ-EMPTY:
; CHECK-F32FTZ-NEXT: // %bb.0:
; CHECK-F32FTZ-NEXT: ld.param.b16 %rs1, [test_sub_f32_f16_2_param_0];
; CHECK-F32FTZ-NEXT: cvt.ftz.f32.f16 %r1, %rs1;
; CHECK-F32FTZ-NEXT: ld.param.b32 %r2, [test_sub_f32_f16_2_param_1];
; CHECK-F32FTZ-NEXT: sub.rn.ftz.f32 %r3, %r1, %r2;
; CHECK-F32FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-F32FTZ-NEXT: ret;
%r0 = fpext half %a to float
%r1 = fsub float %r0, %b
ret float %r1
}
define float @test_sub_f32_bf16_1(bfloat %a, float %b) {
; CHECK-LABEL: test_sub_f32_bf16_1(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [test_sub_f32_bf16_1_param_0];
; CHECK-NEXT: ld.param.b32 %r1, [test_sub_f32_bf16_1_param_1];
; CHECK-NEXT: sub.rn.f32.bf16 %r2, %rs1, %r1;
; CHECK-NEXT: sub.rz.f32.bf16 %r3, %rs1, %r2;
; CHECK-NEXT: sub.rm.f32.bf16 %r4, %rs1, %r3;
; CHECK-NEXT: sub.rp.f32.bf16 %r5, %rs1, %r4;
; CHECK-NEXT: sub.rn.sat.f32.bf16 %r6, %rs1, %r5;
; CHECK-NEXT: sub.rz.sat.f32.bf16 %r7, %rs1, %r6;
; CHECK-NEXT: sub.rm.sat.f32.bf16 %r8, %rs1, %r7;
; CHECK-NEXT: sub.rp.sat.f32.bf16 %r9, %rs1, %r8;
; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
; CHECK-NEXT: ret;
%r0 = fpext bfloat %a to float
%f0 = fneg float %b
%r1 = call float @llvm.nvvm.add.rn.f(float %r0, float %f0)
%f1 = fneg float %r1
%r2 = call float @llvm.nvvm.add.rz.f(float %r0, float %f1)
%f2 = fneg float %r2
%r3 = call float @llvm.nvvm.add.rm.f(float %r0, float %f2)
%f3 = fneg float %r3
%r4 = call float @llvm.nvvm.add.rp.f(float %r0, float %f3)
; SAT
%f4 = fneg float %r4
%r5 = call float @llvm.nvvm.add.rn.sat.f(float %r0, float %f4)
%f5 = fneg float %r5
%r6 = call float @llvm.nvvm.add.rz.sat.f(float %r0, float %f5)
%f6 = fneg float %r6
%r7 = call float @llvm.nvvm.add.rm.sat.f(float %r0, float %f6)
%f7 = fneg float %r7
%r8 = call float @llvm.nvvm.add.rp.sat.f(float %r0, float %f7)
ret float %r8
}
define float @test_sub_f32_bf16_2(bfloat %a, float %b) {
; CHECK-NOF32FTZ-LABEL: test_sub_f32_bf16_2(
; CHECK-NOF32FTZ: {
; CHECK-NOF32FTZ-NEXT: .reg .b16 %rs<2>;
; CHECK-NOF32FTZ-NEXT: .reg .b32 %r<3>;
; CHECK-NOF32FTZ-EMPTY:
; CHECK-NOF32FTZ-NEXT: // %bb.0:
; CHECK-NOF32FTZ-NEXT: ld.param.b16 %rs1, [test_sub_f32_bf16_2_param_0];
; CHECK-NOF32FTZ-NEXT: ld.param.b32 %r1, [test_sub_f32_bf16_2_param_1];
; CHECK-NOF32FTZ-NEXT: sub.rn.f32.bf16 %r2, %rs1, %r1;
; CHECK-NOF32FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NOF32FTZ-NEXT: ret;
;
; CHECK-F32FTZ-LABEL: test_sub_f32_bf16_2(
; CHECK-F32FTZ: {
; CHECK-F32FTZ-NEXT: .reg .b16 %rs<2>;
; CHECK-F32FTZ-NEXT: .reg .b32 %r<4>;
; CHECK-F32FTZ-EMPTY:
; CHECK-F32FTZ-NEXT: // %bb.0:
; CHECK-F32FTZ-NEXT: ld.param.b16 %rs1, [test_sub_f32_bf16_2_param_0];
; CHECK-F32FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
; CHECK-F32FTZ-NEXT: ld.param.b32 %r2, [test_sub_f32_bf16_2_param_1];
; CHECK-F32FTZ-NEXT: sub.rn.ftz.f32 %r3, %r1, %r2;
; CHECK-F32FTZ-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-F32FTZ-NEXT: ret;
%r0 = fpext bfloat %a to float
%r1 = fsub float %r0, %b
ret float %r1
}
; FMA
define float @test_fma_f32_f16_1(half %a, half %b, float %c) {
; CHECK-LABEL: test_fma_f32_f16_1(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [test_fma_f32_f16_1_param_0];
; CHECK-NEXT: ld.param.b16 %rs2, [test_fma_f32_f16_1_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_fma_f32_f16_1_param_2];
; CHECK-NEXT: fma.rn.f32.f16 %r2, %rs1, %rs2, %r1;
; CHECK-NEXT: fma.rz.f32.f16 %r3, %rs1, %rs2, %r2;
; CHECK-NEXT: fma.rm.f32.f16 %r4, %rs1, %rs2, %r3;
; CHECK-NEXT: fma.rp.f32.f16 %r5, %rs1, %rs2, %r4;
; CHECK-NEXT: fma.rn.sat.f32.f16 %r6, %rs1, %rs2, %r5;
; CHECK-NEXT: fma.rz.sat.f32.f16 %r7, %rs1, %rs2, %r6;
; CHECK-NEXT: fma.rm.sat.f32.f16 %r8, %rs1, %rs2, %r7;
; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-NEXT: ret;
%r0 = fpext half %a to float
%r1 = fpext half %b to float
%r2 = call float @llvm.nvvm.fma.rn.f(float %r0, float %r1, float %c)
%r3 = call float @llvm.nvvm.fma.rz.f(float %r0, float %r1, float %r2)
%r4 = call float @llvm.nvvm.fma.rm.f(float %r0, float %r1, float %r3)
%r5 = call float @llvm.nvvm.fma.rp.f(float %r0, float %r1, float %r4)
; SAT
%r6 = call float @llvm.nvvm.fma.rn.sat.f(float %r0, float %r1, float %r5)
%r7 = call float @llvm.nvvm.fma.rz.sat.f(float %r0, float %r1, float %r6)
%r8 = call float @llvm.nvvm.fma.rm.sat.f(float %r0, float %r1, float %r7)
%r9 = call float @llvm.nvvm.fma.rp.sat.f(float %r0, float %r1, float %r8)
ret float %r8
}
define float @test_fma_f32_f16_2(half %a, half %b, float %c) {
; CHECK-NOF32FTZ-LABEL: test_fma_f32_f16_2(
; CHECK-NOF32FTZ: {
; CHECK-NOF32FTZ-NEXT: .reg .b16 %rs<3>;
; CHECK-NOF32FTZ-NEXT: .reg .b32 %r<3>;
; CHECK-NOF32FTZ-EMPTY:
; CHECK-NOF32FTZ-NEXT: // %bb.0:
; CHECK-NOF32FTZ-NEXT: ld.param.b16 %rs1, [test_fma_f32_f16_2_param_0];
; CHECK-NOF32FTZ-NEXT: ld.param.b16 %rs2, [test_fma_f32_f16_2_param_1];
; CHECK-NOF32FTZ-NEXT: ld.param.b32 %r1, [test_fma_f32_f16_2_param_2];
; CHECK-NOF32FTZ-NEXT: fma.rn.f32.f16 %r2, %rs1, %rs2, %r1;
; CHECK-NOF32FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NOF32FTZ-NEXT: ret;
;
; CHECK-F32FTZ-LABEL: test_fma_f32_f16_2(
; CHECK-F32FTZ: {
; CHECK-F32FTZ-NEXT: .reg .b16 %rs<3>;
; CHECK-F32FTZ-NEXT: .reg .b32 %r<5>;
; CHECK-F32FTZ-EMPTY:
; CHECK-F32FTZ-NEXT: // %bb.0:
; CHECK-F32FTZ-NEXT: ld.param.b16 %rs1, [test_fma_f32_f16_2_param_0];
; CHECK-F32FTZ-NEXT: cvt.ftz.f32.f16 %r1, %rs1;
; CHECK-F32FTZ-NEXT: ld.param.b16 %rs2, [test_fma_f32_f16_2_param_1];
; CHECK-F32FTZ-NEXT: cvt.ftz.f32.f16 %r2, %rs2;
; CHECK-F32FTZ-NEXT: ld.param.b32 %r3, [test_fma_f32_f16_2_param_2];
; CHECK-F32FTZ-NEXT: fma.rn.ftz.f32 %r4, %r1, %r2, %r3;
; CHECK-F32FTZ-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-F32FTZ-NEXT: ret;
%r0 = fpext half %a to float
%r1 = fpext half %b to float
%r2 = call float @llvm.fma.f32(float %r0, float %r1, float %c)
ret float %r2
}
define float @test_fma_f32_bf16_1(bfloat %a, bfloat %b, float %c) {
; CHECK-LABEL: test_fma_f32_bf16_1(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [test_fma_f32_bf16_1_param_0];
; CHECK-NEXT: ld.param.b16 %rs2, [test_fma_f32_bf16_1_param_1];
; CHECK-NEXT: ld.param.b32 %r1, [test_fma_f32_bf16_1_param_2];
; CHECK-NEXT: fma.rn.f32.bf16 %r2, %rs1, %rs2, %r1;
; CHECK-NEXT: fma.rz.f32.bf16 %r3, %rs1, %rs2, %r2;
; CHECK-NEXT: fma.rm.f32.bf16 %r4, %rs1, %rs2, %r3;
; CHECK-NEXT: fma.rp.f32.bf16 %r5, %rs1, %rs2, %r4;
; CHECK-NEXT: fma.rn.sat.f32.bf16 %r6, %rs1, %rs2, %r5;
; CHECK-NEXT: fma.rz.sat.f32.bf16 %r7, %rs1, %rs2, %r6;
; CHECK-NEXT: fma.rm.sat.f32.bf16 %r8, %rs1, %rs2, %r7;
; CHECK-NEXT: st.param.b32 [func_retval0], %r8;
; CHECK-NEXT: ret;
%r0 = fpext bfloat %a to float
%r1 = fpext bfloat %b to float
%r2 = call float @llvm.nvvm.fma.rn.f(float %r0, float %r1, float %c)
%r3 = call float @llvm.nvvm.fma.rz.f(float %r0, float %r1, float %r2)
%r4 = call float @llvm.nvvm.fma.rm.f(float %r0, float %r1, float %r3)
%r5 = call float @llvm.nvvm.fma.rp.f(float %r0, float %r1, float %r4)
; SAT
%r6 = call float @llvm.nvvm.fma.rn.sat.f(float %r0, float %r1, float %r5)
%r7 = call float @llvm.nvvm.fma.rz.sat.f(float %r0, float %r1, float %r6)
%r8 = call float @llvm.nvvm.fma.rm.sat.f(float %r0, float %r1, float %r7)
%r9 = call float @llvm.nvvm.fma.rp.sat.f(float %r0, float %r1, float %r8)
ret float %r8
}
define float @test_fma_f32_bf16_2(bfloat %a, bfloat %b, float %c) {
; CHECK-NOF32FTZ-LABEL: test_fma_f32_bf16_2(
; CHECK-NOF32FTZ: {
; CHECK-NOF32FTZ-NEXT: .reg .b16 %rs<3>;
; CHECK-NOF32FTZ-NEXT: .reg .b32 %r<3>;
; CHECK-NOF32FTZ-EMPTY:
; CHECK-NOF32FTZ-NEXT: // %bb.0:
; CHECK-NOF32FTZ-NEXT: ld.param.b16 %rs1, [test_fma_f32_bf16_2_param_0];
; CHECK-NOF32FTZ-NEXT: ld.param.b16 %rs2, [test_fma_f32_bf16_2_param_1];
; CHECK-NOF32FTZ-NEXT: ld.param.b32 %r1, [test_fma_f32_bf16_2_param_2];
; CHECK-NOF32FTZ-NEXT: fma.rn.f32.bf16 %r2, %rs1, %rs2, %r1;
; CHECK-NOF32FTZ-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NOF32FTZ-NEXT: ret;
;
; CHECK-F32FTZ-LABEL: test_fma_f32_bf16_2(
; CHECK-F32FTZ: {
; CHECK-F32FTZ-NEXT: .reg .b16 %rs<3>;
; CHECK-F32FTZ-NEXT: .reg .b32 %r<5>;
; CHECK-F32FTZ-EMPTY:
; CHECK-F32FTZ-NEXT: // %bb.0:
; CHECK-F32FTZ-NEXT: ld.param.b16 %rs1, [test_fma_f32_bf16_2_param_0];
; CHECK-F32FTZ-NEXT: cvt.ftz.f32.bf16 %r1, %rs1;
; CHECK-F32FTZ-NEXT: ld.param.b16 %rs2, [test_fma_f32_bf16_2_param_1];
; CHECK-F32FTZ-NEXT: cvt.ftz.f32.bf16 %r2, %rs2;
; CHECK-F32FTZ-NEXT: ld.param.b32 %r3, [test_fma_f32_bf16_2_param_2];
; CHECK-F32FTZ-NEXT: fma.rn.ftz.f32 %r4, %r1, %r2, %r3;
; CHECK-F32FTZ-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-F32FTZ-NEXT: ret;
%r0 = fpext bfloat %a to float
%r1 = fpext bfloat %b to float
%r2 = call float @llvm.fma.f32(float %r0, float %r1, float %c)
ret float %r2
}