[libclc] Optimize generic CLC fmin/fmax (#128506)

With this commit, the CLC fmin/fmax builtins use clang's
__builtin_elementwise_(min|max)imumnum which helps us generate LLVM
minimumnum/maximumnum intrinsics directly. These intrinsics uniformly
select the non-NaN input over the (quiet or signalling) NaN input, which
corresponds to what the OpenCL CTS tests.

These intrinsics maintain the vector types, as opposed to scalarizing,
which was previously happening. This commit therefore helps to optimize
codegen for those targets.

Note that there is ongoing discussion regarding how these builtins
should handle signalling NaNs in the OpenCL specification and whether
they should be able to return a quiet NaN as per the IEEE behaviour. If
the specification and/or CTS is ever updated to allow or mandate
returning a qNAN, these builtins could/should be updated to use
__builtin_elementwise_(min|max)num instead which would lower to LLVM
minnum/maxnum intrinsics.

The SPIR-V targets maintain the old implementations, as the LLVM ->
SPIR-V translator can't currently handle the LLVM intrinsics. The
implementation has been simplifies to consistently use clang builtins,
as opposed to before where the half version was explicitly defined.

[1] https://github.com/KhronosGroup/OpenCL-CTS/pull/2285
This commit is contained in:
Fraser Cormack
2025-07-29 13:21:42 +01:00
committed by GitHub
parent 315e2e28b1
commit 586cacdbdd
9 changed files with 11 additions and 216 deletions

View File

@@ -1,5 +1,3 @@
math/clc_fmax.cl
math/clc_fmin.cl
math/clc_ldexp_override.cl
workitem/clc_get_global_offset.cl
workitem/clc_get_global_size.cl

View File

@@ -6,53 +6,10 @@
//
//===----------------------------------------------------------------------===//
#include <clc/clcmacro.h>
#include <clc/internal/clc.h>
#include <clc/relational/clc_isnan.h>
#define __FLOAT_ONLY
#define __CLC_MIN_VECSIZE 1
#define FUNCTION __clc_fmax
#define __IMPL_FUNCTION __builtin_fmaxf
#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
#define __IMPL_FUNCTION(x) __builtin_elementwise_maximumnum
#define __CLC_BODY <clc/shared/binary_def.inc>
#include <clc/math/gentype.inc>
#undef __CLC_MIN_VECSIZE
#undef FUNCTION
#undef __IMPL_FUNCTION
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define __DOUBLE_ONLY
#define __CLC_MIN_VECSIZE 1
#define FUNCTION __clc_fmax
#define __IMPL_FUNCTION __builtin_fmax
#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
#include <clc/math/gentype.inc>
#undef __CLC_MIN_VECSIZE
#undef FUNCTION
#undef __IMPL_FUNCTION
#endif
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
_CLC_DEF _CLC_OVERLOAD half __clc_fmax(half x, half y) {
if (__clc_isnan(x))
return y;
if (__clc_isnan(y))
return x;
return (x < y) ? y : x;
}
#define __HALF_ONLY
#define __CLC_SUPPORTED_VECSIZE_OR_1 2
#define FUNCTION __clc_fmax
#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
#include <clc/math/gentype.inc>
#undef FUNCTION
#endif

View File

@@ -6,52 +6,10 @@
//
//===----------------------------------------------------------------------===//
#include <clc/clcmacro.h>
#include <clc/internal/clc.h>
#include <clc/relational/clc_isnan.h>
#define __FLOAT_ONLY
#define __CLC_MIN_VECSIZE 1
#define FUNCTION __clc_fmin
#define __IMPL_FUNCTION __builtin_fminf
#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
#define __IMPL_FUNCTION(x) __builtin_elementwise_minimumnum
#define __CLC_BODY <clc/shared/binary_def.inc>
#include <clc/math/gentype.inc>
#undef __CLC_MIN_VECSIZE
#undef FUNCTION
#undef __IMPL_FUNCTION
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define __DOUBLE_ONLY
#define __CLC_MIN_VECSIZE 1
#define FUNCTION __clc_fmin
#define __IMPL_FUNCTION __builtin_fmin
#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
#include <clc/math/gentype.inc>
#undef __CLC_MIN_VECSIZE
#undef FUNCTION
#undef __IMPL_FUNCTION
#endif
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
_CLC_DEF _CLC_OVERLOAD half __clc_fmin(half x, half y) {
if (__clc_isnan(x))
return y;
if (__clc_isnan(y))
return x;
return (y < x) ? y : x;
}
#define __HALF_ONLY
#define __CLC_SUPPORTED_VECSIZE_OR_1 2
#define FUNCTION __clc_fmin
#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
#include <clc/math/gentype.inc>
#endif

View File

@@ -1,4 +1,2 @@
math/clc_fmax.cl
math/clc_fmin.cl
math/clc_native_rsqrt.cl
math/clc_rsqrt_override.cl

View File

@@ -1,41 +0,0 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <clc/clcmacro.h>
#include <clc/internal/clc.h>
#include <clc/math/math.h>
_CLC_DEF _CLC_OVERLOAD float __clc_fmax(float x, float y) {
// Flush denormals if not enabled. Otherwise fmax instruction flushes the
// values for comparison, but outputs original denormal
x = __clc_flush_denormal_if_not_supported(x);
y = __clc_flush_denormal_if_not_supported(y);
return __builtin_fmaxf(x, y);
}
#define __FLOAT_ONLY
#define FUNCTION __clc_fmax
#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
#include <clc/math/gentype.inc>
#undef FUNCTION
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
_CLC_DEF _CLC_OVERLOAD double __clc_fmax(double x, double y) {
return __builtin_fmax(x, y);
}
#define __DOUBLE_ONLY
#define FUNCTION __clc_fmax
#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
#include <clc/math/gentype.inc>
#undef FUNCTION
#endif

View File

@@ -1,42 +0,0 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <clc/clcmacro.h>
#include <clc/internal/clc.h>
#include <clc/math/math.h>
_CLC_DEF _CLC_OVERLOAD float __clc_fmin(float x, float y) {
// fcanonicalize removes sNaNs and flushes denormals if not enabled. Otherwise
// fmin instruction flushes the values for comparison, but outputs original
// denormal
x = __clc_flush_denormal_if_not_supported(x);
y = __clc_flush_denormal_if_not_supported(y);
return __builtin_fminf(x, y);
}
#define __FLOAT_ONLY
#define FUNCTION __clc_fmin
#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
#include <clc/math/gentype.inc>
#undef FUNCTION
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
_CLC_DEF _CLC_OVERLOAD double __clc_fmin(double x, double y) {
return __builtin_fmin(x, y);
}
#define __DOUBLE_ONLY
#define FUNCTION __clc_fmin
#define __CLC_BODY <clc/shared/binary_def_scalarize.inc>
#include <clc/math/gentype.inc>
#undef FUNCTION
#endif

View File

@@ -1 +1,3 @@
math/clc_fmax.cl
math/clc_fmin.cl
math/clc_runtime_has_hw_fma32.cl

View File

@@ -8,40 +8,23 @@
#include <clc/clcmacro.h>
#include <clc/internal/clc.h>
#include <clc/relational/clc_isnan.h>
_CLC_DEF _CLC_OVERLOAD float __clc_fmax(float x, float y) {
// fcanonicalize removes sNaNs and flushes denormals if not enabled. Otherwise
// fmax instruction flushes the values for comparison, but outputs original
// denormal
x = __builtin_canonicalizef(x);
y = __builtin_canonicalizef(y);
return __builtin_fmaxf(x, y);
}
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
_CLC_DEF _CLC_OVERLOAD double __clc_fmax(double x, double y) {
x = __builtin_canonicalize(x);
y = __builtin_canonicalize(y);
return __builtin_fmax(x, y);
}
#endif
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
_CLC_DEF _CLC_OVERLOAD half __clc_fmax(half x, half y) {
if (__clc_isnan(x))
return y;
if (__clc_isnan(y))
return x;
return (y < x) ? x : y;
return __builtin_fmaxf16(x, y);
}
#endif
#define FUNCTION __clc_fmax

View File

@@ -8,41 +8,23 @@
#include <clc/clcmacro.h>
#include <clc/internal/clc.h>
#include <clc/relational/clc_isnan.h>
_CLC_DEF _CLC_OVERLOAD float __clc_fmin(float x, float y) {
// fcanonicalize removes sNaNs and flushes denormals if not enabled. Otherwise
// fmin instruction flushes the values for comparison, but outputs original
// denormal
x = __builtin_canonicalizef(x);
y = __builtin_canonicalizef(y);
return __builtin_fminf(x, y);
}
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
_CLC_DEF _CLC_OVERLOAD double __clc_fmin(double x, double y) {
x = __builtin_canonicalize(x);
y = __builtin_canonicalize(y);
return __builtin_fmin(x, y);
}
#endif
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
_CLC_DEF _CLC_OVERLOAD half __clc_fmin(half x, half y) {
if (__clc_isnan(x))
return y;
if (__clc_isnan(y))
return x;
return (y < x) ? y : x;
return __builtin_fminf16(x, y);
}
#endif
#define FUNCTION __clc_fmin