[libclc] Remove __attribute__((always_inline)) (#158791)

always_inline doesn't guarantee performance improvement.
Target-specific optimizations decide whether inlining is profitable.
Changes to amdgcn--amdhsa.bc:
* _Z9__clc_logDv16_f and _Z15__clc_remainderDv16_fS_ are not inlined.
* sincos vector function code size has doubled due to apparent
duplication.


Also replace typo _CLC_DECL with _CLC_DEF for function definition.
This commit is contained in:
Wenju He
2025-09-18 07:47:35 +08:00
committed by GitHub
parent 5cc41936f2
commit 7f3661128b
6 changed files with 19 additions and 23 deletions

View File

@@ -11,17 +11,13 @@
#define _CLC_OVERLOAD __attribute__((overloadable))
#define _CLC_DECL
#define _CLC_INLINE __attribute__((always_inline)) inline
#define _CLC_INLINE inline
#define _CLC_CONST __attribute__((const))
// avoid inlines for SPIR-V related targets since we'll optimise later in the
// chain
#if defined(CLC_SPIRV)
#define _CLC_DEF
#elif defined(CLC_CLSPV)
#if defined(CLC_CLSPV)
#define _CLC_DEF __attribute__((noinline)) __attribute__((clspv_libclc_builtin))
#else
#define _CLC_DEF __attribute__((always_inline))
#define _CLC_DEF
#endif
#if __OPENCL_C_VERSION__ == CL_VERSION_2_0 || \

View File

@@ -18,22 +18,22 @@
// The return type is same base type as the input type, with the same vector
// size as the mask. Elements in the mask must be the same size (number of bits)
// as the input value., e.g. char8 ret = shuffle(char2 x, uchar8 mask);
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) x,
__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) y, __CLC_U_GENTYPE mask) {
return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, y, mask);
}
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) x,
__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) y, __CLC_U_GENTYPE mask) {
return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, y, mask);
}
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) x,
__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) y, __CLC_U_GENTYPE mask) {
return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, y, mask);
}
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(
__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) x,
__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) y, __CLC_U_GENTYPE mask) {
return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, y, mask);

View File

@@ -18,19 +18,19 @@
// The return type is same base type as the input type, with the same vector
// size as the mask. Elements in the mask must be the same size (number of bits)
// as the input value., e.g. char8 ret = shuffle(char2 x, uchar8 mask);
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) x, __CLC_U_GENTYPE mask) {
return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, mask);
}
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) x, __CLC_U_GENTYPE mask) {
return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, mask);
}
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE
__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) x, __CLC_U_GENTYPE mask) {
return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, mask);
}
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(
__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) x, __CLC_U_GENTYPE mask) {
return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, mask);
}

View File

@@ -24,7 +24,7 @@
#ifdef __CLC_FPSIZE
#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_atomic_compare_exchange( \
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atomic_compare_exchange( \
volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator, \
__CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal, \
int MemoryScope) { \
@@ -38,7 +38,7 @@
#else
#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_atomic_compare_exchange( \
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atomic_compare_exchange( \
volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator, \
__CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal, \
int MemoryScope) { \

View File

@@ -31,7 +31,7 @@
#ifdef __CLC_NO_VALUE_ARG
#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \
volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder, \
int MemoryScope) { \
return __CLC_AS_RETTYPE(__CLC_IMPL_FUNCTION( \
@@ -39,7 +39,7 @@
}
#elif defined(__CLC_INC_DEC)
#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \
volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder, \
int MemoryScope) { \
return __CLC_AS_RETTYPE( \
@@ -48,7 +48,7 @@
}
#elif defined(__CLC_RETURN_VOID)
#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \
_CLC_OVERLOAD _CLC_DECL void __CLC_FUNCTION( \
_CLC_OVERLOAD _CLC_DEF void __CLC_FUNCTION( \
volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value, \
int MemoryOrder, int MemoryScope) { \
__CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, \
@@ -56,7 +56,7 @@
}
#else
#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \
volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value, \
int MemoryOrder, int MemoryScope) { \
return __CLC_AS_RETTYPE( \

View File

@@ -74,8 +74,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x,
return ret;
}
_CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x,
__CLC_INTN regn) {
_CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x,
__CLC_INTN regn) {
// Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4].
__CLC_FLOATN r = x * x;