[libclc] Implement erf/erfc vector function with loop since scalar function is large (#157055)

This PR reduces amdgcn--amdhsa.bc size by 1.8% and nvptx64--nvidiacl.bc
size by 4%.
Loop trip count is constant and backend can decide whether to unroll.

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Wenju He
2025-09-05 19:58:24 +08:00
committed by GitHub
parent 28d9255aa7
commit a271d07488
3 changed files with 30 additions and 2 deletions

View File

@@ -0,0 +1,28 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include <clc/utils.h>
#if __CLC_VECSIZE_OR_1 >= 2
#ifndef __CLC_IMPL_FUNCTION
#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
#endif
_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x) {
union {
__CLC_GENTYPE vec;
__CLC_SCALAR_GENTYPE arr[__CLC_VECSIZE_OR_1];
} u_x, u_result;
u_x.vec = x;
for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i)
u_result.arr[i] = __CLC_IMPL_FUNCTION(u_x.arr[i]);
return u_result.vec;
}
#endif // __CLC_VECSIZE_OR_1 >= 2

View File

@@ -507,5 +507,5 @@ _CLC_OVERLOAD _CLC_DEF half __clc_erf(half x) {
#endif
#define __CLC_FUNCTION __clc_erf
#define __CLC_BODY <clc/shared/unary_def_scalarize.inc>
#define __CLC_BODY <clc/shared/unary_def_scalarize_loop.inc>
#include <clc/math/gentype.inc>

View File

@@ -518,5 +518,5 @@ _CLC_OVERLOAD _CLC_DEF half __clc_erfc(half x) {
#endif
#define __CLC_FUNCTION __clc_erfc
#define __CLC_BODY <clc/shared/unary_def_scalarize.inc>
#define __CLC_BODY <clc/shared/unary_def_scalarize_loop.inc>
#include <clc/math/gentype.inc>