mirror of
https://github.com/intel/llvm.git
synced 2026-01-27 06:06:34 +08:00
[libc] Improve memcmp latency and codegen
This is based on ideas from @nafi to: - use a branchless version of 'cmp' for 'uint32_t', - completely resolve the lexicographic comparison through vector operations when wide types are available. We also get rid of byte reloads and serializing '__builtin_ctzll'. I did not include the suggestion to replace comparisons of 'uint16_t' with two 'uint8_t' as it did not seem to help the codegen. This can be revisited in sub-sequent patches. The code been rewritten to reduce nested function calls, making the job of the inliner easier and preventing harmful code duplication. Reviewed By: nafi3000 Differential Revision: https://reviews.llvm.org/D148717
This commit is contained in:
@@ -45,6 +45,10 @@
|
||||
#define LIBC_TARGET_ARCH_IS_AARCH64
|
||||
#endif
|
||||
|
||||
#if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM))
|
||||
#define LIBC_TARGET_ARCH_IS_ANY_ARM
|
||||
#endif
|
||||
|
||||
#if defined(__riscv) && (__riscv_xlen == 64)
|
||||
#define LIBC_TARGET_ARCH_IS_RISCV64
|
||||
#endif
|
||||
@@ -53,8 +57,9 @@
|
||||
#define LIBC_TARGET_ARCH_IS_RISCV32
|
||||
#endif
|
||||
|
||||
#if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM))
|
||||
#define LIBC_TARGET_ARCH_IS_ANY_ARM
|
||||
#if (defined(LIBC_TARGET_ARCH_IS_RISCV64) || \
|
||||
defined(LIBC_TARGET_ARCH_IS_RISCV32))
|
||||
#define LIBC_TARGET_ARCH_IS_ANY_RISCV
|
||||
#endif
|
||||
|
||||
#endif // LLVM_LIBC_SUPPORT_MACROS_PROPERTIES_ARCHITECTURES_H
|
||||
|
||||
@@ -450,6 +450,12 @@ function(add_implementation name impl_name)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
|
||||
# Prevent warning when passing x86 SIMD types as template arguments.
|
||||
# e.g. "warning: ignoring attributes on template argument ‘__m128i’ [-Wignored-attributes]"
|
||||
list(APPEND ADD_IMPL_COMPILE_OPTIONS "-Wno-ignored-attributes")
|
||||
endif()
|
||||
|
||||
add_entrypoint_object(${impl_name}
|
||||
NAME ${name}
|
||||
SRCS ${ADD_IMPL_SRCS}
|
||||
@@ -564,7 +570,7 @@ endfunction()
|
||||
if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
|
||||
add_memcpy(memcpy_x86_64_opt_sse2 COMPILE_OPTIONS -march=k8 REQUIRE SSE2)
|
||||
add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
|
||||
add_memcpy(memcpy_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
|
||||
add_memcpy(memcpy_x86_64_opt_avx COMPILE_OPTIONS -march=sandybridge REQUIRE AVX)
|
||||
add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
|
||||
add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
|
||||
add_memcpy(memcpy)
|
||||
|
||||
@@ -24,6 +24,7 @@ add_header_library(
|
||||
libc.src.__support.CPP.type_traits
|
||||
libc.src.__support.macros.config
|
||||
libc.src.__support.macros.optimization
|
||||
libc.src.__support.macros.properties.architectures
|
||||
)
|
||||
|
||||
add_header_library(
|
||||
|
||||
@@ -19,31 +19,32 @@ namespace __llvm_libc {
|
||||
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
|
||||
inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (LIBC_UNLIKELY(count >= 384)) {
|
||||
if (auto value = generic::Memcmp<16>::block(p1, p2))
|
||||
if (auto value = generic::Memcmp<uint8x16_t>::block(p1, p2))
|
||||
return value;
|
||||
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
|
||||
}
|
||||
return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
|
||||
return generic::Memcmp<uint8x16_t>::loop_and_tail(p1, p2, count);
|
||||
}
|
||||
|
||||
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
|
||||
inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (LIBC_UNLIKELY(count >= 128)) { // [128, ∞]
|
||||
if (auto value = generic::Memcmp<16>::block(p1, p2))
|
||||
if (auto value = generic::Memcmp<uint8x16_t>::block(p1, p2))
|
||||
return value;
|
||||
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
|
||||
return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
|
||||
return generic::Memcmp<uint8x16x2_t>::loop_and_tail(p1, p2, count);
|
||||
}
|
||||
if (generic::Bcmp<16>::block(p1, p2)) // [16, 16]
|
||||
return generic::Memcmp<16>::block(p1, p2);
|
||||
if (generic::Bcmp<uint8x16_t>::block(p1, p2)) // [16, 16]
|
||||
return generic::Memcmp<uint8x16_t>::block(p1, p2);
|
||||
if (count < 32) // [17, 31]
|
||||
return generic::Memcmp<16>::tail(p1, p2, count);
|
||||
if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
|
||||
return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
|
||||
return generic::Memcmp<uint8x16_t>::tail(p1, p2, count);
|
||||
if (generic::Bcmp<uint8x16_t>::block(p1 + 16, p2 + 16)) // [32, 32]
|
||||
return generic::Memcmp<uint8x16_t>::block(p1 + 16, p2 + 16);
|
||||
if (count < 64) // [33, 63]
|
||||
return generic::Memcmp<32>::tail(p1, p2, count);
|
||||
return generic::Memcmp<uint8x16x2_t>::tail(p1, p2, count);
|
||||
// [64, 127]
|
||||
return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
|
||||
return generic::Memcmp<uint8x16_t>::loop_and_tail(p1 + 32, p2 + 32,
|
||||
count - 32);
|
||||
}
|
||||
|
||||
LIBC_INLINE MemcmpReturnType inline_memcmp_aarch64(CPtr p1, CPtr p2,
|
||||
@@ -51,15 +52,15 @@ LIBC_INLINE MemcmpReturnType inline_memcmp_aarch64(CPtr p1, CPtr p2,
|
||||
if (count == 0)
|
||||
return MemcmpReturnType::ZERO();
|
||||
if (count == 1)
|
||||
return generic::Memcmp<1>::block(p1, p2);
|
||||
return generic::Memcmp<uint8_t>::block(p1, p2);
|
||||
if (count == 2)
|
||||
return generic::Memcmp<2>::block(p1, p2);
|
||||
return generic::Memcmp<uint16_t>::block(p1, p2);
|
||||
if (count == 3)
|
||||
return generic::Memcmp<3>::block(p1, p2);
|
||||
return generic::MemcmpSequence<uint16_t, uint8_t>::block(p1, p2);
|
||||
if (count <= 8)
|
||||
return generic::Memcmp<4>::head_tail(p1, p2, count);
|
||||
return generic::Memcmp<uint32_t>::head_tail(p1, p2, count);
|
||||
if (count <= 16)
|
||||
return generic::Memcmp<8>::head_tail(p1, p2, count);
|
||||
return generic::Memcmp<uint64_t>::head_tail(p1, p2, count);
|
||||
if constexpr (aarch64::kNeon)
|
||||
return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
|
||||
else
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "src/string/memory_utils/op_aarch64.h"
|
||||
#include "src/string/memory_utils/op_builtin.h"
|
||||
#include "src/string/memory_utils/op_generic.h"
|
||||
#include "src/string/memory_utils/op_riscv.h"
|
||||
#include "src/string/memory_utils/op_x86.h"
|
||||
|
||||
#include <stddef.h> // size_t
|
||||
@@ -22,21 +23,17 @@
|
||||
namespace __llvm_libc {
|
||||
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType
|
||||
inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
|
||||
LIBC_LOOP_NOUNROLL
|
||||
for (; offset < count; ++offset)
|
||||
if (p1[offset] != p2[offset])
|
||||
return BcmpReturnType::NONZERO();
|
||||
return BcmpReturnType::ZERO();
|
||||
inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) {
|
||||
return generic::Bcmp<uint8_t>::loop_and_tail_offset(p1, p2, count, offset);
|
||||
}
|
||||
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType
|
||||
inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
|
||||
constexpr size_t kAlign = sizeof(uint64_t);
|
||||
if (count <= 2 * kAlign)
|
||||
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
|
||||
return inline_bcmp_byte_per_byte(p1, p2, count);
|
||||
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
|
||||
if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
|
||||
if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
|
||||
return value;
|
||||
size_t offset = bytes_to_p1_align;
|
||||
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
|
||||
@@ -55,16 +52,16 @@ inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
|
||||
if (a != b)
|
||||
return BcmpReturnType::NONZERO();
|
||||
}
|
||||
return inline_bcmp_byte_per_byte(p1, p2, offset, count);
|
||||
return inline_bcmp_byte_per_byte(p1, p2, count, offset);
|
||||
}
|
||||
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType
|
||||
inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
|
||||
constexpr size_t kAlign = sizeof(uint32_t);
|
||||
if (count <= 2 * kAlign)
|
||||
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
|
||||
return inline_bcmp_byte_per_byte(p1, p2, count);
|
||||
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
|
||||
if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
|
||||
if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
|
||||
return value;
|
||||
size_t offset = bytes_to_p1_align;
|
||||
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
|
||||
@@ -80,89 +77,82 @@ inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
|
||||
if (a != b)
|
||||
return BcmpReturnType::NONZERO();
|
||||
}
|
||||
return inline_bcmp_byte_per_byte(p1, p2, offset, count);
|
||||
return inline_bcmp_byte_per_byte(p1, p2, count, offset);
|
||||
}
|
||||
|
||||
#if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType
|
||||
inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (count < 256)
|
||||
return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
|
||||
if (auto value = generic::Bcmp<64>::block(p1, p2))
|
||||
return value;
|
||||
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
|
||||
return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
|
||||
return generic::Bcmp<uint64_t>::loop_and_tail_align_above(256, p1, p2, count);
|
||||
}
|
||||
#endif // defined(LIBC_TARGET_ARCH_IS_X86) ||
|
||||
// defined(LIBC_TARGET_ARCH_IS_AARCH64)
|
||||
|
||||
#if defined(LIBC_TARGET_ARCH_IS_X86)
|
||||
#if defined(__SSE4_1__)
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType
|
||||
inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (count <= 32)
|
||||
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
|
||||
if (count < 256)
|
||||
return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
|
||||
if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
|
||||
return value;
|
||||
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
|
||||
return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
|
||||
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
|
||||
return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
|
||||
}
|
||||
#endif // __SSE4_1__
|
||||
|
||||
#if defined(__AVX__)
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType
|
||||
inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (count <= 32)
|
||||
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
|
||||
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
|
||||
if (count <= 64)
|
||||
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
|
||||
if (count <= 128)
|
||||
return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
|
||||
if (LIBC_UNLIKELY(count >= 256)) {
|
||||
if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
|
||||
return value;
|
||||
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
|
||||
}
|
||||
return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
|
||||
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
|
||||
return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
|
||||
}
|
||||
#endif // __AVX__
|
||||
|
||||
#if defined(__AVX512BW__)
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType
|
||||
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (count <= 32)
|
||||
return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
|
||||
return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
|
||||
if (count <= 64)
|
||||
return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
|
||||
return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
|
||||
if (count <= 128)
|
||||
return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
|
||||
if (LIBC_UNLIKELY(count >= 256)) {
|
||||
if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
|
||||
return value;
|
||||
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
|
||||
}
|
||||
return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
|
||||
return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
|
||||
return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
|
||||
}
|
||||
#endif // __AVX512BW__
|
||||
|
||||
[[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
if (count == 0)
|
||||
return BcmpReturnType::ZERO();
|
||||
if (count == 1)
|
||||
return generic::Bcmp<1>::block(p1, p2);
|
||||
return generic::Bcmp<uint8_t>::block(p1, p2);
|
||||
if (count == 2)
|
||||
return generic::Bcmp<2>::block(p1, p2);
|
||||
if (count <= 4)
|
||||
return generic::Bcmp<2>::head_tail(p1, p2, count);
|
||||
if (count <= 8)
|
||||
return generic::Bcmp<4>::head_tail(p1, p2, count);
|
||||
return generic::Bcmp<uint16_t>::block(p1, p2);
|
||||
if (count == 3)
|
||||
return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
|
||||
if (count == 4)
|
||||
return generic::Bcmp<uint32_t>::block(p1, p2);
|
||||
if (count == 5)
|
||||
return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
|
||||
if (count == 6)
|
||||
return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
|
||||
if (count == 7)
|
||||
return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
|
||||
if (count == 8)
|
||||
return generic::Bcmp<uint64_t>::block(p1, p2);
|
||||
if (count <= 16)
|
||||
return generic::Bcmp<8>::head_tail(p1, p2, count);
|
||||
if constexpr (x86::kAvx512BW)
|
||||
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
|
||||
else if constexpr (x86::kAvx2)
|
||||
return inline_bcmp_x86_avx2_gt16(p1, p2, count);
|
||||
else if constexpr (x86::kSse2)
|
||||
return inline_bcmp_x86_sse2_gt16(p1, p2, count);
|
||||
else
|
||||
return inline_bcmp_generic_gt16(p1, p2, count);
|
||||
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
|
||||
#if defined(__AVX512BW__)
|
||||
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
|
||||
#elif defined(__AVX__)
|
||||
return inline_bcmp_x86_avx_gt16(p1, p2, count);
|
||||
#elif defined(__SSE4_1__)
|
||||
return inline_bcmp_x86_sse41_gt16(p1, p2, count);
|
||||
#else
|
||||
return inline_bcmp_generic_gt16(p1, p2, count);
|
||||
#endif
|
||||
}
|
||||
#endif // defined(LIBC_TARGET_ARCH_IS_X86)
|
||||
|
||||
@@ -178,19 +168,19 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
case 0:
|
||||
return BcmpReturnType::ZERO();
|
||||
case 1:
|
||||
return generic::Bcmp<1>::block(p1, p2);
|
||||
return generic::Bcmp<uint8_t>::block(p1, p2);
|
||||
case 2:
|
||||
return generic::Bcmp<2>::block(p1, p2);
|
||||
return generic::Bcmp<uint16_t>::block(p1, p2);
|
||||
case 3:
|
||||
return generic::Bcmp<2>::head_tail(p1, p2, count);
|
||||
return generic::Bcmp<uint16_t>::head_tail(p1, p2, count);
|
||||
case 4:
|
||||
return generic::Bcmp<4>::block(p1, p2);
|
||||
return generic::Bcmp<uint32_t>::block(p1, p2);
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
return generic::Bcmp<4>::head_tail(p1, p2, count);
|
||||
return generic::Bcmp<uint32_t>::head_tail(p1, p2, count);
|
||||
case 8:
|
||||
return generic::Bcmp<8>::block(p1, p2);
|
||||
return generic::Bcmp<uint64_t>::block(p1, p2);
|
||||
case 9:
|
||||
case 10:
|
||||
case 11:
|
||||
@@ -198,7 +188,7 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
case 13:
|
||||
case 14:
|
||||
case 15:
|
||||
return generic::Bcmp<8>::head_tail(p1, p2, count);
|
||||
return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -225,7 +215,7 @@ LIBC_INLINE BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
|
||||
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
|
||||
return inline_bcmp_aligned_access_32bit(p1, p2, count);
|
||||
#else
|
||||
return inline_bcmp_byte_per_byte(p1, p2, 0, count);
|
||||
return inline_bcmp_byte_per_byte(p1, p2, count);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY LIBC_LOOP_NOUNROLL
|
||||
#include "src/__support/macros/properties/architectures.h"
|
||||
#include "src/string/memory_utils/op_generic.h"
|
||||
#include "src/string/memory_utils/op_riscv.h"
|
||||
#include "src/string/memory_utils/utils.h" // CPtr MemcmpReturnType
|
||||
|
||||
#include <stddef.h> // size_t
|
||||
@@ -26,21 +27,17 @@
|
||||
namespace __llvm_libc {
|
||||
|
||||
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
|
||||
inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
|
||||
LIBC_LOOP_NOUNROLL
|
||||
for (; offset < count; ++offset)
|
||||
if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
|
||||
return value;
|
||||
return MemcmpReturnType::ZERO();
|
||||
inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) {
|
||||
return generic::Memcmp<uint8_t>::loop_and_tail_offset(p1, p2, count, offset);
|
||||
}
|
||||
|
||||
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
|
||||
inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
|
||||
constexpr size_t kAlign = sizeof(uint64_t);
|
||||
if (count <= 2 * kAlign)
|
||||
return inline_memcmp_byte_per_byte(p1, p2, 0, count);
|
||||
return inline_memcmp_byte_per_byte(p1, p2, count);
|
||||
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
|
||||
if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
|
||||
if (auto value = inline_memcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
|
||||
return value;
|
||||
size_t offset = bytes_to_p1_align;
|
||||
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
|
||||
@@ -56,21 +53,20 @@ inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
|
||||
b = load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
|
||||
p2, offset);
|
||||
uint64_t a = load64_aligned<uint64_t>(p1, offset);
|
||||
if (a != b) {
|
||||
// TODO use cmp_neq_uint64_t from D148717 once it's submitted.
|
||||
return Endian::to_big_endian(a) < Endian::to_big_endian(b) ? -1 : 1;
|
||||
}
|
||||
if (a != b)
|
||||
return cmp_neq_uint64_t(Endian::to_big_endian(a),
|
||||
Endian::to_big_endian(b));
|
||||
}
|
||||
return inline_memcmp_byte_per_byte(p1, p2, offset, count);
|
||||
return inline_memcmp_byte_per_byte(p1, p2, count, offset);
|
||||
}
|
||||
|
||||
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
|
||||
inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
|
||||
constexpr size_t kAlign = sizeof(uint32_t);
|
||||
if (count <= 2 * kAlign)
|
||||
return inline_memcmp_byte_per_byte(p1, p2, 0, count);
|
||||
return inline_memcmp_byte_per_byte(p1, p2, count);
|
||||
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
|
||||
if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
|
||||
if (auto value = inline_memcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
|
||||
return value;
|
||||
size_t offset = bytes_to_p1_align;
|
||||
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
|
||||
@@ -83,16 +79,10 @@ inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
|
||||
else
|
||||
b = load32_aligned<uint8_t, uint16_t, uint8_t>(p2, offset);
|
||||
uint32_t a = load32_aligned<uint32_t>(p1, offset);
|
||||
if (a != b) {
|
||||
// TODO use cmp_uint32_t from D148717 once it's submitted.
|
||||
// We perform the difference as an uint64_t.
|
||||
const int64_t diff = static_cast<int64_t>(Endian::to_big_endian(a)) -
|
||||
static_cast<int64_t>(Endian::to_big_endian(b));
|
||||
// And reduce the uint64_t into an uint32_t.
|
||||
return static_cast<int32_t>((diff >> 1) | (diff & 0xFFFF));
|
||||
}
|
||||
if (a != b)
|
||||
return cmp_uint32_t(Endian::to_big_endian(a), Endian::to_big_endian(b));
|
||||
}
|
||||
return inline_memcmp_byte_per_byte(p1, p2, offset, count);
|
||||
return inline_memcmp_byte_per_byte(p1, p2, count, offset);
|
||||
}
|
||||
|
||||
LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
|
||||
@@ -105,7 +95,7 @@ LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
|
||||
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
|
||||
return inline_memcmp_aligned_access_32bit(p1, p2, count);
|
||||
#else
|
||||
return inline_memcmp_byte_per_byte(p1, p2, 0, count);
|
||||
return inline_memcmp_byte_per_byte(p1, p2, count);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -38,17 +38,17 @@ LIBC_INLINE void inline_memmove(Ptr dst, CPtr src, size_t count) {
|
||||
#if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
|
||||
#if defined(LIBC_TARGET_ARCH_IS_X86)
|
||||
#if defined(__AVX512F__)
|
||||
using uint128_t = uint8x16_t;
|
||||
using uint256_t = uint8x32_t;
|
||||
using uint512_t = uint8x64_t;
|
||||
using uint128_t = generic_v128;
|
||||
using uint256_t = generic_v256;
|
||||
using uint512_t = generic_v512;
|
||||
#elif defined(__AVX__)
|
||||
using uint128_t = uint8x16_t;
|
||||
using uint256_t = uint8x32_t;
|
||||
using uint512_t = cpp::array<uint8x32_t, 2>;
|
||||
using uint128_t = generic_v128;
|
||||
using uint256_t = generic_v256;
|
||||
using uint512_t = cpp::array<generic_v256, 2>;
|
||||
#elif defined(__SSE2__)
|
||||
using uint128_t = uint8x16_t;
|
||||
using uint256_t = cpp::array<uint8x16_t, 2>;
|
||||
using uint512_t = cpp::array<uint8x16_t, 4>;
|
||||
using uint128_t = generic_v128;
|
||||
using uint256_t = cpp::array<generic_v128, 2>;
|
||||
using uint512_t = cpp::array<generic_v128, 4>;
|
||||
#else
|
||||
using uint128_t = cpp::array<uint64_t, 2>;
|
||||
using uint256_t = cpp::array<uint64_t, 4>;
|
||||
@@ -56,9 +56,9 @@ LIBC_INLINE void inline_memmove(Ptr dst, CPtr src, size_t count) {
|
||||
#endif
|
||||
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
|
||||
static_assert(aarch64::kNeon, "aarch64 supports vector types");
|
||||
using uint128_t = uint8x16_t;
|
||||
using uint256_t = uint8x32_t;
|
||||
using uint512_t = uint8x64_t;
|
||||
using uint128_t = generic_v128;
|
||||
using uint256_t = generic_v256;
|
||||
using uint512_t = generic_v512;
|
||||
#endif
|
||||
if (count == 0)
|
||||
return;
|
||||
|
||||
@@ -60,17 +60,17 @@ inline_memset_aligned_access_64bit(Ptr dst, uint8_t value, size_t count) {
|
||||
[[maybe_unused]] LIBC_INLINE static void
|
||||
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
|
||||
#if defined(__AVX512F__)
|
||||
using uint128_t = uint8x16_t;
|
||||
using uint256_t = uint8x32_t;
|
||||
using uint512_t = uint8x64_t;
|
||||
using uint128_t = generic_v128;
|
||||
using uint256_t = generic_v256;
|
||||
using uint512_t = generic_v512;
|
||||
#elif defined(__AVX__)
|
||||
using uint128_t = uint8x16_t;
|
||||
using uint256_t = uint8x32_t;
|
||||
using uint512_t = cpp::array<uint8x32_t, 2>;
|
||||
using uint128_t = generic_v128;
|
||||
using uint256_t = generic_v256;
|
||||
using uint512_t = cpp::array<generic_v256, 2>;
|
||||
#elif defined(__SSE2__)
|
||||
using uint128_t = uint8x16_t;
|
||||
using uint256_t = cpp::array<uint8x16_t, 2>;
|
||||
using uint512_t = cpp::array<uint8x16_t, 4>;
|
||||
using uint128_t = generic_v128;
|
||||
using uint256_t = cpp::array<generic_v128, 2>;
|
||||
using uint512_t = cpp::array<generic_v128, 4>;
|
||||
#else
|
||||
using uint128_t = cpp::array<uint64_t, 2>;
|
||||
using uint256_t = cpp::array<uint64_t, 4>;
|
||||
@@ -106,9 +106,9 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
|
||||
[[maybe_unused]] LIBC_INLINE static void
|
||||
inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
|
||||
static_assert(aarch64::kNeon, "aarch64 supports vector types");
|
||||
using uint128_t = uint8x16_t;
|
||||
using uint256_t = uint8x32_t;
|
||||
using uint512_t = uint8x64_t;
|
||||
using uint128_t = generic_v128;
|
||||
using uint256_t = generic_v256;
|
||||
using uint512_t = generic_v512;
|
||||
if (count == 0)
|
||||
return;
|
||||
if (count <= 3) {
|
||||
|
||||
@@ -48,7 +48,7 @@ struct BzeroCacheLine {
|
||||
offset += SIZE;
|
||||
} while (offset < count - SIZE);
|
||||
// Unaligned store, we can't use 'dc zva' here.
|
||||
generic::Memset<uint8x64_t>::tail(dst, value, count);
|
||||
generic::Memset<generic_v512>::tail(dst, value, count);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -171,6 +171,100 @@ template <size_t Size> struct Bcmp {
|
||||
|
||||
} // namespace __llvm_libc::aarch64
|
||||
|
||||
namespace __llvm_libc::generic {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint16_t
|
||||
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
|
||||
template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
|
||||
static_cast<int32_t>(load_be<uint16_t>(p2, offset));
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint32_t
|
||||
template <> struct cmp_is_expensive<uint32_t> : cpp::false_type {};
|
||||
template <>
|
||||
LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load_be<uint32_t>(p1, offset);
|
||||
const auto b = load_be<uint32_t>(p2, offset);
|
||||
return a > b ? 1 : a < b ? -1 : 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint64_t
|
||||
template <> struct cmp_is_expensive<uint64_t> : cpp::false_type {};
|
||||
template <>
|
||||
LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint64_t>(p1, offset) != load<uint64_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load_be<uint64_t>(p1, offset);
|
||||
const auto b = load_be<uint64_t>(p2, offset);
|
||||
if (a != b)
|
||||
return a > b ? 1 : -1;
|
||||
return MemcmpReturnType::ZERO();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint8x16_t
|
||||
template <> struct is_vector<uint8x16_t> : cpp::true_type {};
|
||||
template <> struct cmp_is_expensive<uint8x16_t> : cpp::false_type {};
|
||||
template <>
|
||||
LIBC_INLINE uint32_t neq<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
for (size_t i = 0; i < 2; ++i) {
|
||||
auto a = load<uint64_t>(p1, offset);
|
||||
auto b = load<uint64_t>(p2, offset);
|
||||
uint32_t cond = a != b;
|
||||
if (cond)
|
||||
return cond;
|
||||
offset += sizeof(uint64_t);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint8x16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
for (size_t i = 0; i < 2; ++i) {
|
||||
auto a = load_be<uint64_t>(p1, offset);
|
||||
auto b = load_be<uint64_t>(p2, offset);
|
||||
if (a != b)
|
||||
return cmp_neq_uint64_t(a, b);
|
||||
offset += sizeof(uint64_t);
|
||||
}
|
||||
return MemcmpReturnType::ZERO();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint8x16x2_t
|
||||
template <> struct is_vector<uint8x16x2_t> : cpp::true_type {};
|
||||
template <> struct cmp_is_expensive<uint8x16x2_t> : cpp::false_type {};
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint8x16x2_t>(CPtr p1, CPtr p2,
|
||||
size_t offset) {
|
||||
for (size_t i = 0; i < 4; ++i) {
|
||||
auto a = load_be<uint64_t>(p1, offset);
|
||||
auto b = load_be<uint64_t>(p2, offset);
|
||||
if (a != b)
|
||||
return cmp_neq_uint64_t(a, b);
|
||||
offset += sizeof(uint64_t);
|
||||
}
|
||||
return MemcmpReturnType::ZERO();
|
||||
}
|
||||
} // namespace __llvm_libc::generic
|
||||
|
||||
#endif // LIBC_TARGET_ARCH_IS_AARCH64
|
||||
|
||||
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
|
||||
|
||||
@@ -33,31 +33,43 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
static_assert((UINTPTR_MAX == 4294967295U) ||
|
||||
(UINTPTR_MAX == 18446744073709551615UL),
|
||||
"We currently only support 32- or 64-bit platforms");
|
||||
|
||||
#if defined(UINT64_MAX)
|
||||
#define LLVM_LIBC_HAS_UINT64
|
||||
#endif
|
||||
|
||||
namespace __llvm_libc {
|
||||
// Compiler types using the vector attributes.
|
||||
using uint8x1_t = uint8_t __attribute__((__vector_size__(1)));
|
||||
using uint8x2_t = uint8_t __attribute__((__vector_size__(2)));
|
||||
using uint8x4_t = uint8_t __attribute__((__vector_size__(4)));
|
||||
using uint8x8_t = uint8_t __attribute__((__vector_size__(8)));
|
||||
using uint8x16_t = uint8_t __attribute__((__vector_size__(16)));
|
||||
using uint8x32_t = uint8_t __attribute__((__vector_size__(32)));
|
||||
using uint8x64_t = uint8_t __attribute__((__vector_size__(64)));
|
||||
using generic_v128 = uint8_t __attribute__((__vector_size__(16)));
|
||||
using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
|
||||
using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
|
||||
} // namespace __llvm_libc
|
||||
|
||||
namespace __llvm_libc::generic {
|
||||
|
||||
// We accept three types of values as elements for generic operations:
|
||||
// - scalar : unsigned integral types
|
||||
// - vector : compiler types using the vector attributes
|
||||
// - scalar : unsigned integral types,
|
||||
// - vector : compiler types using the vector attributes or platform builtins,
|
||||
// - array : a cpp::array<T, N> where T is itself either a scalar or a vector.
|
||||
// The following traits help discriminate between these cases.
|
||||
template <typename T>
|
||||
constexpr bool is_scalar_v = cpp::is_integral_v<T> && cpp::is_unsigned_v<T>;
|
||||
|
||||
template <typename T>
|
||||
constexpr bool is_vector_v =
|
||||
cpp::details::is_unqualified_any_of<T, uint8x1_t, uint8x2_t, uint8x4_t,
|
||||
uint8x8_t, uint8x16_t, uint8x32_t,
|
||||
uint8x64_t>();
|
||||
template <typename T> struct is_scalar : cpp::false_type {};
|
||||
template <> struct is_scalar<uint8_t> : cpp::true_type {};
|
||||
template <> struct is_scalar<uint16_t> : cpp::true_type {};
|
||||
template <> struct is_scalar<uint32_t> : cpp::true_type {};
|
||||
#ifdef LLVM_LIBC_HAS_UINT64
|
||||
template <> struct is_scalar<uint64_t> : cpp::true_type {};
|
||||
#endif // LLVM_LIBC_HAS_UINT64
|
||||
template <typename T> constexpr bool is_scalar_v = is_scalar<T>::value;
|
||||
|
||||
template <typename T> struct is_vector : cpp::false_type {};
|
||||
template <> struct is_vector<generic_v128> : cpp::true_type {};
|
||||
template <> struct is_vector<generic_v256> : cpp::true_type {};
|
||||
template <> struct is_vector<generic_v512> : cpp::true_type {};
|
||||
template <typename T> constexpr bool is_vector_v = is_vector<T>::value;
|
||||
|
||||
template <class T> struct is_array : cpp::false_type {};
|
||||
template <class T, size_t N> struct is_array<cpp::array<T, N>> {
|
||||
@@ -69,7 +81,7 @@ template <typename T>
|
||||
constexpr bool is_element_type_v =
|
||||
is_scalar_v<T> || is_vector_v<T> || is_array_v<T>;
|
||||
|
||||
//
|
||||
// Helper struct to retrieve the number of elements of an array.
|
||||
template <class T> struct array_size {};
|
||||
template <class T, size_t N>
|
||||
struct array_size<cpp::array<T, N>> : cpp::integral_constant<size_t, N> {};
|
||||
@@ -114,105 +126,15 @@ template <typename T> T splat(uint8_t value) {
|
||||
}
|
||||
}
|
||||
|
||||
static_assert((UINTPTR_MAX == 4294967295U) ||
|
||||
(UINTPTR_MAX == 18446744073709551615UL),
|
||||
"We currently only support 32- or 64-bit platforms");
|
||||
|
||||
#if defined(LIBC_TARGET_ARCH_IS_X86_64) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
|
||||
#define LLVM_LIBC_HAS_UINT64
|
||||
#endif
|
||||
|
||||
namespace details {
|
||||
// Checks that each type is sorted in strictly decreasing order of size.
|
||||
// i.e. sizeof(First) > sizeof(Second) > ... > sizeof(Last)
|
||||
template <typename First> constexpr bool is_decreasing_size() {
|
||||
return sizeof(First) == 1;
|
||||
}
|
||||
template <typename First, typename Second, typename... Next>
|
||||
constexpr bool is_decreasing_size() {
|
||||
if constexpr (sizeof...(Next) > 0)
|
||||
return sizeof(First) > sizeof(Second) && is_decreasing_size<Next...>();
|
||||
else
|
||||
return sizeof(First) > sizeof(Second) && is_decreasing_size<Second>();
|
||||
}
|
||||
|
||||
template <size_t Size, typename... Ts> struct Largest;
|
||||
template <size_t Size> struct Largest<Size> : cpp::type_identity<uint8_t> {};
|
||||
template <size_t Size, typename T, typename... Ts>
|
||||
struct Largest<Size, T, Ts...> {
|
||||
using next = Largest<Size, Ts...>;
|
||||
using type = cpp::conditional_t<(Size >= sizeof(T)), T, typename next::type>;
|
||||
};
|
||||
|
||||
} // namespace details
|
||||
|
||||
// 'SupportedTypes' holds a list of natively supported types.
|
||||
// The types are instanciations of ScalarType or VectorType.
|
||||
// They should be ordered in strictly decreasing order.
|
||||
// The 'TypeFor<Size>' type retrieves is the largest supported type that can
|
||||
// handle 'Size' bytes. e.g.
|
||||
//
|
||||
// using ST = SupportedTypes<ScalarType<uint16_t>, ScalarType<uint8_t>>;
|
||||
// using Type = ST::TypeFor<10>;
|
||||
// static_assert(cpp:is_same_v<Type, ScalarType<uint16_t>>);
|
||||
|
||||
template <typename First, typename... Ts> struct SupportedTypes {
|
||||
static_assert(details::is_decreasing_size<First, Ts...>());
|
||||
|
||||
using MaxType = First;
|
||||
|
||||
template <size_t Size>
|
||||
using TypeFor = typename details::Largest<Size, First, Ts...>::type;
|
||||
};
|
||||
|
||||
// Map from sizes to structures offering static load, store and splat methods.
|
||||
// Note: On platforms lacking vector support, we use the ArrayType below and
|
||||
// decompose the operation in smaller pieces.
|
||||
|
||||
// Lists a generic native types to use for Memset and Memmove operations.
|
||||
// TODO: Inject the native types within Memset and Memmove depending on the
|
||||
// target architectures and derive MaxSize from it.
|
||||
using NativeTypeMap = SupportedTypes<uint8x64_t, //
|
||||
uint8x32_t, //
|
||||
uint8x16_t,
|
||||
#if defined(LLVM_LIBC_HAS_UINT64)
|
||||
uint64_t, // Not available on 32bit
|
||||
#endif
|
||||
uint32_t, //
|
||||
uint16_t, //
|
||||
uint8_t>;
|
||||
|
||||
namespace details {
|
||||
|
||||
// Helper to test if a type is void.
|
||||
template <typename T> inline constexpr bool is_void_v = cpp::is_same_v<T, void>;
|
||||
|
||||
// In case the 'Size' is not supported we can fall back to a sequence of smaller
|
||||
// operations using the largest natively supported type.
|
||||
template <size_t Size, size_t MaxSize> static constexpr bool useArrayType() {
|
||||
return (Size > MaxSize) && ((Size % MaxSize) == 0) &&
|
||||
!details::is_void_v<NativeTypeMap::TypeFor<MaxSize>>;
|
||||
}
|
||||
|
||||
// Compute the type to handle an operation of 'Size' bytes knowing that the
|
||||
// underlying platform only support native types up to MaxSize bytes.
|
||||
template <size_t Size, size_t MaxSize>
|
||||
using getTypeFor = cpp::conditional_t<
|
||||
useArrayType<Size, MaxSize>(),
|
||||
cpp::array<NativeTypeMap::TypeFor<MaxSize>, Size / MaxSize>,
|
||||
NativeTypeMap::TypeFor<Size>>;
|
||||
|
||||
} // namespace details
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Memset
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T> struct Memset {
|
||||
static_assert(is_element_type_v<T>);
|
||||
static constexpr size_t SIZE = sizeof(T);
|
||||
|
||||
LIBC_INLINE static void block(Ptr dst, uint8_t value) {
|
||||
static_assert(is_element_type_v<T>);
|
||||
if constexpr (is_scalar_v<T> || is_vector_v<T>) {
|
||||
store<T>(dst, splat<T>(value));
|
||||
} else if constexpr (is_array_v<T>) {
|
||||
@@ -247,9 +169,8 @@ template <typename T, typename... TS> struct MemsetSequence {
|
||||
static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS));
|
||||
LIBC_INLINE static void block(Ptr dst, uint8_t value) {
|
||||
Memset<T>::block(dst, value);
|
||||
if constexpr (sizeof...(TS) > 0) {
|
||||
if constexpr (sizeof...(TS) > 0)
|
||||
return MemsetSequence<TS...>::block(dst + sizeof(T), value);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -258,6 +179,7 @@ template <typename T, typename... TS> struct MemsetSequence {
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T> struct Memmove {
|
||||
static_assert(is_element_type_v<T>);
|
||||
static constexpr size_t SIZE = sizeof(T);
|
||||
|
||||
LIBC_INLINE static void block(Ptr dst, CPtr src) {
|
||||
@@ -390,136 +312,257 @@ template <typename T> struct Memmove {
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Bcmp
|
||||
// Low level operations for Bcmp and Memcmp that operate on memory locations.
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
template <size_t Size> struct Bcmp {
|
||||
static constexpr size_t SIZE = Size;
|
||||
static constexpr size_t MaxSize = LLVM_LIBC_IS_DEFINED(LLVM_LIBC_HAS_UINT64)
|
||||
? sizeof(uint64_t)
|
||||
: sizeof(uint32_t);
|
||||
|
||||
template <typename T> LIBC_INLINE static uint32_t load_xor(CPtr p1, CPtr p2) {
|
||||
static_assert(sizeof(T) <= sizeof(uint32_t));
|
||||
return load<T>(p1) ^ load<T>(p2);
|
||||
}
|
||||
// Same as load above but with an offset to the pointer.
|
||||
// Making the offset explicit hints the compiler to use relevant addressing mode
|
||||
// consistently.
|
||||
template <typename T> LIBC_INLINE static T load(CPtr ptr, size_t offset) {
|
||||
return ::__llvm_libc::load<T>(ptr + offset);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
LIBC_INLINE static uint32_t load_not_equal(CPtr p1, CPtr p2) {
|
||||
return load<T>(p1) != load<T>(p2);
|
||||
}
|
||||
// Same as above but also makes sure the loaded value is in big endian format.
|
||||
// This is useful when implementing lexicograhic comparisons as big endian
|
||||
// scalar comparison directly maps to lexicographic byte comparisons.
|
||||
template <typename T> LIBC_INLINE static T load_be(CPtr ptr, size_t offset) {
|
||||
return Endian::to_big_endian(load<T>(ptr, offset));
|
||||
}
|
||||
|
||||
LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
|
||||
if constexpr (Size == 1) {
|
||||
return load_xor<uint8_t>(p1, p2);
|
||||
} else if constexpr (Size == 2) {
|
||||
return load_xor<uint16_t>(p1, p2);
|
||||
} else if constexpr (Size == 4) {
|
||||
return load_xor<uint32_t>(p1, p2);
|
||||
} else if constexpr (Size == 8) {
|
||||
return load_not_equal<uint64_t>(p1, p2);
|
||||
} else if constexpr (details::useArrayType<Size, MaxSize>()) {
|
||||
for (size_t offset = 0; offset < Size; offset += MaxSize)
|
||||
if (auto value = Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
|
||||
return value;
|
||||
} else {
|
||||
deferred_static_assert("Unimplemented Size");
|
||||
}
|
||||
return BcmpReturnType::ZERO();
|
||||
}
|
||||
// Equality: returns true iff values at locations (p1 + offset) and (p2 +
|
||||
// offset) compare equal.
|
||||
template <typename T> static bool eq(CPtr p1, CPtr p2, size_t offset);
|
||||
|
||||
LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
|
||||
return block(p1 + count - SIZE, p2 + count - SIZE);
|
||||
}
|
||||
// Not equals: returns non-zero iff values at locations (p1 + offset) and (p2 +
|
||||
// offset) differ.
|
||||
template <typename T> static uint32_t neq(CPtr p1, CPtr p2, size_t offset);
|
||||
|
||||
LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
|
||||
return block(p1, p2) | tail(p1, p2, count);
|
||||
}
|
||||
// Lexicographic comparison:
|
||||
// - returns 0 iff values at locations (p1 + offset) and (p2 + offset) compare
|
||||
// equal.
|
||||
// - returns a negative value if value at location (p1 + offset) is
|
||||
// lexicographically less than value at (p2 + offset).
|
||||
// - returns a positive value if value at location (p1 + offset) is
|
||||
// lexicographically greater than value at (p2 + offset).
|
||||
template <typename T>
|
||||
static MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset);
|
||||
|
||||
LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
static_assert(Size > 1, "a loop of size 1 does not need tail");
|
||||
size_t offset = 0;
|
||||
do {
|
||||
if (auto value = block(p1 + offset, p2 + offset))
|
||||
return value;
|
||||
offset += SIZE;
|
||||
} while (offset < count - SIZE);
|
||||
return tail(p1, p2, count);
|
||||
}
|
||||
};
|
||||
// Lexicographic comparison of non-equal values:
|
||||
// - returns a negative value if value at location (p1 + offset) is
|
||||
// lexicographically less than value at (p2 + offset).
|
||||
// - returns a positive value if value at location (p1 + offset) is
|
||||
// lexicographically greater than value at (p2 + offset).
|
||||
template <typename T>
|
||||
static MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Memcmp
|
||||
// Memcmp implementation
|
||||
//
|
||||
// When building memcmp, not all types are considered equals.
|
||||
//
|
||||
// For instance, the lexicographic comparison of two uint8_t can be implemented
|
||||
// as a simple subtraction, but for wider operations the logic can be much more
|
||||
// involving, especially on little endian platforms.
|
||||
//
|
||||
// For such wider types it is a good strategy to test for equality first and
|
||||
// only do the expensive lexicographic comparison if necessary.
|
||||
//
|
||||
// Decomposing the algorithm like this for wider types allows us to have
|
||||
// efficient implementation of higher order functions like 'head_tail' or
|
||||
// 'loop_and_tail'.
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
template <size_t Size> struct Memcmp {
|
||||
static constexpr size_t SIZE = Size;
|
||||
static constexpr size_t MaxSize = LLVM_LIBC_IS_DEFINED(LLVM_LIBC_HAS_UINT64)
|
||||
? sizeof(uint64_t)
|
||||
: sizeof(uint32_t);
|
||||
|
||||
template <typename T> LIBC_INLINE static T load_be(CPtr ptr) {
|
||||
return Endian::to_big_endian(load<T>(ptr));
|
||||
}
|
||||
// Type traits to decide whether we can use 'cmp' directly or if we need to
|
||||
// split the computation.
|
||||
template <typename T> struct cmp_is_expensive;
|
||||
|
||||
template <typename T>
|
||||
LIBC_INLINE static MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) {
|
||||
return load_be<T>(p1) - load_be<T>(p2);
|
||||
}
|
||||
template <typename T> struct Memcmp {
|
||||
static_assert(is_element_type_v<T>);
|
||||
static constexpr size_t SIZE = sizeof(T);
|
||||
|
||||
template <typename T>
|
||||
LIBC_INLINE static MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) {
|
||||
const auto la = load_be<T>(p1);
|
||||
const auto lb = load_be<T>(p2);
|
||||
return la > lb ? 1 : la < lb ? -1 : 0;
|
||||
}
|
||||
|
||||
LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) {
|
||||
if constexpr (Size == 1) {
|
||||
return load_be_diff<uint8_t>(p1, p2);
|
||||
} else if constexpr (Size == 2) {
|
||||
return load_be_diff<uint16_t>(p1, p2);
|
||||
} else if constexpr (Size == 4) {
|
||||
return load_be_cmp<uint32_t>(p1, p2);
|
||||
} else if constexpr (Size == 8) {
|
||||
return load_be_cmp<uint64_t>(p1, p2);
|
||||
} else if constexpr (details::useArrayType<Size, MaxSize>()) {
|
||||
for (size_t offset = 0; offset < Size; offset += MaxSize)
|
||||
if (Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
|
||||
return Memcmp<MaxSize>::block(p1 + offset, p2 + offset);
|
||||
private:
|
||||
LIBC_INLINE static MemcmpReturnType block_offset(CPtr p1, CPtr p2,
|
||||
size_t offset) {
|
||||
if constexpr (cmp_is_expensive<T>::value) {
|
||||
if (!eq<T>(p1, p2, offset))
|
||||
return cmp_neq<T>(p1, p2, offset);
|
||||
return MemcmpReturnType::ZERO();
|
||||
} else if constexpr (Size == 3) {
|
||||
if (auto value = Memcmp<2>::block(p1, p2))
|
||||
return value;
|
||||
return Memcmp<1>::block(p1 + 2, p2 + 2);
|
||||
} else {
|
||||
deferred_static_assert("Unimplemented Size");
|
||||
return cmp<T>(p1, p2, offset);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) {
|
||||
return block_offset(p1, p2, 0);
|
||||
}
|
||||
|
||||
LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
|
||||
return block(p1 + count - SIZE, p2 + count - SIZE);
|
||||
return block_offset(p1, p2, count - SIZE);
|
||||
}
|
||||
|
||||
LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
if (auto value = block(p1, p2))
|
||||
return value;
|
||||
if constexpr (cmp_is_expensive<T>::value) {
|
||||
if (!eq<T>(p1, p2, 0))
|
||||
return cmp_neq<T>(p1, p2, 0);
|
||||
} else {
|
||||
if (const auto value = cmp<T>(p1, p2, 0))
|
||||
return value;
|
||||
}
|
||||
return tail(p1, p2, count);
|
||||
}
|
||||
|
||||
LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
static_assert(Size > 1, "a loop of size 1 does not need tail");
|
||||
size_t offset = 0;
|
||||
do {
|
||||
if (auto value = block(p1 + offset, p2 + offset))
|
||||
return loop_and_tail_offset(p1, p2, count, 0);
|
||||
}
|
||||
|
||||
LIBC_INLINE static MemcmpReturnType
|
||||
loop_and_tail_offset(CPtr p1, CPtr p2, size_t count, size_t offset) {
|
||||
if constexpr (SIZE > 1) {
|
||||
const size_t limit = count - SIZE;
|
||||
LIBC_LOOP_NOUNROLL
|
||||
for (; offset < limit; offset += SIZE) {
|
||||
if constexpr (cmp_is_expensive<T>::value) {
|
||||
if (!eq<T>(p1, p2, offset))
|
||||
return cmp_neq<T>(p1, p2, offset);
|
||||
} else {
|
||||
if (const auto value = cmp<T>(p1, p2, offset))
|
||||
return value;
|
||||
}
|
||||
}
|
||||
return block_offset(p1, p2, limit); // tail
|
||||
} else {
|
||||
// No need for a tail operation when SIZE == 1.
|
||||
LIBC_LOOP_NOUNROLL
|
||||
for (; offset < count; offset += SIZE)
|
||||
if (auto value = cmp<T>(p1, p2, offset))
|
||||
return value;
|
||||
return MemcmpReturnType::ZERO();
|
||||
}
|
||||
}
|
||||
|
||||
LIBC_INLINE static MemcmpReturnType
|
||||
loop_and_tail_align_above(size_t threshold, CPtr p1, CPtr p2, size_t count) {
|
||||
const AlignHelper<sizeof(T)> helper(p1);
|
||||
if (LIBC_UNLIKELY(count >= threshold) && helper.not_aligned()) {
|
||||
if (auto value = block(p1, p2))
|
||||
return value;
|
||||
offset += SIZE;
|
||||
} while (offset < count - SIZE);
|
||||
return tail(p1, p2, count);
|
||||
adjust(helper.offset(), p1, p2, count);
|
||||
}
|
||||
return loop_and_tail(p1, p2, count);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename... TS> struct MemcmpSequence {
|
||||
static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS));
|
||||
LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) {
|
||||
// TODO: test suggestion in
|
||||
// https://reviews.llvm.org/D148717?id=515724#inline-1446890
|
||||
// once we have a proper way to check memory operation latency.
|
||||
if constexpr (cmp_is_expensive<T>::value) {
|
||||
if (!eq<T>(p1, p2, 0))
|
||||
return cmp_neq<T>(p1, p2, 0);
|
||||
} else {
|
||||
if (auto value = cmp<T>(p1, p2, 0))
|
||||
return value;
|
||||
}
|
||||
if constexpr (sizeof...(TS) > 0)
|
||||
return MemcmpSequence<TS...>::block(p1 + sizeof(T), p2 + sizeof(T));
|
||||
else
|
||||
return MemcmpReturnType::ZERO();
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Bcmp
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
template <typename T> struct Bcmp {
|
||||
static_assert(is_element_type_v<T>);
|
||||
static constexpr size_t SIZE = sizeof(T);
|
||||
|
||||
LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
|
||||
return neq<T>(p1, p2, 0);
|
||||
}
|
||||
|
||||
LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
|
||||
const size_t tail_offset = count - SIZE;
|
||||
return neq<T>(p1, p2, tail_offset);
|
||||
}
|
||||
|
||||
LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
|
||||
if (const auto value = neq<T>(p1, p2, 0))
|
||||
return value;
|
||||
return tail(p1, p2, count);
|
||||
}
|
||||
|
||||
LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
return loop_and_tail_offset(p1, p2, count, 0);
|
||||
}
|
||||
|
||||
LIBC_INLINE static BcmpReturnType
|
||||
loop_and_tail_offset(CPtr p1, CPtr p2, size_t count, size_t offset) {
|
||||
if constexpr (SIZE > 1) {
|
||||
const size_t limit = count - SIZE;
|
||||
LIBC_LOOP_NOUNROLL
|
||||
for (; offset < limit; offset += SIZE)
|
||||
if (const auto value = neq<T>(p1, p2, offset))
|
||||
return value;
|
||||
return tail(p1, p2, count);
|
||||
} else {
|
||||
// No need for a tail operation when SIZE == 1.
|
||||
LIBC_LOOP_NOUNROLL
|
||||
for (; offset < count; offset += SIZE)
|
||||
if (const auto value = neq<T>(p1, p2, offset))
|
||||
return value;
|
||||
return BcmpReturnType::ZERO();
|
||||
}
|
||||
}
|
||||
|
||||
LIBC_INLINE static BcmpReturnType
|
||||
loop_and_tail_align_above(size_t threshold, CPtr p1, CPtr p2, size_t count) {
|
||||
static_assert(SIZE > 1,
|
||||
"No need to align when processing one byte at a time");
|
||||
const AlignHelper<sizeof(T)> helper(p1);
|
||||
if (LIBC_UNLIKELY(count >= threshold) && helper.not_aligned()) {
|
||||
if (auto value = block(p1, p2))
|
||||
return value;
|
||||
adjust(helper.offset(), p1, p2, count);
|
||||
}
|
||||
return loop_and_tail(p1, p2, count);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename... TS> struct BcmpSequence {
|
||||
static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS));
|
||||
LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
|
||||
if (auto value = neq<T>(p1, p2, 0))
|
||||
return value;
|
||||
if constexpr (sizeof...(TS) > 0)
|
||||
return BcmpSequence<TS...>::block(p1 + sizeof(T), p2 + sizeof(T));
|
||||
else
|
||||
return BcmpReturnType::ZERO();
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint8_t
|
||||
template <> struct cmp_is_expensive<uint8_t> : public cpp::false_type {};
|
||||
template <> LIBC_INLINE bool eq<uint8_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint8_t>(p1, offset) == load<uint8_t>(p2, offset);
|
||||
}
|
||||
template <> LIBC_INLINE uint32_t neq<uint8_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint8_t>(p1, offset) ^ load<uint8_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint8_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return static_cast<int32_t>(load<uint8_t>(p1, offset)) -
|
||||
static_cast<int32_t>(load<uint8_t>(p2, offset));
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<uint8_t>(CPtr p1, CPtr p2, size_t offset);
|
||||
} // namespace __llvm_libc::generic
|
||||
|
||||
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
|
||||
|
||||
84
libc/src/string/memory_utils/op_riscv.h
Normal file
84
libc/src/string/memory_utils/op_riscv.h
Normal file
@@ -0,0 +1,84 @@
|
||||
//===-- RISC-V implementation of memory function building blocks ----------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file provides x86 specific building blocks to compose memory functions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_RISCV_H
|
||||
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_RISCV_H
|
||||
|
||||
#include "src/__support/macros/properties/architectures.h"
|
||||
|
||||
#if defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
|
||||
|
||||
#include "src/__support/common.h"
|
||||
#include "src/string/memory_utils/op_generic.h"
|
||||
|
||||
namespace __llvm_libc::generic {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint16_t
|
||||
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
|
||||
template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
|
||||
static_cast<int32_t>(load_be<uint16_t>(p2, offset));
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint32_t
|
||||
template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
|
||||
template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load_be<uint32_t>(p1, offset);
|
||||
const auto b = load_be<uint32_t>(p2, offset);
|
||||
return cmp_uint32_t(a, b);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint64_t
|
||||
template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
|
||||
template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return !eq<uint64_t>(p1, p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
|
||||
size_t offset) {
|
||||
const auto a = load_be<uint64_t>(p1, offset);
|
||||
const auto b = load_be<uint64_t>(p2, offset);
|
||||
return cmp_neq_uint64_t(a, b);
|
||||
}
|
||||
|
||||
} // namespace __llvm_libc::generic
|
||||
|
||||
#endif // LIBC_TARGET_ARCH_IS_ANY_RISCV
|
||||
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_RISCV_H
|
||||
@@ -40,11 +40,13 @@
|
||||
namespace __llvm_libc::x86 {
|
||||
|
||||
// A set of constants to check compile time features.
|
||||
static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
|
||||
static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
|
||||
static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
|
||||
static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
|
||||
static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
|
||||
static LIBC_INLINE constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
|
||||
static LIBC_INLINE constexpr bool kSse41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
|
||||
static LIBC_INLINE constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
|
||||
static LIBC_INLINE constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
|
||||
static LIBC_INLINE constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
|
||||
static LIBC_INLINE constexpr bool kAvx512BW =
|
||||
LLVM_LIBC_IS_DEFINED(__AVX512BW__);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Memcpy repmovsb implementation
|
||||
@@ -54,221 +56,200 @@ struct Memcpy {
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Bcmp
|
||||
|
||||
// Base implementation for the Bcmp specializations.
|
||||
// - BlockSize is either 16, 32 or 64 depending on the available compile time
|
||||
// features, it is used to switch between "single native operation" or a
|
||||
// "sequence of native operations".
|
||||
// - BlockBcmp is the function that implements the bcmp logic.
|
||||
template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
|
||||
static constexpr size_t SIZE = Size;
|
||||
LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
|
||||
if constexpr (Size == BlockSize) {
|
||||
return BlockBcmp(p1, p2);
|
||||
} else if constexpr (Size % BlockSize == 0) {
|
||||
for (size_t offset = 0; offset < Size; offset += BlockSize)
|
||||
if (auto value = BlockBcmp(p1 + offset, p2 + offset))
|
||||
return value;
|
||||
} else {
|
||||
deferred_static_assert("SIZE not implemented");
|
||||
}
|
||||
return BcmpReturnType::ZERO();
|
||||
}
|
||||
|
||||
LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
|
||||
return block(p1 + count - Size, p2 + count - Size);
|
||||
}
|
||||
|
||||
LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
|
||||
return block(p1, p2) | tail(p1, p2, count);
|
||||
}
|
||||
|
||||
LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
static_assert(Size > 1, "a loop of size 1 does not need tail");
|
||||
size_t offset = 0;
|
||||
do {
|
||||
if (auto value = block(p1 + offset, p2 + offset))
|
||||
return value;
|
||||
offset += Size;
|
||||
} while (offset < count - Size);
|
||||
return tail(p1, p2, count);
|
||||
}
|
||||
};
|
||||
|
||||
namespace sse2 {
|
||||
LIBC_INLINE BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
|
||||
#if defined(__SSE2__)
|
||||
using T = char __attribute__((__vector_size__(16)));
|
||||
// A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
|
||||
const int mask =
|
||||
_mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2)));
|
||||
return static_cast<uint32_t>(mask);
|
||||
#else
|
||||
(void)p1;
|
||||
(void)p2;
|
||||
return BcmpReturnType::ZERO();
|
||||
#endif // defined(__SSE2__)
|
||||
}
|
||||
template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
|
||||
} // namespace sse2
|
||||
|
||||
namespace avx2 {
|
||||
LIBC_INLINE BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
|
||||
#if defined(__AVX2__)
|
||||
using T = char __attribute__((__vector_size__(32)));
|
||||
// A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
|
||||
const int mask =
|
||||
_mm256_movemask_epi8(cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2)));
|
||||
// _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
|
||||
// mask.
|
||||
return static_cast<uint32_t>(mask);
|
||||
#else
|
||||
(void)p1;
|
||||
(void)p2;
|
||||
return BcmpReturnType::ZERO();
|
||||
#endif // defined(__AVX2__)
|
||||
}
|
||||
template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
|
||||
} // namespace avx2
|
||||
|
||||
namespace avx512bw {
|
||||
LIBC_INLINE BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
|
||||
#if defined(__AVX512BW__)
|
||||
using T = char __attribute__((__vector_size__(64)));
|
||||
// A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
|
||||
const uint64_t mask = _mm512_cmpneq_epi8_mask(
|
||||
cpp::bit_cast<__m512i>(load<T>(p1)), cpp::bit_cast<__m512i>(load<T>(p2)));
|
||||
const bool mask_is_set = mask != 0;
|
||||
return static_cast<uint32_t>(mask_is_set);
|
||||
#else
|
||||
(void)p1;
|
||||
(void)p2;
|
||||
return BcmpReturnType::ZERO();
|
||||
#endif // defined(__AVX512BW__)
|
||||
}
|
||||
template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
|
||||
} // namespace avx512bw
|
||||
|
||||
// Assuming that the mask is non zero, the index of the first mismatching byte
|
||||
// is the number of trailing zeros in the mask. Trailing zeros and not leading
|
||||
// zeros because the x86 architecture is little endian.
|
||||
LIBC_INLINE MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2,
|
||||
uint64_t mask) {
|
||||
const size_t diff_index = __builtin_ctzll(mask);
|
||||
const int16_t ca = cpp::to_integer<uint8_t>(p1[diff_index]);
|
||||
const int16_t cb = cpp::to_integer<uint8_t>(p2[diff_index]);
|
||||
return ca - cb;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Memcmp
|
||||
|
||||
// Base implementation for the Memcmp specializations.
|
||||
// - BlockSize is either 16, 32 or 64 depending on the available compile time
|
||||
// features, it is used to switch between "single native operation" or a
|
||||
// "sequence of native operations".
|
||||
// - BlockMemcmp is the function that implements the memcmp logic.
|
||||
// - BlockBcmp is the function that implements the bcmp logic.
|
||||
template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
|
||||
struct MemcmpImpl {
|
||||
static constexpr size_t SIZE = Size;
|
||||
LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) {
|
||||
if constexpr (Size == BlockSize) {
|
||||
return BlockMemcmp(p1, p2);
|
||||
} else if constexpr (Size % BlockSize == 0) {
|
||||
for (size_t offset = 0; offset < Size; offset += BlockSize)
|
||||
if (auto value = BlockBcmp(p1 + offset, p2 + offset))
|
||||
return BlockMemcmp(p1 + offset, p2 + offset);
|
||||
} else {
|
||||
deferred_static_assert("SIZE not implemented");
|
||||
}
|
||||
return MemcmpReturnType::ZERO();
|
||||
}
|
||||
|
||||
LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
|
||||
return block(p1 + count - Size, p2 + count - Size);
|
||||
}
|
||||
|
||||
LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
if (auto value = block(p1, p2))
|
||||
return value;
|
||||
return tail(p1, p2, count);
|
||||
}
|
||||
|
||||
LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
|
||||
size_t count) {
|
||||
static_assert(Size > 1, "a loop of size 1 does not need tail");
|
||||
size_t offset = 0;
|
||||
do {
|
||||
if (auto value = block(p1 + offset, p2 + offset))
|
||||
return value;
|
||||
offset += Size;
|
||||
} while (offset < count - Size);
|
||||
return tail(p1, p2, count);
|
||||
}
|
||||
};
|
||||
|
||||
namespace sse2 {
|
||||
LIBC_INLINE MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
|
||||
#if defined(__SSE2__)
|
||||
using T = char __attribute__((__vector_size__(16)));
|
||||
// A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
|
||||
if (int mask =
|
||||
_mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2))))
|
||||
return char_diff_no_zero(p1, p2, mask);
|
||||
return MemcmpReturnType::ZERO();
|
||||
#else
|
||||
(void)p1;
|
||||
(void)p2;
|
||||
return MemcmpReturnType::ZERO();
|
||||
#endif // defined(__SSE2__)
|
||||
}
|
||||
template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
|
||||
} // namespace sse2
|
||||
|
||||
namespace avx2 {
|
||||
LIBC_INLINE MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
|
||||
#if defined(__AVX2__)
|
||||
using T = char __attribute__((__vector_size__(32)));
|
||||
// A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
|
||||
if (int mask = _mm256_movemask_epi8(
|
||||
cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2))))
|
||||
return char_diff_no_zero(p1, p2, mask);
|
||||
return MemcmpReturnType::ZERO();
|
||||
#else
|
||||
(void)p1;
|
||||
(void)p2;
|
||||
return MemcmpReturnType::ZERO();
|
||||
#endif // defined(__AVX2__)
|
||||
}
|
||||
template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
|
||||
} // namespace avx2
|
||||
|
||||
namespace avx512bw {
|
||||
LIBC_INLINE MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
|
||||
#if defined(__AVX512BW__)
|
||||
using T = char __attribute__((__vector_size__(64)));
|
||||
// A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
|
||||
if (uint64_t mask =
|
||||
_mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(load<T>(p1)),
|
||||
cpp::bit_cast<__m512i>(load<T>(p2))))
|
||||
return char_diff_no_zero(p1, p2, mask);
|
||||
return MemcmpReturnType::ZERO();
|
||||
#else
|
||||
(void)p1;
|
||||
(void)p2;
|
||||
return MemcmpReturnType::ZERO();
|
||||
#endif // defined(__AVX512BW__)
|
||||
}
|
||||
template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
|
||||
} // namespace avx512bw
|
||||
|
||||
} // namespace __llvm_libc::x86
|
||||
|
||||
namespace __llvm_libc::generic {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint16_t
|
||||
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
|
||||
template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
|
||||
static_cast<int32_t>(load_be<uint16_t>(p2, offset));
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint32_t
|
||||
template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
|
||||
template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load_be<uint32_t>(p1, offset);
|
||||
const auto b = load_be<uint32_t>(p2, offset);
|
||||
return cmp_uint32_t(a, b);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for uint64_t
|
||||
template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
|
||||
template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
|
||||
return !eq<uint64_t>(p1, p2, offset);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
|
||||
size_t offset) {
|
||||
const auto a = load_be<uint64_t>(p1, offset);
|
||||
const auto b = load_be<uint64_t>(p2, offset);
|
||||
return cmp_neq_uint64_t(a, b);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for __m128i
|
||||
#if defined(__SSE4_1__)
|
||||
template <> struct is_vector<__m128i> : cpp::true_type {};
|
||||
template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
|
||||
LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
|
||||
return _mm_max_epu8(a, b);
|
||||
}
|
||||
LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
|
||||
return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
|
||||
8, 9, 10, 11, 12, 13, 14, 15));
|
||||
}
|
||||
LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
|
||||
return _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value)));
|
||||
}
|
||||
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m128i>(p1, offset);
|
||||
const auto b = load<__m128i>(p2, offset);
|
||||
const auto xored = _mm_xor_si128(a, b);
|
||||
return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
|
||||
}
|
||||
template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m128i>(p1, offset);
|
||||
const auto b = load<__m128i>(p2, offset);
|
||||
const auto xored = _mm_xor_si128(a, b);
|
||||
return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m128i>(p1, offset);
|
||||
const auto b = load<__m128i>(p2, offset);
|
||||
const auto vmax = bytewise_max(a, b);
|
||||
const auto le = big_endian_cmp_mask(vmax, b);
|
||||
const auto ge = big_endian_cmp_mask(vmax, a);
|
||||
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
|
||||
return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
|
||||
}
|
||||
#endif // __SSE4_1__
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for __m256i
|
||||
#if defined(__AVX__)
|
||||
template <> struct is_vector<__m256i> : cpp::true_type {};
|
||||
template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
|
||||
template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m256i>(p1, offset);
|
||||
const auto b = load<__m256i>(p2, offset);
|
||||
const auto xored = _mm256_castps_si256(
|
||||
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
|
||||
return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
|
||||
}
|
||||
template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m256i>(p1, offset);
|
||||
const auto b = load<__m256i>(p2, offset);
|
||||
const auto xored = _mm256_castps_si256(
|
||||
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
|
||||
return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
|
||||
}
|
||||
#endif // __AVX__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
|
||||
return _mm256_max_epu8(a, b);
|
||||
}
|
||||
LIBC_INLINE __m256i bytewise_reverse(__m256i value) {
|
||||
return _mm256_shuffle_epi8(value,
|
||||
_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
|
||||
8, 9, 10, 11, 12, 13, 14, 15, //
|
||||
16, 17, 18, 19, 20, 21, 22, 23, //
|
||||
24, 25, 26, 27, 28, 29, 30, 31));
|
||||
}
|
||||
LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
|
||||
return _mm256_movemask_epi8(bytewise_reverse(_mm256_cmpeq_epi8(max, value)));
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m256i>(p1, offset);
|
||||
const auto b = load<__m256i>(p2, offset);
|
||||
const auto vmax = bytewise_max(a, b);
|
||||
const auto le = big_endian_cmp_mask(vmax, b);
|
||||
const auto ge = big_endian_cmp_mask(vmax, a);
|
||||
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
|
||||
return cmp_uint32_t(ge, le);
|
||||
}
|
||||
#endif // __AVX2__
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Specializations for __m512i
|
||||
#if defined(__AVX512BW__)
|
||||
template <> struct is_vector<__m512i> : cpp::true_type {};
|
||||
template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
|
||||
LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
|
||||
return _mm512_max_epu8(a, b);
|
||||
}
|
||||
LIBC_INLINE __m512i bytewise_reverse(__m512i value) {
|
||||
return _mm512_shuffle_epi8(value,
|
||||
_mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
|
||||
8, 9, 10, 11, 12, 13, 14, 15, //
|
||||
16, 17, 18, 19, 20, 21, 22, 23, //
|
||||
24, 25, 26, 27, 28, 29, 30, 31, //
|
||||
32, 33, 34, 35, 36, 37, 38, 39, //
|
||||
40, 41, 42, 43, 44, 45, 46, 47, //
|
||||
48, 49, 50, 51, 52, 53, 54, 55, //
|
||||
56, 57, 58, 59, 60, 61, 62, 63));
|
||||
}
|
||||
LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
|
||||
return _mm512_cmpeq_epi8_mask(bytewise_reverse(max), bytewise_reverse(value));
|
||||
}
|
||||
template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m512i>(p1, offset);
|
||||
const auto b = load<__m512i>(p2, offset);
|
||||
return _mm512_cmpneq_epi8_mask(a, b) == 0;
|
||||
}
|
||||
template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m512i>(p1, offset);
|
||||
const auto b = load<__m512i>(p2, offset);
|
||||
const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
|
||||
return (xored >> 32) | (xored & 0xFFFFFFFF);
|
||||
}
|
||||
template <>
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
|
||||
const auto a = load<__m512i>(p1, offset);
|
||||
const auto b = load<__m512i>(p2, offset);
|
||||
const auto vmax = bytewise_max(a, b);
|
||||
const auto le = big_endian_cmp_mask(vmax, b);
|
||||
const auto ge = big_endian_cmp_mask(vmax, a);
|
||||
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
|
||||
return cmp_neq_uint64_t(ge, le);
|
||||
}
|
||||
#endif // __AVX512BW__
|
||||
|
||||
} // namespace __llvm_libc::generic
|
||||
|
||||
#endif // LIBC_TARGET_ARCH_IS_X86_64
|
||||
|
||||
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
|
||||
|
||||
@@ -11,10 +11,12 @@
|
||||
|
||||
#include "src/__support/CPP/bit.h"
|
||||
#include "src/__support/CPP/cstddef.h"
|
||||
#include "src/__support/CPP/limits.h" // cpp::numeric_limits
|
||||
#include "src/__support/CPP/type_traits.h"
|
||||
#include "src/__support/endian.h"
|
||||
#include "src/__support/macros/attributes.h" // LIBC_INLINE
|
||||
#include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN
|
||||
#include "src/__support/macros/properties/architectures.h"
|
||||
|
||||
#include <stddef.h> // size_t
|
||||
#include <stdint.h> // intptr_t / uintptr_t
|
||||
@@ -149,6 +151,56 @@ private:
|
||||
using MemcmpReturnType = StrictIntegralType<int32_t>;
|
||||
using BcmpReturnType = StrictIntegralType<uint32_t>;
|
||||
|
||||
// This implements the semantic of 'memcmp' returning a negative value when 'a'
|
||||
// is less than 'b', '0' when 'a' equals 'b' and a positive number otherwise.
|
||||
LIBC_INLINE MemcmpReturnType cmp_uint32_t(uint32_t a, uint32_t b) {
|
||||
// We perform the difference as an int64_t.
|
||||
const int64_t diff = static_cast<int64_t>(a) - static_cast<int64_t>(b);
|
||||
// For the int64_t to int32_t conversion we want the following properties:
|
||||
// - int32_t[31:31] == 1 iff diff < 0
|
||||
// - int32_t[31:0] == 0 iff diff == 0
|
||||
|
||||
// We also observe that:
|
||||
// - When diff < 0: diff[63:32] == 0xffffffff and diff[31:0] != 0
|
||||
// - When diff > 0: diff[63:32] == 0 and diff[31:0] != 0
|
||||
// - When diff == 0: diff[63:32] == 0 and diff[31:0] == 0
|
||||
// - https://godbolt.org/z/8W7qWP6e5
|
||||
// - This implies that we can only look at diff[32:32] for determining the
|
||||
// sign bit for the returned int32_t.
|
||||
|
||||
// So, we do the following:
|
||||
// - int32_t[31:31] = diff[32:32]
|
||||
// - int32_t[30:0] = diff[31:0] == 0 ? 0 : non-0.
|
||||
|
||||
// And, we can achieve the above by the expression below. We could have also
|
||||
// used (diff64 >> 1) | (diff64 & 0x1) but (diff64 & 0xFFFF) is faster than
|
||||
// (diff64 & 0x1). https://godbolt.org/z/j3b569rW1
|
||||
return static_cast<int32_t>((diff >> 1) | (diff & 0xFFFF));
|
||||
}
|
||||
|
||||
// Returns a negative value if 'a' is less than 'b' and a positive value
|
||||
// otherwise. This implements the semantic of 'memcmp' when we know that 'a' and
|
||||
// 'b' differ.
|
||||
LIBC_INLINE MemcmpReturnType cmp_neq_uint64_t(uint64_t a, uint64_t b) {
|
||||
#if defined(LIBC_TARGET_ARCH_IS_X86_64)
|
||||
// On x86, we choose the returned values so that they are just one unit appart
|
||||
// as this allows for better code generation.
|
||||
static constexpr int32_t POSITIVE = cpp::numeric_limits<int32_t>::max();
|
||||
static constexpr int32_t NEGATIVE = cpp::numeric_limits<int32_t>::min();
|
||||
static_assert(cpp::bit_cast<uint32_t>(NEGATIVE) -
|
||||
cpp::bit_cast<uint32_t>(POSITIVE) ==
|
||||
1);
|
||||
#else
|
||||
// On RISC-V we simply use '1' and '-1' as it leads to branchless code.
|
||||
// On ARMv8, both strategies lead to the same performance.
|
||||
static constexpr int32_t POSITIVE = 1;
|
||||
static constexpr int32_t NEGATIVE = -1;
|
||||
#endif
|
||||
static_assert(POSITIVE > 0);
|
||||
static_assert(NEGATIVE < 0);
|
||||
return a < b ? NEGATIVE : POSITIVE;
|
||||
}
|
||||
|
||||
// Loads bytes from memory (possibly unaligned) and materializes them as
|
||||
// type.
|
||||
template <typename T> LIBC_INLINE T load(CPtr ptr) {
|
||||
@@ -280,6 +332,16 @@ void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
|
||||
deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
|
||||
}
|
||||
|
||||
template <size_t SIZE> struct AlignHelper {
|
||||
AlignHelper(CPtr ptr) : offset_(distance_to_next_aligned<SIZE>(ptr)) {}
|
||||
|
||||
LIBC_INLINE bool not_aligned() const { return offset_ != SIZE; }
|
||||
LIBC_INLINE uintptr_t offset() const { return offset_; }
|
||||
|
||||
private:
|
||||
uintptr_t offset_;
|
||||
};
|
||||
|
||||
} // namespace __llvm_libc
|
||||
|
||||
#endif // LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
|
||||
|
||||
@@ -18,79 +18,76 @@ namespace __llvm_libc {
|
||||
|
||||
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
|
||||
inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (LIBC_UNLIKELY(count >= 384)) {
|
||||
if (auto value = generic::Memcmp<16>::block(p1, p2))
|
||||
return value;
|
||||
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
|
||||
}
|
||||
return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
|
||||
return generic::Memcmp<uint64_t>::loop_and_tail_align_above(384, p1, p2,
|
||||
count);
|
||||
}
|
||||
|
||||
#if defined(__SSE4_1__)
|
||||
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
|
||||
inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (LIBC_UNLIKELY(count >= 384)) {
|
||||
if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
|
||||
return value;
|
||||
align_to_next_boundary<16, Arg::P1>(p1, p2, count);
|
||||
}
|
||||
return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
|
||||
inline_memcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
return generic::Memcmp<__m128i>::loop_and_tail_align_above(384, p1, p2,
|
||||
count);
|
||||
}
|
||||
#endif // __SSE4_1__
|
||||
|
||||
#if defined(__AVX2__)
|
||||
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
|
||||
inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (count <= 32)
|
||||
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
|
||||
return generic::Memcmp<__m128i>::head_tail(p1, p2, count);
|
||||
if (count <= 64)
|
||||
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
|
||||
if (count <= 128)
|
||||
return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
|
||||
if (LIBC_UNLIKELY(count >= 384)) {
|
||||
if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
|
||||
return value;
|
||||
align_to_next_boundary<32, Arg::P1>(p1, p2, count);
|
||||
}
|
||||
return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
|
||||
return generic::Memcmp<__m256i>::head_tail(p1, p2, count);
|
||||
return generic::Memcmp<__m256i>::loop_and_tail_align_above(384, p1, p2,
|
||||
count);
|
||||
}
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__AVX512BW__)
|
||||
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
|
||||
inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
|
||||
if (count <= 32)
|
||||
return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
|
||||
return generic::Memcmp<__m128i>::head_tail(p1, p2, count);
|
||||
if (count <= 64)
|
||||
return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
|
||||
return generic::Memcmp<__m256i>::head_tail(p1, p2, count);
|
||||
if (count <= 128)
|
||||
return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
|
||||
if (LIBC_UNLIKELY(count >= 384)) {
|
||||
if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
|
||||
return value;
|
||||
align_to_next_boundary<64, Arg::P1>(p1, p2, count);
|
||||
}
|
||||
return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
|
||||
return generic::Memcmp<__m512i>::head_tail(p1, p2, count);
|
||||
return generic::Memcmp<__m512i>::loop_and_tail_align_above(384, p1, p2,
|
||||
count);
|
||||
}
|
||||
#endif // __AVX512BW__
|
||||
|
||||
LIBC_INLINE MemcmpReturnType inline_memcmp_x86(CPtr p1, CPtr p2, size_t count) {
|
||||
|
||||
if (count == 0)
|
||||
return MemcmpReturnType::ZERO();
|
||||
if (count == 1)
|
||||
return generic::Memcmp<1>::block(p1, p2);
|
||||
return generic::Memcmp<uint8_t>::block(p1, p2);
|
||||
if (count == 2)
|
||||
return generic::Memcmp<2>::block(p1, p2);
|
||||
return generic::Memcmp<uint16_t>::block(p1, p2);
|
||||
if (count == 3)
|
||||
return generic::Memcmp<3>::block(p1, p2);
|
||||
if (count <= 8)
|
||||
return generic::Memcmp<4>::head_tail(p1, p2, count);
|
||||
return generic::MemcmpSequence<uint16_t, uint8_t>::block(p1, p2);
|
||||
if (count == 4)
|
||||
return generic::Memcmp<uint32_t>::block(p1, p2);
|
||||
if (count == 5)
|
||||
return generic::MemcmpSequence<uint32_t, uint8_t>::block(p1, p2);
|
||||
if (count == 6)
|
||||
return generic::MemcmpSequence<uint32_t, uint16_t>::block(p1, p2);
|
||||
if (count == 7)
|
||||
return generic::Memcmp<uint32_t>::head_tail(p1, p2, 7);
|
||||
if (count == 8)
|
||||
return generic::Memcmp<uint64_t>::block(p1, p2);
|
||||
if (count <= 16)
|
||||
return generic::Memcmp<8>::head_tail(p1, p2, count);
|
||||
if constexpr (x86::kAvx512BW)
|
||||
return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
|
||||
else if constexpr (x86::kAvx2)
|
||||
return inline_memcmp_x86_avx2_gt16(p1, p2, count);
|
||||
else if constexpr (x86::kSse2)
|
||||
return inline_memcmp_x86_sse2_gt16(p1, p2, count);
|
||||
else
|
||||
return inline_memcmp_generic_gt16(p1, p2, count);
|
||||
return generic::Memcmp<uint64_t>::head_tail(p1, p2, count);
|
||||
#if defined(__AVX512BW__)
|
||||
return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
|
||||
#elif defined(__AVX2__)
|
||||
return inline_memcmp_x86_avx2_gt16(p1, p2, count);
|
||||
#elif defined(__SSE4_1__)
|
||||
return inline_memcmp_x86_sse41_gt16(p1, p2, count);
|
||||
#else
|
||||
return inline_memcmp_generic_gt16(p1, p2, count);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace __llvm_libc
|
||||
|
||||
#endif // LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCMP_IMPLEMENTATIONS_H
|
||||
|
||||
@@ -9,14 +9,11 @@
|
||||
#include "memory_check_utils.h"
|
||||
#include "src/string/memory_utils/op_aarch64.h"
|
||||
#include "src/string/memory_utils/op_builtin.h"
|
||||
#include "src/string/memory_utils/op_generic.h"
|
||||
#include "src/string/memory_utils/op_generic.h" // LLVM_LIBC_HAS_UINT64
|
||||
#include "src/string/memory_utils/op_riscv.h"
|
||||
#include "src/string/memory_utils/op_x86.h"
|
||||
#include "test/UnitTest/Test.h"
|
||||
|
||||
#if defined(LIBC_TARGET_ARCH_IS_X86_64) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
|
||||
#define LLVM_LIBC_HAS_UINT64
|
||||
#endif
|
||||
|
||||
namespace __llvm_libc {
|
||||
|
||||
template <typename T> struct has_head_tail {
|
||||
@@ -131,13 +128,13 @@ using MemsetImplementations = testing::TypeList<
|
||||
generic::Memset<uint64_t>, generic::Memset<cpp::array<uint64_t, 2>>,
|
||||
#endif
|
||||
#ifdef __AVX512F__
|
||||
generic::Memset<uint8x64_t>, generic::Memset<cpp::array<uint8x64_t, 2>>,
|
||||
generic::Memset<generic_v512>, generic::Memset<cpp::array<generic_v512, 2>>,
|
||||
#endif
|
||||
#ifdef __AVX__
|
||||
generic::Memset<uint8x32_t>, generic::Memset<cpp::array<uint8x32_t, 2>>,
|
||||
generic::Memset<generic_v256>, generic::Memset<cpp::array<generic_v256, 2>>,
|
||||
#endif
|
||||
#ifdef __SSE2__
|
||||
generic::Memset<uint8x16_t>, generic::Memset<cpp::array<uint8x16_t, 2>>,
|
||||
generic::Memset<generic_v128>, generic::Memset<cpp::array<generic_v128, 2>>,
|
||||
#endif
|
||||
generic::Memset<uint32_t>, generic::Memset<cpp::array<uint32_t, 2>>, //
|
||||
generic::Memset<uint16_t>, generic::Memset<cpp::array<uint16_t, 2>>, //
|
||||
@@ -194,35 +191,36 @@ TYPED_TEST(LlvmLibcOpTest, Memset, MemsetImplementations) {
|
||||
}
|
||||
|
||||
using BcmpImplementations = testing::TypeList<
|
||||
#ifdef __SSE2__
|
||||
x86::sse2::Bcmp<16>, //
|
||||
x86::sse2::Bcmp<32>, //
|
||||
x86::sse2::Bcmp<64>, //
|
||||
x86::sse2::Bcmp<128>, //
|
||||
#endif
|
||||
#ifdef LIBC_TARGET_ARCH_IS_X86_64
|
||||
#ifdef __SSE4_1__
|
||||
generic::Bcmp<__m128i>,
|
||||
#endif // __SSE4_1__
|
||||
#ifdef __AVX2__
|
||||
x86::avx2::Bcmp<32>, //
|
||||
x86::avx2::Bcmp<64>, //
|
||||
x86::avx2::Bcmp<128>, //
|
||||
#endif
|
||||
generic::Bcmp<__m256i>,
|
||||
#endif // __AVX2__
|
||||
#ifdef __AVX512BW__
|
||||
x86::avx512bw::Bcmp<64>, //
|
||||
x86::avx512bw::Bcmp<128>, //
|
||||
#endif
|
||||
generic::Bcmp<__m512i>,
|
||||
#endif // __AVX512BW__
|
||||
|
||||
#endif // LIBC_TARGET_ARCH_IS_X86_64
|
||||
#ifdef LIBC_TARGET_ARCH_IS_AARCH64
|
||||
aarch64::Bcmp<16>, //
|
||||
aarch64::Bcmp<32>, //
|
||||
aarch64::Bcmp<32>,
|
||||
#endif
|
||||
#ifndef LIBC_TARGET_ARCH_IS_ARM // Removing non uint8_t types for ARM
|
||||
generic::Bcmp<uint16_t>,
|
||||
generic::Bcmp<uint32_t>, //
|
||||
#ifdef LLVM_LIBC_HAS_UINT64
|
||||
generic::Bcmp<8>, //
|
||||
#endif
|
||||
generic::Bcmp<1>, //
|
||||
generic::Bcmp<2>, //
|
||||
generic::Bcmp<4>, //
|
||||
generic::Bcmp<16>, //
|
||||
generic::Bcmp<32>, //
|
||||
generic::Bcmp<64> //
|
||||
>;
|
||||
generic::Bcmp<uint64_t>,
|
||||
#endif // LLVM_LIBC_HAS_UINT64
|
||||
generic::BcmpSequence<uint16_t, uint8_t>,
|
||||
generic::BcmpSequence<uint32_t, uint8_t>, //
|
||||
generic::BcmpSequence<uint32_t, uint16_t>, //
|
||||
generic::BcmpSequence<uint32_t, uint16_t, uint8_t>,
|
||||
#endif // LIBC_TARGET_ARCH_IS_ARM
|
||||
generic::BcmpSequence<uint8_t, uint8_t>,
|
||||
generic::BcmpSequence<uint8_t, uint8_t, uint8_t>, //
|
||||
generic::Bcmp<uint8_t>>;
|
||||
|
||||
// Adapt CheckBcmp signature to op implementation signatures.
|
||||
template <auto FnImpl>
|
||||
@@ -247,7 +245,8 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
|
||||
ASSERT_TRUE((CheckBcmp<BlockImpl>(span1, span2, kSize)));
|
||||
}
|
||||
}
|
||||
{ // Test head tail operations from kSize to 2 * kSize.
|
||||
if constexpr (has_head_tail<Impl>::value) {
|
||||
// Test head tail operations from kSize to 2 * kSize.
|
||||
static constexpr auto HeadTailImpl = CmpAdaptor<Impl::head_tail>;
|
||||
Buffer Buffer1(2 * kSize);
|
||||
Buffer Buffer2(2 * kSize);
|
||||
@@ -258,7 +257,8 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
|
||||
ASSERT_TRUE((CheckBcmp<HeadTailImpl>(span1, span2, size)));
|
||||
}
|
||||
}
|
||||
{ // Test loop operations from kSize to 3 * kSize.
|
||||
if constexpr (has_loop_and_tail<Impl>::value) {
|
||||
// Test loop operations from kSize to 3 * kSize.
|
||||
if constexpr (kSize > 1) {
|
||||
static constexpr auto LoopImpl = CmpAdaptor<Impl::loop_and_tail>;
|
||||
Buffer Buffer1(3 * kSize);
|
||||
@@ -274,32 +274,33 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
|
||||
}
|
||||
|
||||
using MemcmpImplementations = testing::TypeList<
|
||||
#ifdef LIBC_TARGET_ARCH_IS_X86_64
|
||||
#ifdef __SSE2__
|
||||
x86::sse2::Memcmp<16>, //
|
||||
x86::sse2::Memcmp<32>, //
|
||||
x86::sse2::Memcmp<64>, //
|
||||
x86::sse2::Memcmp<128>, //
|
||||
generic::Memcmp<__m128i>, //
|
||||
#endif
|
||||
#ifdef __AVX2__
|
||||
x86::avx2::Memcmp<32>, //
|
||||
x86::avx2::Memcmp<64>, //
|
||||
x86::avx2::Memcmp<128>, //
|
||||
generic::Memcmp<__m256i>, //
|
||||
#endif
|
||||
#ifdef __AVX512BW__
|
||||
x86::avx512bw::Memcmp<64>, //
|
||||
x86::avx512bw::Memcmp<128>, //
|
||||
generic::Memcmp<__m512i>, //
|
||||
#endif
|
||||
#endif // LIBC_TARGET_ARCH_IS_X86_64
|
||||
#ifdef LIBC_TARGET_ARCH_IS_AARCH64
|
||||
generic::Memcmp<uint8x16_t>, //
|
||||
generic::Memcmp<uint8x16x2_t>,
|
||||
#endif
|
||||
#ifndef LIBC_TARGET_ARCH_IS_ARM // Removing non uint8_t types for ARM
|
||||
generic::Memcmp<uint16_t>,
|
||||
generic::Memcmp<uint32_t>, //
|
||||
#ifdef LLVM_LIBC_HAS_UINT64
|
||||
generic::Memcmp<8>, //
|
||||
#endif
|
||||
generic::Memcmp<1>, //
|
||||
generic::Memcmp<2>, //
|
||||
generic::Memcmp<3>, //
|
||||
generic::Memcmp<4>, //
|
||||
generic::Memcmp<16>, //
|
||||
generic::Memcmp<32>, //
|
||||
generic::Memcmp<64> //
|
||||
>;
|
||||
generic::Memcmp<uint64_t>,
|
||||
#endif // LLVM_LIBC_HAS_UINT64
|
||||
generic::MemcmpSequence<uint16_t, uint8_t>,
|
||||
generic::MemcmpSequence<uint32_t, uint16_t, uint8_t>, //
|
||||
#endif // LIBC_TARGET_ARCH_IS_ARM
|
||||
generic::MemcmpSequence<uint8_t, uint8_t>,
|
||||
generic::MemcmpSequence<uint8_t, uint8_t, uint8_t>,
|
||||
generic::Memcmp<uint8_t>>;
|
||||
|
||||
TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
|
||||
using Impl = ParamType;
|
||||
@@ -314,7 +315,8 @@ TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
|
||||
ASSERT_TRUE((CheckMemcmp<BlockImpl>(span1, span2, kSize)));
|
||||
}
|
||||
}
|
||||
{ // Test head tail operations from kSize to 2 * kSize.
|
||||
if constexpr (has_head_tail<Impl>::value) {
|
||||
// Test head tail operations from kSize to 2 * kSize.
|
||||
static constexpr auto HeadTailImpl = CmpAdaptor<Impl::head_tail>;
|
||||
Buffer Buffer1(2 * kSize);
|
||||
Buffer Buffer2(2 * kSize);
|
||||
@@ -325,7 +327,8 @@ TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
|
||||
ASSERT_TRUE((CheckMemcmp<HeadTailImpl>(span1, span2, size)));
|
||||
}
|
||||
}
|
||||
{ // Test loop operations from kSize to 3 * kSize.
|
||||
if constexpr (has_loop_and_tail<Impl>::value) {
|
||||
// Test loop operations from kSize to 3 * kSize.
|
||||
if constexpr (kSize > 1) {
|
||||
static constexpr auto LoopImpl = CmpAdaptor<Impl::loop_and_tail>;
|
||||
Buffer Buffer1(3 * kSize);
|
||||
|
||||
@@ -1935,6 +1935,7 @@ libc_support_library(
|
||||
"src/string/memory_utils/op_aarch64.h",
|
||||
"src/string/memory_utils/op_builtin.h",
|
||||
"src/string/memory_utils/op_generic.h",
|
||||
"src/string/memory_utils/op_riscv.h",
|
||||
"src/string/memory_utils/op_x86.h",
|
||||
"src/string/memory_utils/utils.h",
|
||||
],
|
||||
@@ -1958,6 +1959,7 @@ libc_support_library(
|
||||
":__support_cpp_array",
|
||||
":__support_cpp_bit",
|
||||
":__support_cpp_cstddef",
|
||||
":__support_cpp_limits",
|
||||
":__support_cpp_type_traits",
|
||||
":__support_macros_attributes",
|
||||
":__support_macros_config",
|
||||
|
||||
Reference in New Issue
Block a user