mirror of
https://github.com/intel/llvm.git
synced 2026-01-22 06:19:46 +08:00
This is a preparation for P2093R14 Formatted output When the output of print is to the terminal it needs to use the native API. This means transcoding UTF-8 to UTF-16 on Windows. The encoder's interface is modeled after P2728 Unicode in the Library, Part 1: UTF Transcoding But only the required part for P2093R14 is implemented. On Windows wchar_t is 16 bits, in order to test on platforms where wchar_t is 32 bits the transcoder has support for char16_t. It also adds and UTF-8 to UTF-32 encoder which is useful for other tests. Note it is possible to use <codecvt> for transcoding, but that header is deprecated. So rather write new code that is not deprecated; the hard part, decoding, has already been done. The <codecvt> header also requires locale support while the new code works without including <locale>. Note the current transcoder implementation can be optimized since it basically does UTF-8 -> UTF-32 -> UTF-16. The first goal is to have a working implementation. Since it's not part of the ABI it's possible to do the optimization later. Depends on D149672 Reviewed By: ldionne, tahonermann, #libc Differential Revision: https://reviews.llvm.org/D150031
120 lines
4.0 KiB
C++
120 lines
4.0 KiB
C++
// -*- C++ -*-
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef _LIBCPP_PRINT
|
|
#define _LIBCPP_PRINT
|
|
|
|
#include <__assert> // all public C++ headers provide the assertion handler
|
|
#include <__concepts/same_as.h>
|
|
#include <__config>
|
|
#include <__format/unicode.h>
|
|
#include <string_view>
|
|
#include <version>
|
|
|
|
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
|
# pragma GCC system_header
|
|
#endif
|
|
|
|
_LIBCPP_BEGIN_NAMESPACE_STD
|
|
|
|
#if _LIBCPP_STD_VER >= 23
|
|
|
|
# ifndef _LIBCPP_HAS_NO_UNICODE
|
|
// This is the code to transcode UTF-8 to UTF-16. This is used on
|
|
// Windows for the native Unicode API. The code is modeled to make it
|
|
// easier to extend to
|
|
//
|
|
// P2728R0 Unicode in the Library, Part 1: UTF Transcoding
|
|
//
|
|
// This paper is still under heavy development so it makes no sense yet
|
|
// to strictly follow the paper.
|
|
namespace __unicode {
|
|
|
|
// The names of these concepts are modelled after P2728R0, but the
|
|
// implementation is not. char16_t may contain 32-bits so depending on the
|
|
// number of bits is an issue.
|
|
# ifdef _LIBCPP_SHORT_WCHAR
|
|
template <class _Tp>
|
|
concept __utf16_code_unit =
|
|
same_as<_Tp, char16_t>
|
|
# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
|
|
|| same_as<_Tp, wchar_t>
|
|
# endif
|
|
;
|
|
template <class _Tp>
|
|
concept __utf32_code_unit = same_as<_Tp, char32_t>;
|
|
# else // _LIBCPP_SHORT_WCHAR
|
|
template <class _Tp>
|
|
concept __utf16_code_unit = same_as<_Tp, char16_t>;
|
|
template <class _Tp>
|
|
concept __utf32_code_unit =
|
|
same_as<_Tp, char32_t>
|
|
# ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
|
|
|| same_as<_Tp, wchar_t>
|
|
# endif
|
|
;
|
|
# endif // _LIBCPP_SHORT_WCHAR
|
|
|
|
// Pass by reference since an output_iterator may not be copyable.
|
|
template <class _OutIt>
|
|
_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt&, char32_t) = delete;
|
|
|
|
template <class _OutIt>
|
|
requires __utf16_code_unit<iter_value_t<_OutIt>>
|
|
_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value) {
|
|
_LIBCPP_ASSERT(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-16");
|
|
|
|
if (__value < 0x10000) {
|
|
*__out_it++ = __value;
|
|
return;
|
|
}
|
|
|
|
__value -= 0x10000;
|
|
*__out_it++ = 0xd800 + (__value >> 10);
|
|
*__out_it++ = 0xdc00 + (__value & 0x3FF);
|
|
}
|
|
|
|
template <class _OutIt>
|
|
requires __utf32_code_unit<iter_value_t<_OutIt>>
|
|
_LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& __out_it, char32_t __value) {
|
|
_LIBCPP_ASSERT(__is_scalar_value(__value), "an invalid unicode scalar value results in invalid UTF-32");
|
|
*__out_it++ = __value;
|
|
}
|
|
|
|
template <class _OutIt, input_iterator _InIt>
|
|
requires output_iterator<_OutIt, const iter_value_t<_OutIt>&> && (!same_as<iter_value_t<_OutIt>, iter_value_t<_InIt>>)
|
|
_LIBCPP_HIDE_FROM_ABI constexpr _OutIt __transcode(_InIt __first, _InIt __last, _OutIt __out_it) {
|
|
// The __code_point_view has a basic_string_view interface.
|
|
// When transcoding becomes part of the standard we probably want to
|
|
// look at smarter algorithms.
|
|
// For example, when processing a code point that is encoded in
|
|
// 1 to 3 code units in UTF-8, the result will always be encoded
|
|
// in 1 code unit in UTF-16 (code points that require 4 code
|
|
// units in UTF-8 will require 2 code units in UTF-16).
|
|
//
|
|
// Note if P2728 is accepted types like int may become valid. In that case
|
|
// the __code_point_view should use a span. Libc++ will remove support for
|
|
// char_traits<int>.
|
|
basic_string_view<iter_value_t<_InIt>> __data{__first, __last};
|
|
__code_point_view<iter_value_t<_InIt>> __view{__data.begin(), __data.end()};
|
|
while (!__view.__at_end())
|
|
__unicode::__encode(__out_it, __view.__consume().__code_point);
|
|
return __out_it;
|
|
}
|
|
|
|
} // namespace __unicode
|
|
|
|
# endif // _LIBCPP_HAS_NO_UNICODE
|
|
|
|
#endif // _LIBCPP_STD_VER >= 23
|
|
|
|
_LIBCPP_END_NAMESPACE_STD
|
|
|
|
#endif // _LIBCPP_PRINT
|