mirror of git://gcc.gnu.org/git/gcc.git
1309 lines
36 KiB
C++
1309 lines
36 KiB
C++
// Unicode utilities -*- C++ -*-
|
|
|
|
// Copyright The GNU Toolchain Authors.
|
|
//
|
|
// This file is part of the GNU ISO C++ Library. This library is free
|
|
// software; you can redistribute it and/or modify it under the
|
|
// terms of the GNU General Public License as published by the
|
|
// Free Software Foundation; either version 3, or (at your option)
|
|
// any later version.
|
|
|
|
// This library is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
|
|
// Under Section 7 of GPL version 3, you are granted additional
|
|
// permissions described in the GCC Runtime Library Exception, version
|
|
// 3.1, as published by the Free Software Foundation.
|
|
|
|
// You should have received a copy of the GNU General Public License and
|
|
// a copy of the GCC Runtime Library Exception along with this program;
|
|
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
// <http://www.gnu.org/licenses/>.
|
|
|
|
/** @file include/bits/unicode.h
|
|
* This is an internal header file, included by other library headers.
|
|
* Do not attempt to use it directly. @headername{format}
|
|
*/
|
|
|
|
#ifndef _GLIBCXX_UNICODE_H
|
|
#define _GLIBCXX_UNICODE_H 1
|
|
|
|
#if __cplusplus >= 202002L
|
|
#include <array>
|
|
#include <bit> // bit_width
|
|
#include <charconv> // __detail::__from_chars_alnum_to_val_table
|
|
#include <string_view>
|
|
#include <cstdint>
|
|
#include <bits/stl_algo.h>
|
|
#include <bits/stl_iterator.h>
|
|
#include <bits/ranges_base.h> // iterator_t, sentinel_t, input_range, etc.
|
|
#include <bits/ranges_util.h> // view_interface
|
|
|
|
namespace std _GLIBCXX_VISIBILITY(default)
|
|
{
|
|
_GLIBCXX_BEGIN_NAMESPACE_VERSION
|
|
namespace __unicode
|
|
{
|
|
// A Unicode code point that is not a high or low surrogate.
|
|
constexpr bool
|
|
__is_scalar_value(char32_t __c)
|
|
{
|
|
if (__c < 0xD800) [[likely]]
|
|
return true;
|
|
return 0xDFFF < __c && __c <= 0x10FFFF;
|
|
}
|
|
|
|
// A code point that can be encoded in a single code unit of type _CharT.
|
|
template<typename _CharT>
|
|
constexpr bool
|
|
__is_single_code_unit(char32_t __c)
|
|
{
|
|
if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
|
|
return __c <= 0x7F; // ASCII character
|
|
else
|
|
return __c < __gnu_cxx::__int_traits<_CharT>::__max
|
|
&& __is_scalar_value(__c);
|
|
}
|
|
|
|
// Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template
|
|
|
|
struct _Repl
|
|
{
|
|
constexpr char32_t
|
|
operator()() const noexcept
|
|
{ return 0xFFFD; }
|
|
};
|
|
|
|
struct _Null_sentinel_t
|
|
{
|
|
template<input_iterator _It>
|
|
requires default_initializable<iter_value_t<_It>>
|
|
&& equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
|
|
friend constexpr auto
|
|
operator==(_It __it, _Null_sentinel_t)
|
|
{ return *__it == iter_value_t<_It>{}; }
|
|
};
|
|
|
|
// An iterator over an input range of FromFmt code units that yields either
|
|
// UTF-8, UTF-16, or UTF-32, as a range of ToFmt code units.
|
|
// The code units from the input range are interpreted as Unicode code points
|
|
// and the iterator produces the individual code unit for each code point.
|
|
// Invalid sequences in the input are replaced with U+FFDD so that the result
|
|
// is always valid UTF-8, UTF-16, or UTF-32.
|
|
//
|
|
// The iterator knows the bounds of the underlying input range and will not
|
|
// read outside those bounds (incrementing or decrementing at the boundary
|
|
// is erroneously idempotent).
|
|
//
|
|
// On construction, the iterator attemps to decode a single code point from
|
|
// the input range and then encode it into an internal buffer in the output
|
|
// format, e.g. if the input is UTF-8 and the output is UTF-16, it might read
|
|
// three char8_t code units from the input and store two char16_t code units
|
|
// in its buffer. Incrementing the iterator will first iterate over buffer,
|
|
// yielding each code unit in turn, and then extract another code point from
|
|
// the input. Failure to extract a valid code point from the input will store
|
|
// U+FFFD in the buffer, encoded as the appropriate code units of type ToFmt.
|
|
template<typename _FromFmt, typename _ToFmt,
|
|
input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
|
|
typename _ErrorHandler = _Repl>
|
|
requires convertible_to<iter_value_t<_Iter>, _FromFmt>
|
|
class _Utf_iterator
|
|
{
|
|
static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));
|
|
|
|
public:
|
|
using value_type = _ToFmt;
|
|
using difference_type = iter_difference_t<_Iter>;
|
|
using reference = value_type;
|
|
using iterator_concept
|
|
= std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
|
|
bidirectional_iterator_tag>;
|
|
|
|
constexpr _Utf_iterator() = default;
|
|
|
|
constexpr
|
|
_Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
|
|
requires bidirectional_iterator<_Iter>
|
|
: _M_first_and_curr{__first, __it}, _M_last(__last)
|
|
{
|
|
if (_M_curr() != _M_last)
|
|
_M_read();
|
|
else
|
|
_M_buf = {};
|
|
}
|
|
|
|
constexpr
|
|
_Utf_iterator(_Iter __it, _Sent __last)
|
|
requires (!bidirectional_iterator<_Iter>)
|
|
: _M_first_and_curr{__it}, _M_last(__last)
|
|
{
|
|
if (_M_curr() != _M_last)
|
|
_M_read();
|
|
else
|
|
_M_buf = {};
|
|
}
|
|
|
|
template<class _Iter2, class _Sent2>
|
|
requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
|
|
constexpr
|
|
_Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
|
|
_ErrorHandler>& __other)
|
|
: _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
|
|
_M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
|
|
_M_last(__other._M_last)
|
|
{ }
|
|
|
|
[[nodiscard]]
|
|
constexpr _Iter
|
|
begin() const requires bidirectional_iterator<_Iter>
|
|
{ return _M_first(); }
|
|
|
|
[[nodiscard]]
|
|
constexpr _Sent
|
|
end() const { return _M_last; }
|
|
|
|
[[nodiscard]]
|
|
constexpr _Iter
|
|
base() const requires forward_iterator<_Iter>
|
|
{ return _M_curr(); }
|
|
|
|
[[nodiscard]]
|
|
constexpr iter_difference_t<_Iter>
|
|
_M_units() const requires forward_iterator<_Iter>
|
|
{ return _M_to_increment; }
|
|
|
|
[[nodiscard]]
|
|
constexpr value_type
|
|
operator*() const { return _M_buf[_M_buf_index]; }
|
|
|
|
constexpr _Utf_iterator&
|
|
operator++()
|
|
{
|
|
if (_M_buf_index + 1 < _M_buf_last)
|
|
++_M_buf_index; // Move to the next code unit in the buffer.
|
|
else if (_M_curr() != _M_last)
|
|
{
|
|
// Advance past the current code point (for non-forward iterators
|
|
// we already moved there after decoding the last code point).
|
|
if constexpr (forward_iterator<_Iter>)
|
|
std::advance(_M_curr(), _M_to_increment);
|
|
if (_M_curr() == _M_last)
|
|
_M_buf_index = 0;
|
|
else // Decode next code point from the input and update buffer.
|
|
_M_read();
|
|
}
|
|
// else erroneous, but ignored for now.
|
|
return *this;
|
|
}
|
|
|
|
constexpr _Utf_iterator
|
|
operator++(int)
|
|
{
|
|
auto __tmp = *this;
|
|
++*this;
|
|
return __tmp;
|
|
}
|
|
|
|
constexpr _Utf_iterator&
|
|
operator--() requires bidirectional_iterator<_Iter>
|
|
{
|
|
if (_M_buf_index > 0)
|
|
--_M_buf_index;
|
|
else if (_M_curr() != _M_first())
|
|
{
|
|
_M_read_reverse();
|
|
_M_buf_index = _M_buf_last - 1;
|
|
ranges::advance(_M_curr(), -_M_to_increment);
|
|
}
|
|
// else erroneous, but ignored for now.
|
|
return *this;
|
|
}
|
|
|
|
constexpr _Utf_iterator
|
|
operator--(int)
|
|
{
|
|
auto __tmp = *this;
|
|
--*this;
|
|
return __tmp;
|
|
}
|
|
|
|
[[nodiscard]]
|
|
friend constexpr bool
|
|
operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
|
|
requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
|
|
{
|
|
if constexpr (forward_iterator<_Iter>)
|
|
return __lhs._M_curr() == __rhs._M_curr()
|
|
&& __lhs._M_buf_index == __rhs._M_buf_index;
|
|
else if (__lhs._M_curr() != __rhs._M_curr())
|
|
return false;
|
|
else if (__lhs._M_buf_index == __rhs._M_buf_index
|
|
&& __lhs._M_buf_last == __rhs._M_buf_last)
|
|
return true;
|
|
else
|
|
return __lhs._M_buf_index == __lhs._M_buf_last
|
|
&& __rhs._M_buf_index == __rhs._M_buf_last;
|
|
}
|
|
|
|
[[nodiscard]]
|
|
friend constexpr bool
|
|
operator==(_Utf_iterator __lhs, _Sent __rhs)
|
|
{
|
|
if constexpr (forward_iterator<_Iter>)
|
|
return __lhs._M_curr() == __rhs;
|
|
else
|
|
return __lhs._M_curr() == __rhs
|
|
&& __lhs._M_buf_index == __lhs._M_buf_last;
|
|
}
|
|
|
|
private:
|
|
constexpr void
|
|
_M_read()
|
|
{
|
|
if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
|
|
_M_read_utf8();
|
|
else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
|
|
_M_read_utf16();
|
|
else
|
|
{
|
|
static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
|
|
_M_read_utf32();
|
|
}
|
|
}
|
|
|
|
constexpr void
|
|
_M_read_reverse() requires bidirectional_iterator<_Iter>
|
|
{
|
|
if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
|
|
_M_read_reverse_utf8();
|
|
else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
|
|
_M_read_reverse_utf16();
|
|
else
|
|
{
|
|
static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
|
|
_M_read_reverse_utf32();
|
|
}
|
|
}
|
|
|
|
template<typename>
|
|
struct _Guard
|
|
{
|
|
_Guard(void*, _Iter&) { }
|
|
};
|
|
|
|
template<typename _It> requires forward_iterator<_It>
|
|
struct _Guard<_It>
|
|
{
|
|
constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
|
|
_Utf_iterator* _M_this;
|
|
_It _M_orig;
|
|
};
|
|
|
|
constexpr char32_t
|
|
_M_read_utf8()
|
|
{
|
|
_Guard<_Iter> __g{this, _M_curr()};
|
|
char32_t __c{};
|
|
const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
|
|
uint8_t __u = *_M_curr()++;
|
|
uint8_t __to_incr = 1;
|
|
auto __incr = [&, this] {
|
|
++__to_incr;
|
|
return ++_M_curr();
|
|
};
|
|
|
|
if (__u <= 0x7F) [[likely]] // 0x00 to 0x7F
|
|
__c = __u;
|
|
else if (__u < 0xC2) [[unlikely]]
|
|
__c = _S_error();
|
|
else if (_M_curr() == _M_last) [[unlikely]]
|
|
__c = _S_error();
|
|
else if (__u <= 0xDF) // 0xC2 to 0xDF
|
|
{
|
|
__c = __u & 0x1F;
|
|
__u = *_M_curr();
|
|
|
|
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
|
|
__c = _S_error();
|
|
else
|
|
{
|
|
__c = (__c << 6) | (__u & 0x3F);
|
|
__incr();
|
|
}
|
|
}
|
|
else if (__u <= 0xEF) // 0xE0 to 0xEF
|
|
{
|
|
const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
|
|
const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;
|
|
|
|
__c = __u & 0x0F;
|
|
__u = *_M_curr();
|
|
|
|
if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
|
|
__c = _S_error();
|
|
else if (__incr() == _M_last) [[unlikely]]
|
|
__c = _S_error();
|
|
else
|
|
{
|
|
__c = (__c << 6) | (__u & 0x3F);
|
|
__u = *_M_curr();
|
|
|
|
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
|
|
__c = _S_error();
|
|
else
|
|
{
|
|
__c = (__c << 6) | (__u & 0x3F);
|
|
__incr();
|
|
}
|
|
}
|
|
}
|
|
else if (__u <= 0xF4) // 0xF0 to 0xF4
|
|
{
|
|
const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
|
|
const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;
|
|
|
|
__c = __u & 0x07;
|
|
__u = *_M_curr();
|
|
|
|
if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
|
|
__c = _S_error();
|
|
else if (__incr() == _M_last) [[unlikely]]
|
|
__c = _S_error();
|
|
else
|
|
{
|
|
__c = (__c << 6) | (__u & 0x3F);
|
|
__u = *_M_curr();
|
|
|
|
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
|
|
__c = _S_error();
|
|
else if (__incr() == _M_last) [[unlikely]]
|
|
__c = _S_error();
|
|
else
|
|
{
|
|
__c = (__c << 6) | (__u & 0x3F);
|
|
__u = *_M_curr();
|
|
|
|
if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
|
|
__c = _S_error();
|
|
else
|
|
{
|
|
__c = (__c << 6) | (__u & 0x3F);
|
|
__incr();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else [[unlikely]]
|
|
__c = _S_error();
|
|
|
|
_M_update(__c, __to_incr);
|
|
|
|
return __c;
|
|
}
|
|
|
|
constexpr void
|
|
_M_read_utf16()
|
|
{
|
|
_Guard<_Iter> __g{this, _M_curr()};
|
|
char32_t __c{};
|
|
uint16_t __u = *_M_curr()++;
|
|
uint8_t __to_incr = 1;
|
|
|
|
if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
|
|
__c = __u;
|
|
else if (__u < 0xDC00 && _M_curr() != _M_last)
|
|
{
|
|
uint16_t __u2 = *_M_curr();
|
|
if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
|
|
__c = _S_error();
|
|
else
|
|
{
|
|
++_M_curr();
|
|
__to_incr = 2;
|
|
uint32_t __x = (__u & 0x3F) << 10 | (__u2 & 0x3FF);
|
|
uint32_t __w = (__u >> 6) & 0x1F;
|
|
__c = (__w + 1) << 16 | __x;
|
|
}
|
|
}
|
|
else
|
|
__c = _S_error();
|
|
|
|
_M_update(__c, __to_incr);
|
|
}
|
|
|
|
constexpr void
|
|
_M_read_utf32()
|
|
{
|
|
_Guard<_Iter> __g{this, _M_curr()};
|
|
char32_t __c = *_M_curr()++;
|
|
if (!__is_scalar_value(__c)) [[unlikely]]
|
|
__c = _S_error();
|
|
_M_update(__c, 1);
|
|
}
|
|
|
|
constexpr void
|
|
_M_read_reverse_utf8() requires bidirectional_iterator<_Iter>
|
|
{
|
|
const auto __first = _M_first();
|
|
auto __curr = _M_curr();
|
|
// The code point we decode:
|
|
char32_t __c{};
|
|
// The last code unit read:
|
|
uint8_t __u = *--__curr;
|
|
// Count of bytes read:
|
|
uint8_t __to_incr = 1;
|
|
|
|
if (__u <= 0x7F) [[likely]]
|
|
{
|
|
_M_update(__u, 1);
|
|
return;
|
|
}
|
|
|
|
// Continuation bytes match 10xxxxxx
|
|
auto __is_continuation = [](uint8_t __b) {
|
|
return (__b & 0xC0) == 0x80;
|
|
};
|
|
// 0xC0 and 0xC1 would produce overlong encodings of ASCII characters.
|
|
// 0xF5-0xFF would produce code points above U+10FFFF
|
|
auto __is_invalid = [](uint8_t __b) {
|
|
return (__b & 0xFE) == 0xC0 || __b >= 0xF5;
|
|
};
|
|
|
|
// No valid or invalid multibyte sequence is longer than 4 bytes,
|
|
// so skip back over at most four bytes.
|
|
while (__is_continuation(__u) && __to_incr < 4 && __curr != __first)
|
|
{
|
|
++__to_incr;
|
|
__u = *--__curr;
|
|
}
|
|
|
|
// If the last byte read was a continuation byte then either we read
|
|
// four continuation bytes, or stopped at the start of the sequence.
|
|
// Either way, the maximal subparts are the individual continuation
|
|
// bytes so each one should be replaced with U+FFFD.
|
|
if (__is_continuation(__u) || __is_invalid(__u)) [[unlikely]]
|
|
{
|
|
// Either found four continuation bytes (maximum allowed is three)
|
|
// or first non-continuation byte is an invalid UTF-8 code unit.
|
|
_M_update(_S_error(), 1);
|
|
return;
|
|
}
|
|
// __u is a valid start byte so use countl_one to get the expected
|
|
// length of the multibyte sequence that starts with this byte.
|
|
int __seq_length = std::countl_one((unsigned char)__u);
|
|
if (__seq_length < __to_incr) [[unlikely]]
|
|
{
|
|
// If the expected number of continuation bytes is less than
|
|
// the number we found, then the last continuation byte is a
|
|
// maximal subpart and the decremented iterator points to it.
|
|
_M_update(_S_error(), 1);
|
|
return;
|
|
}
|
|
|
|
auto __orig = std::__exchange(_M_curr(), std::move(__curr));
|
|
if (_M_read_utf8() == _S_error()) [[unlikely]]
|
|
{
|
|
if (_M_to_increment < __to_incr) // Read truncated sequence, set
|
|
_M_to_increment = 1; // curr to last continuation byte.
|
|
}
|
|
|
|
_M_curr() = std::move(__orig);
|
|
// operator--() will move back by _M_to_increment
|
|
}
|
|
|
|
constexpr void
|
|
_M_read_reverse_utf16() requires bidirectional_iterator<_Iter>
|
|
{
|
|
_Guard<_Iter> __g{this, _M_curr()};
|
|
char32_t __c{};
|
|
uint16_t __u = *--_M_curr();
|
|
uint8_t __to_incr = 1;
|
|
|
|
if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
|
|
__c = __u;
|
|
else if (__u >= 0xDC00 && _M_curr() != _M_first()) [[likely]]
|
|
{
|
|
// read a low surrogate, expect a high surrogate before it.
|
|
uint16_t __u2 = *--_M_curr();
|
|
if (__u2 < 0xD800 || __u2 >= 0xDC00) [[unlikely]]
|
|
__c = _S_error(); // unpaired low surrogate
|
|
else
|
|
{
|
|
__to_incr = 2;
|
|
uint32_t __x = (__u2 & 0x3F) << 10 | (__u & 0x3FF);
|
|
uint32_t __w = (__u2 >> 6) & 0x1F;
|
|
__c = (__w + 1) << 16 | __x;
|
|
}
|
|
}
|
|
else
|
|
__c = _S_error(); // unpaired surrogate
|
|
|
|
_M_update(__c, __to_incr);
|
|
}
|
|
|
|
constexpr void
|
|
_M_read_reverse_utf32() requires bidirectional_iterator<_Iter>
|
|
{
|
|
_Guard<_Iter> __g{this, _M_curr()};
|
|
char32_t __c = *--_M_curr();
|
|
if (!__is_scalar_value(__c)) [[unlikely]]
|
|
__c = _S_error();
|
|
_M_update(__c, 1);
|
|
}
|
|
|
|
// Encode the code point __c as one or more code units in _M_buf.
|
|
constexpr void
|
|
_M_update(char32_t __c, uint8_t __to_incr)
|
|
{
|
|
_M_to_increment = __to_incr;
|
|
_M_buf_index = 0;
|
|
if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
|
|
{
|
|
_M_buf[0] = __c;
|
|
_M_buf_last = 1;
|
|
}
|
|
else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
|
|
{
|
|
if (__is_single_code_unit<_ToFmt>(__c))
|
|
{
|
|
_M_buf[0] = __c;
|
|
_M_buf[1] = 0;
|
|
_M_buf_last = 1;
|
|
}
|
|
else
|
|
{
|
|
// From http://www.unicode.org/faq/utf_bom.html#utf16-4
|
|
const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
|
|
char16_t __lead = __lead_offset + (__c >> 10);
|
|
char16_t __trail = 0xDC00 + (__c & 0x3FF);
|
|
_M_buf[0] = __lead;
|
|
_M_buf[1] = __trail;
|
|
_M_buf_last = 2;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
static_assert(sizeof(_ToFmt) == 1);
|
|
int __bits = std::bit_width((uint32_t)__c);
|
|
if (__bits <= 7) [[likely]]
|
|
{
|
|
_M_buf[0] = __c;
|
|
_M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
|
|
_M_buf_last = 1;
|
|
}
|
|
else if (__bits <= 11)
|
|
{
|
|
_M_buf[0] = 0xC0 | (__c >> 6);
|
|
_M_buf[1] = 0x80 | (__c & 0x3F);
|
|
_M_buf[2] = _M_buf[3] = 0;
|
|
_M_buf_last = 2;
|
|
}
|
|
else if (__bits <= 16)
|
|
{
|
|
_M_buf[0] = 0xE0 | (__c >> 12);
|
|
_M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
|
|
_M_buf[2] = 0x80 | (__c & 0x3F);
|
|
_M_buf[3] = 0;
|
|
_M_buf_last = 3;
|
|
}
|
|
else
|
|
{
|
|
_M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
|
|
_M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
|
|
_M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
|
|
_M_buf[3] = 0x80 | (__c & 0x3F);
|
|
_M_buf_last = 4;
|
|
}
|
|
}
|
|
}
|
|
|
|
constexpr char32_t
|
|
_S_error()
|
|
{
|
|
char32_t __c = _ErrorHandler()();
|
|
__glibcxx_assert(__is_scalar_value(__c));
|
|
return __c;
|
|
}
|
|
|
|
constexpr _Iter
|
|
_M_first() const requires bidirectional_iterator<_Iter>
|
|
{ return _M_first_and_curr._M_first; }
|
|
|
|
constexpr _Iter&
|
|
_M_curr() { return _M_first_and_curr._M_curr; }
|
|
|
|
constexpr _Iter
|
|
_M_curr() const { return _M_first_and_curr._M_curr; }
|
|
|
|
// _M_first is not needed for non-bidirectional ranges.
|
|
template<typename _It>
|
|
struct _First_and_curr
|
|
{
|
|
_First_and_curr() = default;
|
|
|
|
constexpr
|
|
_First_and_curr(_It __curr) : _M_curr(__curr) { }
|
|
|
|
template<convertible_to<_It> _It2>
|
|
constexpr
|
|
_First_and_curr(const _First_and_curr<_It2>& __other)
|
|
: _M_curr(__other._M_curr) { }
|
|
|
|
// First code unit of the current code point for forward iterators,
|
|
// past-the-end of the current code point for input iterators.
|
|
_It _M_curr;
|
|
};
|
|
|
|
template<typename _It> requires bidirectional_iterator<_It>
|
|
struct _First_and_curr<_It>
|
|
{
|
|
_First_and_curr() = default;
|
|
|
|
constexpr
|
|
_First_and_curr(_It __first, _It __curr)
|
|
: _M_first(__first), _M_curr(__curr) { }
|
|
|
|
template<convertible_to<_It> _It2>
|
|
constexpr
|
|
_First_and_curr(const _First_and_curr<_It2>& __other)
|
|
: _M_first(__other._M_first), _M_curr(__other._M_curr) { }
|
|
|
|
_It _M_first; // Start of the underlying range.
|
|
_It _M_curr; // First code unit of the current code point.
|
|
};
|
|
|
|
// Iterators pointing to the start of the underlying range and to the
|
|
// start (or end, for non-forward iterators) of the current code point.
|
|
_First_and_curr<_Iter> _M_first_and_curr;
|
|
|
|
// The end of the underlying input range.
|
|
[[no_unique_address]] _Sent _M_last;
|
|
|
|
// Buffer holding the individual code units of the current code point.
|
|
array<value_type, 4 / sizeof(_ToFmt)> _M_buf;
|
|
|
|
uint8_t _M_buf_index = 0; // Index of current code unit in the buffer.
|
|
uint8_t _M_buf_last = 0; // Number of code units in the buffer.
|
|
uint8_t _M_to_increment = 0; // How far to advance _M_curr on increment.
|
|
|
|
template<typename _FromFmt2, typename _ToFmt2,
|
|
input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
|
|
typename _ErrHandler>
|
|
requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
|
|
friend class _Utf_iterator;
|
|
};
|
|
|
|
template<typename _ToFormat, ranges::input_range _Range>
|
|
class _Utf_view
|
|
: public ranges::view_interface<_Utf_view<_ToFormat, _Range>>
|
|
{
|
|
using _Iterator = _Utf_iterator<ranges::range_value_t<_Range>,
|
|
_ToFormat, ranges::iterator_t<_Range>,
|
|
ranges::sentinel_t<_Range>>;
|
|
|
|
template<typename _Iter, typename _Sent>
|
|
constexpr auto
|
|
_M_begin(_Iter __first, _Sent __last)
|
|
{
|
|
if constexpr (bidirectional_iterator<_Iter>)
|
|
return _Iterator(__first, __first, __last);
|
|
else
|
|
return _Iterator(__first, __last);
|
|
}
|
|
|
|
template<typename _Iter, typename _Sent>
|
|
constexpr auto
|
|
_M_end(_Iter __first, _Sent __last)
|
|
{
|
|
if constexpr (!is_same_v<_Iter, _Sent>)
|
|
return __last;
|
|
else if constexpr (bidirectional_iterator<_Iter>)
|
|
return _Iterator(__first, __last, __last);
|
|
else
|
|
return _Iterator(__last, __last);
|
|
}
|
|
|
|
_Range _M_base;
|
|
|
|
public:
|
|
constexpr explicit
|
|
_Utf_view(_Range&& __r) : _M_base(std::forward<_Range>(__r)) { }
|
|
|
|
constexpr auto begin()
|
|
{ return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }
|
|
|
|
constexpr auto end()
|
|
{ return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }
|
|
|
|
constexpr bool empty() const { return ranges::empty(_M_base); }
|
|
};
|
|
|
|
#ifdef __cpp_char8_t
|
|
template<typename _View>
|
|
using _Utf8_view = _Utf_view<char8_t, _View>;
|
|
#else
|
|
template<typename _View>
|
|
using _Utf8_view = _Utf_view<char, _View>;
|
|
#endif
|
|
template<typename _View>
|
|
using _Utf16_view = _Utf_view<char16_t, _View>;
|
|
template<typename _View>
|
|
using _Utf32_view = _Utf_view<char32_t, _View>;
|
|
|
|
inline namespace __v16_0_0
|
|
{
|
|
#define _GLIBCXX_GET_UNICODE_DATA 160000
|
|
#include "unicode-data.h"
|
|
#ifdef _GLIBCXX_GET_UNICODE_DATA
|
|
# error "Invalid unicode data"
|
|
#endif
|
|
|
|
// The field width of a code point.
|
|
constexpr int
|
|
__field_width(char32_t __c) noexcept
|
|
{
|
|
if (__c < __width_edges[0]) [[likely]]
|
|
return 1;
|
|
|
|
auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
|
|
return (__p - __width_edges) % 2 + 1;
|
|
}
|
|
|
|
// @pre c <= 0x10FFFF
|
|
constexpr bool
|
|
__should_escape_category(char32_t __c) noexcept
|
|
{
|
|
constexpr uint32_t __mask = 0x01;
|
|
auto* __end = std::end(__escape_edges);
|
|
auto* __p = std::lower_bound(__escape_edges, __end,
|
|
(__c << 1u) + 2);
|
|
return __p[-1] & __mask;
|
|
}
|
|
|
|
|
|
// @pre c <= 0x10FFFF
|
|
constexpr _Gcb_property
|
|
__grapheme_cluster_break_property(char32_t __c) noexcept
|
|
{
|
|
constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
|
|
auto* __end = std::end(__gcb_edges);
|
|
auto* __p = std::lower_bound(__gcb_edges, __end,
|
|
(__c << __gcb_shift_bits) | __mask);
|
|
return _Gcb_property(__p[-1] & __mask);
|
|
}
|
|
|
|
constexpr bool
|
|
__is_incb_linker(char32_t __c) noexcept
|
|
{
|
|
const auto __end = std::end(__incb_linkers);
|
|
// Array is small enough that linear search is faster than binary search.
|
|
return _GLIBCXX_STD_A::find(__incb_linkers, __end, __c) != __end;
|
|
}
|
|
|
|
// @pre c <= 0x10FFFF
|
|
constexpr _InCB
|
|
__incb_property(char32_t __c) noexcept
|
|
{
|
|
if ((__c << 2) < __incb_edges[0]) [[likely]]
|
|
return _InCB(0);
|
|
|
|
constexpr uint32_t __mask = 0x3;
|
|
auto* __end = std::end(__incb_edges);
|
|
auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
|
|
return _InCB(__p[-1] & __mask);
|
|
}
|
|
|
|
constexpr bool
|
|
__is_extended_pictographic(char32_t __c)
|
|
{
|
|
if (__c < __xpicto_edges[0]) [[likely]]
|
|
return 0;
|
|
|
|
auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
|
|
return (__p - __xpicto_edges) % 2;
|
|
}
|
|
|
|
struct _Grapheme_cluster_iterator_base
|
|
{
|
|
char32_t _M_c; // First code point in the cluster.
|
|
_Gcb_property _M_prop; // GCB property of _M_c.
|
|
enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
|
|
_XPicto _M_xpicto_seq_state = _XPicto::_Init;
|
|
unsigned char _M_RI_count = 0;
|
|
bool _M_incb_linker_seen = false;
|
|
|
|
constexpr void
|
|
_M_reset(char32_t __c, _Gcb_property __p)
|
|
{
|
|
_M_c = __c;
|
|
_M_prop = __p;
|
|
_M_xpicto_seq_state = _XPicto::_Init;
|
|
_M_RI_count = 0;
|
|
_M_incb_linker_seen = false;
|
|
}
|
|
|
|
constexpr void
|
|
_M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
|
|
{
|
|
if (_M_xpicto_seq_state == _XPicto::_Failed)
|
|
return;
|
|
|
|
auto __next_state = _XPicto::_Failed;
|
|
if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
|
|
{
|
|
if (__p == _Gcb_property::_Gcb_ZWJ)
|
|
{
|
|
if (_M_xpicto_seq_state == _XPicto::_Matched)
|
|
__next_state = _XPicto::_Zwj;
|
|
// We check _M_c here so that we do the lookup at most once,
|
|
// and only for clusters containing at least one ZWJ.
|
|
else if (__is_extended_pictographic(_M_c))
|
|
__next_state = _XPicto::_Zwj;
|
|
}
|
|
else if (__p == _Gcb_property::_Gcb_Extend)
|
|
__next_state = _M_xpicto_seq_state; // no change
|
|
}
|
|
else // Zwj
|
|
{
|
|
// This assumes that all \p{Extended_Pictographic} emoji have
|
|
// Grapheme_Cluster_Break=Other.
|
|
if (__p == _Gcb_property::_Gcb_Other
|
|
&& __is_extended_pictographic(__c))
|
|
__next_state = _XPicto::_Matched;
|
|
}
|
|
_M_xpicto_seq_state = __next_state;
|
|
}
|
|
|
|
constexpr void
|
|
_M_update_ri_count(_Gcb_property __p)
|
|
{
|
|
if (__p == _Gcb_property::_Gcb_Regional_Indicator)
|
|
++_M_RI_count;
|
|
else
|
|
_M_RI_count = 0;
|
|
}
|
|
|
|
constexpr void
|
|
_M_update_incb_state(char32_t __c, _Gcb_property)
|
|
{
|
|
if (__is_incb_linker(__c))
|
|
_M_incb_linker_seen = true;
|
|
}
|
|
};
|
|
|
|
// Split a range into extended grapheme clusters.
|
|
template<ranges::forward_range _View> requires ranges::view<_View>
|
|
class _Grapheme_cluster_view
|
|
: public ranges::view_interface<_Grapheme_cluster_view<_View>>
|
|
{
|
|
public:
|
|
|
|
constexpr
|
|
_Grapheme_cluster_view(_View __v)
|
|
: _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
|
|
{ }
|
|
|
|
constexpr auto begin() const { return _M_begin; }
|
|
constexpr auto end() const { return _M_begin.end(); }
|
|
|
|
private:
|
|
struct _Iterator : private _Grapheme_cluster_iterator_base
|
|
{
|
|
private:
|
|
// Iterator over the underlying code points.
|
|
using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;
|
|
|
|
public:
|
|
// TODO: Change value_type to be subrange<_U32_iterator> instead?
|
|
// Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
|
|
// That would be the whole cluster, not just the first code point.
|
|
// Would need to store two iterators and find end of current cluster
|
|
// on increment, so operator* returns value_type(_M_base, _M_next).
|
|
using value_type = char32_t;
|
|
using iterator_concept = forward_iterator_tag;
|
|
using difference_type = ptrdiff_t;
|
|
|
|
constexpr
|
|
_Iterator(_U32_iterator __i)
|
|
: _M_base(__i)
|
|
{
|
|
if (__i != __i.end())
|
|
{
|
|
_M_c = *__i;
|
|
_M_prop = __grapheme_cluster_break_property(_M_c);
|
|
}
|
|
}
|
|
|
|
// The first code point of the current extended grapheme cluster.
|
|
constexpr value_type
|
|
operator*() const
|
|
{ return _M_c; }
|
|
|
|
constexpr auto
|
|
operator->() const
|
|
{ return &_M_c; }
|
|
|
|
// Move to the next extended grapheme cluster.
|
|
constexpr _Iterator&
|
|
operator++()
|
|
{
|
|
const auto __end = _M_base.end();
|
|
if (_M_base != __end)
|
|
{
|
|
auto __p_prev = _M_prop;
|
|
auto __it = _M_base;
|
|
while (++__it != __end)
|
|
{
|
|
char32_t __c = *__it;
|
|
auto __p = __grapheme_cluster_break_property(*__it);
|
|
_M_update_xpicto_seq_state(__c, __p);
|
|
_M_update_ri_count(__p);
|
|
_M_update_incb_state(__c, __p);
|
|
if (_M_is_break(__p_prev, __p, __it))
|
|
{
|
|
// Found a grapheme cluster break
|
|
_M_reset(__c, __p);
|
|
break;
|
|
}
|
|
__p_prev = __p;
|
|
}
|
|
_M_base = __it;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
constexpr _Iterator
|
|
operator++(int)
|
|
{
|
|
auto __tmp = *this;
|
|
++*this;
|
|
return __tmp;
|
|
}
|
|
|
|
constexpr bool
|
|
operator==(const _Iterator& __i) const
|
|
{ return _M_base == __i._M_base; }
|
|
|
|
// This supports iter != iter.end()
|
|
constexpr bool
|
|
operator==(const ranges::sentinel_t<_View>& __i) const
|
|
{ return _M_base == __i; }
|
|
|
|
// Iterator to the start of the current cluster.
|
|
constexpr auto base() const { return _M_base.base(); }
|
|
|
|
// The end of the underlying view (not the end of the current cluster!)
|
|
constexpr auto end() const { return _M_base.end(); }
|
|
|
|
// Field width of the first code point in the cluster.
|
|
constexpr int
|
|
width() const noexcept
|
|
{ return __field_width(_M_c); }
|
|
|
|
private:
|
|
_U32_iterator _M_base;
|
|
|
|
// Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
|
|
// http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
|
|
// This implements the rules from TR29 revision 43 in Unicode 15.1.0.
|
|
// Return true if there is a break between code point with property p1
|
|
// and code point with property p2.
|
|
constexpr bool
|
|
_M_is_break(_Gcb_property __p1, _Gcb_property __p2,
|
|
_U32_iterator __curr) const
|
|
{
|
|
using enum _Gcb_property;
|
|
|
|
if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
|
|
return true; // Break after Control or LF.
|
|
|
|
if (__p1 == _Gcb_CR)
|
|
return __p2 != _Gcb_LF; // Do not break between a CR and LF.
|
|
|
|
// Rule GB5
|
|
if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
|
|
return true; // Break before Control, CR or LF.
|
|
|
|
// Rule GB6
|
|
if (__p1 == _Gcb_L)
|
|
switch (__p2)
|
|
{
|
|
case _Gcb_L:
|
|
case _Gcb_V:
|
|
case _Gcb_LV:
|
|
case _Gcb_LVT:
|
|
return false; // Do not break Hangul syllable sequences.
|
|
default:
|
|
return true;
|
|
}
|
|
|
|
// Rule GB7
|
|
if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
|
|
switch (__p2)
|
|
{
|
|
case _Gcb_V:
|
|
case _Gcb_T:
|
|
return false; // Do not break Hangul syllable sequences.
|
|
default:
|
|
return true;
|
|
}
|
|
|
|
// Rule GB8
|
|
if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
|
|
return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.
|
|
|
|
// Rule GB9
|
|
if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
|
|
return false; // Do not break before extending characters or ZWJ.
|
|
|
|
// The following GB9x rules only apply to extended grapheme clusters,
|
|
// which is what the C++ standard uses (not legacy grapheme clusters).
|
|
|
|
// Rule GB9a
|
|
if (__p2 == _Gcb_SpacingMark)
|
|
return false; // Do not break before SpacingMarks,
|
|
// Rule GB9b
|
|
if (__p1 == _Gcb_Prepend)
|
|
return false; // or after Prepend characters.
|
|
|
|
// Rule GB9c (Unicode 15.1.0)
|
|
// Do not break within certain combinations with
|
|
// Indic_Conjunct_Break (InCB)=Linker.
|
|
if (_M_incb_linker_seen
|
|
&& __incb_property(_M_c) == _InCB::_Consonant
|
|
&& __incb_property(*__curr) == _InCB::_Consonant)
|
|
{
|
|
// Match [_M_base, __curr] against regular expression
|
|
// Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
|
|
bool __have_linker = false;
|
|
auto __it = _M_base;
|
|
while (++__it != __curr)
|
|
{
|
|
if (__is_incb_linker(*__it))
|
|
__have_linker = true;
|
|
else
|
|
{
|
|
auto __incb = __incb_property(*__it);
|
|
if (__incb == _InCB::_Consonant)
|
|
__have_linker = false;
|
|
else if (__incb != _InCB::_Extend)
|
|
break;
|
|
}
|
|
}
|
|
if (__it == __curr && __have_linker)
|
|
return false;
|
|
}
|
|
|
|
// Rule GB11
|
|
// Do not break within emoji modifier sequences
|
|
// or emoji zwj sequences.
|
|
if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
|
|
return false;
|
|
|
|
// Rules GB12 and GB13
|
|
// Do not break within emoji flag sequences. That is, do not break
|
|
// between regional indicator (RI) symbols if there is an odd number
|
|
// of RI characters before the break point.
|
|
if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
|
|
return (_M_RI_count & 1) == 0;
|
|
|
|
// Rule GB999
|
|
return true; // Otherwise, break everywhere.
|
|
}
|
|
};
|
|
|
|
_Iterator _M_begin;
|
|
};
|
|
|
|
} // namespace __v16_0_0
|
|
|
|
// Return the field width of a string.
|
|
template<typename _CharT>
|
|
constexpr size_t
|
|
__field_width(basic_string_view<_CharT> __s)
|
|
{
|
|
if (__s.empty()) [[unlikely]]
|
|
return 0;
|
|
_Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
|
|
auto __it = __gc.begin();
|
|
const auto __end = __gc.end();
|
|
size_t __n = __it.width();
|
|
while (++__it != __end)
|
|
__n += __it.width();
|
|
return __n;
|
|
}
|
|
|
|
// Truncate a string to at most `__max` field width units, and return the
|
|
// resulting field width.
|
|
template<typename _CharT>
|
|
constexpr size_t
|
|
__truncate(basic_string_view<_CharT>& __s, size_t __max)
|
|
{
|
|
if (__s.empty()) [[unlikely]]
|
|
return 0;
|
|
|
|
_Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
|
|
auto __it = __gc.begin();
|
|
const auto __end = __gc.end();
|
|
size_t __n = __it.width();
|
|
if (__n > __max)
|
|
{
|
|
__s = {};
|
|
return 0;
|
|
}
|
|
while (++__it != __end)
|
|
{
|
|
size_t __n2 = __n + __it.width();
|
|
if (__n2 > __max)
|
|
{
|
|
__s = basic_string_view<_CharT>(__s.begin(), __it.base());
|
|
return __n;
|
|
}
|
|
__n = __n2;
|
|
}
|
|
return __n;
|
|
}
|
|
|
|
template<typename _CharT>
|
|
consteval bool
|
|
__literal_encoding_is_unicode()
|
|
{
|
|
if constexpr (is_same_v<_CharT, char16_t>)
|
|
return true;
|
|
else if constexpr (is_same_v<_CharT, char32_t>)
|
|
return true;
|
|
#ifdef __cpp_char8_t
|
|
else if constexpr (is_same_v<_CharT, char8_t>)
|
|
return true;
|
|
#endif
|
|
|
|
const char* __enc = "";
|
|
|
|
#ifdef __GNUC_EXECUTION_CHARSET_NAME
|
|
auto __remove_iso10646_prefix = [](const char* __s) {
|
|
// GNU iconv allows "ISO-10646/" prefix (case-insensitive).
|
|
if (__s[0] == 'I' || __s[0] == 'i')
|
|
if (__s[1] == 'S' || __s[1] == 's')
|
|
if (__s[2] == 'O' || __s[2] == 'o')
|
|
if (string_view(__s + 3).starts_with("-10646/"))
|
|
return __s + 10;
|
|
return __s;
|
|
};
|
|
|
|
if constexpr (is_same_v<_CharT, char>)
|
|
__enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
|
|
# if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
|
|
else
|
|
__enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
|
|
# endif
|
|
|
|
if ((__enc[0] == 'U' || __enc[0] == 'u')
|
|
&& (__enc[1] == 'T' || __enc[1] == 't')
|
|
&& (__enc[2] == 'F' || __enc[2] == 'f'))
|
|
{
|
|
__enc += 3;
|
|
if (__enc[0] == '-')
|
|
++__enc;
|
|
if (__enc[0] == '8')
|
|
return __enc[1] == '\0' || string_view(__enc + 1) == "//";
|
|
else if constexpr (!is_same_v<_CharT, char>)
|
|
{
|
|
string_view __s(__enc);
|
|
if (__s.ends_with("//"))
|
|
__s.remove_suffix(2);
|
|
if (__s.ends_with("LE") || __s.ends_with("BE"))
|
|
__s.remove_suffix(2);
|
|
return __s == "16" || __s == "32";
|
|
}
|
|
}
|
|
#elif defined __clang_literal_encoding__
|
|
if constexpr (is_same_v<_CharT, char>)
|
|
__enc = __clang_literal_encoding__;
|
|
# if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
|
|
else
|
|
__enc = __clang_wide_literal_encoding__;
|
|
# endif
|
|
// Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
|
|
string_view __s(__enc);
|
|
if (__s == "UTF-8")
|
|
return true;
|
|
else if constexpr (!is_same_v<_CharT, char>)
|
|
return __s == "UTF-16" || __s == "UTF-32";
|
|
#endif
|
|
|
|
return false;
|
|
}
|
|
|
|
consteval bool
|
|
__literal_encoding_is_utf8()
|
|
{ return __literal_encoding_is_unicode<char>(); }
|
|
|
|
consteval bool
|
|
__literal_encoding_is_extended_ascii()
|
|
{
|
|
return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
|
|
&& 'a' == 0x61 && 'z' == 0x7a;
|
|
}
|
|
|
|
// https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
|
|
constexpr bool
|
|
__charset_alias_match(string_view __a, string_view __b)
|
|
{
|
|
// Map alphanumeric chars to their base 64 value, everything else to 127.
|
|
auto __map = [](char __c, bool& __num) -> unsigned char {
|
|
if (__c == '0') [[unlikely]]
|
|
return __num ? 0 : 127;
|
|
const auto __v = __detail::__from_chars_alnum_to_val(__c);
|
|
__num = __v < 10;
|
|
return __v;
|
|
};
|
|
|
|
auto __ptr_a = __a.begin(), __end_a = __a.end();
|
|
auto __ptr_b = __b.begin(), __end_b = __b.end();
|
|
bool __num_a = false, __num_b = false;
|
|
|
|
while (true)
|
|
{
|
|
// Find the value of the next alphanumeric character in each string.
|
|
unsigned char __val_a{}, __val_b{};
|
|
while (__ptr_a != __end_a
|
|
&& (__val_a = __map(*__ptr_a, __num_a)) == 127)
|
|
++__ptr_a;
|
|
while (__ptr_b != __end_b
|
|
&& (__val_b = __map(*__ptr_b, __num_b)) == 127)
|
|
++__ptr_b;
|
|
// Stop when we reach the end of a string, or get a mismatch.
|
|
if (__ptr_a == __end_a)
|
|
return __ptr_b == __end_b;
|
|
else if (__ptr_b == __end_b)
|
|
return false;
|
|
else if (__val_a != __val_b)
|
|
return false; // Found non-matching characters.
|
|
++__ptr_a;
|
|
++__ptr_b;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
} // namespace __unicode
|
|
|
|
namespace ranges
|
|
{
|
|
template<typename _To, typename _Range>
|
|
inline constexpr bool
|
|
enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>
|
|
= enable_borrowed_range<_Range>;
|
|
|
|
template<typename _Range>
|
|
inline constexpr bool
|
|
enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>
|
|
= enable_borrowed_range<_Range>;
|
|
} // namespace ranges
|
|
|
|
_GLIBCXX_END_NAMESPACE_VERSION
|
|
} // namespace std
|
|
#endif // C++20
|
|
#endif // _GLIBCXX_UNICODE_H
|