gcc/libstdc++-v3/include/bits/unicode.h

// Unicode utilities -*- C++ -*-

// Copyright The GNU Toolchain Authors.
//
// This file is part of the GNU ISO C++ Library.  This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.

// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.

// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.

// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
// <http://www.gnu.org/licenses/>.

/** @file include/bits/unicode.h
 *  This is an internal header file, included by other library headers.
 *  Do not attempt to use it directly. @headername{format}
 */

#ifndef _GLIBCXX_UNICODE_H
#define _GLIBCXX_UNICODE_H 1

#if __cplusplus >= 202002L
#include <array>
#include <bit>      // bit_width
#include <charconv> // __detail::__from_chars_alnum_to_val_table
#include <string_view>
#include <cstdint>
#include <bits/stl_algo.h>
#include <bits/stl_iterator.h>
#include <bits/ranges_base.h> // iterator_t, sentinel_t, input_range, etc.
#include <bits/ranges_util.h> // view_interface

namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
namespace __unicode
{
  // A Unicode code point that is not a high or low surrogate.
  constexpr bool
  __is_scalar_value(char32_t __c)
  {
    if (__c < 0xD800) [[likely]]
      return true;
    return 0xDFFF < __c && __c <= 0x10FFFF;
  }

  // A code point that can be encoded in a single code unit of type _CharT.
  template<typename _CharT>
    constexpr bool
    __is_single_code_unit(char32_t __c)
    {
      if constexpr (__gnu_cxx::__int_traits<_CharT>::__max <= 0xFF)
	return __c <= 0x7F; // ASCII character
      else
	return __c < __gnu_cxx::__int_traits<_CharT>::__max
		       && __is_scalar_value(__c);
    }

  // Based on https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2728r6.html#add-the-transcoding-iterator-template

  struct _Repl
  {
    constexpr char32_t
    operator()() const noexcept
    { return 0xFFFD; }
  };

  struct _Null_sentinel_t
  {
    template<input_iterator _It>
      requires default_initializable<iter_value_t<_It>>
	&& equality_comparable_with<iter_reference_t<_It>, iter_value_t<_It>>
      friend constexpr auto
      operator==(_It __it, _Null_sentinel_t)
      { return *__it == iter_value_t<_It>{}; }
  };

  // An iterator over an input range of FromFmt code units that yields either
  // UTF-8, UTF-16, or UTF-32, as a range of ToFmt code units.
  // The code units from the input range are interpreted as Unicode code points
  // and the iterator produces the individual code unit for each code point.
  // Invalid sequences in the input are replaced with U+FFDD so that the result
  // is always valid UTF-8, UTF-16, or UTF-32.
  //
  // The iterator knows the bounds of the underlying input range and will not
  // read outside those bounds (incrementing or decrementing at the boundary
  // is erroneously idempotent).
  //
  // On construction, the iterator attemps to decode a single code point from
  // the input range and then encode it into an internal buffer in the output
  // format, e.g. if the input is UTF-8 and the output is UTF-16, it might read
  // three char8_t code units from the input and store two char16_t code units
  // in its buffer. Incrementing the iterator will first iterate over buffer,
  // yielding each code unit in turn, and then extract another code point from
  // the input. Failure to extract a valid code point from the input will store
  // U+FFFD in the buffer, encoded as the appropriate code units of type ToFmt.
  template<typename _FromFmt, typename _ToFmt,
	   input_iterator _Iter, sentinel_for<_Iter> _Sent = _Iter,
	   typename _ErrorHandler = _Repl>
    requires convertible_to<iter_value_t<_Iter>, _FromFmt>
    class _Utf_iterator
    {
      static_assert(forward_iterator<_Iter> || noexcept(_ErrorHandler()()));

    public:
      using value_type = _ToFmt;
      using difference_type = iter_difference_t<_Iter>;
      using reference = value_type;
      using iterator_concept
	= std::__detail::__clamp_iter_cat<__iter_category_t<_Iter>,
					  bidirectional_iterator_tag>;

      constexpr _Utf_iterator() = default;

      constexpr
      _Utf_iterator(_Iter __first, _Iter __it, _Sent __last)
      requires bidirectional_iterator<_Iter>
      : _M_first_and_curr{__first, __it}, _M_last(__last)
      {
	if (_M_curr() != _M_last)
	  _M_read();
	else
	  _M_buf = {};
      }

      constexpr
      _Utf_iterator(_Iter __it, _Sent __last)
      requires (!bidirectional_iterator<_Iter>)
      : _M_first_and_curr{__it}, _M_last(__last)
      {
	if (_M_curr() != _M_last)
	  _M_read();
	else
	  _M_buf = {};
      }

      template<class _Iter2, class _Sent2>
	requires convertible_to<_Iter2, _Iter> && convertible_to<_Sent2, _Sent>
	constexpr
	_Utf_iterator(const _Utf_iterator<_FromFmt, _ToFmt, _Iter2, _Sent2,
					  _ErrorHandler>& __other)
	: _M_buf(__other._M_buf), _M_first_and_curr(__other._M_first_and_curr),
	  _M_buf_index(__other._M_buf_index), _M_buf_last(__other._M_buf_last),
	  _M_last(__other._M_last)
	{ }

      [[nodiscard]]
      constexpr _Iter
      begin() const requires bidirectional_iterator<_Iter>
      { return _M_first(); }

      [[nodiscard]]
      constexpr _Sent
      end() const { return _M_last; }

      [[nodiscard]]
      constexpr _Iter
      base() const requires forward_iterator<_Iter>
      { return _M_curr(); }

      [[nodiscard]]
      constexpr iter_difference_t<_Iter>
      _M_units() const requires forward_iterator<_Iter>
      { return _M_to_increment; }

      [[nodiscard]]
      constexpr value_type
      operator*() const { return _M_buf[_M_buf_index]; }

      constexpr _Utf_iterator&
      operator++()
      {
	if (_M_buf_index + 1 < _M_buf_last)
	  ++_M_buf_index; // Move to the next code unit in the buffer.
	else if (_M_curr() != _M_last)
	  {
	    // Advance past the current code point (for non-forward iterators
	    // we already moved there after decoding the last code point).
	    if constexpr (forward_iterator<_Iter>)
	      std::advance(_M_curr(), _M_to_increment);
	    if (_M_curr() == _M_last)
	      _M_buf_index = 0;
	    else // Decode next code point from the input and update buffer.
	      _M_read();
	  }
	// else erroneous, but ignored for now.
	return *this;
      }

      constexpr _Utf_iterator
      operator++(int)
      {
	auto __tmp = *this;
	++*this;
	return __tmp;
      }

      constexpr _Utf_iterator&
      operator--() requires bidirectional_iterator<_Iter>
      {
	if (_M_buf_index > 0)
	  --_M_buf_index;
	else if (_M_curr() != _M_first())
	  {
	    _M_read_reverse();
	    _M_buf_index = _M_buf_last - 1;
	    ranges::advance(_M_curr(), -_M_to_increment);
	  }
	// else erroneous, but ignored for now.
	return *this;
      }

      constexpr _Utf_iterator
      operator--(int)
      {
	auto __tmp = *this;
	--*this;
	return __tmp;
      }

      [[nodiscard]]
      friend constexpr bool
      operator==(_Utf_iterator __lhs, _Utf_iterator __rhs)
      requires forward_iterator<_Iter> || requires (_Iter __i) { __i != __i; }
      {
	if constexpr (forward_iterator<_Iter>)
	  return __lhs._M_curr() == __rhs._M_curr()
		   && __lhs._M_buf_index == __rhs._M_buf_index;
	else if (__lhs._M_curr() != __rhs._M_curr())
	  return false;
	else if (__lhs._M_buf_index == __rhs._M_buf_index
		   && __lhs._M_buf_last == __rhs._M_buf_last)
	  return true;
	else
	  return __lhs._M_buf_index == __lhs._M_buf_last
		   && __rhs._M_buf_index == __rhs._M_buf_last;
      }

      [[nodiscard]]
      friend constexpr bool
      operator==(_Utf_iterator __lhs, _Sent __rhs)
      {
	if constexpr (forward_iterator<_Iter>)
	  return __lhs._M_curr() == __rhs;
	else
	  return __lhs._M_curr() == __rhs
		   && __lhs._M_buf_index == __lhs._M_buf_last;
      }

    private:
      constexpr void
      _M_read()
      {
	if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
	  _M_read_utf8();
	else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
	  _M_read_utf16();
	else
	  {
	    static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
	    _M_read_utf32();
	  }
      }

      constexpr void
      _M_read_reverse() requires bidirectional_iterator<_Iter>
      {
	if constexpr (sizeof(_FromFmt) == sizeof(uint8_t))
	  _M_read_reverse_utf8();
	else if constexpr (sizeof(_FromFmt) == sizeof(uint16_t))
	  _M_read_reverse_utf16();
	else
	  {
	    static_assert(sizeof(_FromFmt) == sizeof(uint32_t));
	    _M_read_reverse_utf32();
	  }
      }

      template<typename>
	struct _Guard
	{
	  _Guard(void*, _Iter&) { }
	};

      template<typename _It> requires forward_iterator<_It>
	struct _Guard<_It>
	{
	  constexpr ~_Guard() { _M_this->_M_curr() = std::move(_M_orig); }
	  _Utf_iterator* _M_this;
	  _It _M_orig;
	};

      constexpr char32_t
      _M_read_utf8()
      {
	_Guard<_Iter> __g{this, _M_curr()};
	char32_t __c{};
	const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
	uint8_t __u = *_M_curr()++;
	uint8_t __to_incr = 1;
	auto __incr = [&, this] {
	  ++__to_incr;
	  return ++_M_curr();
	};

	if (__u <= 0x7F) [[likely]]      // 0x00 to 0x7F
	  __c = __u;
	else if (__u < 0xC2) [[unlikely]]
	  __c = _S_error();
	else if (_M_curr() == _M_last) [[unlikely]]
	  __c = _S_error();
	else if (__u <= 0xDF) // 0xC2 to 0xDF
	  {
	    __c = __u & 0x1F;
	    __u = *_M_curr();

	    if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
	      __c = _S_error();
	    else
	      {
		__c = (__c << 6) | (__u & 0x3F);
		__incr();
	      }
	  }
	else if (__u <= 0xEF) // 0xE0 to 0xEF
	  {
	    const uint8_t __lo_bound_2 = __u == 0xE0 ? 0xA0 : __lo_bound;
	    const uint8_t __hi_bound_2 = __u == 0xED ? 0x9F : __hi_bound;

	    __c = __u & 0x0F;
	    __u = *_M_curr();

	    if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
	      __c = _S_error();
	    else if (__incr() == _M_last) [[unlikely]]
	      __c = _S_error();
	    else
	      {
		__c = (__c << 6) | (__u & 0x3F);
		__u = *_M_curr();

		if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
		  __c = _S_error();
		else
		  {
		    __c = (__c << 6) | (__u & 0x3F);
		    __incr();
		  }
	      }
	  }
	else if (__u <= 0xF4) // 0xF0 to 0xF4
	  {
	    const uint8_t __lo_bound_2 = __u == 0xF0 ? 0x90 : __lo_bound;
	    const uint8_t __hi_bound_2 = __u == 0xF4 ? 0x8F : __hi_bound;

	    __c = __u & 0x07;
	    __u = *_M_curr();

	    if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
	      __c = _S_error();
	    else if (__incr() == _M_last) [[unlikely]]
	      __c = _S_error();
	    else
	      {
		__c = (__c << 6) | (__u & 0x3F);
		__u = *_M_curr();

		if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
		  __c = _S_error();
		else if (__incr() == _M_last) [[unlikely]]
		  __c = _S_error();
		else
		  {
		    __c = (__c << 6) | (__u & 0x3F);
		    __u = *_M_curr();

		    if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
		      __c = _S_error();
		    else
		      {
			__c = (__c << 6) | (__u & 0x3F);
			__incr();
		      }
		  }
	      }
	  }
	else [[unlikely]]
	  __c = _S_error();

	_M_update(__c, __to_incr);

	return __c;
      }

      constexpr void
      _M_read_utf16()
      {
	_Guard<_Iter> __g{this, _M_curr()};
	char32_t __c{};
	uint16_t __u = *_M_curr()++;
	uint8_t __to_incr = 1;

	if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
	  __c = __u;
	else if (__u < 0xDC00 && _M_curr() != _M_last)
	  {
	    uint16_t __u2 = *_M_curr();
	    if (__u2 < 0xDC00 || __u2 > 0xDFFF) [[unlikely]]
	      __c = _S_error();
	    else
	      {
		++_M_curr();
		__to_incr = 2;
		uint32_t __x = (__u & 0x3F) << 10 | (__u2 & 0x3FF);
		uint32_t __w = (__u >> 6) & 0x1F;
		__c = (__w + 1) << 16 | __x;
	      }
	  }
	else
	  __c = _S_error();

	_M_update(__c, __to_incr);
      }

      constexpr void
      _M_read_utf32()
      {
	_Guard<_Iter> __g{this, _M_curr()};
	char32_t __c = *_M_curr()++;
	if (!__is_scalar_value(__c)) [[unlikely]]
	  __c = _S_error();
	_M_update(__c, 1);
      }

      constexpr void
      _M_read_reverse_utf8() requires bidirectional_iterator<_Iter>
      {
	const auto __first = _M_first();
	auto __curr = _M_curr();
	// The code point we decode:
	char32_t __c{};
	// The last code unit read:
	uint8_t __u = *--__curr;
	// Count of bytes read:
	uint8_t __to_incr = 1;

	if (__u <= 0x7F) [[likely]]
	  {
	    _M_update(__u, 1);
	    return;
	  }

	// Continuation bytes match 10xxxxxx
	auto __is_continuation = [](uint8_t __b) {
	  return (__b & 0xC0) == 0x80;
	};
	// 0xC0 and 0xC1 would produce overlong encodings of ASCII characters.
	// 0xF5-0xFF would produce code points above U+10FFFF
	auto __is_invalid = [](uint8_t __b) {
	  return (__b & 0xFE) == 0xC0 || __b >= 0xF5;
	};

	// No valid or invalid multibyte sequence is longer than 4 bytes,
	// so skip back over at most four bytes.
	while (__is_continuation(__u) && __to_incr < 4 && __curr != __first)
	  {
	    ++__to_incr;
	    __u = *--__curr;
	  }

	// If the last byte read was a continuation byte then either we read
	// four continuation bytes, or stopped at the start of the sequence.
	// Either way, the maximal subparts are the individual continuation
	// bytes so each one should be replaced with U+FFFD.
	if (__is_continuation(__u) || __is_invalid(__u)) [[unlikely]]
	  {
	    // Either found four continuation bytes (maximum allowed is three)
	    // or first non-continuation byte is an invalid UTF-8 code unit.
	    _M_update(_S_error(), 1);
	    return;
	  }
	// __u is a valid start byte so use countl_one to get the expected
	// length of the multibyte sequence that starts with this byte.
	int __seq_length = std::countl_one((unsigned char)__u);
	if (__seq_length < __to_incr) [[unlikely]]
	  {
	    // If the expected number of continuation bytes is less than
	    // the number we found, then the last continuation byte is a
	    // maximal subpart and the decremented iterator points to it.
	    _M_update(_S_error(), 1);
	    return;
	  }

	auto __orig = std::__exchange(_M_curr(), std::move(__curr));
	if (_M_read_utf8() == _S_error()) [[unlikely]]
	  {
	    if (_M_to_increment < __to_incr) // Read truncated sequence, set
	      _M_to_increment = 1;           // curr to last continuation byte.
	  }

	_M_curr() = std::move(__orig);
	// operator--() will move back by _M_to_increment
      }

      constexpr void
      _M_read_reverse_utf16() requires bidirectional_iterator<_Iter>
      {
	_Guard<_Iter> __g{this, _M_curr()};
	char32_t __c{};
	uint16_t __u = *--_M_curr();
	uint8_t __to_incr = 1;

	if (__u < 0xD800 || __u > 0xDFFF) [[likely]]
	  __c = __u;
	else if (__u >= 0xDC00 && _M_curr() != _M_first()) [[likely]]
	  {
	    // read a low surrogate, expect a high surrogate before it.
	    uint16_t __u2 = *--_M_curr();
	    if (__u2 < 0xD800 || __u2 >= 0xDC00) [[unlikely]]
	      __c = _S_error(); // unpaired low surrogate
	    else
	      {
		__to_incr = 2;
		uint32_t __x = (__u2 & 0x3F) << 10 | (__u & 0x3FF);
		uint32_t __w = (__u2 >> 6) & 0x1F;
		__c = (__w + 1) << 16 | __x;
	      }
	  }
	else
	  __c = _S_error(); // unpaired surrogate

	_M_update(__c, __to_incr);
      }

      constexpr void
      _M_read_reverse_utf32() requires bidirectional_iterator<_Iter>
      {
	_Guard<_Iter> __g{this, _M_curr()};
	char32_t __c = *--_M_curr();
	if (!__is_scalar_value(__c)) [[unlikely]]
	  __c = _S_error();
	_M_update(__c, 1);
      }

      // Encode the code point __c as one or more code units in _M_buf.
      constexpr void
      _M_update(char32_t __c, uint8_t __to_incr)
      {
	_M_to_increment = __to_incr;
	_M_buf_index = 0;
	if constexpr (sizeof(_ToFmt) == sizeof(uint32_t))
	  {
	    _M_buf[0] = __c;
	    _M_buf_last = 1;
	  }
	else if constexpr (sizeof(_ToFmt) == sizeof(uint16_t))
	  {
	    if (__is_single_code_unit<_ToFmt>(__c))
	      {
		_M_buf[0] = __c;
		_M_buf[1] = 0;
		_M_buf_last = 1;
	      }
	    else
	      {
		// From http://www.unicode.org/faq/utf_bom.html#utf16-4
		const char32_t __lead_offset = 0xD800 - (0x10000 >> 10);
		char16_t __lead = __lead_offset + (__c >> 10);
		char16_t __trail = 0xDC00 + (__c & 0x3FF);
		_M_buf[0] = __lead;
		_M_buf[1] = __trail;
		_M_buf_last = 2;
	      }
	  }
	else
	  {
	    static_assert(sizeof(_ToFmt) == 1);
	    int __bits = std::bit_width((uint32_t)__c);
	    if (__bits <= 7) [[likely]]
	      {
		_M_buf[0] = __c;
		_M_buf[1] = _M_buf[2] = _M_buf[3] = 0;
		_M_buf_last = 1;
	      }
	    else if (__bits <= 11)
	      {
		_M_buf[0] = 0xC0 | (__c >> 6);
		_M_buf[1] = 0x80 | (__c & 0x3F);
		_M_buf[2] = _M_buf[3] = 0;
		_M_buf_last = 2;
	      }
	    else if (__bits <= 16)
	      {
		_M_buf[0] = 0xE0 | (__c >> 12);
		_M_buf[1] = 0x80 | ((__c >> 6) & 0x3F);
		_M_buf[2] = 0x80 | (__c & 0x3F);
		_M_buf[3] = 0;
		_M_buf_last = 3;
	      }
	    else
	      {
		_M_buf[0] = 0xF0 | ((__c >> 18) & 0x07);
		_M_buf[1] = 0x80 | ((__c >> 12) & 0x3F);
		_M_buf[2] = 0x80 | ((__c >> 6) & 0x3F);
		_M_buf[3] = 0x80 | (__c & 0x3F);
		_M_buf_last = 4;
	      }
	  }
      }

      constexpr char32_t
      _S_error()
      {
	char32_t __c = _ErrorHandler()();
	__glibcxx_assert(__is_scalar_value(__c));
	return __c;
      }

      constexpr _Iter
      _M_first() const requires bidirectional_iterator<_Iter>
      { return _M_first_and_curr._M_first; }

      constexpr _Iter&
      _M_curr() { return _M_first_and_curr._M_curr; }

      constexpr _Iter
      _M_curr() const { return _M_first_and_curr._M_curr; }

      // _M_first is not needed for non-bidirectional ranges.
      template<typename _It>
	struct _First_and_curr
	{
	  _First_and_curr() = default;

	  constexpr
	  _First_and_curr(_It __curr) : _M_curr(__curr) { }

	  template<convertible_to<_It> _It2>
	    constexpr
	    _First_and_curr(const _First_and_curr<_It2>& __other)
	    : _M_curr(__other._M_curr) { }

	  // First code unit of the current code point for forward iterators,
	  // past-the-end of the current code point for input iterators.
	  _It _M_curr;
	};

      template<typename _It> requires bidirectional_iterator<_It>
	struct _First_and_curr<_It>
	{
	  _First_and_curr() = default;

	  constexpr
	  _First_and_curr(_It __first, _It __curr)
	  : _M_first(__first), _M_curr(__curr) { }

	  template<convertible_to<_It> _It2>
	    constexpr
	    _First_and_curr(const _First_and_curr<_It2>& __other)
	    : _M_first(__other._M_first), _M_curr(__other._M_curr) { }

	  _It _M_first; // Start of the underlying range.
	  _It _M_curr;  // First code unit of the current code point.
	};

      // Iterators pointing to the start of the underlying range and to the
      // start (or end, for non-forward iterators) of the current code point.
      _First_and_curr<_Iter> _M_first_and_curr;

      // The end of the underlying input range.
      [[no_unique_address]] _Sent _M_last;

      // Buffer holding the individual code units of the current code point.
      array<value_type, 4 / sizeof(_ToFmt)> _M_buf;

      uint8_t _M_buf_index = 0;    // Index of current code unit in the buffer.
      uint8_t _M_buf_last = 0;     // Number of code units in the buffer.
      uint8_t _M_to_increment = 0; // How far to advance _M_curr on increment.

      template<typename _FromFmt2, typename _ToFmt2,
	       input_iterator _Iter2, sentinel_for<_Iter2> _Sent2,
	       typename _ErrHandler>
	requires convertible_to<iter_value_t<_Iter2>, _FromFmt2>
	friend class _Utf_iterator;
    };

  template<typename _ToFormat, ranges::input_range _Range>
    class _Utf_view
    : public ranges::view_interface<_Utf_view<_ToFormat, _Range>>
    {
      using _Iterator = _Utf_iterator<ranges::range_value_t<_Range>,
				      _ToFormat, ranges::iterator_t<_Range>,
				      ranges::sentinel_t<_Range>>;

      template<typename _Iter, typename _Sent>
	constexpr auto
	_M_begin(_Iter __first, _Sent __last)
	{
	  if constexpr (bidirectional_iterator<_Iter>)
	    return _Iterator(__first, __first, __last);
	  else
	    return _Iterator(__first, __last);
	}

      template<typename _Iter, typename _Sent>
	constexpr auto
	_M_end(_Iter __first, _Sent __last)
	{
	  if constexpr (!is_same_v<_Iter, _Sent>)
	    return __last;
	  else if constexpr (bidirectional_iterator<_Iter>)
	    return _Iterator(__first, __last, __last);
	  else
	    return _Iterator(__last, __last);
	}

      _Range _M_base;

    public:
      constexpr explicit
      _Utf_view(_Range&& __r) : _M_base(std::forward<_Range>(__r)) { }

      constexpr auto begin()
      { return _M_begin(ranges::begin(_M_base), ranges::end(_M_base)); }

      constexpr auto end()
      { return _M_end(ranges::begin(_M_base), ranges::end(_M_base)); }

      constexpr bool empty() const { return ranges::empty(_M_base); }
    };

#ifdef __cpp_char8_t
  template<typename _View>
    using _Utf8_view = _Utf_view<char8_t, _View>;
#else
  template<typename _View>
    using _Utf8_view = _Utf_view<char, _View>;
#endif
  template<typename _View>
    using _Utf16_view = _Utf_view<char16_t, _View>;
  template<typename _View>
    using _Utf32_view = _Utf_view<char32_t, _View>;

inline namespace __v16_0_0
{
#define _GLIBCXX_GET_UNICODE_DATA 160000
#include "unicode-data.h"
#ifdef _GLIBCXX_GET_UNICODE_DATA
# error "Invalid unicode data"
#endif

  // The field width of a code point.
  constexpr int
  __field_width(char32_t __c) noexcept
  {
    if (__c < __width_edges[0]) [[likely]]
      return 1;

    auto* __p = std::upper_bound(__width_edges, std::end(__width_edges), __c);
    return (__p - __width_edges) % 2 + 1;
  }

  // @pre c <= 0x10FFFF
  constexpr bool
  __should_escape_category(char32_t __c) noexcept
  {
    constexpr uint32_t __mask = 0x01;
    auto* __end = std::end(__escape_edges);
    auto* __p = std::lower_bound(__escape_edges, __end,
				 (__c << 1u) + 2);
    return __p[-1] & __mask;
  }


  // @pre c <= 0x10FFFF
  constexpr _Gcb_property
  __grapheme_cluster_break_property(char32_t __c) noexcept
  {
    constexpr uint32_t __mask = (1 << __gcb_shift_bits) - 1;
    auto* __end = std::end(__gcb_edges);
    auto* __p = std::lower_bound(__gcb_edges, __end,
				 (__c << __gcb_shift_bits) | __mask);
    return _Gcb_property(__p[-1] & __mask);
  }

  constexpr bool
  __is_incb_linker(char32_t __c) noexcept
  {
    const auto __end = std::end(__incb_linkers);
    // Array is small enough that linear search is faster than binary search.
    return _GLIBCXX_STD_A::find(__incb_linkers, __end, __c) != __end;
  }

  // @pre c <= 0x10FFFF
  constexpr _InCB
  __incb_property(char32_t __c) noexcept
  {
    if ((__c << 2) < __incb_edges[0]) [[likely]]
      return _InCB(0);

    constexpr uint32_t __mask = 0x3;
    auto* __end = std::end(__incb_edges);
    auto* __p = std::lower_bound(__incb_edges, __end, (__c << 2) | __mask);
    return _InCB(__p[-1] & __mask);
  }

  constexpr bool
  __is_extended_pictographic(char32_t __c)
  {
    if (__c < __xpicto_edges[0]) [[likely]]
      return 0;

    auto* __p = std::upper_bound(__xpicto_edges, std::end(__xpicto_edges), __c);
    return (__p - __xpicto_edges) % 2;
  }

  struct _Grapheme_cluster_iterator_base
  {
    char32_t _M_c; // First code point in the cluster.
    _Gcb_property _M_prop; // GCB property of _M_c.
    enum class _XPicto : unsigned char { _Init, _Zwj, _Matched, _Failed };
    _XPicto _M_xpicto_seq_state = _XPicto::_Init;
    unsigned char _M_RI_count = 0;
    bool _M_incb_linker_seen = false;

    constexpr void
    _M_reset(char32_t __c, _Gcb_property __p)
    {
      _M_c = __c;
      _M_prop = __p;
      _M_xpicto_seq_state = _XPicto::_Init;
      _M_RI_count = 0;
      _M_incb_linker_seen = false;
    }

    constexpr void
    _M_update_xpicto_seq_state(char32_t __c, _Gcb_property __p)
    {
      if (_M_xpicto_seq_state == _XPicto::_Failed)
	return;

      auto __next_state = _XPicto::_Failed;
      if (_M_xpicto_seq_state != _XPicto::_Zwj) // i.e. Init or Matched
	{
	  if (__p == _Gcb_property::_Gcb_ZWJ)
	    {
	      if (_M_xpicto_seq_state == _XPicto::_Matched)
		__next_state = _XPicto::_Zwj;
	      // We check _M_c here so that we do the lookup at most once,
	      // and only for clusters containing at least one ZWJ.
	      else if (__is_extended_pictographic(_M_c))
		__next_state = _XPicto::_Zwj;
	    }
	  else if (__p == _Gcb_property::_Gcb_Extend)
	    __next_state = _M_xpicto_seq_state; // no change
	}
      else // Zwj
	{
	  // This assumes that all \p{Extended_Pictographic} emoji have
	  // Grapheme_Cluster_Break=Other.
	  if (__p == _Gcb_property::_Gcb_Other
		&& __is_extended_pictographic(__c))
	    __next_state = _XPicto::_Matched;
	}
      _M_xpicto_seq_state = __next_state;
    }

    constexpr void
    _M_update_ri_count(_Gcb_property __p)
    {
      if (__p == _Gcb_property::_Gcb_Regional_Indicator)
	++_M_RI_count;
      else
	_M_RI_count = 0;
    }

    constexpr void
    _M_update_incb_state(char32_t __c, _Gcb_property)
    {
      if (__is_incb_linker(__c))
	_M_incb_linker_seen = true;
    }
  };

  // Split a range into extended grapheme clusters.
  template<ranges::forward_range _View> requires ranges::view<_View>
    class _Grapheme_cluster_view
    : public ranges::view_interface<_Grapheme_cluster_view<_View>>
    {
    public:

      constexpr
      _Grapheme_cluster_view(_View __v)
      : _M_begin(_Utf32_view<_View>(std::move(__v)).begin())
      { }

      constexpr auto begin() const { return _M_begin; }
      constexpr auto end() const { return _M_begin.end(); }

    private:
      struct _Iterator : private _Grapheme_cluster_iterator_base
      {
      private:
	// Iterator over the underlying code points.
	using _U32_iterator = ranges::iterator_t<_Utf32_view<_View>>;

      public:
	// TODO: Change value_type to be subrange<_U32_iterator> instead?
	// Alternatively, value_type could be _Utf32_view<iterator_t<_View>>.
	// That would be the whole cluster, not just the first code point.
	// Would need to store two iterators and find end of current cluster
	// on increment, so operator* returns value_type(_M_base, _M_next).
	using value_type = char32_t;
	using iterator_concept = forward_iterator_tag;
	using difference_type = ptrdiff_t;

	constexpr
	_Iterator(_U32_iterator __i)
	: _M_base(__i)
	{
	  if (__i != __i.end())
	    {
	      _M_c = *__i;
	      _M_prop = __grapheme_cluster_break_property(_M_c);
	    }
	}

	// The first code point of the current extended grapheme cluster.
	constexpr value_type
	operator*() const
	{ return _M_c; }

	constexpr auto
	operator->() const
	{ return &_M_c; }

	// Move to the next extended grapheme cluster.
	constexpr _Iterator&
	operator++()
	{
	  const auto __end = _M_base.end();
	  if (_M_base != __end)
	    {
	      auto __p_prev = _M_prop;
	      auto __it = _M_base;
	      while (++__it != __end)
		{
		  char32_t __c = *__it;
		  auto __p = __grapheme_cluster_break_property(*__it);
		  _M_update_xpicto_seq_state(__c, __p);
		  _M_update_ri_count(__p);
		  _M_update_incb_state(__c, __p);
		  if (_M_is_break(__p_prev, __p, __it))
		    {
		      // Found a grapheme cluster break
		      _M_reset(__c, __p);
		      break;
		    }
		  __p_prev = __p;
		}
	      _M_base = __it;
	    }
	  return *this;
	}

	constexpr _Iterator
	operator++(int)
	{
	  auto __tmp = *this;
	  ++*this;
	  return __tmp;
	}

	constexpr bool
	operator==(const _Iterator& __i) const
	{ return _M_base == __i._M_base; }

	// This supports iter != iter.end()
	constexpr bool
	operator==(const ranges::sentinel_t<_View>& __i) const
	{ return _M_base == __i; }

	// Iterator to the start of the current cluster.
	constexpr auto base() const { return _M_base.base(); }

	// The end of the underlying view (not the end of the current cluster!)
	constexpr auto end() const { return _M_base.end(); }

	// Field width of the first code point in the cluster.
	constexpr int
	width() const noexcept
	{ return __field_width(_M_c); }

      private:
	_U32_iterator _M_base;

	// Implement the Grapheme Cluster Boundary Rules from Unicode Annex #29
	// http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
	// This implements the rules from TR29 revision 43 in Unicode 15.1.0.
	// Return true if there is a break between code point with property p1
	// and code point with property p2.
	constexpr bool
	_M_is_break(_Gcb_property __p1, _Gcb_property __p2,
		    _U32_iterator __curr) const
	{
	  using enum _Gcb_property;

	  if (__p1 == _Gcb_Control || __p1 == _Gcb_LF)
	    return true; // Break after Control or LF.

	  if (__p1 == _Gcb_CR)
	    return __p2 != _Gcb_LF; // Do not break between a CR and LF.

	  // Rule GB5
	  if (__p2 == _Gcb_Control || __p2 == _Gcb_CR || __p2 == _Gcb_LF)
	    return true; // Break before Control, CR or LF.

	  // Rule GB6
	  if (__p1 == _Gcb_L)
	    switch (__p2)
	    {
	      case _Gcb_L:
	      case _Gcb_V:
	      case _Gcb_LV:
	      case _Gcb_LVT:
		return false; // Do not break Hangul syllable sequences.
	      default:
		return true;
	      }

	  // Rule GB7
	  if (__p1 == _Gcb_LV || __p1 == _Gcb_V)
	    switch (__p2)
	    {
	      case _Gcb_V:
	      case _Gcb_T:
		return false; // Do not break Hangul syllable sequences.
	      default:
		return true;
	      }

	  // Rule GB8
	  if (__p1 == _Gcb_LVT || __p1 == _Gcb_T)
	    return __p2 != _Gcb_T; // Do not break Hangul syllable sequences.

	  // Rule GB9
	  if (__p2 == _Gcb_Extend || __p2 == _Gcb_ZWJ)
	    return false; // Do not break before extending characters or ZWJ.

	  // The following GB9x rules only apply to extended grapheme clusters,
	  // which is what the C++ standard uses (not legacy grapheme clusters).

	  // Rule GB9a
	  if (__p2 == _Gcb_SpacingMark)
	    return false; // Do not break before SpacingMarks,
	  // Rule GB9b
	  if (__p1 == _Gcb_Prepend)
	    return false; // or after Prepend characters.

	  // Rule GB9c (Unicode 15.1.0)
	  // Do not break within certain combinations with
	  // Indic_Conjunct_Break (InCB)=Linker.
	  if (_M_incb_linker_seen
		&& __incb_property(_M_c) == _InCB::_Consonant
		&& __incb_property(*__curr) == _InCB::_Consonant)
	    {
	      // Match [_M_base, __curr] against regular expression
	      // Consonant ([Extend Linker]* Linker [Extend Linker]* Consonant)+
	      bool __have_linker = false;
	      auto __it = _M_base;
	      while (++__it != __curr)
		{
		  if (__is_incb_linker(*__it))
		    __have_linker = true;
		  else
		    {
		      auto __incb = __incb_property(*__it);
		      if (__incb == _InCB::_Consonant)
			__have_linker = false;
		      else if (__incb != _InCB::_Extend)
			break;
		    }
		}
	      if (__it == __curr && __have_linker)
		return false;
	    }

	  // Rule GB11
	  // Do not break within emoji modifier sequences
	  // or emoji zwj sequences.
	  if (__p1 == _Gcb_ZWJ && _M_xpicto_seq_state == _XPicto::_Matched)
	    return false;

	  // Rules GB12 and GB13
	  // Do not break within emoji flag sequences. That is, do not break
	  // between regional indicator (RI) symbols if there is an odd number
	  // of RI characters before the break point.
	  if (__p1 == _Gcb_property::_Gcb_Regional_Indicator && __p1 == __p2)
	    return (_M_RI_count & 1) == 0;

	  // Rule GB999
	  return true; // Otherwise, break everywhere.
	}
      };

      _Iterator _M_begin;
    };

} // namespace __v16_0_0

  // Return the field width of a string.
  template<typename _CharT>
    constexpr size_t
    __field_width(basic_string_view<_CharT> __s)
    {
      if (__s.empty()) [[unlikely]]
	return 0;
      _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
      auto __it = __gc.begin();
      const auto __end = __gc.end();
      size_t __n = __it.width();
      while (++__it != __end)
	__n += __it.width();
      return __n;
    }

  // Truncate a string to at most `__max` field width units, and return the
  // resulting field width.
  template<typename _CharT>
    constexpr size_t
    __truncate(basic_string_view<_CharT>& __s, size_t __max)
    {
      if (__s.empty()) [[unlikely]]
	return 0;

      _Grapheme_cluster_view<basic_string_view<_CharT>> __gc(__s);
      auto __it = __gc.begin();
      const auto __end = __gc.end();
      size_t __n = __it.width();
      if (__n > __max)
	{
	  __s = {};
	  return 0;
	}
      while (++__it != __end)
	{
	  size_t __n2 = __n + __it.width();
	  if (__n2 > __max)
	    {
	      __s = basic_string_view<_CharT>(__s.begin(), __it.base());
	      return __n;
	    }
	  __n = __n2;
	}
      return __n;
    }

  template<typename _CharT>
    consteval bool
    __literal_encoding_is_unicode()
    {
      if constexpr (is_same_v<_CharT, char16_t>)
	return true;
      else if constexpr (is_same_v<_CharT, char32_t>)
	  return true;
#ifdef __cpp_char8_t
      else if constexpr (is_same_v<_CharT, char8_t>)
	return true;
#endif

      const char* __enc = "";

#ifdef __GNUC_EXECUTION_CHARSET_NAME
      auto __remove_iso10646_prefix = [](const char* __s) {
	// GNU iconv allows "ISO-10646/" prefix (case-insensitive).
	if (__s[0] == 'I' || __s[0] == 'i')
	  if (__s[1] == 'S' || __s[1] == 's')
	    if (__s[2] == 'O' || __s[2] == 'o')
	      if (string_view(__s + 3).starts_with("-10646/"))
		return __s + 10;
	return __s;
      };

      if constexpr (is_same_v<_CharT, char>)
	__enc = __remove_iso10646_prefix(__GNUC_EXECUTION_CHARSET_NAME);
# if defined _GLIBCXX_USE_WCHAR_T && defined __GNUC_WIDE_EXECUTION_CHARSET_NAME
      else
	__enc = __remove_iso10646_prefix(__GNUC_WIDE_EXECUTION_CHARSET_NAME);
# endif

      if ((__enc[0] == 'U' || __enc[0] == 'u')
	    && (__enc[1] == 'T' || __enc[1] == 't')
	    && (__enc[2] == 'F' || __enc[2] == 'f'))
	{
	  __enc += 3;
	  if (__enc[0] == '-')
	    ++__enc;
	  if (__enc[0] == '8')
	    return __enc[1] == '\0' || string_view(__enc + 1) == "//";
	  else if constexpr (!is_same_v<_CharT, char>)
	    {
	      string_view __s(__enc);
	      if (__s.ends_with("//"))
		__s.remove_suffix(2);
	      if (__s.ends_with("LE") || __s.ends_with("BE"))
		__s.remove_suffix(2);
	      return __s == "16" || __s == "32";
	    }
	}
#elif defined __clang_literal_encoding__
      if constexpr (is_same_v<_CharT, char>)
	__enc = __clang_literal_encoding__;
# if defined _GLIBCXX_USE_WCHAR_T && defined __clang_wide_literal_encoding__
      else
	__enc = __clang_wide_literal_encoding__;
# endif
      // Clang accepts "-fexec-charset=utf-8" but the macro is still uppercase.
      string_view __s(__enc);
      if (__s == "UTF-8")
	return true;
      else if constexpr (!is_same_v<_CharT, char>)
	return __s == "UTF-16" || __s == "UTF-32";
#endif

      return false;
    }

  consteval bool
  __literal_encoding_is_utf8()
  { return __literal_encoding_is_unicode<char>(); }

  consteval bool
  __literal_encoding_is_extended_ascii()
  {
    return '0' == 0x30 && 'A' == 0x41 && 'Z' == 0x5a
	     && 'a' == 0x61 && 'z' == 0x7a;
  }

  // https://www.unicode.org/reports/tr22/tr22-8.html#Charset_Alias_Matching
  constexpr bool
  __charset_alias_match(string_view __a, string_view __b)
  {
    // Map alphanumeric chars to their base 64 value, everything else to 127.
    auto __map = [](char __c, bool& __num) -> unsigned char {
      if (__c == '0') [[unlikely]]
	return __num ? 0 : 127;
      const auto __v = __detail::__from_chars_alnum_to_val(__c);
      __num = __v < 10;
      return __v;
    };

    auto __ptr_a = __a.begin(), __end_a = __a.end();
    auto __ptr_b = __b.begin(), __end_b = __b.end();
    bool __num_a = false, __num_b = false;

    while (true)
      {
	// Find the value of the next alphanumeric character in each string.
	unsigned char __val_a{}, __val_b{};
	while (__ptr_a != __end_a
		 && (__val_a = __map(*__ptr_a, __num_a)) == 127)
	  ++__ptr_a;
	while (__ptr_b != __end_b
		 && (__val_b = __map(*__ptr_b, __num_b)) == 127)
	  ++__ptr_b;
	// Stop when we reach the end of a string, or get a mismatch.
	if (__ptr_a == __end_a)
	  return __ptr_b == __end_b;
	else if (__ptr_b == __end_b)
	  return false;
	else if (__val_a != __val_b)
	  return false; // Found non-matching characters.
	++__ptr_a;
	++__ptr_b;
      }
    return true;
  }

} // namespace __unicode

namespace ranges
{
  template<typename _To, typename _Range>
    inline constexpr bool
    enable_borrowed_range<std::__unicode::_Utf_view<_To, _Range>>
      = enable_borrowed_range<_Range>;

  template<typename _Range>
    inline constexpr bool
    enable_borrowed_range<std::__unicode::_Grapheme_cluster_view<_Range>>
      = enable_borrowed_range<_Range>;
} // namespace ranges

_GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#endif // C++20
#endif // _GLIBCXX_UNICODE_H