libstdc++: Implement _Escaping_sink to avoid construction of string

This patch implements _Escaping_sink that stores characters in a local (stack)
buffer. When the buffer is full, the range of characters is escaped and written
to the underlying sink.

To support above, the __write_escaped_unicode_part function are defined.
It takes __str and __prev_esc by reference. The __prev_esc value is updated
based on the last character written. If the buffer ends with an incomplete
code point sequence, __str is left non-empty and last code points are not
written. _Escaping_sink then copies these characters to the front of the
buffer to reconstruct the full code point.

__formatter__str::_M_format_range now uses _Escaping_sink to escape any
non-continuous character sequences.

libstdc++-v3/ChangeLog:

	* include/std/format (__format::__write_escape_seqs)
	(__format::_Escaping_sink): Define.
	(__format::__write_escaped_unicode_part): Extract from
	__format::__write_escaped_unicode.
	(__format::__write_escaped_unicode): Forward to
	__write_escaped_unicode_part.
	(__formatter_str::_M_format_range): Use _Escaping sink.
	* testsuite/std/format/ranges/string.cc: New tests for
	character which codepoints will be split in buffer and
	escaping. Invoked test_padding.

Reviewed-by: Patrick Palka <ppalka@redhat.com>
Reviewed-by: Jonathan Wakely <jwakely@redhat.com>
Signed-off-by: Tomasz Kamiński <tkaminsk@redhat.com>
This commit is contained in:
Tomasz Kamiński 2025-07-08 18:50:50 +02:00
parent 9f13fd1b07
commit 59cabe08b5
2 changed files with 231 additions and 55 deletions

View File

@ -105,6 +105,7 @@ namespace __format
template<typename _CharT> class _Sink; template<typename _CharT> class _Sink;
template<typename _CharT> class _Fixedbuf_sink; template<typename _CharT> class _Fixedbuf_sink;
template<typename _Out, typename _CharT> class _Padding_sink; template<typename _Out, typename _CharT> class _Padding_sink;
template<typename _Out, typename _CharT> class _Escaping_sink;
// Output iterator that writes to a type-erase character sink. // Output iterator that writes to a type-erase character sink.
template<typename _CharT> template<typename _CharT>
@ -1066,6 +1067,17 @@ namespace __format
return ++__out; return ++__out;
} }
template<typename _Out, typename _CharT>
_Out
__write_escape_seqs(_Out __out, basic_string_view<_CharT> __units)
{
using _UChar = make_unsigned_t<_CharT>;
for (_CharT __c : __units)
__out = __format::__write_escape_seq(
__out, static_cast<_UChar>(__c), _Escapes<_CharT>::_S_x());
return __out;
}
template<typename _Out, typename _CharT> template<typename _Out, typename _CharT>
_Out _Out
__write_escaped_char(_Out __out, _CharT __c) __write_escaped_char(_Out __out, _CharT __c)
@ -1124,12 +1136,10 @@ namespace __format
template<typename _CharT, typename _Out> template<typename _CharT, typename _Out>
_Out _Out
__write_escaped_unicode(_Out __out, __write_escaped_unicode_part(_Out __out, basic_string_view<_CharT>& __str,
basic_string_view<_CharT> __str, bool& __prev_esc, _Term_char __term)
_Term_char __term)
{ {
using _Str_view = basic_string_view<_CharT>; using _Str_view = basic_string_view<_CharT>;
using _UChar = make_unsigned_t<_CharT>;
using _Esc = _Escapes<_CharT>; using _Esc = _Escapes<_CharT>;
static constexpr char32_t __replace = U'\uFFFD'; static constexpr char32_t __replace = U'\uFFFD';
@ -1143,10 +1153,10 @@ namespace __format
}(); }();
__unicode::_Utf_view<char32_t, _Str_view> __v(std::move(__str)); __unicode::_Utf_view<char32_t, _Str_view> __v(std::move(__str));
__str = {};
auto __first = __v.begin(); auto __first = __v.begin();
auto const __last = __v.end(); auto const __last = __v.end();
bool __prev_esc = true;
while (__first != __last) while (__first != __last)
{ {
bool __esc_ascii = false; bool __esc_ascii = false;
@ -1185,15 +1195,32 @@ namespace __format
__out = __format::__write_escaped_char(__out, *__first.base()); __out = __format::__write_escaped_char(__out, *__first.base());
else if (__esc_unicode) else if (__esc_unicode)
__out = __format::__write_escape_seq(__out, *__first, _Esc::_S_u()); __out = __format::__write_escape_seq(__out, *__first, _Esc::_S_u());
else // __esc_replace // __esc_replace
for (_CharT __c : _Str_view(__first.base(), __first._M_units())) else if (_Str_view __units(__first.base(), __first._M_units());
__out = __format::__write_escape_seq(__out, __units.end() != __last.base())
static_cast<_UChar>(__c), __out = __format::__write_escape_seqs(__out, __units);
_Esc::_S_x()); else
{
__str = __units;
return __out;
}
__prev_esc = true; __prev_esc = true;
++__first; ++__first;
} }
return __out;
}
template<typename _CharT, typename _Out>
_Out
__write_escaped_unicode(_Out __out, basic_string_view<_CharT> __str,
_Term_char __term)
{
bool __prev_escape = true;
__out = __format::__write_escaped_unicode_part(__out, __str,
__prev_escape, __term);
__out = __format::__write_escape_seqs(__out, __str);
return __out; return __out;
} }
@ -1412,55 +1439,28 @@ namespace __format
size_t(ranges::distance(__rg))); size_t(ranges::distance(__rg)));
return format(__str, __fc); return format(__str, __fc);
} }
else if (!_M_spec._M_debug) else
{ {
auto __handle_debug = [this, &__rg]<typename _NOut>(_NOut __nout)
{
if (!_M_spec._M_debug)
return ranges::copy(__rg, std::move(__nout)).out;
_Escaping_sink<_NOut, _CharT>
__sink(std::move(__nout), _Term_quote);
ranges::copy(__rg, __sink.out());
return __sink._M_finish();
};
const size_t __padwidth = _M_spec._M_get_width(__fc); const size_t __padwidth = _M_spec._M_get_width(__fc);
if (__padwidth == 0 && _M_spec._M_prec_kind == _WP_none) if (__padwidth == 0 && _M_spec._M_prec_kind == _WP_none)
return ranges::copy(__rg, __fc.out()).out; return __handle_debug(__fc.out());
_Padding_sink<_Out, _CharT> __sink(__fc.out(), __padwidth, _Padding_sink<_Out, _CharT>
_M_spec._M_get_precision(__fc)); __sink(__fc.out(), __padwidth, _M_spec._M_get_precision(__fc));
ranges::copy(__rg, __sink.out()); __handle_debug(__sink.out());
return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill); return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill);
} }
else if constexpr (ranges::forward_range<_Rg> || ranges::sized_range<_Rg>)
{
const size_t __n(ranges::distance(__rg));
size_t __w = __n;
if constexpr (!__unicode::__literal_encoding_is_unicode<_CharT>())
if (size_t __max = _M_spec._M_get_precision(__fc); __n > __max)
__w == __max;
if (__w <= __format::__stackbuf_size<_CharT>)
{
_CharT __buf[__format::__stackbuf_size<_CharT>];
ranges::copy_n(ranges::begin(__rg), __w, __buf);
return _M_format_escaped(_String_view(__buf, __n), __fc);
}
else if constexpr (ranges::random_access_range<_Rg>)
{
ranges::iterator_t<_Rg> __first = ranges::begin(__rg);
ranges::subrange __sub(__first, ranges::next(__first, __w));
return _M_format_escaped(_String(from_range, __sub), __fc);
}
else if (__w <= __n)
{
ranges::subrange __sub(
counted_iterator(ranges::begin(__rg), __w),
default_sentinel);
return _M_format_escaped(_String(from_range, __sub), __fc);
}
else if constexpr (ranges::sized_range<_Rg>)
return _M_format_escaped(_String(from_range, __rg), __fc);
else
{
// N.B. preserve the computed size
ranges::subrange __sub(__rg, __n);
return _M_format_escaped(_String(from_range, __sub), __fc);
}
}
else
return _M_format_escaped(_String(from_range, __rg), __fc);
} }
constexpr void constexpr void
@ -3997,6 +3997,93 @@ namespace __format
} }
}; };
template<typename _Out, typename _CharT>
class _Escaping_sink : public _Buf_sink<_CharT>
{
using _Esc = _Escapes<_CharT>;
_Out _M_out;
_Term_char _M_term : 2;
unsigned _M_prev_escape : 1;
unsigned _M_out_discards : 1;
void
_M_sync_discarding()
{
if constexpr (is_same_v<_Out, _Sink_iter<_CharT>>)
_M_out_discards = _M_out._M_discarding();
}
void
_M_write()
{
span<_CharT> __bytes = this->_M_used();
basic_string_view<_CharT> __str(__bytes.data(), __bytes.size());
size_t __rem = 0;
if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
{
bool __prev_escape = _M_prev_escape;
_M_out = __format::__write_escaped_unicode_part(
std::move(_M_out), __str, __prev_escape, _M_term);
_M_prev_escape = __prev_escape;
__rem = __str.size();
if (__rem > 0 && __str.data() != this->_M_buf) [[unlikely]]
ranges::move(__str, this->_M_buf);
}
else
_M_out = __format::__write_escaped_ascii(
std::move(_M_out), __str, _M_term);
this->_M_reset(this->_M_buf, __rem);
_M_sync_discarding();
}
void
_M_overflow() override
{
if (_M_out_discards)
this->_M_rewind();
else
_M_write();
}
bool
_M_discarding() const override
{ return _M_out_discards; }
public:
[[__gnu__::__always_inline__]]
explicit
_Escaping_sink(_Out __out, _Term_char __term)
: _M_out(std::move(__out)), _M_term(__term),
_M_prev_escape(true), _M_out_discards(false)
{
_M_out = __format::__write(std::move(_M_out), _Esc::_S_term(_M_term));
_M_sync_discarding();
}
_Out
_M_finish()
{
if (_M_out_discards)
return std::move(_M_out);
if (!this->_M_used().empty())
{
_M_write();
if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
if (auto __rem = this->_M_used(); !__rem.empty())
{
basic_string_view<_CharT> __str(__rem.data(), __rem.size());
_M_out = __format::__write_escape_seqs(std::move(_M_out), __str);
}
}
return __format::__write(std::move(_M_out), _Esc::_S_term(_M_term));
}
};
enum class _Arg_t : unsigned char { enum class _Arg_t : unsigned char {
_Arg_none, _Arg_bool, _Arg_c, _Arg_i, _Arg_u, _Arg_ll, _Arg_ull, _Arg_none, _Arg_bool, _Arg_c, _Arg_i, _Arg_u, _Arg_ll, _Arg_ull,
_Arg_flt, _Arg_dbl, _Arg_ldbl, _Arg_str, _Arg_sv, _Arg_ptr, _Arg_handle, _Arg_flt, _Arg_dbl, _Arg_ldbl, _Arg_str, _Arg_sv, _Arg_ptr, _Arg_handle,

View File

@ -279,6 +279,93 @@ void test_padding()
VERIFY( strip_prefix(resv, 46, '*') ); VERIFY( strip_prefix(resv, 46, '*') );
VERIFY( strip_quotes(resv) ); VERIFY( strip_quotes(resv) );
VERIFY( resv == in ); VERIFY( resv == in );
// width is 5, size is 15
in = "\u2160\u2161\u2162\u2163\u2164";
in += in; // width is 10, size is 30
in += in; // width is 20, size is 60
in += in; // width is 40, size is 120
in += in; // width is 80, size is 240
in += in; // width is 160, size is 480
lc.assign_range(in);
resv = res = std::format("{:s}", lc);
VERIFY( resv == in );
resv = res = std::format("{:*>10s}", lc);
VERIFY( resv == in );
resv = res = std::format("{:*>200s}", lc);
VERIFY( strip_prefix(resv, 40, '*') );
VERIFY( resv == in );
resv = res = std::format("{:?s}", lc);
VERIFY( strip_quotes(resv) );
VERIFY( resv == in );
resv = res = std::format("{:*>10?s}", lc);
VERIFY( strip_quotes(resv) );
VERIFY( resv == in );
resv = res = std::format("{:*>200?s}", lc);
VERIFY( strip_prefix(resv, 38, '*') );
VERIFY( strip_quotes(resv) );
VERIFY( resv == in );
}
void test_escaping()
{
std::string res;
std::string_view resv;
const std::string_view input =
"\t\n\r\\\""
"\u008a" // Cc, Control, Line Tabulation Set,
"\u00ad" // Cf, Format, Soft Hyphen
"\u1d3d" // Lm, Modifier letter, Modifier Letter Capital Ou
"\u00a0" // Zs, Space Separator, No-Break Space (NBSP)
"\u2029" // Zp, Paragraph Separator, Paragraph Separator
"\U0001f984" // So, Other Symbol, Unicorn Face
;
const std::string_view output =
R"(\t\n\r\\\")"
R"(\u{8a})"
R"(\u{ad})"
"\u1d3d"
R"(\u{a0})"
R"(\u{2029})"
"\U0001f984";
std::forward_list<char> lc(std::from_range, input);
resv = res = std::format("{:s}", lc);
VERIFY( resv == input );
resv = res = std::format("{:?s}", lc);
VERIFY( strip_quotes(resv) );
VERIFY( resv == output );
// width is 5, size is 15
std::string in = "\u2160\u2161\u2162\u2163\u2164";
in += in; // width is 10, size is 30
in += in; // width is 20, size is 60
in += in; // width is 40, size is 120
in += in; // width is 80, size is 240
in += in; // width is 160, size is 480
std::string_view inv = in;
// last charcter is incomplete
lc.assign_range(inv.substr(0, 479));
// non-debug format, chars copied as is
resv = res = std::format("{:s}", lc);
VERIFY( resv == inv.substr(0, 479) );
// debug-format, incomplete code-point sequence is esaped
resv = res = std::format("{:?s}", lc);
VERIFY( strip_quotes(resv) );
VERIFY( resv.substr(0, 477) == inv.substr(0, 477) );
resv.remove_prefix(477);
VERIFY( resv == R"(\x{e2}\x{85})" );
} }
int main() int main()
@ -287,4 +374,6 @@ int main()
test_outputs<char>(); test_outputs<char>();
test_outputs<wchar_t>(); test_outputs<wchar_t>();
test_nested(); test_nested();
test_padding();
test_escaping();
} }