diagnostics: Fix mojibake from displaying UTF-8 on Windows consoles

UTF-8 characters in diagnostic output (such as the warning emoji ⚠️
used by fanalyzer) display as mojibake on Windows unless the utf8
code page is being used

This patch adds UTF-8 to UTF-16 conversion when outputting to a console
on Windows.

gcc/ChangeLog:
	* pretty-print.cc (decode_utf8_char): Move forward declaration.
	(mingw_utf8_str_to_utf16_str): New function to convert UTF-8 to UTF-16.
	(is_console_handle): New function to detect Windows console handles.
	(write_all): Add UTF-8 to UTF-16 conversion for console output,
	falling back to WriteFile for ASCII strings and regular files.

Signed-off-by: Peter Damianov <peter0x44@disroot.org>
Signed-off-by: Jonathan Yong <10walls@gmail.com>
This commit is contained in:
Peter Damianov 2025-09-19 20:02:37 +01:00 committed by Jonathan Yong
parent 214372031a
commit bd352bd592
1 changed files with 129 additions and 3 deletions

View File

@ -38,11 +38,18 @@ along with GCC; see the file COPYING3. If not see
#include <iconv.h>
#endif
static int
decode_utf8_char (const unsigned char *, size_t len, unsigned int *);
#ifdef __MINGW32__
/* Replacement for fputs() that handles ANSI escape codes on Windows NT.
Contributed by: Liu Hao (lh_mouse at 126 dot com)
Extended by: Peter Damianov
Converts UTF-8 to UTF-16 if outputting to a console, so that emojis and
various other unicode characters don't get mojibak'd.
XXX: This file is compiled into libcommon.a that will be self-contained.
It looks like that these functions can be put nowhere else. */
@ -50,11 +57,132 @@ along with GCC; see the file COPYING3. If not see
#define WIN32_LEAN_AND_MEAN 1
#include <windows.h>
/* Convert UTF-8 string to UTF-16.
Returns true if conversion was performed, false if string is pure ASCII.
If the string contains only ASCII characters, returns false
without allocating any memory. Otherwise, a buffer that the caller
must free is allocated and the string is converted into it. */
static bool
mingw_utf8_str_to_utf16_str (const char *utf8_str, size_t utf8_len, wchar_t **utf16_str,
size_t *utf16_len)
{
if (utf8_len == 0)
{
*utf16_str = NULL;
*utf16_len = 0;
return false; /* No conversion needed for empty string. */
}
/* First pass: scan for non-ASCII and count UTF-16 code units needed. */
size_t utf16_count = 0;
const unsigned char *p = (const unsigned char *) utf8_str;
const unsigned char *end = p + utf8_len;
bool found_non_ascii = false;
while (p < end)
{
if (*p <= 127)
{
/* ASCII character - count as 1 UTF-16 unit and advance. */
utf16_count++;
p++;
}
else
{
/* Non-ASCII character - decode UTF-8 sequence. */
found_non_ascii = true;
unsigned int codepoint;
int utf8_char_len = decode_utf8_char (p, end - p, &codepoint);
if (utf8_char_len == 0)
return false; /* Invalid UTF-8. */
if (codepoint <= 0xFFFF)
utf16_count += 1; /* Single UTF-16 unit. */
else
utf16_count += 2; /* Surrogate pair. */
p += utf8_char_len;
}
}
/* If string is pure ASCII, no conversion needed. */
if (!found_non_ascii)
return false;
*utf16_str = (wchar_t *) xmalloc (utf16_count * sizeof (wchar_t));
*utf16_len = utf16_count;
/* Second pass: convert UTF-8 to UTF-16. */
wchar_t *out = *utf16_str;
p = (const unsigned char *) utf8_str;
while (p < end)
{
if (*p <= 127)
{
/* ASCII character. */
*out++ = (wchar_t) *p++;
}
else
{
/* Non-ASCII character - decode and convert. */
unsigned int codepoint;
int utf8_char_len = decode_utf8_char (p, end - p, &codepoint);
if (codepoint <= 0xFFFF)
{
*out++ = (wchar_t) codepoint;
}
else
{
/* Convert to UTF-16 surrogate pair. */
codepoint -= 0x10000;
*out++ = (wchar_t) (0xD800 + (codepoint >> 10));
*out++ = (wchar_t) (0xDC00 + (codepoint & 0x3FF));
}
p += utf8_char_len;
}
}
return true;
}
/* Check if the handle is a console. */
static bool
is_console_handle (HANDLE h)
{
DWORD mode;
return GetConsoleMode (h, &mode);
}
/* Write all bytes in [s,s+n) into the specified stream.
Errors are ignored. */
If outputting to a Windows console, convert UTF-8 to UTF-16 if needed.
Errors are ignored. */
static void
write_all (HANDLE h, const char *s, size_t n)
{
/* If writing to console, try to convert from UTF-8 to UTF-16 and use
WriteConsoleW. utf8_to_utf16 will return false if the string is pure
ASCII, in which case we fall back to the regular WriteFile path. */
if (is_console_handle (h))
{
wchar_t *utf16_str;
size_t utf16_len;
if (mingw_utf8_str_to_utf16_str (s, n, &utf16_str, &utf16_len))
{
DWORD written;
WriteConsoleW (h, utf16_str, utf16_len, &written, NULL);
free (utf16_str);
return;
}
/* If UTF-8 conversion returned false, fall back to WriteFile. */
}
/* WriteFile for regular files or when UTF-16 conversion is not needed. */
size_t rem = n;
DWORD step;
@ -712,8 +840,6 @@ mingw_ansi_fputs (const char *str, FILE *fp)
#endif /* __MINGW32__ */
static int
decode_utf8_char (const unsigned char *, size_t len, unsigned int *);
static void pp_quoted_string (pretty_printer *, const char *, size_t = -1);
extern void