Commit 119ff0b0 authored by Nicolas Pitre's avatar Nicolas Pitre Committed by Greg Kroah-Hartman
Browse files

vt: update gen_ucs_width.py to produce more space efficient tables



Split table ranges into BMP (16-bit) and non-BMP (above 16-bit).
This reduces the corresponding text size by 20-25%.

Signed-off-by: default avatarNicolas Pitre <npitre@baylibre.com>
Link: https://lore.kernel.org/r/20250410011839.64418-10-nico@fluxnic.net


Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent cd6937d4
Loading
Loading
Loading
Loading
+113 −41
Original line number Diff line number Diff line
@@ -132,13 +132,49 @@ def generate_ucs_width():
        ranges.append((start, prev))
        return ranges

    # Function to split ranges into BMP (16-bit) and non-BMP (above 16-bit)
    def split_ranges_by_size(ranges):
        bmp_ranges = []
        non_bmp_ranges = []

        for start, end in ranges:
            if end <= 0xFFFF:
                bmp_ranges.append((start, end))
            elif start > 0xFFFF:
                non_bmp_ranges.append((start, end))
            else:
                # Split the range at 0xFFFF
                bmp_ranges.append((start, 0xFFFF))
                non_bmp_ranges.append((0x10000, end))

        return bmp_ranges, non_bmp_ranges

    # Extract ranges for each width
    zero_width_ranges = ranges_optimize(width_map, 0)
    double_width_ranges = ranges_optimize(width_map, 2)

    # Split ranges into BMP and non-BMP
    zero_width_bmp, zero_width_non_bmp = split_ranges_by_size(zero_width_ranges)
    double_width_bmp, double_width_non_bmp = split_ranges_by_size(double_width_ranges)

    # Get Unicode version information
    unicode_version = unicodedata.unidata_version

    # Function to generate code point description comments
    def get_code_point_comment(start, end):
        try:
            start_char_desc = unicodedata.name(chr(start))
            if start == end:
                return f"/* {start_char_desc} */"
            else:
                end_char_desc = unicodedata.name(chr(end))
                return f"/* {start_char_desc} - {end_char_desc} */"
        except:
            if start == end:
                return f"/* U+{start:04X} */"
            else:
                return f"/* U+{start:04X} - U+{end:04X} */"

    # Generate C implementation file
    with open(c_file, 'w') as f:
        f.write(f"""\
@@ -156,62 +192,77 @@ def generate_ucs_width():
#include <linux/bsearch.h>
#include <linux/consolemap.h>

struct interval {{
struct interval16 {{
	uint16_t first;
	uint16_t last;
}};

struct interval32 {{
	uint32_t first;
	uint32_t last;
}};

/* Zero-width character ranges */
static const struct interval zero_width_ranges[] = {{
/* Zero-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
static const struct interval16 zero_width_bmp[] = {{
""")

        for start, end in zero_width_ranges:
            try:
                start_char_desc = unicodedata.name(chr(start)) if start < 0x10000 else f"U+{start:05X}"
                if start == end:
                    comment = f"/* {start_char_desc} */"
                else:
                    end_char_desc = unicodedata.name(chr(end)) if end < 0x10000 else f"U+{end:05X}"
                    comment = f"/* {start_char_desc} - {end_char_desc} */"
            except:
                if start == end:
                    comment = f"/* U+{start:05X} */"
                else:
                    comment = f"/* U+{start:05X} - U+{end:05X} */"
        for start, end in zero_width_bmp:
            comment = get_code_point_comment(start, end)
            f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")

        f.write("""\
};

/* Zero-width character ranges (non-BMP, U+10000 and above) */
static const struct interval32 zero_width_non_bmp[] = {
""")

        for start, end in zero_width_non_bmp:
            comment = get_code_point_comment(start, end)
            f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")

        f.write("""\
};

/* Double-width character ranges */
static const struct interval double_width_ranges[] = {
/* Double-width character ranges (BMP - Basic Multilingual Plane, U+0000 to U+FFFF) */
static const struct interval16 double_width_bmp[] = {
""")

        for start, end in double_width_ranges:
            try:
                start_char_desc = unicodedata.name(chr(start)) if start < 0x10000 else f"U+{start:05X}"
                if start == end:
                    comment = f"/* {start_char_desc} */"
                else:
                    end_char_desc = unicodedata.name(chr(end)) if end < 0x10000 else f"U+{end:05X}"
                    comment = f"/* {start_char_desc} - {end_char_desc} */"
            except:
                if start == end:
                    comment = f"/* U+{start:05X} */"
                else:
                    comment = f"/* U+{start:05X} - U+{end:05X} */"
        for start, end in double_width_bmp:
            comment = get_code_point_comment(start, end)
            f.write(f"\t{{ 0x{start:04X}, 0x{end:04X} }}, {comment}\n")

        f.write("""\
};

/* Double-width character ranges (non-BMP, U+10000 and above) */
static const struct interval32 double_width_non_bmp[] = {
""")

        for start, end in double_width_non_bmp:
            comment = get_code_point_comment(start, end)
            f.write(f"\t{{ 0x{start:05X}, 0x{end:05X} }}, {comment}\n")

        f.write("""\
};


static int ucs_cmp(const void *key, const void *element)
static int ucs_cmp16(const void *key, const void *element)
{
	uint16_t cp = *(uint16_t *)key;
	const struct interval16 *e = element;

	if (cp > e->last)
		return 1;
	if (cp < e->first)
		return -1;
	return 0;
}

static int ucs_cmp32(const void *key, const void *element)
{
	uint32_t cp = *(uint32_t *)key;
	const struct interval *e = element;
	const struct interval32 *e = element;

	if (cp > e->last)
		return 1;
@@ -220,13 +271,22 @@ static int ucs_cmp(const void *key, const void *element)
	return 0;
}

static bool is_in_interval(uint32_t cp, const struct interval *intervals, size_t count)
static bool is_in_interval16(uint16_t cp, const struct interval16 *intervals, size_t count)
{
	if (cp < intervals[0].first || cp > intervals[count - 1].last)
		return false;

	return __inline_bsearch(&cp, intervals, count,
				sizeof(*intervals), ucs_cmp) != NULL;
				sizeof(*intervals), ucs_cmp16) != NULL;
}

static bool is_in_interval32(uint32_t cp, const struct interval32 *intervals, size_t count)
{
	if (cp < intervals[0].first || cp > intervals[count - 1].last)
		return false;

	return __inline_bsearch(&cp, intervals, count,
				sizeof(*intervals), ucs_cmp32) != NULL;
}

/**
@@ -237,7 +297,9 @@ static bool is_in_interval(uint32_t cp, const struct interval *intervals, size_t
 */
bool ucs_is_zero_width(uint32_t cp)
{
	return is_in_interval(cp, zero_width_ranges, ARRAY_SIZE(zero_width_ranges));
	return (cp <= 0xFFFF)
	       ? is_in_interval16(cp, zero_width_bmp, ARRAY_SIZE(zero_width_bmp))
	       : is_in_interval32(cp, zero_width_non_bmp, ARRAY_SIZE(zero_width_non_bmp));
}

/**
@@ -248,17 +310,27 @@ bool ucs_is_zero_width(uint32_t cp)
 */
bool ucs_is_double_width(uint32_t cp)
{
	return is_in_interval(cp, double_width_ranges, ARRAY_SIZE(double_width_ranges));
	return (cp <= 0xFFFF)
	       ? is_in_interval16(cp, double_width_bmp, ARRAY_SIZE(double_width_bmp))
	       : is_in_interval32(cp, double_width_non_bmp, ARRAY_SIZE(double_width_non_bmp));
}
""")

    # Print summary
    zero_width_count = sum(end - start + 1 for start, end in zero_width_ranges)
    double_width_count = sum(end - start + 1 for start, end in double_width_ranges)
    zero_width_bmp_count = sum(end - start + 1 for start, end in zero_width_bmp)
    zero_width_non_bmp_count = sum(end - start + 1 for start, end in zero_width_non_bmp)
    double_width_bmp_count = sum(end - start + 1 for start, end in double_width_bmp)
    double_width_non_bmp_count = sum(end - start + 1 for start, end in double_width_non_bmp)

    total_zero_width = zero_width_bmp_count + zero_width_non_bmp_count
    total_double_width = double_width_bmp_count + double_width_non_bmp_count

    print(f"Generated {c_file} with:")
    print(f"- {len(zero_width_ranges)} zero-width ranges covering ~{zero_width_count} code points")
    print(f"- {len(double_width_ranges)} double-width ranges covering ~{double_width_count} code points")
    print(f"- {len(zero_width_bmp)} zero-width BMP ranges (16-bit) covering ~{zero_width_bmp_count} code points")
    print(f"- {len(zero_width_non_bmp)} zero-width non-BMP ranges (32-bit) covering ~{zero_width_non_bmp_count} code points")
    print(f"- {len(double_width_bmp)} double-width BMP ranges (16-bit) covering ~{double_width_bmp_count} code points")
    print(f"- {len(double_width_non_bmp)} double-width non-BMP ranges (32-bit) covering ~{double_width_non_bmp_count} code points")
    print(f"Total: {len(zero_width_bmp) + len(zero_width_non_bmp) + len(double_width_bmp) + len(double_width_non_bmp)} ranges covering ~{total_zero_width + total_double_width} code points")

if __name__ == "__main__":
    generate_ucs_width()