mirror of git://gcc.gnu.org/git/gcc.git
				
				
				
			
		
			
				
	
	
		
			966 lines
		
	
	
		
			25 KiB
		
	
	
	
		
			Perl
		
	
	
	
			
		
		
	
	
			966 lines
		
	
	
		
			25 KiB
		
	
	
	
		
			Perl
		
	
	
	
| # chartables.pl - A perl program to generate tables for use by the
 | |
| # Character class.
 | |
| 
 | |
| # Copyright (C) 1998, 1999  Red Hat, Inc.
 | |
| #
 | |
| # This file is part of libjava.
 | |
| # 
 | |
| # This software is copyrighted work licensed under the terms of the
 | |
| # Libjava License.  Please consult the file "LIBJAVA_LICENSE" for
 | |
| # details.
 | |
| 
 | |
| # This program requires a `unidata.txt' file of the form distributed
 | |
| # on the Unicode 2.0 CD ROM.  Or, get it more conveniently here:
 | |
| # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
 | |
| # Version `2.1.8' of this file was last used to update the Character class.
 | |
| 
 | |
| # Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
 | |
| # "The Java Language Specification", ISBN 0-201-63451-1
 | |
| # plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
 | |
| 
 | |
| # Usage: perl chartables.pl [-n] UnicodeData-VERSION.txt
 | |
| # If this exits with nonzero status, then you must investigate the
 | |
| # cause of the problem.
 | |
| # Diagnostics and other information to stderr.
 | |
| # This creates the new include/java-chartables.h and
 | |
| # include/java-chardecomp.h files directly.
 | |
| # With -n, the files are not created, but all processing
 | |
| # still occurs.
 | |
| 
 | |
| # Fields in the table.
 | |
| $CODE = 0;
 | |
| $NAME = 1;
 | |
| $CATEGORY = 2;
 | |
| $DECOMPOSITION = 5;
 | |
| $DECIMAL = 6;
 | |
| $DIGIT = 7;
 | |
| $NUMERIC = 8;
 | |
| $UPPERCASE = 12;
 | |
| $LOWERCASE = 13;
 | |
| $TITLECASE = 14;
 | |
| 
 | |
| # A special case.
 | |
| $TAMIL_DIGIT_ONE  = 0x0be7;
 | |
| $TAMIL_DIGIT_NINE = 0x0bef;
 | |
| 
 | |
| # These are endpoints of legitimate gaps in the tables.
 | |
| $CJK_IDEOGRAPH_END = 0x9fa5;
 | |
| $HANGUL_END = 0xd7a3;
 | |
| $HIGH_SURROGATE_END = 0xdb7f;
 | |
| $PRIVATE_HIGH_SURROGATE_END = 0xdbff;
 | |
| $LOW_SURROGATE_END = 0xdfff;
 | |
| $PRIVATE_END = 0xf8ff;
 | |
| 
 | |
| %title_to_upper = ();
 | |
| %title_to_lower = ();
 | |
| %numerics  = ();
 | |
| %name = ();
 | |
| 
 | |
| @digit_start = ();
 | |
| @digit_end   = ();
 | |
| 
 | |
| @space_start = ();
 | |
| @space_end   = ();
 | |
| 
 | |
| # @letter_start = ();
 | |
| # @letter_end   = ();
 | |
| 
 | |
| @all_start = ();
 | |
| @all_end   = ();
 | |
| @all_cats  = ();
 | |
| 
 | |
| @upper_start = ();
 | |
| @upper_end   = ();
 | |
| @upper_map   = ();
 | |
| %upper_anom  = ();
 | |
| 
 | |
| @lower_start = ();
 | |
| @lower_end   = ();
 | |
| @lower_map   = ();
 | |
| %lower_anom  = ();
 | |
| 
 | |
| @attributes = ();
 | |
| 
 | |
| # There are a few characters which actually need two attributes.
 | |
| # These are special-cased.
 | |
| $ROMAN_START = 0x2160;
 | |
| $ROMAN_END   = 0x217f;
 | |
| %second_attributes = ();
 | |
| 
 | |
| $prevcode = -1;
 | |
| $status = 0;
 | |
| 
 | |
| %category_map =
 | |
| (
 | |
|  'Mn' => 'NON_SPACING_MARK',
 | |
|  'Mc' => 'COMBINING_SPACING_MARK',
 | |
|  'Me' => 'ENCLOSING_MARK',
 | |
|  'Nd' => 'DECIMAL_DIGIT_NUMBER',
 | |
|  'Nl' => 'LETTER_NUMBER',
 | |
|  'No' => 'OTHER_NUMBER',
 | |
|  'Zs' => 'SPACE_SEPARATOR',
 | |
|  'Zl' => 'LINE_SEPARATOR',
 | |
|  'Zp' => 'PARAGRAPH_SEPARATOR',
 | |
|  'Cc' => 'CONTROL',
 | |
|  'Cf' => 'FORMAT',
 | |
|  'Cs' => 'SURROGATE',
 | |
|  'Co' => 'PRIVATE_USE',
 | |
|  'Cn' => 'UNASSIGNED',
 | |
|  'Lu' => 'UPPERCASE_LETTER',
 | |
|  'Ll' => 'LOWERCASE_LETTER',
 | |
|  'Lt' => 'TITLECASE_LETTER',
 | |
|  'Lm' => 'MODIFIER_LETTER',
 | |
|  'Lo' => 'OTHER_LETTER',
 | |
|  'Pc' => 'CONNECTOR_PUNCTUATION',
 | |
|  'Pd' => 'DASH_PUNCTUATION',
 | |
|  'Ps' => 'START_PUNCTUATION',
 | |
|  'Pe' => 'END_PUNCTUATION',
 | |
|  'Pi' => 'START_PUNCTUATION',
 | |
|  'Pf' => 'END_PUNCTUATION',
 | |
|  'Po' => 'OTHER_PUNCTUATION',
 | |
|  'Sm' => 'MATH_SYMBOL',
 | |
|  'Sc' => 'CURRENCY_SYMBOL',
 | |
|  'Sk' => 'MODIFIER_SYMBOL',
 | |
|  'So' => 'OTHER_SYMBOL'
 | |
|  );
 | |
| 
 | |
| # These maps characters to their decompositions.
 | |
| %canonical_decomposition = ();
 | |
| %full_decomposition = ();
 | |
| 
 | |
| 
 | |
| # Handle `-n' and open output files.
 | |
| local ($f1, $f2) = ('include/java-chartables.h',
 | |
| 		    'include/java-chardecomp.h');
 | |
| if ($ARGV[0] eq '-n')
 | |
| {
 | |
|     shift @ARGV;
 | |
|     $f1 = '/dev/null';
 | |
|     $f2 = '/dev/null';
 | |
| }
 | |
| 
 | |
| open (CHARTABLE, "> $f1");
 | |
| open (DECOMP, "> $f2");
 | |
| 
 | |
| # Process the Unicode file.
 | |
| while (<>)
 | |
| {
 | |
|     chop;
 | |
|     # Specify a limit for split so that we pick up trailing fields.
 | |
|     # We make the limit larger than we need, to catch the case where
 | |
|     # there are extra fields.
 | |
|     @fields = split (';', $_, 30);
 | |
|     # Convert code to number.
 | |
|     $ncode = hex ($fields[$CODE]);
 | |
| 
 | |
|     if ($#fields != 14)
 | |
|     {
 | |
| 	print STDERR ("Entry for \\u", $fields[$CODE],
 | |
| 		      " has wrong number of fields: ", $#fields, "\n");
 | |
|     }
 | |
| 
 | |
|     $name{$fields[$CODE]} = $fields[$NAME];
 | |
| 
 | |
|     # If we've found a gap in the table, fill it in.
 | |
|     if ($ncode != $prevcode + 1)
 | |
|     {
 | |
| 	&process_gap (*fields, $prevcode, $ncode);
 | |
|     }
 | |
| 
 | |
|     &process_char (*fields, $ncode);
 | |
| 
 | |
|     $prevcode = $ncode;
 | |
| }
 | |
| 
 | |
| if ($prevcode != 0xffff)
 | |
| {
 | |
|     # Setting of `fields' parameter doesn't matter here.
 | |
|     &process_gap (*fields, $prevcode, 0x10000);
 | |
| }
 | |
| 
 | |
| print CHARTABLE "// java-chartables.h - Character tables for java.lang.Character -*- c++ -*-\n\n";
 | |
| print CHARTABLE "#ifndef __JAVA_CHARTABLES_H__\n";
 | |
| print CHARTABLE "#define __JAVA_CHARTABLES_H__\n\n";
 | |
| print CHARTABLE "// These tables are automatically generated by the chartables.pl\n";
 | |
| print CHARTABLE "// script.  DO NOT EDIT the tables.  Instead, fix the script\n";
 | |
| print CHARTABLE "// and run it again.\n\n";
 | |
| print CHARTABLE "// This file should only be included by natCharacter.cc\n\n";
 | |
| 
 | |
| 
 | |
| $bytes = 0;
 | |
| 
 | |
| # Titlecase mapping tables.
 | |
| if ($#title_to_lower != $#title_to_upper)
 | |
| {
 | |
|     # If this fails we need to reimplement toTitleCase.
 | |
|     print STDERR "titlecase mappings have different sizes\n";
 | |
|     $status = 1;
 | |
| }
 | |
| # Also ensure that the tables are entirely parallel.
 | |
| foreach $key (sort keys %title_to_lower)
 | |
| {
 | |
|     if (! defined $title_to_upper{$key})
 | |
|     {
 | |
| 	print STDERR "titlecase mappings have different entries\n";
 | |
| 	$status = 1;
 | |
|     }
 | |
| }
 | |
| &print_single_map ("title_to_lower_table", %title_to_lower);
 | |
| &print_single_map ("title_to_upper_table", %title_to_upper);
 | |
| 
 | |
| print CHARTABLE "#ifdef COMPACT_CHARACTER\n\n";
 | |
| 
 | |
| printf CHARTABLE "#define TAMIL_DIGIT_ONE 0x%04x\n\n", $TAMIL_DIGIT_ONE;
 | |
| 
 | |
| # All numeric values.
 | |
| &print_numerics;
 | |
| 
 | |
| # Digits only.
 | |
| &print_block ("digit_table", *digit_start, *digit_end);
 | |
| 
 | |
| # Space characters.
 | |
| &print_block ("space_table", *space_start, *space_end);
 | |
| 
 | |
| # Letters.  We used to generate a separate letter table.  But this
 | |
| # doesn't really seem worthwhile.  Simply using `all_table' saves us
 | |
| # about 800 bytes, and only adds 3 table probes to isLetter.
 | |
| # &print_block ("letter_table", *letter_start, *letter_end);
 | |
| 
 | |
| # Case tables.
 | |
| &print_case_table ("upper", *upper_start, *upper_end, *upper_map, *upper_anom);
 | |
| &print_case_table ("lower", *lower_start, *lower_end, *lower_map, *lower_anom);
 | |
| 
 | |
| # Everything else.
 | |
| &print_all_block (*all_start, *all_end, *all_cats);
 | |
| 
 | |
| print CHARTABLE "#else /* COMPACT_CHARACTER */\n\n";
 | |
| 
 | |
| printf CHARTABLE "#define ROMAN_START 0x%04x\n", $ROMAN_START;
 | |
| printf CHARTABLE "#define ROMAN_END   0x%04x\n\n", $ROMAN_END;
 | |
| 
 | |
| &print_fast_tables (*all_start, *all_end, *all_cats,
 | |
| 		    *attributes, *second_attributes);
 | |
| 
 | |
| print CHARTABLE "#endif /* COMPACT_CHARACTER */\n\n";
 | |
| 
 | |
| print CHARTABLE "#endif /* __JAVA_CHARTABLES_H__ */\n";
 | |
| 
 | |
| printf STDERR "Approximately %d bytes of data generated (compact case)\n",
 | |
|     $bytes;
 | |
| 
 | |
| 
 | |
| # Now generate decomposition tables.
 | |
| printf DECOMP "// java-chardecomp.h - Decomposition character tables -*- c++ -*-\n\n";
 | |
| printf DECOMP "#ifndef __JAVA_CHARDECOMP_H__\n";
 | |
| printf DECOMP "#define __JAVA_CHARDECOMP_H__\n\n";
 | |
| print DECOMP "// These tables are automatically generated by the chartables.pl\n";
 | |
| print DECOMP "// script.  DO NOT EDIT the tables.  Instead, fix the script\n";
 | |
| print DECOMP "// and run it again.\n\n";
 | |
| print DECOMP "// This file should only be included by natCollator.cc\n\n";
 | |
| 
 | |
| print DECOMP "struct decomp_entry\n{\n";
 | |
| print DECOMP "  jchar key;\n";
 | |
| print DECOMP "  const char *value;\n";
 | |
| print DECOMP "};\n\n";
 | |
| 
 | |
| &write_decompositions;
 | |
| 
 | |
| printf DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
 | |
| 
 | |
| 
 | |
| close (CHARTABLE);
 | |
| close (DECOMP);
 | |
| 
 | |
| exit $status;
 | |
| 
 | |
| 
 | |
| # Process a gap in the space.
 | |
| sub process_gap
 | |
| {
 | |
|     local (*fields, $prevcode, $ncode) = @_;
 | |
|     local (@gap_fields, $i);
 | |
| 
 | |
|     if ($ncode == $CJK_IDEOGRAPH_END
 | |
| 	|| $ncode == $HANGUL_END
 | |
| 	|| $ncode == $HIGH_SURROGATE_END
 | |
| 	|| $ncode == $PRIVATE_HIGH_SURROGATE_END
 | |
| 	|| $ncode == $LOW_SURROGATE_END
 | |
| 	|| $ncode == $PRIVATE_END)
 | |
|     {
 | |
| 	# The characters in the gap we just found are known to
 | |
| 	# have the same properties as the character at the end of
 | |
| 	# the gap.
 | |
| 	@gap_fields = @fields;
 | |
|     }
 | |
|     else
 | |
|     {
 | |
| 	# This prints too much to be enabled.
 | |
| 	# print STDERR "Gap found at \\u", $fields[$CODE], "\n";
 | |
| 	@gap_fields = ('', '', 'Cn', '', '', '', '', '', '', '', '',
 | |
| 		       '', '', '', '');
 | |
|     }
 | |
| 
 | |
|     for ($i = $prevcode + 1; $i < $ncode; ++$i)
 | |
|     {
 | |
| 	$gap_fields[$CODE] = sprintf ("%04x", $i);
 | |
| 	$gap_fields[$NAME] = "CHARACTER " . $gap_fields[$CODE];
 | |
| 	&process_char (*gap_fields, $i);
 | |
|     }
 | |
| }
 | |
| 
 | |
| # Process a single character.
 | |
| sub process_char
 | |
| {
 | |
|     local (*fields, $ncode) = @_;
 | |
| 
 | |
|     if ($fields[$DECOMPOSITION] ne '')
 | |
|     {
 | |
| 	&add_decomposition ($ncode, $fields[$DECOMPOSITION]);
 | |
|     }
 | |
| 
 | |
|     # If this is a titlecase character, mark it.
 | |
|     if ($fields[$CATEGORY] eq 'Lt')
 | |
|     {
 | |
| 	$title_to_upper{$fields[$CODE]} = $fields[$UPPERCASE];
 | |
| 	$title_to_lower{$fields[$CODE]} = $fields[$LOWERCASE];
 | |
|     }
 | |
|     else
 | |
|     {
 | |
| 	# For upper and lower case mappings, we try to build compact
 | |
| 	# tables that map range onto range.  We specifically want to
 | |
| 	# avoid titlecase characters.  Java specifies a range check to
 | |
| 	# make sure the character is not between 0x2000 and 0x2fff.
 | |
| 	# We avoid that here because we need to generate table entries
 | |
| 	# -- toLower and toUpper still work in that range.
 | |
| 	if ($fields[$UPPERCASE] eq ''
 | |
| 	    && ($fields[$LOWERCASE] ne ''
 | |
| 		|| $fields[$NAME] =~ /CAPITAL (LETTER|LIGATURE)/))
 | |
| 	{
 | |
| 	    if ($fields[$LOWERCASE] ne '')
 | |
| 	    {
 | |
| 		&update_case_block (*upper_start, *upper_end, *upper_map,
 | |
| 				    $fields[$CODE], $fields[$LOWERCASE]);
 | |
| 		&set_attribute ($ncode, hex ($fields[$LOWERCASE]));
 | |
| 	    }
 | |
| 	    else
 | |
| 	    {
 | |
| 		$upper_anom{$fields[$CODE]} = 1;
 | |
| 	    }
 | |
| 	}
 | |
| 	elsif ($fields[$LOWERCASE] ne '')
 | |
| 	{
 | |
| 	    print STDERR ("Java missed upper case char \\u",
 | |
| 			  $fields[$CODE], "\n");
 | |
| 	}
 | |
| 	elsif ($fields[$CATEGORY] eq 'Lu')
 | |
| 	{
 | |
| 	    # This case is for letters which are marked as upper case
 | |
| 	    # but for which there is no lower case equivalent.  For
 | |
| 	    # instance, LATIN LETTER YR.
 | |
| 	}
 | |
| 
 | |
| 	if ($fields[$LOWERCASE] eq ''
 | |
| 	    && ($fields[$UPPERCASE] ne ''
 | |
| 		|| $fields[$NAME] =~ /SMALL (LETTER|LIGATURE)/))
 | |
| 	{
 | |
| 	    if ($fields[$UPPERCASE] ne '')
 | |
| 	    {
 | |
| 		&update_case_block (*lower_start, *lower_end, *lower_map,
 | |
| 				    $fields[$CODE], $fields[$UPPERCASE]);
 | |
| 		&set_attribute ($ncode, hex ($fields[$UPPERCASE]));
 | |
| 	    }
 | |
| 	    else
 | |
| 	    {
 | |
| 		$lower_anom{$fields[$CODE]} = 1;
 | |
| 	    }
 | |
| 	}
 | |
| 	elsif ($fields[$UPPERCASE] ne '')
 | |
| 	{
 | |
| 	    print STDERR ("Java missed lower case char \\u",
 | |
| 			  $fields[$CODE], "\n");
 | |
| 	}
 | |
| 	elsif ($fields[$CATEGORY] eq 'Ll')
 | |
| 	{
 | |
| 	    # This case is for letters which are marked as lower case
 | |
| 	    # but for which there is no upper case equivalent.  For
 | |
| 	    # instance, FEMININE ORDINAL INDICATOR.
 | |
| 	}
 | |
|     }
 | |
| 
 | |
| 
 | |
|     # If we have a non-decimal numeric value, add it to the list.
 | |
|     if ($fields[$CATEGORY] eq 'Nd'
 | |
| 	&& ($ncode < 0x2000 || $ncode > 0x2fff)
 | |
| 	&& $fields[$NAME] =~ /DIGIT/)
 | |
|     {
 | |
| 	# This is a digit character that is handled elsewhere.
 | |
|     }
 | |
|     elsif ($fields[$DIGIT] ne '' || $fields[$NUMERIC] ne '')
 | |
|     {
 | |
| 	# Do a simple check.
 | |
| 	if ($fields[$DECIMAL] ne '')
 | |
| 	{
 | |
| 	    # This catches bugs in an earlier implementation of
 | |
| 	    # chartables.pl.  Now it is here for historical interest
 | |
| 	    # only.
 | |
| 	    # print STDERR ("Character \u", $fields[$CODE],
 | |
| 	    # " would have been missed as digit\n");
 | |
| 	}
 | |
| 
 | |
| 	local ($val) = $fields[$DIGIT];
 | |
| 	$val = $fields[$NUMERIC] if $val eq '';
 | |
| 	local ($ok) = 1;
 | |
| 
 | |
| 	# If we have a value which is not a positive integer, then we
 | |
| 	# set the value to -2 to make life easier for
 | |
| 	# Character.getNumericValue.
 | |
| 	if ($val !~ m/^[0-9]+$/)
 | |
| 	{
 | |
| 	    if ($fields[$CATEGORY] ne 'Nl'
 | |
| 		&& $fields[$CATEGORY] ne 'No')
 | |
| 	    {
 | |
| 		# This shows a few errors in the Unicode table.  These
 | |
| 		# characters have a missing Numeric field, and the `N'
 | |
| 		# for the mirrored field shows up there instead.  I
 | |
| 		# reported these characters to errata@unicode.org on
 | |
| 		# Thu Sep 10 1998.  They said it will be fixed in the
 | |
| 		# 2.1.6 release of the tables.
 | |
| 		print STDERR ("Character \u", $fields[$CODE],
 | |
| 			      " has value but is not numeric; val = '",
 | |
| 			      $val, "'\n");
 | |
| 		# We skip these.
 | |
| 		$ok = 0;
 | |
| 	    }
 | |
| 	    $val = "-2";
 | |
| 	}
 | |
| 
 | |
| 	if ($ok)
 | |
| 	{
 | |
| 	    $numerics{$fields[$CODE]} = $val;
 | |
| 	    &set_attribute ($ncode, $val);
 | |
| 	}
 | |
|     }
 | |
| 
 | |
|     # We build a table that lists ranges of ordinary decimal values.
 | |
|     # At each step we make sure that the digits are in the correct
 | |
|     # order, with no holes, as this is assumed by Character.  If this
 | |
|     # fails, reimplementation is required.  This implementation
 | |
|     # dovetails nicely with the Java Spec, which has strange rules for
 | |
|     # what constitutes a decimal value.  In particular the Unicode
 | |
|     # name must contain the word `DIGIT'.  The spec doesn't directly
 | |
|     # say that digits must have type `Nd' (or that their value must an
 | |
|     # integer), but that can be inferred from the list of digits in
 | |
|     # the book(s).  Currently the only Unicode characters whose name
 | |
|     # includes `DIGIT' which would not fit are the Tibetan "half"
 | |
|     # digits.
 | |
|     if ($fields[$CATEGORY] eq 'Nd')
 | |
|     {
 | |
| 	if (($ncode < 0x2000 || $ncode > 0x2fff)
 | |
| 	    && $fields[$NAME] =~ /DIGIT/)
 | |
| 	{
 | |
| 	    &update_digit_block (*digit_start, *digit_end, $fields[$CODE],
 | |
| 				 $fields[$DECIMAL]);
 | |
| 	    &set_attribute ($ncode, $fields[$DECIMAL]);
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 	    # If this fails then Character.getType will fail.  We
 | |
| 	    # assume that things in `digit_table' are the only
 | |
| 	    # category `Nd' characters.
 | |
| 	    print STDERR ("Character \u", $fields[$CODE],
 | |
| 			  " is class Nd but not in digit table\n");
 | |
| 	    $status = 1;
 | |
| 	}
 | |
|     }
 | |
| 
 | |
|     # Keep track of space characters.
 | |
|     if ($fields[$CATEGORY] =~ /Z[slp]/)
 | |
|     {
 | |
| 	&update_block (*space_start, *space_end, $fields[$CODE]);
 | |
|     }
 | |
| 
 | |
|     # Keep track of letters.
 | |
|     # if ($fields[$CATEGORY] =~ /L[ultmo]/)
 | |
|     # {
 | |
|     # 	&update_letter_block (*letter_start, *letter_end, $fields[$CODE],
 | |
|     # 			      $fields[$CATEGORY]);
 | |
|     # }
 | |
| 
 | |
|     # Keep track of all characters.  You might think we wouldn't have
 | |
|     # to do this for uppercase letters, or other characters we already
 | |
|     # "classify".  The problem is that this classification is
 | |
|     # different.  E.g., \u216f is uppercase by Java rules, but is a
 | |
|     # LETTER_NUMBER here.
 | |
|     &update_all_block (*all_start, *all_end, *all_cats,
 | |
| 		       $fields[$CODE], $fields[$CATEGORY]);
 | |
| }
 | |
| 
 | |
| 
 | |
| # Called to add a new decomposition.
 | |
| sub add_decomposition
 | |
| {
 | |
|     local ($ncode, $value) = @_;
 | |
|     local ($is_full) = 0;
 | |
|     local ($first) = 1;
 | |
|     local (@decomp) = ();
 | |
| 
 | |
|     foreach (split (' ', $value))
 | |
|     {
 | |
| 	if ($first && /^\<.*\>$/)
 | |
| 	{
 | |
| 	    $is_full = 1;
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 	    push (@decomp, hex ($_));
 | |
| 	}
 | |
| 	$first = 0;
 | |
|     }
 | |
| 
 | |
|     # We pack the value into a string because this means we can stick
 | |
|     # with Perl 4 features.
 | |
|     local ($s) = pack "I*", @decomp;
 | |
|     if ($is_full)
 | |
|     {
 | |
| 	$full_decomposition{$ncode} = $s;
 | |
|     }
 | |
|     else
 | |
|     {
 | |
| 	$canonical_decomposition{$ncode} = $s;
 | |
|     }
 | |
| }
 | |
| 
 | |
| # Write a single decomposition table.
 | |
| sub write_single_decomposition
 | |
| {
 | |
|     local ($name, $is_canon, %table) = @_;
 | |
| 
 | |
|     printf DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
 | |
| 
 | |
|     local ($key, @expansion, $char);
 | |
|     local ($first_line) = 1;
 | |
| 
 | |
|     for ($key = 0; $key <= 65535; ++$key)
 | |
|     {
 | |
| 	next if ! defined $table{$key};
 | |
| 
 | |
| 	printf DECOMP ",\n"
 | |
| 	    unless $first_line;
 | |
| 	$first_line = 0;
 | |
| 
 | |
| 	printf DECOMP "  { 0x%04x, \"", $key;
 | |
| 
 | |
| 	# We represent the expansion as a series of bytes, terminated
 | |
| 	# with a double nul.  This is ugly, but relatively
 | |
| 	# space-efficient.  Most expansions are short, but there are a
 | |
| 	# few that are very long (e.g. \uFDFA).  This means that if we
 | |
| 	# chose a fixed-space representation we would waste a lot of
 | |
| 	# space.
 | |
| 	@expansion = unpack "I*", $table{$key};
 | |
| 	foreach $char (@expansion)
 | |
| 	{
 | |
| 	    printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
 | |
| 	}
 | |
| 
 | |
| 	printf DECOMP "\" }";
 | |
|     }
 | |
| 
 | |
|     printf DECOMP "\n};\n\n";
 | |
| }
 | |
| 
 | |
| sub write_decompositions
 | |
| {
 | |
|     &write_single_decomposition ('canonical', 1, %canonical_decomposition);
 | |
|     &write_single_decomposition ('full', 0, %full_decomposition);
 | |
| }
 | |
| 
 | |
| # We represent a block of characters with a pair of lists.  This
 | |
| # function updates the pair to account for the new character.  Returns
 | |
| # 1 if we added to the old block, 0 otherwise.
 | |
| sub update_block
 | |
| {
 | |
|     local (*start, *end, $char) = @_;
 | |
| 
 | |
|     local ($nchar) = hex ($char);
 | |
|     local ($count) = $#end;
 | |
|     if ($count >= 0 && $end[$count] == $nchar - 1)
 | |
|     {
 | |
| 	++$end[$count];
 | |
| 	return 1;
 | |
|     }
 | |
|     else
 | |
|     {
 | |
| 	++$count;
 | |
| 	$start[$count] = $nchar;
 | |
| 	$end[$count] = $nchar;
 | |
|     }
 | |
|     return 0;
 | |
| }
 | |
| 
 | |
| # Return true if we will be appending this character to the end of the
 | |
| # existing block.
 | |
| sub block_append_p
 | |
| {
 | |
|     local (*end, $char) = @_;
 | |
|     return $#end >= 0 && $end[$#end] == $char - 1;
 | |
| }
 | |
| 
 | |
| # This updates the digit block.  This table is much like an ordinary
 | |
| # block, but it has an extra constraint.
 | |
| sub update_digit_block
 | |
| {
 | |
|     local (*start, *end, $char, $value) = @_;
 | |
| 
 | |
|     &update_block ($start, $end, $char);
 | |
|     local ($nchar) = hex ($char);
 | |
| 
 | |
|     # We want to make sure that the new digit's value is correct for
 | |
|     # its place in the block.  However, we special-case Tamil digits,
 | |
|     # since Tamil does not have a digit `0'.
 | |
|     local ($count) = $#start;
 | |
|     if (($nchar < $TAMIL_DIGIT_ONE || $nchar > $TAMIL_DIGIT_NINE)
 | |
| 	&& $nchar - $start[$count] != $value)
 | |
|     {
 | |
| 	# If this fails then Character.digit_value will be wrong.
 | |
| 	print STDERR "Character \\u", $char, " violates digit constraint\n";
 | |
| 	$status = 1;
 | |
|     }
 | |
| }
 | |
| 
 | |
| # Update letter table.  We could be smart about avoiding upper or
 | |
| # lower case letters, but it is much simpler to just track them all.
 | |
| sub update_letter_block
 | |
| {
 | |
|     local (*start, *end, $char, $category) = @_;
 | |
| 
 | |
|     &update_block (*start, *end, $char);
 | |
| }
 | |
| 
 | |
| # Update `all' table.  This table holds all the characters we don't
 | |
| # already categorize for other reasons.  FIXME: if a given type has
 | |
| # very few characters, we should just inline the code.  E.g., there is
 | |
| # only one paragraph separator.
 | |
| sub update_all_block
 | |
| {
 | |
|     local (*start, *end, *cats, $char, $category) = @_;
 | |
| 
 | |
|     local ($nchar) = hex ($char);
 | |
|     local ($count) = $#end;
 | |
|     if ($count >= 0
 | |
| 	&& $end[$count] == $nchar - 1
 | |
| 	&& $cats[$count] eq $category)
 | |
|     {
 | |
| 	++$end[$count];
 | |
|     }
 | |
|     else
 | |
|     {
 | |
| 	++$count;
 | |
| 	$start[$count] = $nchar;
 | |
| 	$end[$count] = $nchar;
 | |
| 	$cats[$count] = $category;
 | |
|     }
 | |
| }
 | |
| 
 | |
| # Update a case table.  We handle case tables specially because we
 | |
| # want to map (e.g.) a block of uppercase characters directly onto the
 | |
| # corresponding block of lowercase characters.  Therefore we generate
 | |
| # a new entry when the block would no longer map directly.
 | |
| sub update_case_block
 | |
| {
 | |
|     local (*start, *end, *map, $char, $mapchar) = @_;
 | |
| 
 | |
|     local ($nchar) = hex ($char);
 | |
|     local ($nmap) = hex ($mapchar);
 | |
| 
 | |
|     local ($count) = $#end;
 | |
|     if ($count >= 0
 | |
| 	&& $end[$count] == $nchar - 1
 | |
| 	&& $nchar - $start[$count] == $nmap - $map[$count])
 | |
|     {
 | |
| 	++$end[$count];
 | |
|     }
 | |
|     else
 | |
|     {
 | |
| 	++$count;
 | |
| 	$start[$count] = $nchar;
 | |
| 	$end[$count] = $nchar;
 | |
| 	$map[$count] = $nmap;
 | |
|     }
 | |
| }
 | |
| 
 | |
| # Set the attribute value for the character.  Each character can have
 | |
| # only one attribute.
 | |
| sub set_attribute
 | |
| {
 | |
|     local ($ncode, $attr) = @_;
 | |
| 
 | |
|     if ($attributes{$ncode} ne '' && $attributes{$ncode} ne $attr)
 | |
|     {
 | |
| 	if ($ncode >= $ROMAN_START && $ncode <= $ROMAN_END)
 | |
| 	{
 | |
| 	    $second_attributes{$ncode} = $attr;
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 	    printf STDERR "character \\u%04x already has attribute\n", $ncode;
 | |
| 	}
 | |
|     }
 | |
|     # Attributes can be interpreted as unsigned in some situations,
 | |
|     # so we check against 65535.  This could cause errors -- we need
 | |
|     # to check the interpretation here.
 | |
|     elsif ($attr < -32768 || $attr > 65535)
 | |
|     {
 | |
| 	printf STDERR "attribute out of range for character \\u%04x\n", $ncode;
 | |
|     }
 | |
|     else
 | |
|     {
 | |
| 	$attributes{$ncode} = $attr;
 | |
|     }
 | |
| }
 | |
| 
 | |
| 
 | |
| # Print a block table.
 | |
| sub print_block
 | |
| {
 | |
|     local ($title, *start, *end) = @_;
 | |
| 
 | |
|     print CHARTABLE "static const jchar ", $title, "[][2] =\n";
 | |
|     print CHARTABLE "  {\n";
 | |
| 
 | |
|     local ($i) = 0;
 | |
|     while ($i <= $#start)
 | |
|     {
 | |
| 	print CHARTABLE "    { ";
 | |
| 	&print_char ($start[$i]);
 | |
| 	print CHARTABLE ", ";
 | |
| 	&print_char ($end[$i]);
 | |
| 	print CHARTABLE " }";
 | |
| 	print CHARTABLE "," if ($i != $#start);
 | |
| 	print CHARTABLE "\n";
 | |
| 	++$i;
 | |
| 	$bytes += 4;		# Two bytes per char.
 | |
|     }
 | |
| 
 | |
|     print CHARTABLE "  };\n\n";
 | |
| }
 | |
| 
 | |
| # Print the numerics table.
 | |
| sub print_numerics
 | |
| {
 | |
|     local ($i, $key, $count, @keys);
 | |
| 
 | |
|     $i = 0;
 | |
|     @keys = sort keys %numerics;
 | |
|     $count = @keys;
 | |
| 
 | |
|     print CHARTABLE "static const jchar numeric_table[] =\n";
 | |
|     print CHARTABLE "  { ";
 | |
|     foreach $key (@keys)
 | |
|     {
 | |
| 	&print_char (hex ($key));
 | |
| 	++$i;
 | |
| 	print CHARTABLE ", " if $i < $count;
 | |
| 	# Print 5 per line.
 | |
| 	print CHARTABLE "\n    " if ($i % 5 == 0);
 | |
| 	$bytes += 2;		# One character.
 | |
|     }
 | |
|     print CHARTABLE " };\n\n";
 | |
| 
 | |
|     print CHARTABLE "static const jshort numeric_value[] =\n";
 | |
|     print CHARTABLE "  { ";
 | |
|     $i = 0;
 | |
|     foreach $key (@keys)
 | |
|     {
 | |
| 	print CHARTABLE $numerics{$key};
 | |
| 	if ($numerics{$key} > 32767 || $numerics{$key} < -32768)
 | |
| 	{
 | |
| 	    # This means our generated type info is incorrect.  We
 | |
| 	    # could just detect and work around this here, but I'm
 | |
| 	    # lazy.
 | |
| 	    print STDERR "numeric value won't fit in a short\n";
 | |
| 	    $status = 1;
 | |
| 	}
 | |
| 	++$i;
 | |
| 	print CHARTABLE ", " if $i < $count;
 | |
| 	# Print 10 per line.
 | |
| 	print CHARTABLE "\n    " if ($i % 10 == 0);
 | |
| 	$bytes += 2;		# One short.
 | |
|     }
 | |
|     print CHARTABLE " };\n\n";
 | |
| }
 | |
| 
 | |
| # Print a table that maps one single letter onto another.  It assumes
 | |
| # the map is index by char code.
 | |
| sub print_single_map
 | |
| {
 | |
|     local ($title, %map) = @_;
 | |
| 
 | |
|     local (@keys) = sort keys %map;
 | |
|     $num = @keys;
 | |
|     print CHARTABLE "static const jchar ", $title, "[][2] =\n";
 | |
|     print CHARTABLE "  {\n";
 | |
|     $i = 0;
 | |
|     for $key (@keys)
 | |
|     {
 | |
| 	print CHARTABLE "    { ";
 | |
| 	&print_char (hex ($key));
 | |
| 	print CHARTABLE ", ";
 | |
| 	&print_char (hex ($map{$key}));
 | |
| 	print CHARTABLE " }";
 | |
| 	++$i;
 | |
| 	if ($i < $num)
 | |
| 	{
 | |
| 	    print CHARTABLE ",";
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 	    print CHARTABLE " ";
 | |
| 	}
 | |
| 	print CHARTABLE "   // ", $name{$key}, "\n";
 | |
| 	$bytes += 4;		# Two bytes per char.
 | |
|     }
 | |
|     print CHARTABLE "  };\n\n";
 | |
| }
 | |
| 
 | |
| # Print the `all' block.
 | |
| sub print_all_block
 | |
| {
 | |
|     local (*start, *end, *cats) = @_;
 | |
| 
 | |
|     &print_block ("all_table", *start, *end);
 | |
| 
 | |
|     local ($i) = 0;
 | |
|     local ($sum) = 0;
 | |
|     while ($i <= $#start)
 | |
|     {
 | |
| 	$sum += $end[$i] - $start[$i] + 1;
 | |
| 	++$i;
 | |
|     }
 | |
|     # We do this computation just to make sure it isn't cheaper to
 | |
|     # simply list all the characters individually.
 | |
|     printf STDERR ("all_table encodes %d characters in %d entries\n",
 | |
| 		   $sum, $#start + 1);
 | |
| 
 | |
|     print CHARTABLE "static const jbyte category_table[] =\n";
 | |
|     print CHARTABLE "  { ";
 | |
| 
 | |
|     $i = 0;
 | |
|     while ($i <= $#cats)
 | |
|     {
 | |
| 	if ($i > 0 && $cats[$i] eq $cats[$i - 1])
 | |
| 	{
 | |
| 	    # This isn't an error.  We can have a duplicate because
 | |
| 	    # two ranges are not adjacent while the intervening
 | |
| 	    # characters are left out of the table for other reasons.
 | |
| 	    # We could exploit this to make the table a little smaller.
 | |
| 	    # printf STDERR "Duplicate all entry at \\u%04x\n", $start[$i];
 | |
| 	}
 | |
| 	print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]};
 | |
| 	print CHARTABLE ", " if ($i < $#cats);
 | |
| 	++$i;
 | |
| 	print CHARTABLE "\n    ";
 | |
| 	++$bytes;
 | |
|     }
 | |
|     print CHARTABLE "  };\n\n";
 | |
| }
 | |
| 
 | |
| # Print case table.
 | |
| sub print_case_table
 | |
| {
 | |
|     local ($title, *start, *end, *map, *anomalous) = @_;
 | |
| 
 | |
|     &print_block ($title . '_case_table', *start, *end);
 | |
| 
 | |
|     print CHARTABLE "static const jchar ", $title, "_case_map_table[] =\n";
 | |
|     print CHARTABLE "  { ";
 | |
| 
 | |
|     local ($i) = 0;
 | |
|     while ($i <= $#map)
 | |
|     {
 | |
| 	&print_char ($map[$i]);
 | |
| 	print CHARTABLE ", " if $i < $#map;
 | |
| 	++$i;
 | |
| 	print CHARTABLE "\n    " if $i % 5 == 0;
 | |
| 	$bytes += 2;
 | |
|     }
 | |
|     print CHARTABLE "  };\n";
 | |
| 
 | |
| 
 | |
|     local ($key, @keys);
 | |
|     @keys = sort keys %anomalous;
 | |
| 
 | |
|     if ($title eq 'upper')
 | |
|     {
 | |
| 	if ($#keys >= 0)
 | |
| 	{
 | |
| 	    # If these are found we need to change Character.isUpperCase.
 | |
| 	    print STDERR "Found anomalous upper case characters\n";
 | |
| 	    $status = 1;
 | |
| 	}
 | |
|     }
 | |
|     else
 | |
|     {
 | |
| 	print CHARTABLE "\n";
 | |
| 	print CHARTABLE "static const jchar ", $title, "_anomalous_table[] =\n";
 | |
| 	print CHARTABLE "  { ";
 | |
| 	$i = 0;
 | |
| 	foreach $key (@keys)
 | |
| 	{
 | |
| 	    &print_char (hex ($key));
 | |
| 	    print CHARTABLE ", " if $i < $#keys;
 | |
| 	    ++$i;
 | |
| 	    print CHARTABLE "\n    " if $i % 5 == 0;
 | |
| 	    $bytes += 2;
 | |
| 	}
 | |
| 	print CHARTABLE "  };\n";
 | |
|     }
 | |
| 
 | |
|     print CHARTABLE "\n";
 | |
| }
 | |
| 
 | |
| # Print the type table and attributes table for the fast version.
 | |
| sub print_fast_tables
 | |
| {
 | |
|     local (*start, *end, *cats, *atts, *second_atts) = @_;
 | |
| 
 | |
|     print CHARTABLE "static const jbyte type_table[] =\n{ ";
 | |
| 
 | |
|     local ($i, $j);
 | |
|     for ($i = 0; $i <= $#cats; ++$i)
 | |
|     {
 | |
| 	for ($j = $start[$i]; $j <= $end[$i]; ++$j)
 | |
| 	{
 | |
| 	    print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]};
 | |
| 	    print CHARTABLE "," if ($i < $#cats || $j < $end[$i]);
 | |
| 	    print CHARTABLE "\n    ";
 | |
| 	}
 | |
|     }
 | |
|     print CHARTABLE "\n };\n\n";
 | |
| 
 | |
|     print CHARTABLE "static const jshort attribute_table[] =\n{ ";
 | |
|     for ($i = 0; $i <= 0xffff; ++$i)
 | |
|     {
 | |
| 	$atts{$i} = 0 if ! defined $atts{$i};
 | |
| 	print CHARTABLE $atts{$i};
 | |
| 	print CHARTABLE ", " if $i < 0xffff;
 | |
| 	print CHARTABLE "\n    " if $i % 5 == 1;
 | |
|     }
 | |
|     print CHARTABLE "\n };\n\n";
 | |
| 
 | |
|     print CHARTABLE "static const jshort secondary_attribute_table[] =\n{ ";
 | |
|     for ($i = $ROMAN_START; $i <= $ROMAN_END; ++$i)
 | |
|     {
 | |
| 	print CHARTABLE $second_atts{$i};
 | |
| 	print CHARTABLE ", " if $i < $ROMAN_END;
 | |
| 	print CHARTABLE "\n    " if $i % 5 == 1;
 | |
|     }
 | |
|     print CHARTABLE "\n };\n\n";
 | |
| }
 | |
| 
 | |
| # Print a character constant.
 | |
| sub print_char
 | |
| {
 | |
|     local ($ncode) = @_;
 | |
|     printf CHARTABLE "0x%04x", $ncode;
 | |
| }
 |