mirror of git://gcc.gnu.org/git/gcc.git
				
				
				
			
		
			
				
	
	
		
			147 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			147 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
#!/usr/bin/perl -w
 | 
						|
# unicode-decomp.pl - script to generate database for java.text.Collator
 | 
						|
# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
 | 
						|
#
 | 
						|
# This file is part of libjava.
 | 
						|
# 
 | 
						|
# This software is copyrighted work licensed under the terms of the
 | 
						|
# Libjava License.  Please consult the file "LIBJAVA_LICENSE" for
 | 
						|
# details.
 | 
						|
 | 
						|
# Code for reading UnicodeData.txt and generating the code for
 | 
						|
# gnu.java.lang.CharData.  For now, the relevant Unicode definition files
 | 
						|
# are found in libjava/gnu/gcj/convert/.
 | 
						|
#
 | 
						|
# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
 | 
						|
#   where <UnicodeData.txt> is obtained from www.unicode.org (named
 | 
						|
#   UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
 | 
						|
#   is the final location of include/java-chardecomp.h.
 | 
						|
#   As of JDK 1.4, use Unicode version 3.0.0 for best results.
 | 
						|
#
 | 
						|
# If this exits with nonzero status, then you must investigate the
 | 
						|
# cause of the problem.
 | 
						|
# Diagnostics and other information to stderr.
 | 
						|
# With -n, the files are not created, but all processing still occurs.
 | 
						|
 | 
						|
# These maps characters to their decompositions.
 | 
						|
my %canonical_decomposition = ();
 | 
						|
my %full_decomposition = ();
 | 
						|
 | 
						|
# Handle `-n' and open output files.
 | 
						|
if ($ARGV[0] && $ARGV[0] eq '-n')
 | 
						|
{
 | 
						|
    shift @ARGV;
 | 
						|
    $ARGV[1] = '/dev/null';
 | 
						|
}
 | 
						|
die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
 | 
						|
open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
 | 
						|
 | 
						|
# Process the Unicode file.
 | 
						|
$| = 1;
 | 
						|
my $count = 0;
 | 
						|
print STDERR "Parsing attributes file";
 | 
						|
while (<UNICODE>)
 | 
						|
{
 | 
						|
    print STDERR "." unless $count++ % 1000;
 | 
						|
    chomp;
 | 
						|
    s/\r//g;
 | 
						|
    my ($ch, undef, undef, undef, undef, $decomp) = split ';';
 | 
						|
    $ch = hex($ch);
 | 
						|
 | 
						|
    if ($decomp ne '')
 | 
						|
    {
 | 
						|
        my $is_full = 0;
 | 
						|
        my @decomp = ();
 | 
						|
        foreach (split (' ', $decomp))
 | 
						|
        {
 | 
						|
            if (/^\<.*\>$/)
 | 
						|
            {
 | 
						|
                $is_full = 1;
 | 
						|
                next;
 | 
						|
            }
 | 
						|
	    push (@decomp, hex ($_));
 | 
						|
	}
 | 
						|
        my $s = pack "n*", @decomp;
 | 
						|
        if ($is_full)
 | 
						|
        {
 | 
						|
            $full_decomposition{$ch} = $s;
 | 
						|
        }
 | 
						|
        else
 | 
						|
        {
 | 
						|
            $canonical_decomposition{$ch} = $s;
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
# Now generate decomposition tables.
 | 
						|
open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
 | 
						|
print STDERR "\nGenerating tables\n";
 | 
						|
print DECOMP <<EOF;
 | 
						|
// java-chardecomp.h - Decomposition character tables -*- c++ -*-
 | 
						|
 | 
						|
#ifndef __JAVA_CHARDECOMP_H__
 | 
						|
#define __JAVA_CHARDECOMP_H__
 | 
						|
 | 
						|
 | 
						|
// These tables are automatically generated by the $0
 | 
						|
// script.  DO NOT EDIT the tables.  Instead, fix the script
 | 
						|
// and run it again.
 | 
						|
 | 
						|
// This file should only be included by natCollator.cc
 | 
						|
 | 
						|
struct decomp_entry
 | 
						|
{
 | 
						|
  jchar key;
 | 
						|
  const char *value;
 | 
						|
};
 | 
						|
 | 
						|
EOF
 | 
						|
 | 
						|
&write_decompositions;
 | 
						|
 | 
						|
print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
 | 
						|
 | 
						|
close(DECOMP);
 | 
						|
print STDERR "Done\n";
 | 
						|
exit;
 | 
						|
 | 
						|
 | 
						|
# Write a single decomposition table.
 | 
						|
sub write_single_decomposition($$%)
 | 
						|
{
 | 
						|
    my ($name, $is_canon, %table) = @_;
 | 
						|
    my $first_line = 1;
 | 
						|
    print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
 | 
						|
 | 
						|
    for my $key (0 .. 0xffff)
 | 
						|
    {
 | 
						|
	next if ! defined $table{$key};
 | 
						|
        print DECOMP ",\n" unless $first_line;
 | 
						|
	$first_line = 0;
 | 
						|
 | 
						|
	printf DECOMP "  { 0x%04x, \"", $key;
 | 
						|
 | 
						|
	# We represent the expansion as a series of bytes, terminated
 | 
						|
	# with a double nul.  This is ugly, but relatively
 | 
						|
	# space-efficient.  Most expansions are short, but there are a
 | 
						|
	# few that are very long (e.g. \uFDFA).  This means that if we
 | 
						|
	# chose a fixed-space representation we would waste a lot of
 | 
						|
	# space.
 | 
						|
	my @expansion = unpack "n*", $table{$key};
 | 
						|
	foreach my $char (@expansion)
 | 
						|
	{
 | 
						|
	    printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
 | 
						|
	}
 | 
						|
 | 
						|
	print DECOMP "\" }";
 | 
						|
    }
 | 
						|
 | 
						|
    print DECOMP "\n};\n\n";
 | 
						|
}
 | 
						|
 | 
						|
sub write_decompositions()
 | 
						|
{
 | 
						|
    &write_single_decomposition ('canonical', 1, %canonical_decomposition);
 | 
						|
    &write_single_decomposition ('full', 0, %full_decomposition);
 | 
						|
}
 |