unicode-decomp.pl: Move from chartables.pl...

2002-03-04 Eric Blake <ebb9@email.byu.edu> * scripts/unicode-decomp.pl: Move from chartables.pl, and remove the code for generating include/java-chartables.h. * scripts/unicode-blocks.pl: Move from scripts/blocks.pl, and merge with Classpath. * scripts/unicode-muncher.pl: Copy from Classpath. * scritps/MakeCharTables.java: New file. * gnu/gcj/convert/Blocks-3.txt: New file. * gnu/gcj/convert/UnicodeData-3.0.0.txt: New file. * gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html: New file. * gnu/java/lang/CharData.java: Copy from Classpath. * Makefile.am (ordinary_java_source_files): Add gnu/java/lang/CharData.java. * configure.in: Remove --enable-fast-character option. * java/lang/Character.java: Merge algorithms and Javadoc with Classpath. * java/lang/natCharacter.cc: Implement Unicode lookup table more efficiently. * include/java-chardecomp.h: Regenerate. * include/java-chartables.h: Regenerate. From-SVN: r50368
2002-03-06 18:54:45 +00:00 · 2002-03-06 18:54:45 +00:00 · 1fa782725c
commit 1fa782725c
parent b87e4a4c6f
17 changed files with 16513 additions and 84861 deletions
--- a/libjava/scripts/unicode-decomp.pl
+++ b/libjava/scripts/unicode-decomp.pl
@ -0,0 +1,146 @@
+#!/usr/bin/perl -w
+# unicode-decomp.pl - script to generate database for java.text.Collator
+# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
+#
+# This file is part of libjava.
+# 
+# This software is copyrighted work licensed under the terms of the
+# Libjava License.  Please consult the file "LIBJAVA_LICENSE" for
+# details.
+
+# Code for reading UnicodeData.txt and generating the code for
+# gnu.java.lang.CharData.  For now, the relevant Unicode definition files
+# are found in libjava/gnu/gcj/convert/.
+#
+# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
+#   where <UnicodeData.txt> is obtained from www.unicode.org (named
+#   UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
+#   is the final location of include/java-chardecomp.h.
+#   As of JDK 1.4, use Unicode version 3.0.0 for best results.
+#
+# If this exits with nonzero status, then you must investigate the
+# cause of the problem.
+# Diagnostics and other information to stderr.
+# With -n, the files are not created, but all processing still occurs.
+
+# These maps characters to their decompositions.
+my %canonical_decomposition = ();
+my %full_decomposition = ();
+
+# Handle `-n' and open output files.
+if ($ARGV[0] && $ARGV[0] eq '-n')
+{
+    shift @ARGV;
+    $ARGV[1] = '/dev/null';
+}
+die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
+open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
+
+# Process the Unicode file.
+$| = 1;
+my $count = 0;
+print STDERR "Parsing attributes file";
+while (<UNICODE>)
+{
+    print STDERR "." unless $count++ % 1000;
+    chomp;
+    s/\r//g;
+    my ($ch, undef, undef, undef, undef, $decomp) = split ';';
+    $ch = hex($ch);
+
+    if ($decomp ne '')
+    {
+        my $is_full = 0;
+        my @decomp = ();
+        foreach (split (' ', $decomp))
+        {
+            if (/^\<.*\>$/)
+            {
+                $is_full = 1;
+                next;
+            }
+	    push (@decomp, hex ($_));
+	}
+        my $s = pack "n*", @decomp;
+        if ($is_full)
+        {
+            $full_decomposition{$ch} = $s;
+        }
+        else
+        {
+            $canonical_decomposition{$ch} = $s;
+        }
+    }
+}
+
+# Now generate decomposition tables.
+open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
+print STDERR "\nGenerating tables\n";
+print DECOMP <<EOF;
+// java-chardecomp.h - Decomposition character tables -*- c++ -*-
+
+#ifndef __JAVA_CHARDECOMP_H__
+#define __JAVA_CHARDECOMP_H__
+
+
+// These tables are automatically generated by the $0
+// script.  DO NOT EDIT the tables.  Instead, fix the script
+// and run it again.
+
+// This file should only be included by natCollator.cc
+
+struct decomp_entry
+{
+  jchar key;
+  const char *value;
+};
+
+EOF
+
+&write_decompositions;
+
+print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
+
+close(DECOMP);
+print STDERR "Done\n";
+exit;
+
+
+# Write a single decomposition table.
+sub write_single_decomposition($$%)
+{
+    my ($name, $is_canon, %table) = @_;
+    my $first_line = 1;
+    print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
+
+    for my $key (0 .. 0xffff)
+    {
+	next if ! defined $table{$key};
+        print DECOMP ",\n" unless $first_line;
+	$first_line = 0;
+
+	printf DECOMP "  { 0x%04x, \"", $key;
+
+	# We represent the expansion as a series of bytes, terminated
+	# with a double nul.  This is ugly, but relatively
+	# space-efficient.  Most expansions are short, but there are a
+	# few that are very long (e.g. \uFDFA).  This means that if we
+	# chose a fixed-space representation we would waste a lot of
+	# space.
+	my @expansion = unpack "n*", $table{$key};
+	foreach my $char (@expansion)
+	{
+	    printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
+	}
+
+	print DECOMP "\" }";
+    }
+
+    print DECOMP "\n};\n\n";
+}
+
+sub write_decompositions()
+{
+    &write_single_decomposition ('canonical', 1, %canonical_decomposition);
+    &write_single_decomposition ('full', 0, %full_decomposition);
+}