mirror of git://gcc.gnu.org/git/gcc.git
				
				
				
			
		
			
				
	
	
		
			1018 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			Java
		
	
	
	
			
		
		
	
	
			1018 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			Java
		
	
	
	
| /* RuleBasedCollator.java -- Concrete Collator Class
 | |
|    Copyright (C) 1998, 1999, 2000, 2001, 2003, 2004  Free Software Foundation, Inc.
 | |
| 
 | |
| This file is part of GNU Classpath.
 | |
| 
 | |
| GNU Classpath is free software; you can redistribute it and/or modify
 | |
| it under the terms of the GNU General Public License as published by
 | |
| the Free Software Foundation; either version 2, or (at your option)
 | |
| any later version.
 | |
|  
 | |
| GNU Classpath is distributed in the hope that it will be useful, but
 | |
| WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
| General Public License for more details.
 | |
| 
 | |
| You should have received a copy of the GNU General Public License
 | |
| along with GNU Classpath; see the file COPYING.  If not, write to the
 | |
| Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
 | |
| 02111-1307 USA.
 | |
| 
 | |
| Linking this library statically or dynamically with other modules is
 | |
| making a combined work based on this library.  Thus, the terms and
 | |
| conditions of the GNU General Public License cover the whole
 | |
| combination.
 | |
| 
 | |
| As a special exception, the copyright holders of this library give you
 | |
| permission to link this library with independent modules to produce an
 | |
| executable, regardless of the license terms of these independent
 | |
| modules, and to copy and distribute the resulting executable under
 | |
| terms of your choice, provided that you also meet, for each linked
 | |
| independent module, the terms and conditions of the license of that
 | |
| module.  An independent module is a module which is not derived from
 | |
| or based on this library.  If you modify this library, you may extend
 | |
| this exception to your version of the library, but you are not
 | |
| obligated to do so.  If you do not wish to do so, delete this
 | |
| exception statement from your version. */
 | |
| 
 | |
| package java.text;
 | |
| 
 | |
| import java.util.Enumeration;
 | |
| import java.util.HashMap;
 | |
| import java.util.Vector;
 | |
| 
 | |
| /* Written using "Java Class Libraries", 2nd edition, plus online
 | |
|  * API docs for JDK 1.2 from http://www.javasoft.com.
 | |
|  * Status: Believed complete and correct
 | |
|  */
 | |
| 
 | |
| /**
 | |
|  * This class is a concrete subclass of <code>Collator</code> suitable
 | |
|  * for string collation in a wide variety of languages.  An instance of
 | |
|  * this class is normally returned by the <code>getInstance</code> method
 | |
|  * of <code>Collator</code> with rules predefined for the requested
 | |
|  * locale.  However, an instance of this class can be created manually
 | |
|  * with any desired rules.
 | |
|  * <p>
 | |
|  * Rules take the form of a <code>String</code> with the following syntax
 | |
|  * <ul>
 | |
|  * <li> Modifier: '@'</li>
 | |
|  * <li> Relation: '<' | ';' | ',' | '=' : <text></li>
 | |
|  * <li> Reset: '&' : <text></li>
 | |
|  * </ul>
 | |
|  * The modifier character indicates that accents sort backward as is the
 | |
|  * case with French.  The modifier applies to all rules <b>after</b>
 | |
|  * the modifier but before the next primary sequence. If placed at the end
 | |
|  * of the sequence if applies to all unknown accented character.
 | |
|  * The relational operators specify how the text 
 | |
|  * argument relates to the previous term.  The relation characters have
 | |
|  * the following meanings:
 | |
|  * <ul>
 | |
|  * <li>'<' - The text argument is greater than the prior term at the primary
 | |
|  * difference level.</li>
 | |
|  * <li>';' - The text argument is greater than the prior term at the secondary
 | |
|  * difference level.</li>
 | |
|  * <li>',' - The text argument is greater than the prior term at the tertiary
 | |
|  * difference level.</li>
 | |
|  * <li>'=' - The text argument is equal to the prior term</li>
 | |
|  * </ul>
 | |
|  * <p>
 | |
|  * As for the text argument itself, this is any sequence of Unicode
 | |
|  * characters not in the following ranges: 0x0009-0x000D, 0x0020-0x002F,
 | |
|  * 0x003A-0x0040, 0x005B-0x0060, and 0x007B-0x007E. If these characters are 
 | |
|  * desired, they must be enclosed in single quotes.  If any whitespace is 
 | |
|  * encountered, it is ignored.  (For example, "a b" is equal to "ab").  
 | |
|  * <p>
 | |
|  * The reset operation inserts the following rule at the point where the
 | |
|  * text argument to it exists in the previously declared rule string.  This
 | |
|  * makes it easy to add new rules to an existing string by simply including
 | |
|  * them in a reset sequence at the end.  Note that the text argument, or
 | |
|  * at least the first character of it, must be present somewhere in the
 | |
|  * previously declared rules in order to be inserted properly.  If this
 | |
|  * is not satisfied, a <code>ParseException</code> will be thrown. 
 | |
|  * <p>
 | |
|  * This system of configuring <code>RuleBasedCollator</code> is needlessly
 | |
|  * complex and the people at Taligent who developed it (along with the folks
 | |
|  * at Sun who accepted it into the Java standard library) deserve a slow
 | |
|  * and agonizing death.
 | |
|  * <p>
 | |
|  * Here are a couple of example of rule strings:
 | |
|  * <p>
 | |
|  * "< a < b < c" - This string says that a is greater than b which is 
 | |
|  * greater than c, with all differences being primary differences.
 | |
|  * <p>
 | |
|  * "< a,A < b,B < c,C" - This string says that 'A' is greater than 'a' with
 | |
|  * a tertiary strength comparison.  Both 'b' and 'B' are greater than 'a' and
 | |
|  * 'A' during a primary strength comparison.  But 'B' is greater than 'b'
 | |
|  * under a tertiary strength comparison.
 | |
|  * <p>
 | |
|  * "< a < c & a < b " - This sequence is identical in function to the 
 | |
|  * "< a < b < c" rule string above.  The '&' reset symbol indicates that
 | |
|  * the rule "< b" is to be inserted after the text argument "a" in the
 | |
|  * previous rule string segment.
 | |
|  * <p>
 | |
|  * "< a < b & y < z" - This is an error.  The character 'y' does not appear
 | |
|  * anywhere in the previous rule string segment so the rule following the
 | |
|  * reset rule cannot be inserted.
 | |
|  * <p>
 | |
|  * "< a & A @ < e & E < f& F" - This sequence is equivalent to the following
 | |
|  * "< a & A < E & e < f & F".
 | |
|  * <p>
 | |
|  * For a description of the various comparison strength types, see the
 | |
|  * documentation for the <code>Collator</code> class.
 | |
|  * <p>
 | |
|  * As an additional complication to this already overly complex rule scheme,
 | |
|  * if any characters precede the first rule, these characters are considered
 | |
|  * ignorable.  They will be treated as if they did not exist during 
 | |
|  * comparisons.  For example, "- < a < b ..." would make '-' an ignorable
 | |
|  * character such that the strings "high-tech" and "hightech" would
 | |
|  * be considered identical.
 | |
|  * <p>
 | |
|  * A <code>ParseException</code> will be thrown for any of the following
 | |
|  * conditions:
 | |
|  * <ul>
 | |
|  * <li>Unquoted punctuation characters in a text argument.</li>
 | |
|  * <li>A relational or reset operator not followed by a text argument</li>
 | |
|  * <li>A reset operator where the text argument is not present in
 | |
|  * the previous rule string section.</li>
 | |
|  * </ul>
 | |
|  *
 | |
|  * @author Aaron M. Renn (arenn@urbanophile.com)
 | |
|  * @author Tom Tromey (tromey@cygnus.com)
 | |
|  * @author Guilhem Lavaux (guilhem@kaffe.org)
 | |
|  */
 | |
| public class RuleBasedCollator extends Collator
 | |
| {
 | |
|   /**
 | |
|    * This class describes what rank has a character (or a sequence of characters) 
 | |
|    * in the lexicographic order. Each element in a rule has a collation element.
 | |
|    */
 | |
|   final static class CollationElement
 | |
|   {
 | |
|     String key;
 | |
|     int primary;
 | |
|     short secondary;
 | |
|     short tertiary;
 | |
|     short equality;
 | |
|     boolean ignore;
 | |
|     String expansion;
 | |
| 
 | |
|     CollationElement(String key, int primary, short secondary, short tertiary,
 | |
| 		     short equality, String expansion, boolean ignore)
 | |
|     {
 | |
|       this.key = key;
 | |
|       this.primary = primary;
 | |
|       this.secondary = secondary;
 | |
|       this.tertiary = tertiary;
 | |
|       this.equality = equality;
 | |
|       this.ignore = ignore;
 | |
|       this.expansion = expansion;
 | |
|     }
 | |
| 
 | |
|     final int getValue()
 | |
|     {
 | |
|       return (primary << 16) + (secondary << 8) + tertiary;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * Basic collation instruction (internal format) to build the series of
 | |
|    * collation elements. It contains an instruction which specifies the new
 | |
|    * state of the generator. The sequence of instruction should not contain
 | |
|    * RESET (it is used by
 | |
|    * {@link #mergeRules(int,java.lang.String,java.util.Vector,java.util.Vector)})
 | |
|    * as a temporary state while merging two sets of instructions.
 | |
|    */
 | |
|   final static class CollationSorter
 | |
|   {
 | |
|     static final int GREATERP = 0;
 | |
|     static final int GREATERS = 1;
 | |
|     static final int GREATERT = 2;
 | |
|     static final int EQUAL = 3;
 | |
|     static final int RESET = 4;
 | |
|     static final int INVERSE_SECONDARY = 5;
 | |
|     
 | |
|     int comparisonType;
 | |
|     String textElement;
 | |
|     int hashText;
 | |
|     int offset;
 | |
|     boolean ignore;
 | |
| 
 | |
|     String expansionOrdering;
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This the the original rule string.
 | |
|    */
 | |
|   private String rules;
 | |
| 
 | |
|   /**
 | |
|    * This is the table of collation element values
 | |
|    */
 | |
|   private Object[] ce_table;
 | |
| 
 | |
|   /**
 | |
|    * Quick-prefix finder.
 | |
|    */
 | |
|   HashMap prefix_tree;
 | |
| 
 | |
|   /**
 | |
|    * This is the value of the last sequence entered into
 | |
|    * <code>ce_table</code>. It is used to compute the
 | |
|    * ordering value of unspecified character.
 | |
|    */
 | |
|   private int last_primary_value;
 | |
| 
 | |
|   /**
 | |
|    * This is the value of the last secondary sequence of the
 | |
|    * primary 0, entered into
 | |
|    * <code>ce_table</code>. It is used to compute the
 | |
|    * ordering value of an unspecified accented character.
 | |
|    */
 | |
|   private int last_tertiary_value;
 | |
| 
 | |
|   /**
 | |
|    * This variable is true if accents need to be sorted
 | |
|    * in the other direction.
 | |
|    */
 | |
|   private boolean inverseAccentComparison;
 | |
| 
 | |
|   /**
 | |
|    * This collation element is special to unknown sequence.
 | |
|    * The JDK uses it to mark and sort the characters which has
 | |
|    * no collation rules.
 | |
|    */
 | |
|   static final CollationElement SPECIAL_UNKNOWN_SEQ = 
 | |
|     new CollationElement("", (short) 32767, (short) 0, (short) 0,
 | |
| 			 (short) 0, null, false);
 | |
|   
 | |
|   /**
 | |
|    * This method initializes a new instance of <code>RuleBasedCollator</code>
 | |
|    * with the specified collation rules.  Note that an application normally
 | |
|    * obtains an instance of <code>RuleBasedCollator</code> by calling the
 | |
|    * <code>getInstance</code> method of <code>Collator</code>.  That method
 | |
|    * automatically loads the proper set of rules for the desired locale.
 | |
|    *
 | |
|    * @param rules The collation rule string.
 | |
|    *
 | |
|    * @exception ParseException If the rule string contains syntax errors.
 | |
|    */
 | |
|   public RuleBasedCollator(String rules) throws ParseException
 | |
|   {
 | |
|     if (rules.equals(""))
 | |
|       throw new ParseException("empty rule set", 0);
 | |
|     
 | |
|     this.rules = rules;
 | |
| 
 | |
|     buildCollationVector(parseString(rules));
 | |
|     buildPrefixAccess();
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method returns the number of common characters at the beginning
 | |
|    * of the string of the two parameters.
 | |
|    *
 | |
|    * @param prefix A string considered as a prefix to test against
 | |
|    * the other string.
 | |
|    * @param s A string to test the prefix against.
 | |
|    * @return The number of common characters.
 | |
|    */
 | |
|   static int findPrefixLength(String prefix, String s)
 | |
|   {
 | |
|     int index;
 | |
|     int len = prefix.length();
 | |
| 
 | |
|     for (index = 0; index < len && index < s.length(); ++index)
 | |
|       {
 | |
| 	if (prefix.charAt(index) != s.charAt(index))
 | |
| 	  return index;
 | |
|       }
 | |
| 
 | |
| 
 | |
|     return index;
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * Here we are merging two sets of sorting instructions: 'patch' into 'main'. This methods
 | |
|    * checks whether it is possible to find an anchor point for the rules to be merged and
 | |
|    * then insert them at that precise point.
 | |
|    *
 | |
|    * @param offset Offset in the string containing rules of the beginning of the rules
 | |
|    * being merged in.
 | |
|    * @param starter Text of the rules being merged.
 | |
|    * @param main Repository of all already parsed rules.
 | |
|    * @param patch Rules to be merged into the repository.
 | |
|    * @throws ParseException if it is impossible to find an anchor point for the new rules.
 | |
|    */
 | |
|   private void mergeRules(int offset, String starter, Vector main, Vector patch)
 | |
|     throws ParseException 
 | |
|   {
 | |
|     Enumeration elements = main.elements();
 | |
|     int insertion_point = -1;
 | |
|     int max_length = 0;
 | |
|     
 | |
|     /* We must check that no rules conflict with another already present. If it
 | |
|      * is the case delete the old rule. 
 | |
|      */
 | |
|     
 | |
|     /* For the moment good old O(N^2) algorithm.
 | |
|      */
 | |
|     for (int i = 0; i < patch.size(); i++)
 | |
|       {
 | |
| 	int j = 0;
 | |
| 	
 | |
| 	while (j < main.size())
 | |
| 	  {
 | |
| 	    CollationSorter rule1 = (CollationSorter) patch.elementAt(i);
 | |
| 	    CollationSorter rule2 = (CollationSorter) main.elementAt(j);
 | |
| 	    
 | |
| 	    if (rule1.textElement.equals(rule2.textElement))
 | |
| 	      main.removeElementAt(j);
 | |
| 	    else
 | |
| 	      j++;
 | |
| 	  }
 | |
|       }
 | |
| 
 | |
|     // Find the insertion point... O(N)
 | |
|     for (int i = 0; i < main.size(); i++)
 | |
|       {
 | |
| 	CollationSorter sorter = (CollationSorter) main.elementAt(i);
 | |
| 	int length = findPrefixLength(starter, sorter.textElement);
 | |
| 		
 | |
| 	if (length > max_length)
 | |
| 	  {
 | |
| 	    max_length = length;
 | |
| 	    insertion_point = i+1;
 | |
| 	  }
 | |
|       }
 | |
| 
 | |
|     if (insertion_point < 0)
 | |
|       throw new ParseException("no insertion point found for " + starter, offset);
 | |
| 
 | |
|     if (max_length < starter.length())
 | |
|       {
 | |
| 	/*
 | |
| 	 * We need to expand the first entry. It must be sorted
 | |
| 	 * like if it was the reference key itself (like the spec
 | |
| 	 * said. So the first entry is special: the element is
 | |
| 	 * replaced by the specified text element for the sorting.
 | |
| 	 * This text replace the old one for comparisons. However
 | |
| 	 * to preserve the behaviour we replace the first key (corresponding
 | |
| 	 * to the found prefix) by a new code rightly ordered in the
 | |
| 	 * sequence. The rest of the subsequence must be appended
 | |
| 	 * to the end of the sequence.
 | |
| 	 */
 | |
| 	CollationSorter sorter = (CollationSorter) patch.elementAt(0);
 | |
| 	CollationSorter expansionPrefix =
 | |
| 	  (CollationSorter) main.elementAt(insertion_point-1);
 | |
| 	
 | |
| 	sorter.expansionOrdering = starter.substring(max_length); // Skip the first good prefix element
 | |
| 		
 | |
| 	main.insertElementAt(sorter, insertion_point);
 | |
| 	
 | |
| 	/*
 | |
| 	 * This is a new set of rules. Append to the list.
 | |
| 	 */
 | |
| 	patch.removeElementAt(0);
 | |
| 	insertion_point++;
 | |
|       }
 | |
| 
 | |
|     // Now insert all elements of patch at the insertion point.
 | |
|     for (int i = 0; i < patch.size(); i++)
 | |
|       main.insertElementAt(patch.elementAt(i), i+insertion_point);
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method parses a string and build a set of sorting instructions. The parsing
 | |
|    * may only be partial on the case the rules are to be merged sometime later.
 | |
|    * 
 | |
|    * @param stop_on_reset If this parameter is true then the parser stops when it
 | |
|    * encounters a reset instruction. In the other case, it tries to parse the subrules
 | |
|    * and merged it in the same repository.
 | |
|    * @param v Output vector for the set of instructions.
 | |
|    * @param base_offset Offset in the string to begin parsing.
 | |
|    * @param rules Rules to be parsed.
 | |
|    * @return -1 if the parser reached the end of the string, an integer representing the
 | |
|    * offset in the string at which it stopped parsing. 
 | |
|    * @throws ParseException if something turned wrong during the parsing. To get details
 | |
|    * decode the message.
 | |
|    */
 | |
|   private int subParseString(boolean stop_on_reset, Vector v,
 | |
| 			     int base_offset, String rules)
 | |
|     throws ParseException
 | |
|   {
 | |
|     boolean ignoreChars = (base_offset == 0);
 | |
|     int operator = -1;
 | |
|     StringBuffer sb = new StringBuffer();
 | |
|     boolean doubleQuote = false;
 | |
|     boolean eatingChars = false;
 | |
|     boolean nextIsModifier = false;
 | |
|     boolean isModifier = false;
 | |
|     int i;
 | |
|     
 | |
| main_parse_loop:
 | |
|     for (i = 0; i < rules.length(); i++)
 | |
|       {
 | |
| 	char c = rules.charAt(i);
 | |
| 	int type = -1;
 | |
| 	
 | |
| 	if (!eatingChars &&
 | |
| 	    ((c >= 0x09 && c <= 0x0D) || (c == 0x20)))
 | |
| 	      continue;
 | |
| 
 | |
| 	isModifier = nextIsModifier;
 | |
| 	nextIsModifier = false;
 | |
| 
 | |
| 	if (eatingChars && c != '\'')
 | |
| 	  {
 | |
| 	    doubleQuote = false;
 | |
| 	    sb.append(c);
 | |
| 	    continue;
 | |
| 	  }
 | |
| 	if (doubleQuote && eatingChars)
 | |
| 	  {
 | |
| 	    sb.append(c);
 | |
| 	    doubleQuote = false;
 | |
| 	    continue;
 | |
| 	  }
 | |
| 
 | |
| 	switch (c) {
 | |
| 	case '!':
 | |
| 	  throw new ParseException
 | |
| 	    ("Modifier '!' is not yet supported by Classpath", i+base_offset);
 | |
| 	case '<':
 | |
| 	  type = CollationSorter.GREATERP;
 | |
| 	  break;
 | |
| 	case ';':
 | |
| 	  type = CollationSorter.GREATERS;
 | |
| 	  break;
 | |
| 	case ',':
 | |
| 	  type = CollationSorter.GREATERT;
 | |
| 	  break;
 | |
| 	case '=':
 | |
| 	  type = CollationSorter.EQUAL;
 | |
| 	  break;
 | |
| 	case '\'':
 | |
| 	  eatingChars = !eatingChars;
 | |
| 	  doubleQuote = true;
 | |
| 	  break;
 | |
| 	case '@':
 | |
| 	  if (ignoreChars)
 | |
| 	    throw new ParseException
 | |
| 	      ("comparison list has not yet been started. You may only use"
 | |
| 	       + "(<,;=&)", i+base_offset);
 | |
| 	  // Inverse the order of secondaries from now on.
 | |
| 	  nextIsModifier = true;
 | |
| 	  type = CollationSorter.INVERSE_SECONDARY;
 | |
| 	  break;
 | |
| 	case '&':
 | |
| 	  type = CollationSorter.RESET;
 | |
| 	  if (stop_on_reset)
 | |
| 	    break main_parse_loop;
 | |
| 	  break;
 | |
| 	default:
 | |
| 	  if (operator < 0)
 | |
| 	    throw new ParseException
 | |
| 	      ("operator missing at " + (i+base_offset), i+base_offset);
 | |
| 	  if (!eatingChars &&
 | |
| 	      ((c >= 0x21 && c <= 0x2F) 
 | |
| 	       || (c >= 0x3A && c <= 0x40)
 | |
| 	       || (c >= 0x5B && c <= 0x60)
 | |
| 	       || (c >= 0x7B && c <= 0x7E)))
 | |
| 	    throw new ParseException
 | |
| 	      ("unquoted punctuation character '"+c+"'", i+base_offset);
 | |
| 
 | |
| 	  //type = ignoreChars ? CollationSorter.IGNORE : -1;
 | |
| 	  sb.append(c);
 | |
| 	  break;
 | |
| 	}
 | |
| 
 | |
| 	if (type  < 0)
 | |
| 	  continue;
 | |
| 
 | |
| 	if (operator < 0)
 | |
| 	  {
 | |
| 	    operator = type;
 | |
| 	    continue;
 | |
| 	  }
 | |
| 
 | |
| 	if (sb.length() == 0 && !isModifier)
 | |
| 	  throw new ParseException
 | |
| 	    ("text element empty at " + (i+base_offset), i+base_offset);
 | |
| 
 | |
| 	if (operator == CollationSorter.RESET)
 | |
| 	  {
 | |
| 	    /* Reposition in the sorting list at the position
 | |
| 	     * indicated by the text element.
 | |
| 	     */
 | |
| 	    String subrules = rules.substring(i);
 | |
| 	    Vector sorted_rules = new Vector();
 | |
| 	    int idx;
 | |
| 
 | |
| 	    // Parse the subrules but do not iterate through all
 | |
| 	    // sublist. This is the priviledge of the first call.
 | |
| 	    idx = subParseString(true, sorted_rules, base_offset+i, subrules);
 | |
|     
 | |
| 	    // Merge new parsed rules into the list.
 | |
| 	    mergeRules(base_offset+i, sb.toString(), v, sorted_rules);
 | |
| 	    sb.setLength(0);
 | |
| 	    
 | |
| 	    // Reset state to none.
 | |
| 	    operator = -1;
 | |
| 	    type = -1;
 | |
| 	    // We have found a new subrule at 'idx' but it has not been parsed.
 | |
| 	    if (idx >= 0)
 | |
| 	      {
 | |
| 		i += idx-1;
 | |
| 		continue main_parse_loop;
 | |
| 	      }
 | |
| 	    else
 | |
| 		// No more rules.
 | |
| 		break main_parse_loop;
 | |
| 	  }
 | |
| 
 | |
| 	CollationSorter sorter = new CollationSorter();
 | |
| 	
 | |
| 	if (operator == CollationSorter.GREATERP)
 | |
| 	  ignoreChars = false;
 | |
| 
 | |
| 	sorter.comparisonType = operator;
 | |
| 	sorter.textElement = sb.toString();
 | |
| 	sorter.hashText = sorter.textElement.hashCode();
 | |
| 	sorter.offset = base_offset+rules.length();
 | |
| 	sorter.ignore = ignoreChars;
 | |
| 	sb.setLength(0);
 | |
| 
 | |
| 	v.add(sorter);
 | |
| 	operator = type;
 | |
|       }
 | |
| 
 | |
|     if (operator >= 0)
 | |
|       {
 | |
| 	CollationSorter sorter = new CollationSorter();
 | |
| 	int pos = rules.length() + base_offset;
 | |
| 
 | |
| 	if ((sb.length() != 0 && nextIsModifier)
 | |
| 	    || (sb.length() == 0 && !nextIsModifier && !eatingChars))
 | |
| 	  throw new ParseException("text element empty at " + pos, pos);
 | |
| 
 | |
| 	if (operator == CollationSorter.GREATERP)
 | |
| 	  ignoreChars = false;
 | |
| 
 | |
| 	sorter.comparisonType = operator;
 | |
| 	sorter.textElement = sb.toString();
 | |
|  	sorter.hashText = sorter.textElement.hashCode();
 | |
| 	sorter.offset = base_offset+pos;
 | |
| 	sorter.ignore = ignoreChars;
 | |
| 	v.add(sorter);
 | |
|       }
 | |
| 
 | |
|     if (i == rules.length())
 | |
|       return -1;
 | |
|     else
 | |
|       return i;
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method creates a copy of this object.
 | |
|    *
 | |
|    * @return A copy of this object.
 | |
|    */
 | |
|   public Object clone()
 | |
|   {
 | |
|     return super.clone();
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method completely parses a string 'rules' containing sorting rules.
 | |
|    *
 | |
|    * @param rules String containing the rules to be parsed. 
 | |
|    * @return A set of sorting instructions stored in a Vector.
 | |
|    * @throws ParseException if something turned wrong during the parsing. To get details
 | |
|    * decode the message.
 | |
|    */
 | |
|   private Vector parseString(String rules) 
 | |
|     throws ParseException
 | |
|   {
 | |
|     Vector v = new Vector();
 | |
| 
 | |
|     // result of the first subParseString is not absolute (may be -1 or a
 | |
|     // positive integer). But we do not care.
 | |
|     subParseString(false, v, 0, rules);
 | |
|     
 | |
|     return v;
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method uses the sorting instructions built by {@link #parseString}
 | |
|    * to build collation elements which can be directly used to sort strings.
 | |
|    *
 | |
|    * @param parsedElements Parsed instructions stored in a Vector.
 | |
|    * @throws ParseException if the order of the instructions are not valid.
 | |
|    */
 | |
|   private void buildCollationVector(Vector parsedElements)
 | |
|     throws ParseException
 | |
|   {
 | |
|     int primary_seq = 0;
 | |
|     int last_tertiary_seq = 0;
 | |
|     short secondary_seq = 0;
 | |
|     short tertiary_seq = 0;
 | |
|     short equality_seq = 0;
 | |
|     boolean inverseComparisons = false;
 | |
|     final boolean DECREASING = false;
 | |
|     final boolean INCREASING = true;
 | |
|     boolean secondaryType = INCREASING;
 | |
|     Vector v = new Vector();
 | |
| 
 | |
|     // elts is completely sorted.
 | |
| element_loop:
 | |
|     for (int i = 0; i < parsedElements.size(); i++)
 | |
|       {
 | |
| 	CollationSorter elt = (CollationSorter) parsedElements.elementAt(i);
 | |
| 	boolean ignoreChar = false;
 | |
| 
 | |
| 	switch (elt.comparisonType)
 | |
| 	  {
 | |
| 	  case CollationSorter.GREATERP:
 | |
| 	    primary_seq++;
 | |
| 	    if (inverseComparisons)
 | |
| 	      {
 | |
| 		secondary_seq = Short.MAX_VALUE;
 | |
| 		secondaryType = DECREASING;
 | |
| 	      }
 | |
| 	    else
 | |
| 	      {
 | |
| 		secondary_seq = 0;
 | |
| 		secondaryType = INCREASING;
 | |
| 	      }
 | |
| 	    tertiary_seq = 0;
 | |
| 	    equality_seq = 0;
 | |
| 	    inverseComparisons = false;
 | |
| 	    break;
 | |
| 	  case CollationSorter.GREATERS:
 | |
| 	    if (secondaryType == DECREASING)
 | |
| 	      secondary_seq--;
 | |
| 	    else
 | |
| 	      secondary_seq++;
 | |
| 	    tertiary_seq = 0;
 | |
| 	    equality_seq = 0;
 | |
| 	    break;
 | |
| 	  case CollationSorter.INVERSE_SECONDARY:
 | |
| 	    inverseComparisons = true;
 | |
| 	    continue element_loop;
 | |
| 	  case CollationSorter.GREATERT:
 | |
| 	    tertiary_seq++;
 | |
| 	    if (primary_seq == 0)
 | |
| 	      last_tertiary_seq = tertiary_seq;
 | |
| 	    equality_seq = 0;
 | |
| 	    break;
 | |
| 	  case CollationSorter.EQUAL:
 | |
| 	    equality_seq++;
 | |
| 	    break;
 | |
| 	  case CollationSorter.RESET:
 | |
| 	    throw new ParseException
 | |
| 	      ("Invalid reached state 'RESET'. Internal error", elt.offset);
 | |
| 	  default:
 | |
| 	    throw new ParseException
 | |
| 	      ("Invalid unknown state '" + elt.comparisonType + "'", elt.offset);
 | |
| 	  }
 | |
| 
 | |
| 	v.add(new CollationElement(elt.textElement, primary_seq,
 | |
| 				   secondary_seq, tertiary_seq,
 | |
| 				   equality_seq, elt.expansionOrdering, elt.ignore));
 | |
|       }
 | |
| 
 | |
|     this.inverseAccentComparison = inverseComparisons; 
 | |
| 
 | |
|     ce_table = v.toArray();
 | |
| 
 | |
|     last_primary_value = primary_seq+1;
 | |
|     last_tertiary_value = last_tertiary_seq+1;
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * Build a tree where all keys are the texts of collation elements and data is
 | |
|    * the collation element itself. The tree is used when extracting all prefix
 | |
|    * for a given text.
 | |
|    */
 | |
|   private void buildPrefixAccess()
 | |
|   {
 | |
|     prefix_tree = new HashMap();
 | |
| 
 | |
|     for (int i = 0; i < ce_table.length; i++)
 | |
|       {
 | |
| 	CollationElement e = (CollationElement) ce_table[i];
 | |
| 
 | |
| 	prefix_tree.put(e.key, e);
 | |
|       }
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method returns an integer which indicates whether the first
 | |
|    * specified <code>String</code> is less than, greater than, or equal to
 | |
|    * the second.  The value depends not only on the collation rules in
 | |
|    * effect, but also the strength and decomposition settings of this object.
 | |
|    *
 | |
|    * @param source The first <code>String</code> to compare.
 | |
|    * @param target A second <code>String</code> to compare to the first.
 | |
|    *
 | |
|    * @return A negative integer if source < target, a positive integer
 | |
|    * if source > target, or 0 if source == target.
 | |
|    */
 | |
|   public int compare(String source, String target)
 | |
|   {
 | |
|     CollationElementIterator cs, ct;
 | |
|     CollationElement ord1block = null;
 | |
|     CollationElement ord2block = null;
 | |
|     boolean advance_block_1 = true;
 | |
|     boolean advance_block_2 = true;
 | |
| 
 | |
|     cs = getCollationElementIterator(source);
 | |
|     ct = getCollationElementIterator(target);
 | |
| 
 | |
|     for(;;)
 | |
|       {
 | |
| 	int ord1;
 | |
| 	int ord2;
 | |
| 
 | |
| 	/*
 | |
| 	 * We have to check whether the characters are ignorable.
 | |
| 	 * If it is the case then forget them. 
 | |
| 	 */
 | |
| 	if (advance_block_1)
 | |
| 	  {
 | |
| 	    ord1block = cs.nextBlock();
 | |
| 	    if (ord1block != null && ord1block.ignore)
 | |
| 	      continue;
 | |
| 	  }
 | |
| 	
 | |
| 	if (advance_block_2)
 | |
| 	  {
 | |
| 	    ord2block = ct.nextBlock();
 | |
| 	    if (ord2block != null && ord2block.ignore)
 | |
| 	      {
 | |
| 	        advance_block_1 = false;
 | |
| 	        continue;
 | |
| 	      }
 | |
| 	 }
 | |
| 	else
 | |
| 	  advance_block_2 = true;
 | |
| 
 | |
| 	if (!advance_block_1)
 | |
| 	  advance_block_1 = true;
 | |
| 
 | |
| 	if (ord1block != null)
 | |
| 	  ord1 = ord1block.getValue();
 | |
| 	else
 | |
| 	  {
 | |
| 	    if (ord2block == null)
 | |
| 	      return 0;
 | |
| 	    return -1;
 | |
| 	  }
 | |
| 
 | |
| 	if (ord2block == null)
 | |
| 	  return 1;
 | |
| 	
 | |
| 	ord2 = ord2block.getValue();
 | |
| 	
 | |
| 	// We know chars are totally equal, so skip
 | |
|         if (ord1 == ord2)
 | |
| 	  {
 | |
| 	    if (getStrength() == IDENTICAL)
 | |
| 	      if (!ord1block.key.equals(ord2block.key))
 | |
| 		return ord1block.key.compareTo(ord2block.key);
 | |
| 	    continue;
 | |
| 	  }
 | |
| 
 | |
|         // Check for primary strength differences
 | |
|         int prim1 = CollationElementIterator.primaryOrder(ord1); 
 | |
|         int prim2 = CollationElementIterator.primaryOrder(ord2); 
 | |
| 	
 | |
| 	if (prim1 == 0 && getStrength() < TERTIARY)
 | |
| 	  {
 | |
|             advance_block_2 = false;
 | |
| 	    continue;
 | |
| 	  }
 | |
| 	else if (prim2 == 0 && getStrength() < TERTIARY)
 | |
| 	  {
 | |
| 	    advance_block_1 = false;
 | |
| 	    continue;
 | |
| 	  }
 | |
| 
 | |
|         if (prim1 < prim2)
 | |
|           return -1;
 | |
|         else if (prim1 > prim2)
 | |
|           return 1;
 | |
|         else if (getStrength() == PRIMARY)
 | |
|           continue;
 | |
| 
 | |
|         // Check for secondary strength differences
 | |
|         int sec1 = CollationElementIterator.secondaryOrder(ord1);
 | |
|         int sec2 = CollationElementIterator.secondaryOrder(ord2);
 | |
| 
 | |
| 	if (sec1 < sec2)
 | |
|           return -1;
 | |
|         else if (sec1 > sec2)
 | |
|           return 1;
 | |
|         else if (getStrength() == SECONDARY)
 | |
|           continue;
 | |
| 
 | |
|         // Check for tertiary differences
 | |
|         int tert1 = CollationElementIterator.tertiaryOrder(ord1);
 | |
|         int tert2 = CollationElementIterator.tertiaryOrder(ord2);
 | |
| 
 | |
|         if (tert1 < tert2)
 | |
|           return -1;
 | |
|         else if (tert1 > tert2)
 | |
|           return 1;
 | |
| 	else if (getStrength() == TERTIARY)
 | |
| 	  continue;
 | |
| 
 | |
| 	// Apparently JDK does this (at least for my test case).
 | |
| 	return ord1block.key.compareTo(ord2block.key);    
 | |
|       }
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method tests this object for equality against the specified 
 | |
|    * object.  This will be true if and only if the specified object is
 | |
|    * another reference to this object.
 | |
|    *
 | |
|    * @param obj The <code>Object</code> to compare against this object.
 | |
|    *
 | |
|    * @return <code>true</code> if the specified object is equal to this object,
 | |
|    * <code>false</code> otherwise.
 | |
|    */
 | |
|   public boolean equals(Object obj)
 | |
|   {
 | |
|     if (obj == this)
 | |
|       return true;
 | |
|     else
 | |
|       return false;
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method builds a default collation element without invoking
 | |
|    * the database created from the rules passed to the constructor.
 | |
|    *
 | |
|    * @param c Character which needs a collation element.
 | |
|    * @return A valid brand new CollationElement instance.
 | |
|    */
 | |
|   CollationElement getDefaultElement(char c)
 | |
|   {
 | |
|     int v;
 | |
| 
 | |
|     // Preliminary support for generic accent sorting inversion (I don't know if all
 | |
|     // characters in the range should be sorted backward). This is the place
 | |
|     // to fix this if needed.
 | |
|     if (inverseAccentComparison && (c >= 0x02B9 && c <= 0x0361))
 | |
|       v = 0x0361 - ((int) c - 0x02B9);
 | |
|     else
 | |
|       v = (short) c;
 | |
|     return new CollationElement("" + c, last_primary_value + v,
 | |
| 				(short) 0, (short) 0, (short) 0, null, false);
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method builds a default collation element for an accented character
 | |
|    * without invoking the database created from the rules passed to the constructor.
 | |
|    *
 | |
|    * @param c Character which needs a collation element.
 | |
|    * @return A valid brand new CollationElement instance.
 | |
|    */
 | |
|   CollationElement getDefaultAccentedElement(char c)
 | |
|   {
 | |
|     int v;
 | |
| 
 | |
|     // Preliminary support for generic accent sorting inversion (I don't know if all
 | |
|     // characters in the range should be sorted backward). This is the place
 | |
|     // to fix this if needed.
 | |
|     if (inverseAccentComparison && (c >= 0x02B9 && c <= 0x0361))
 | |
|       v = 0x0361 - ((int) c - 0x02B9);
 | |
|     else
 | |
|       v = (short) c;
 | |
|     return new CollationElement("" + c, (short) 0,
 | |
| 				(short) 0, (short) (last_tertiary_value + v), (short) 0, null, false);
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method returns an instance for <code>CollationElementIterator</code>
 | |
|    * for the specified <code>String</code> under the collation rules for this
 | |
|    * object.
 | |
|    *
 | |
|    * @param source The <code>String</code> to return the
 | |
|    * <code>CollationElementIterator</code> instance for.
 | |
|    *
 | |
|    * @return A <code>CollationElementIterator</code> for the specified
 | |
|    * <code>String</code>.
 | |
|    */
 | |
|   public CollationElementIterator getCollationElementIterator(String source)
 | |
|   {
 | |
|     return new CollationElementIterator(this, source);
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method returns an instance of <code>CollationElementIterator</code>
 | |
|    * for the <code>String</code> represented by the specified
 | |
|    * <code>CharacterIterator</code>.
 | |
|    *
 | |
|    * @param source The <code>CharacterIterator</code> with the desired <code>String</code>.
 | |
|    *
 | |
|    * @return A <code>CollationElementIterator</code> for the specified <code>String</code>.
 | |
|    */
 | |
|   public CollationElementIterator getCollationElementIterator(CharacterIterator source)
 | |
|   {
 | |
|     StringBuffer expand = new StringBuffer("");
 | |
|     
 | |
|     // Right now we assume that we will read from the beginning of the string.
 | |
|     for (char c = source.first();
 | |
| 	 c != CharacterIterator.DONE;
 | |
| 	 c = source.next())
 | |
|       decomposeCharacter(c, expand);
 | |
| 
 | |
|     return getCollationElementIterator(expand.toString());
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method returns an instance of <code>CollationKey</code> for the
 | |
|    * specified <code>String</code>.  The object returned will have a
 | |
|    * more efficient mechanism for its comparison function that could
 | |
|    * provide speed benefits if multiple comparisons are performed, such
 | |
|    * as during a sort.
 | |
|    *
 | |
|    * @param source The <code>String</code> to create a <code>CollationKey</code> for.
 | |
|    *
 | |
|    * @return A <code>CollationKey</code> for the specified <code>String</code>.
 | |
|    */
 | |
|   public CollationKey getCollationKey(String source)
 | |
|   {
 | |
|     CollationElementIterator cei = getCollationElementIterator(source);
 | |
|     Vector vect = new Vector(25);
 | |
| 
 | |
|     int ord = cei.next();
 | |
|     cei.reset(); //set to start of string
 | |
| 
 | |
|     while (ord != CollationElementIterator.NULLORDER)
 | |
|       {
 | |
| 	// If the primary order is null, it means this is an ignorable
 | |
| 	// character.
 | |
| 	if (CollationElementIterator.primaryOrder(ord) == 0)
 | |
| 	  {
 | |
|             ord = cei.next();
 | |
| 	    continue;
 | |
| 	  }
 | |
|         switch (getStrength())
 | |
|           {
 | |
|             case PRIMARY:
 | |
| 	      ord = CollationElementIterator.primaryOrder(ord);
 | |
| 	      break;
 | |
| 	      
 | |
|             case SECONDARY:
 | |
| 	      ord = CollationElementIterator.primaryOrder(ord) << 8;
 | |
| 	      ord |= CollationElementIterator.secondaryOrder(ord);
 | |
| 
 | |
|             default:
 | |
|                break;
 | |
|           }
 | |
| 
 | |
|         vect.add(new Integer(ord)); 
 | |
| 	ord = cei.next(); //increment to next key
 | |
|       }
 | |
| 
 | |
|     Object[] objarr = vect.toArray();
 | |
|     byte[] key = new byte[objarr.length * 4];
 | |
| 
 | |
|     for (int i = 0; i < objarr.length; i++)
 | |
|       {
 | |
|         int j = ((Integer) objarr[i]).intValue();
 | |
|         key [i * 4] = (byte) ((j & 0xFF000000) >> 24);
 | |
|         key [i * 4 + 1] = (byte) ((j & 0x00FF0000) >> 16);
 | |
|         key [i * 4 + 2] = (byte) ((j & 0x0000FF00) >> 8);
 | |
|         key [i * 4 + 3] = (byte) (j & 0x000000FF);
 | |
|       }
 | |
| 
 | |
|     return new CollationKey(this, source, key);
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method returns a <code>String</code> containing the collation rules
 | |
|    * for this object.
 | |
|    *
 | |
|    * @return The collation rules for this object.
 | |
|    */
 | |
|   public String getRules()
 | |
|   {
 | |
|     return rules;
 | |
|   }
 | |
| 
 | |
|   /**
 | |
|    * This method returns a hash value for this object.
 | |
|    *
 | |
|    * @return A hash value for this object.
 | |
|    */
 | |
|   public int hashCode()
 | |
|   {
 | |
|     return System.identityHashCode(this);
 | |
|   }
 | |
| }
 |