Index: lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 956028) +++ lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (working copy) @@ -103,7 +103,7 @@ // build a cache of sorted transitions for every state allTransitions = new Transition[runAutomaton.getSize()][]; for (State state : this.automaton.getNumberedStates()) { - state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order); + state.sortTransitions(Transition.CompareByMinMaxThenDest); state.trimTransitionsArray(); allTransitions[state.getNumber()] = state.transitionsArray; } @@ -158,11 +158,7 @@ // seek to the next possible string; if (nextString()) { // reposition - - // FIXME: this is really bad to turn off - // but it cannot work correctly until terms are in utf8 order. - linear = false; - + if (linear) setLinear(infinitePosition); return seekBytesRef; @@ -188,15 +184,15 @@ } for (int i = 0; i < allTransitions[state].length; i++) { Transition t = allTransitions[state][i]; - if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 && - compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) { + if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && + (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) { maxInterval = t.getMax(); break; } } - // 0xef terms don't get the optimization... not worth the trouble. - if (maxInterval != 0xef) - maxInterval = incrementUTF16(maxInterval); + // 0xff terms don't get the optimization... not worth the trouble. + if (maxInterval != 0xff) + maxInterval = incrementUTF8(maxInterval); int length = position + 1; /* position + maxTransition */ if (linearUpperBound.bytes.length < length) linearUpperBound.bytes = new byte[length]; @@ -281,7 +277,7 @@ // if the next character is U+FFFF and is not part of the useful portion, // then by definition it puts us in a reject state, and therefore this // path is dead. there cannot be any higher transitions. backtrack. - c = incrementUTF16(c); + c = incrementUTF8(c); if (c == -1) return false; } @@ -295,8 +291,8 @@ for (int i = 0; i < transitions.length; i++) { Transition transition = transitions[i]; - if (compareToUTF16(transition.getMax(), c) >= 0) { - int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin(); + if (transition.getMax() >= c) { + int nextChar = Math.max(c, transition.getMin()); // append either the next sequential char, or the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; @@ -342,9 +338,9 @@ private boolean backtrack(int position) { while (position > 0) { int nextChar = seekBytesRef.bytes[position - 1] & 0xff; - // if a character is 0xef its a dead-end too, - // because there is no higher character in UTF-16 sort order. - nextChar = incrementUTF16(nextChar); + // if a character is 0xff its a dead-end too, + // because there is no higher character in UTF-8 sort order. + nextChar = incrementUTF8(nextChar); if (nextChar != -1) { seekBytesRef.bytes[position - 1] = (byte) nextChar; seekBytesRef.length = position; @@ -355,34 +351,11 @@ return false; /* all solutions exhausted */ } - /* return the next utf8 byte in utf16 order, or -1 if exhausted */ - private final int incrementUTF16(int utf8) { + /* return the next utf8 byte in utf8 order, or -1 if exhausted */ + private final int incrementUTF8(int utf8) { switch(utf8) { - case 0xed: return 0xf0; - case 0xfd: return 0xee; - case 0xee: return 0xef; - case 0xef: return -1; + case 0xff: return -1; default: return utf8 + 1; } } - - int compareToUTF16(int aByte, int bByte) { - if (aByte != bByte) { - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; - } - return 0; - } } Index: lucene/src/java/org/apache/lucene/util/automaton/Transition.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/Transition.java (revision 956028) +++ lucene/src/java/org/apache/lucene/util/automaton/Transition.java (working copy) @@ -210,64 +210,4 @@ } public static final Comparator CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle(); - - private static class UTF8InUTF16Order { - protected int compareCodePoint(int aByte, int bByte) { - if (aByte != bByte) { - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; - } - return 0; - } - } - - private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator { - public int compare(Transition t1, Transition t2) { - if (t1.to != t2.to) { - if (t1.to == null) return -1; - else if (t2.to == null) return 1; - else if (t1.to.number < t2.to.number) return -1; - else if (t1.to.number > t2.to.number) return 1; - } - int minComp = compareCodePoint(t1.min, t2.min); - if (minComp != 0) return minComp; - int maxComp = compareCodePoint(t1.max, t2.max); - if (maxComp != 0) return maxComp; - return 0; - } - } - - public static final Comparator CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle(); - - private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator { - public int compare(Transition t1, Transition t2) { - int minComp = compareCodePoint(t1.min, t2.min); - if (minComp != 0) return minComp; - int maxComp = compareCodePoint(t1.max, t2.max); - if (maxComp != 0) return maxComp; - if (t1.to != t2.to) { - if (t1.to == null) return -1; - else if (t2.to == null) return 1; - else if (t1.to.number < t2.to.number) return -1; - else if (t1.to.number > t2.to.number) return 1; - } - return 0; - } - } - - public static final Comparator CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle(); - - }