Index: lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java (revision 937269) +++ lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java (working copy) @@ -124,55 +124,4 @@ Automaton a = new RegExp("((\uD866\uDF05)|\uFB94).*").toAutomaton(); assertAutomatonHits(2, a); } - - /** - * Test that AutomatonQuery properly seeks to supplementary characters. - * Transitions are modeled as UTF-16 code units, so without special handling - * by default it will try to seek to a lead surrogate with some DFAs - */ - public void testSeekSurrogate() throws IOException { - Automaton a = new RegExp("\uD866[a\uDF05\uFB93][a-z]{0,5}[fl]").toAutomaton(); - assertAutomatonHits(1, a); - } - - /** - * Try seeking to an ending lead surrogate. - */ - public void testSeekSurrogate2() throws IOException { - Automaton a = new RegExp("\uD866(\uDF06ghijkl)?").toAutomaton(); - assertAutomatonHits(1, a); - } - - /** - * Try seeking to an starting trail surrogate. - */ - public void testSeekSurrogate3() throws IOException { - Automaton a = new RegExp("[\uDF06\uFB94]mnopqr").toAutomaton(); - assertAutomatonHits(1, a); - } - - /** - * Try seeking to an medial/final trail surrogate. - */ - public void testSeekSurrogate4() throws IOException { - Automaton a = new RegExp("a[\uDF06\uFB94]bc").toAutomaton(); - assertAutomatonHits(1, a); - } - - /** - * Ensure the 'constant suffix' does not contain a leading trail surrogate. - */ - public void testSurrogateSuffix() throws IOException { - Automaton a = new RegExp(".*[\uD865\uD866]\uDF06ghijkl").toAutomaton(); - assertAutomatonHits(1, a); - } - - /** - * Try when the constant suffix is only a leading trail surrogate. - * instead this must use an empty suffix. - */ - public void testSurrogateSuffix2() throws IOException { - Automaton a = new RegExp(".*\uDF05").toAutomaton(); - assertAutomatonHits(1, a); - } } Index: lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java (revision 937269) +++ lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java (working copy) @@ -32,6 +32,7 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; import org.apache.lucene.util.automaton.RunAutomaton; @@ -46,7 +47,7 @@ @Override protected void setUp() throws Exception { super.setUp(); - random = newRandom(System.nanoTime()); + random = newRandom(); RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); @@ -87,7 +88,7 @@ } private class SimpleAutomatonTermsEnum extends FilteredTermsEnum { - RunAutomaton runAutomaton = new RunAutomaton(automaton); + CharacterRunAutomaton runAutomaton = new CharacterRunAutomaton(automaton); UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); private SimpleAutomatonTermsEnum(IndexReader reader, String field) throws IOException { @@ -111,7 +112,7 @@ /** test a bunch of random regular expressions */ public void testRegexps() throws Exception { - for (int i = 0; i < 500; i++) + for (int i = 0; i < 20; i++) assertSame(randomRegex()); } @@ -144,7 +145,7 @@ TopDocs smartDocs = searcher.search(smart, 25); TopDocs dumbDocs = searcher.search(dumb, 25); - assertEquals(dumbDocs.totalHits, smartDocs.totalHits); + assertEquals("for re:" + regexp + ", automaton: " + new RegExp(regexp).toAutomaton().toString(), dumbDocs.totalHits, smartDocs.totalHits); } char buffer[] = new char[20]; Index: lucene/src/test/org/apache/lucene/util/_TestUtil.java =================================================================== --- lucene/src/test/org/apache/lucene/util/_TestUtil.java (revision 937269) +++ lucene/src/test/org/apache/lucene/util/_TestUtil.java (working copy) @@ -111,4 +111,8 @@ buf.append("]"); return buf.toString(); } + /** start and end are BOTH inclusive */ + public static int nextInt(Random r, int start, int end) { + return start + r.nextInt(end-start+1); + } } Index: lucene/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java =================================================================== --- lucene/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java (revision 937269) +++ lucene/src/test/org/apache/lucene/util/automaton/TestLevenshteinAutomata.java (working copy) @@ -169,7 +169,7 @@ } private void assertBruteForce(String input, Automaton dfa, int distance) { - RunAutomaton ra = new RunAutomaton(dfa); + CharacterRunAutomaton ra = new CharacterRunAutomaton(dfa); int maxLen = input.length() + distance + 1; int maxNum = (int) Math.pow(2, maxLen); for (int i = 0; i < maxNum; i++) { Index: lucene/src/test/org/apache/lucene/util/automaton/TestUTF32SpecialCase.java =================================================================== --- lucene/src/test/org/apache/lucene/util/automaton/TestUTF32SpecialCase.java (revision 0) +++ lucene/src/test/org/apache/lucene/util/automaton/TestUTF32SpecialCase.java (revision 0) @@ -0,0 +1,20 @@ +package org.apache.lucene.util.automaton; + +import junit.framework.TestCase; + +public class TestUTF32SpecialCase extends TestCase { + public void testCase() { + RegExp re = new RegExp(".?"); + Automaton automaton = re.toAutomaton(); + CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); + ByteRunAutomaton bra = new ByteRunAutomaton(automaton); + // make sure character dfa accepts empty string + assertTrue(cra.isAccept(cra.getInitialState())); + assertTrue(cra.run("")); + assertTrue(cra.run(new char[0], 0, 0)); + + // make sure byte dfa accepts empty string + assertTrue(bra.isAccept(bra.getInitialState())); + assertTrue(bra.run(new byte[0], 0, 0)); + } +} Property changes on: lucene/src/test/org/apache/lucene/util/automaton/TestUTF32SpecialCase.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java =================================================================== --- lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java (revision 0) +++ lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java (revision 0) @@ -0,0 +1,131 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import java.util.Random; + +public class TestUTF32ToUTF8 extends LuceneTestCase { + + private static final int MAX_UNICODE = 0x10FFFF; + + final BytesRef b = new BytesRef(4); + + private boolean matches(ByteRunAutomaton a, int code) { + char[] chars = Character.toChars(code); + UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b); + return a.run(b.bytes, 0, b.length); + } + + private void testOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters) { + + // Verify correct ints are accepted + for(int iter=0;iter= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | + (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) { + iter--; + continue; + } + assertTrue("DFA for range " + startCode + "-" + endCode + " failed to match code=" + code, + matches(a, code)); + } + + // Verify invalid ints are not accepted + final int invalidRange = MAX_UNICODE - (endCode - startCode + 1); + if (invalidRange > 0) { + for(int iter=0;iter= startCode) { + code = endCode + 1 + x - startCode; + } else { + code = x; + } + if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | + (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) { + iter--; + continue; + } + assertFalse("DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code, + matches(a, code)); + + } + } + } + + // Evenly picks random code point from the 4 "buckets" + // (bucket = same #bytes when encoded to utf8) + private int getCodeStart(Random r) { + switch(r.nextInt(4)) { + case 0: + return _TestUtil.nextInt(r, 0, 128); + case 1: + return _TestUtil.nextInt(r, 128, 2048); + case 2: + return _TestUtil.nextInt(r, 2048, 65536); + default: + return _TestUtil.nextInt(r, 65536, 1+MAX_UNICODE); + } + } + + public void testRandomRanges() throws Exception { + final Random r = newRandom(); + int ITERS = 10; + int ITERS_PER_DFA = 100; + for(int iter=0;iter termComp; /** @@ -80,39 +77,26 @@ * Construct an enumerator based upon an automaton, enumerating the specified * field, working on a supplied reader. *

- * @lucene.internal Use the public ctor instead. This constructor allows the - * (dangerous) option of passing in a pre-compiled RunAutomaton. If you use - * this ctor and compile your own RunAutomaton, you are responsible for - * ensuring it is in sync with the Automaton object, including internal - * State numbering, or you will get undefined behavior. + * @lucene.internal Use the public ctor instead. *

- * @param preCompiled optional pre-compiled RunAutomaton (can be null) + * @param runAutomaton pre-compiled ByteRunAutomaton * @param finite true if the automaton accepts a finite language */ - AutomatonTermsEnum(Automaton automaton, RunAutomaton preCompiled, + AutomatonTermsEnum(ByteRunAutomaton runAutomaton, Term queryTerm, IndexReader reader, boolean finite) throws IOException { super(reader, queryTerm.field()); - this.automaton = automaton; + this.automaton = runAutomaton.getAutomaton(); this.finite = finite; - /* - * tableize the automaton. this also ensures it is deterministic, and has no - * transitions to dead states. it also invokes Automaton.setStateNumbers to - * number the original states (this is how they are tableized) - */ - if (preCompiled == null) - runAutomaton = new RunAutomaton(this.automaton); - else - runAutomaton = preCompiled; + this.runAutomaton = runAutomaton; + commonSuffixRef = finite ? null : SpecialOperations.getCommonSuffixBytesRef(automaton); - commonSuffixRef = finite ? null : new BytesRef(getValidUTF16Suffix(SpecialOperations - .getCommonSuffix(automaton))); - // build a cache of sorted transitions for every state allTransitions = new Transition[runAutomaton.getSize()][]; + Comparator comparator = new UTF8InUTF16OrderTransitionComparator(false); for (State state : this.automaton.getStates()) - allTransitions[state.getNumber()] = state.getSortedTransitionArray(false); + allTransitions[state.getNumber()] = state.getSortedTransitionArray(comparator); // used for path tracking, where each bit is a numbered state. visited = new long[runAutomaton.getSize()]; @@ -128,7 +112,7 @@ */ public AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader) throws IOException { - this(automaton, null, queryTerm, reader, SpecialOperations.isFinite(automaton)); + this(new ByteRunAutomaton(automaton), queryTerm, reader, SpecialOperations.isFinite(automaton)); } /** @@ -138,8 +122,7 @@ @Override protected AcceptStatus accept(final BytesRef term) { if (commonSuffixRef == null || term.endsWith(commonSuffixRef)) { - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); - if (runAutomaton.run(utf16.result, 0, utf16.length)) + if (runAutomaton.run(term.bytes, term.offset, term.length)) return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK; else return (linear && termComp.compare(term, linearUpperBound) < 0) ? @@ -153,15 +136,14 @@ @Override protected BytesRef nextSeekTerm(final BytesRef term) throws IOException { if (term == null) { + seekBytesRef.copy(""); // return the empty term, as its valid - if (runAutomaton.run("")) { - seekBytesRef.copy(""); + if (runAutomaton.run(new byte[0], 0, 0)) { return seekBytesRef; } - - utf16.copyText(""); } else { - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + // nocommit: necessary? + seekBytesRef.copy(term); } // seek to the next possible string; @@ -169,8 +151,6 @@ // reposition if (linear) setLinear(infinitePosition); - UnicodeUtil.nextValidUTF16String(utf16); - UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, seekBytesRef); return seekBytesRef; } // no more possible strings can match @@ -187,27 +167,28 @@ */ private void setLinear(int position) { int state = runAutomaton.getInitialState(); - char maxInterval = 0xffff; - for (int i = 0; i < position; i++) - state = runAutomaton.step(state, utf16.result[i]); + int maxInterval = 0xef; + for (int i = 0; i < position; i++) { + state = runAutomaton.step(state, seekBytesRef.bytes[i] & 0xff); + assert state >= 0: "state=" + state; + } for (int i = 0; i < allTransitions[state].length; i++) { Transition t = allTransitions[state][i]; - if (t.getMin() <= utf16.result[position] && utf16.result[position] <= t.getMax()) { + if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 && + compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) { maxInterval = t.getMax(); break; } } - // 0xffff terms don't get the optimization... not worth the trouble. - if (maxInterval < 0xffff) - maxInterval++; + // 0xef terms don't get the optimization... not worth the trouble. + if (maxInterval != 0xef) + maxInterval = incrementUTF16(maxInterval); int length = position + 1; /* position + maxTransition */ - if (linearUpperBoundUTF16.result.length < length) - linearUpperBoundUTF16.result = new char[length]; - System.arraycopy(utf16.result, 0, linearUpperBoundUTF16.result, 0, position); - linearUpperBoundUTF16.result[position] = maxInterval; - linearUpperBoundUTF16.setLength(length); - UnicodeUtil.nextValidUTF16String(linearUpperBoundUTF16); - UnicodeUtil.UTF16toUTF8(linearUpperBoundUTF16.result, 0, length, linearUpperBound); + if (linearUpperBound.bytes.length < length) + linearUpperBound.bytes = new byte[length]; + System.arraycopy(seekBytesRef.bytes, 0, linearUpperBound.bytes, 0, position); + linearUpperBound.bytes[position] = (byte) maxInterval; + linearUpperBound.length = length; } /** @@ -229,9 +210,9 @@ linear = false; state = runAutomaton.getInitialState(); // walk the automaton until a character is rejected. - for (pos = 0; pos < utf16.length; pos++) { + for (pos = 0; pos < seekBytesRef.length; pos++) { visited[state] = curGen; - int nextState = runAutomaton.step(state, utf16.result[pos]); + int nextState = runAutomaton.step(state, seekBytesRef.bytes[pos] & 0xff); if (nextState == -1) break; // we found a loop, record it for faster enumeration @@ -249,7 +230,7 @@ } else { /* no more solutions exist from this useful portion, backtrack */ if (!backtrack(pos)) /* no more solutions at all */ return false; - else if (runAutomaton.run(utf16.result, 0, utf16.length)) + else if (runAutomaton.run(seekBytesRef.bytes, 0, seekBytesRef.length)) /* String is good to go as-is */ return true; /* else advance further */ @@ -280,19 +261,18 @@ * the next lexicographic character must be greater than the existing * character, if it exists. */ - char c = 0; - if (position < utf16.length) { - c = utf16.result[position]; + int c = 0; + if (position < seekBytesRef.length) { + c = seekBytesRef.bytes[position] & 0xff; // if the next character is U+FFFF and is not part of the useful portion, // then by definition it puts us in a reject state, and therefore this // path is dead. there cannot be any higher transitions. backtrack. - if (c == '\uFFFF') + c = incrementUTF16(c); + if (c == -1) return false; - else - c++; } - utf16.setLength(position); + seekBytesRef.length = position; visited[state] = curGen; Transition transitions[] = allTransitions[state]; @@ -301,11 +281,12 @@ for (int i = 0; i < transitions.length; i++) { Transition transition = transitions[i]; - if (transition.getMax() >= c) { - char nextChar = (char) Math.max(c, transition.getMin()); + if (compareToUTF16(transition.getMax(), c) >= 0) { + int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin(); // append either the next sequential char, or the minimum transition - utf16.setLength(utf16.length + 1); - utf16.result[utf16.length - 1] = nextChar; + seekBytesRef.grow(seekBytesRef.length + 1); + seekBytesRef.length++; + seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar; state = transition.getDest().getNumber(); /* * as long as is possible, continue down the minimal path in @@ -323,11 +304,12 @@ // we found a loop, record it for faster enumeration if (!finite && !linear && visited[state] == curGen) { linear = true; - infinitePosition = utf16.length; + infinitePosition = seekBytesRef.length; } // append the minimum transition - utf16.setLength(utf16.length + 1); - utf16.result[utf16.length - 1] = transition.getMin(); + seekBytesRef.grow(seekBytesRef.length + 1); + seekBytesRef.length++; + seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin(); } return true; } @@ -345,13 +327,13 @@ */ private boolean backtrack(int position) { while (position > 0) { - char nextChar = utf16.result[position - 1]; - // if a character is U+FFFF its a dead-end too, + int nextChar = seekBytesRef.bytes[position - 1] & 0xff; + // if a character is 0xef its a dead-end too, // because there is no higher character in UTF-16 sort order. - if (nextChar != '\uFFFF') { - nextChar++; - utf16.result[position - 1] = nextChar; - utf16.setLength(position); + nextChar = incrementUTF16(nextChar); + if (nextChar != -1) { + seekBytesRef.bytes[position - 1] = (byte) nextChar; + seekBytesRef.length = position; return true; } position--; @@ -359,19 +341,35 @@ return false; /* all solutions exhausted */ } - /** - * if the suffix starts with a low surrogate, remove it. - * This won't be quite as efficient, but can be converted to valid UTF-8 - * - * This isn't nearly as complex as cleanupPosition, because its not - * going to use this suffix to walk any path thru the terms. - * - */ - private String getValidUTF16Suffix(String suffix) { - if (suffix != null && suffix.length() > 0 && - Character.isLowSurrogate(suffix.charAt(0))) - return suffix.substring(1); - else - return suffix; + // nocommit: probably not efficient + /* return the next utf8 byte in utf16 order, or -1 if exhausted */ + int incrementUTF16(int utf8) { + switch(utf8) { + case 0xed: return 0xf0; + case 0xfd: return 0xee; + case 0xee: return 0xef; + case 0xef: return -1; + default: return utf8 + 1; + } } + + int compareToUTF16(int aByte, int bByte) { + if (aByte != bByte) { + // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order + + // We know the terms are not equal, but, we may + // have to carefully fixup the bytes at the + // difference to match UTF16's sort order: + if (aByte >= 0xee && bByte >= 0xee) { + if ((aByte & 0xfe) == 0xee) { + aByte += 0x10; + } + if ((bByte&0xfe) == 0xee) { + bByte += 0x10; + } + } + return aByte - bByte; + } + return 0; + } } Index: lucene/src/java/org/apache/lucene/search/WildcardQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/WildcardQuery.java (revision 937269) +++ lucene/src/java/org/apache/lucene/search/WildcardQuery.java (working copy) @@ -63,8 +63,8 @@ String wildcardText = wildcardquery.text(); - for (int i = 0; i < wildcardText.length(); i++) { - final char c = wildcardText.charAt(i); + for (int i = 0; i < wildcardText.length();) { + final int c = wildcardText.codePointAt(i); switch(c) { case WILDCARD_STRING: automata.add(BasicAutomata.makeAnyString()); @@ -75,6 +75,7 @@ default: automata.add(BasicAutomata.makeChar(c)); } + i += Character.charCount(c); } return BasicOperations.concatenate(automata); Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (revision 937269) +++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (working copy) @@ -28,6 +28,7 @@ import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.apache.lucene.util.automaton.RunAutomaton; @@ -58,8 +59,9 @@ private final int termLength; private int maxEdits; + // nocommit, don't need to keep around private List automata; - private List runAutomata; + private List runAutomata; private final IndexReader reader; private final Term term; @@ -112,9 +114,9 @@ throws IOException { initAutomata(editDistance); if (automata != null && editDistance < automata.size()) { - return new AutomatonFuzzyTermsEnum(automata.get(editDistance), term, + return new AutomatonFuzzyTermsEnum(term, reader, minSimilarity, runAutomata.subList(0, editDistance + 1) - .toArray(new RunAutomaton[0]), lastTerm); + .toArray(new ByteRunAutomaton[0]), lastTerm); } else { return null; } @@ -127,7 +129,7 @@ LevenshteinAutomata builder = new LevenshteinAutomata(term.text().substring(realPrefixLength)); automata = new ArrayList(maxDistance); - runAutomata = new ArrayList(maxDistance); + runAutomata = new ArrayList(maxDistance); for (int i = 0; i <= maxDistance; i++) { Automaton a = builder.toAutomaton(i); // constant prefix @@ -137,7 +139,7 @@ a = BasicOperations.concatenate(prefix, a); } automata.add(a); - runAutomata.add(new RunAutomaton(a)); + runAutomata.add(new ByteRunAutomaton(a)); } } } @@ -252,7 +254,7 @@ * and comparison is linear to length of the term (rather than quadratic) */ final class AutomatonFuzzyTermsEnum extends AutomatonTermsEnum { - private final RunAutomaton matchers[]; + private final ByteRunAutomaton matchers[]; // used for unicode conversion from BytesRef byte[] to char[] private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); @@ -266,9 +268,9 @@ private final MultiTermQuery.BoostAttribute boostAtt = attributes().addAttribute(MultiTermQuery.BoostAttribute.class); - public AutomatonFuzzyTermsEnum(Automaton automaton, Term queryTerm, - IndexReader reader, float minSimilarity, RunAutomaton matchers[], BytesRef lastTerm) throws IOException { - super(automaton, matchers[matchers.length - 1], queryTerm, reader, true); + public AutomatonFuzzyTermsEnum(Term queryTerm, + IndexReader reader, float minSimilarity, ByteRunAutomaton matchers[], BytesRef lastTerm) throws IOException { + super(matchers[matchers.length - 1], queryTerm, reader, true); this.minimumSimilarity = minSimilarity; this.scale_factor = 1.0f / (1.0f - minimumSimilarity); this.matchers = matchers; @@ -285,11 +287,16 @@ return AcceptStatus.YES_AND_SEEK; } - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); - + boolean converted = false; // TODO: benchmark doing this backwards for (int i = 1; i < matchers.length; i++) - if (matchers[i].run(utf16.result, 0, utf16.length)) { + if (matchers[i].run(term.bytes, 0, term.length)) { + // nocommit, use codepoint length? + // this sucks, we convert just to score based on length. + if (!converted) { + UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + converted = true; + } final float similarity = 1.0f - ((float) i / (float) (Math.min(utf16.length, fullSearchTermLength))); if (similarity > minimumSimilarity) { Index: lucene/src/java/org/apache/lucene/util/automaton/UTF8InUTF16OrderTransitionComparator.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/UTF8InUTF16OrderTransitionComparator.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/automaton/UTF8InUTF16OrderTransitionComparator.java (revision 0) @@ -0,0 +1,76 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; +import java.util.Comparator; + +public class UTF8InUTF16OrderTransitionComparator implements Comparator, Serializable { + boolean to_first; + + public UTF8InUTF16OrderTransitionComparator(boolean to_first) { + this.to_first = to_first; + } + + /** + * Compares by (min, reverse max, to) or (to, min, reverse max). + */ + public int compare(Transition t1, Transition t2) { + if (to_first) { + if (t1.to != t2.to) { + if (t1.to == null) return -1; + else if (t2.to == null) return 1; + else if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + } + int minComp = compareCodePoint(t1.min, t2.min); + if (minComp != 0) return minComp; + int maxComp = compareCodePoint(t1.max, t2.max); + if (maxComp != 0) return maxComp; + if (!to_first) { + if (t1.to != t2.to) { + if (t1.to == null) return -1; + else if (t2.to == null) return 1; + else if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + } + return 0; + } + + private int compareCodePoint(int aByte, int bByte) { + if (aByte != bByte) { + // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order + + // We know the terms are not equal, but, we may + // have to carefully fixup the bytes at the + // difference to match UTF16's sort order: + if (aByte >= 0xee && bByte >= 0xee) { + if ((aByte & 0xfe) == 0xee) { + aByte += 0x10; + } + if ((bByte&0xfe) == 0xee) { + bByte += 0x10; + } + } + return aByte - bByte; + } + return 0; + } +} Property changes on: lucene/src/java/org/apache/lucene/util/automaton/UTF8InUTF16OrderTransitionComparator.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/automaton/Automaton.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/Automaton.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/Automaton.java (working copy) @@ -319,17 +319,17 @@ */ void totalize() { State s = new State(); - s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + s.transitions.add(new Transition(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT, s)); for (State p : getStates()) { - int maxi = Character.MIN_VALUE; + int maxi = Character.MIN_CODE_POINT; for (Transition t : p.getSortedTransitions(false)) { - if (t.min > maxi) p.transitions.add(new Transition((char) maxi, - (char) (t.min - 1), s)); + if (t.min > maxi) p.transitions.add(new Transition(maxi, + (t.min - 1), s)); if (t.max + 1 > maxi) maxi = t.max + 1; } - if (maxi <= Character.MAX_VALUE) p.transitions.add(new Transition( - (char) maxi, Character.MAX_VALUE, s)); + if (maxi <= Character.MAX_CODE_POINT) p.transitions.add(new Transition( + maxi, Character.MAX_CODE_POINT, s)); } } @@ -362,39 +362,39 @@ if (t.min <= max + 1) { if (t.max > max) max = t.max; } else { - if (p != null) s.transitions.add(new Transition((char) min, - (char) max, p)); + if (p != null) s.transitions.add(new Transition(min, + max, p)); min = t.min; max = t.max; } } else { - if (p != null) s.transitions.add(new Transition((char) min, - (char) max, p)); + if (p != null) s.transitions.add(new Transition(min, + max, p)); p = t.to; min = t.min; max = t.max; } } if (p != null) s.transitions - .add(new Transition((char) min, (char) max, p)); + .add(new Transition(min, max, p)); } } /** * Returns sorted array of all interval start points. */ - char[] getStartPoints() { - Set pointset = new HashSet(); + int[] getStartPoints() { + Set pointset = new HashSet(); for (State s : getStates()) { - pointset.add(Character.MIN_VALUE); + pointset.add(Character.MIN_CODE_POINT); for (Transition t : s.transitions) { pointset.add(t.min); - if (t.max < Character.MAX_VALUE) pointset.add((char) (t.max + 1)); + if (t.max < Character.MAX_CODE_POINT) pointset.add((t.max + 1)); } } - char[] points = new char[pointset.size()]; + int[] points = new int[pointset.size()]; int n = 0; - for (Character m : pointset) + for (Integer m : pointset) points[n++] = m; Arrays.sort(points); return points; @@ -470,9 +470,9 @@ if (isSingleton()) { State p = new State(); initial = p; - for (int i = 0; i < singleton.length(); i++) { + for (int i = 0, cp = 0; i < singleton.length(); i += Character.charCount(cp)) { State q = new State(); - p.transitions.add(new Transition(singleton.charAt(i), q)); + p.transitions.add(new Transition(cp = singleton.codePointAt(i), q)); p = q; } p.accept = true; @@ -542,7 +542,11 @@ StringBuilder b = new StringBuilder(); if (isSingleton()) { b.append("singleton: "); - for (char c : singleton.toCharArray()) + int length = singleton.codePointCount(0, singleton.length()); + int codepoints[] = new int[length]; + for (int i = 0, j = 0, cp = 0; i < singleton.length(); i += Character.charCount(cp)) + codepoints[j++] = cp = singleton.codePointAt(i); + for (int c : codepoints) Transition.appendCharString(c, b); b.append("\n"); } else { Index: lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java (revision 0) @@ -0,0 +1,51 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CharacterRunAutomaton extends RunAutomaton { + + public CharacterRunAutomaton(Automaton a) { + super(a, Character.MAX_CODE_POINT, false); + } + + /** + * Returns true if the given string is accepted by this automaton. + */ + public boolean run(String s) { + int p = initial; + int l = s.length(); + for (int i = 0, cp = 0; i < l; i += Character.charCount(cp)) { + p = step(p, cp = s.codePointAt(i)); + if (p == -1) return false; + } + return accept[p]; + } + + /** + * Returns true if the given string is accepted by this automaton + */ + public boolean run(char[] s, int offset, int length) { + int p = initial; + int l = offset + length; + for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) { + p = step(p, cp = Character.codePointAt(s, i, l)); + if (p == -1) return false; + } + return accept[p]; + } +} Property changes on: lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (working copy) @@ -70,8 +70,8 @@ Set tr = a.initial.getTransitions(); if (tr.size() == 1) { Transition t = tr.iterator().next(); - if (t.to == a.initial && t.min == Character.MIN_VALUE - && t.max == Character.MAX_VALUE) return; + if (t.to == a.initial && t.min == Character.MIN_CODE_POINT + && t.max == Character.MAX_CODE_POINT) return; } a.totalize(); // make arrays for numbered states and effective alphabet @@ -82,7 +82,7 @@ states[number] = q; q.number = number++; } - char[] sigma = a.getStartPoints(); + int[] sigma = a.getStartPoints(); // initialize data structures ArrayList>> reverse = new ArrayList>>(); for (int q = 0; q < states.length; q++) { @@ -121,7 +121,7 @@ partition.get(j).add(qq); block[qq.number] = j; for (int x = 0; x < sigma.length; x++) { - char y = sigma[x]; + int y = sigma[x]; State p = qq.step(y); reverse.get(p.number).get(x).add(qq); reverse_nonempty[p.number][x] = true; Index: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java (revision 0) @@ -0,0 +1,302 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +import java.util.IdentityHashMap; +import java.util.List; +import java.util.ArrayList; +import java.util.Map; +import java.util.HashMap; + +// TODO +// - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef + +public class UTF32ToUTF8 { + + // Unicode boundaries for UTF8 bytes 1,2,3,4 + private static final int[] startCodes = new int[] {0, 128, 2048, 65536}; + private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111}; + + static int[] MASKS = new int[32]; + static { + int v = 2; + for(int i=0;i<32;i++) { + MASKS[i] = v-1; + v *= 2; + } + } + + // Represents one of the N utf8 bytes that (in sequence) + // define a code point. value is the byte value; bits is + // how many bits are "used" by utf8 at that byte + private static class UTF8Byte { + int value; // TODO: change to byte + byte bits; + } + + // Holds a single code point, as a sequence of 1-4 utf8 bytes: + // TODO: maybe move to UnicodeUtil? + private static class UTF8Sequence { + private final UTF8Byte[] bytes; + private int len; + + public UTF8Sequence() { + bytes = new UTF8Byte[4]; + for(int i=0;i<4;i++) { + bytes[i] = new UTF8Byte(); + } + } + + public int byteAt(int idx) { + return bytes[idx].value; + } + + public int numBits(int idx) { + return bytes[idx].bits; + } + + private void set(int code) { + if (code < 128) { + // 0xxxxxxx + bytes[0].value = code; + bytes[0].bits = 7; + len = 1; + } else if (code < 2048) { + // 110yyyxx 10xxxxxx + bytes[0].value = (6 << 5) | (code >> 6); + bytes[0].bits = 5; + setRest(code, 1); + len = 2; + } else if (code < 65536) { + // 1110yyyy 10yyyyxx 10xxxxxx + bytes[0].value = (14 << 4) | (code >> 12); + bytes[0].bits = 4; + setRest(code, 2); + len = 3; + } else { + // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx + bytes[0].value = (30 << 3) | (code >> 18); + bytes[0].bits = 3; + setRest(code, 3); + len = 4; + } + } + + private void setRest(int code, int numBytes) { + for(int i=0;i> 6; + } + } + + public String toString() { + StringBuilder b = new StringBuilder(); + for(int i=0;i 0) { + b.append(' '); + } + b.append(Integer.toBinaryString(bytes[i].value)); + } + return b.toString(); + } + } + + private final UTF8Sequence startUTF8 = new UTF8Sequence(); + private final UTF8Sequence endUTF8 = new UTF8Sequence(); + + private final UTF8Sequence tmpUTF8a = new UTF8Sequence(); + private final UTF8Sequence tmpUTF8b = new UTF8Sequence(); + + // Builds necessary utf8 edges between start & end + void convertOneEdge(State start, State end, int startCodePoint, int endCodePoint) { + startUTF8.set(startCodePoint); + endUTF8.set(endCodePoint); + //System.out.println("start = " + startUTF8); + //System.out.println(" end = " + endUTF8); + build(start, end, startUTF8, endUTF8, 0); + } + + private void build(State start, State end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) { + + // Break into start, middle, end: + if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) { + // Degen case: lead with the same byte: + if (upto == startUTF8.len-1 && upto == endUTF8.len-1) { + // Super degen: just single edge, one UTF8 byte: + start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); + return; + } else { + assert startUTF8.len > upto+1; + assert endUTF8.len > upto+1; + State n = new State(); + + // Single value leading edge + start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single + + // Recurse for the rest + build(n, end, startUTF8, endUTF8, 1+upto); + } + } else if (startUTF8.len == endUTF8.len) { + if (upto == startUTF8.len-1) { + start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend + } else { + start(start, end, startUTF8, upto, false); + if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) { + // There is a middle + all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1); + } + end(start, end, endUTF8, upto, false); + } + } else { + + // start + start(start, end, startUTF8, upto, true); + + // possibly middle, spanning multiple num bytes + int byteCount = 1+startUTF8.len-upto; + final int limit = endUTF8.len-upto; + while (byteCount < limit) { + // wasteful: we only need first byte, and, we should + // statically encode this first byte: + tmpUTF8a.set(startCodes[byteCount-1]); + tmpUTF8b.set(endCodes[byteCount-1]); + all(start, end, + tmpUTF8a.byteAt(0), + tmpUTF8b.byteAt(0), + tmpUTF8a.len - 1); + byteCount++; + } + + // end + end(start, end, endUTF8, upto, true); + } + } + + private static void start(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) { + if (upto == utf8.len-1) { + // Done recursing + start.addTransition(new Transition(utf8.byteAt(upto), utf8.byteAt(upto) | MASKS[utf8.numBits(upto)-1], end)); // type=start + } else { + State n = new State(); + start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=start + start(n, end, utf8, 1+upto, true); + int endCode = utf8.byteAt(upto) | MASKS[utf8.numBits(upto)-1]; + if (doAll && utf8.byteAt(upto) != endCode) { + all(start, end, utf8.byteAt(upto)+1, endCode, utf8.len-upto-1); + } + } + } + + private static void end(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) { + if (upto == utf8.len-1) { + // Done recursing + start.addTransition(new Transition(utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto)-1]), utf8.byteAt(upto), end)); // type=end + } else { + final int startCode; + if (utf8.numBits(upto) == 5) { + // special case -- avoid created unused edges (utf8 + // doesn't accept certain byte sequences) -- there + // are other cases we could optimize too: + startCode = 194; + } else { + startCode = utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto)-1]); + } + if (doAll && utf8.byteAt(upto) != startCode) { + all(start, end, startCode, utf8.byteAt(upto)-1, utf8.len-upto-1); + } + State n = new State(); + start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=end + end(n, end, utf8, 1+upto, true); + } + } + + private static void all(State start, State end, int startCode, int endCode, int left) { + if (left == 0) { + start.addTransition(new Transition(startCode, endCode, end)); // type=all + } else { + State lastN = new State(); + start.addTransition(new Transition(startCode, endCode, lastN)); // type=all + while (left > 1) { + State n = new State(); + lastN.addTransition(new Transition(128, 191, n)); // type=all* + left--; + lastN = n; + } + lastN.addTransition(new Transition(128, 191, end)); // type = all* + } + } + + /** Converts an incoming utf32 automaton to an equivalent + * utf8 one. The incoming automaton need not be + * deterministic. Note that the returned automaton will + * not in general be deterministic, so you must + * determinize it if that's needed. */ + public Automaton convert(Automaton utf32) { + if (utf32.isSingleton()) { + utf32 = utf32.cloneExpanded(); + } + + Map map = new HashMap(); + List pending = new ArrayList(); + State utf32State = utf32.getInitialState(); + pending.add(utf32State); + Automaton utf8 = new Automaton(); + utf8.setDeterministic(false); + + State utf8State = utf8.getInitialState(); + utf8State.setAccept(utf32State.isAccept()); + + map.put(utf32State, utf8State); + + while(pending.size() != 0) { + utf32State = pending.remove(pending.size()-1); + utf8State = map.get(utf32State); + for(Transition t : utf32State.getSortedTransitions(false)) { + final State destUTF32 = t.getDest(); + State destUTF8 = map.get(destUTF32); + if (destUTF8 == null) { + destUTF8 = new State(); + destUTF8.setAccept(destUTF32.isAccept()); + map.put(destUTF32, destUTF8); + pending.add(destUTF32); + } + convertOneEdge(utf8State, destUTF8, t.getMin(), t.getMax()); + } + } + + return utf8; + } + + public static void main(String[] args) { + final int startCode = Integer.parseInt(args[0]); + final int endCode = Integer.parseInt(args[1]); + + Automaton a = new Automaton(); + State start = a.getInitialState(); + State end = new State(); + end.setAccept(true); + + UTF32ToUTF8 converter = new UTF32ToUTF8(); + converter.convertOneEdge(start, end, startCode, endCode); + } +} Property changes on: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (working copy) @@ -70,35 +70,35 @@ State s = new State(); a.initial = s; s.accept = true; - s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + s.transitions.add(new Transition(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT, s)); a.deterministic = true; return a; } /** - * Returns a new (deterministic) automaton that accepts any single character. + * Returns a new (deterministic) automaton that accepts any single codepoint. */ public static Automaton makeAnyChar() { - return makeCharRange(Character.MIN_VALUE, Character.MAX_VALUE); + return makeCharRange(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); } /** - * Returns a new (deterministic) automaton that accepts a single character of + * Returns a new (deterministic) automaton that accepts a single codepoint of * the given value. */ - public static Automaton makeChar(char c) { + public static Automaton makeChar(int c) { Automaton a = new Automaton(); - a.singleton = Character.toString(c); + a.singleton = new String(Character.toChars(c)); a.deterministic = true; return a; } /** - * Returns a new (deterministic) automaton that accepts a single char whose + * Returns a new (deterministic) automaton that accepts a single codepoint whose * value is in the given interval (including both end points). */ - public static Automaton makeCharRange(char min, char max) { + public static Automaton makeCharRange(int min, int max) { if (min == max) return makeChar(min); Automaton a = new Automaton(); State s1 = new State(); Index: lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class ByteRunAutomaton extends RunAutomaton { + + public ByteRunAutomaton(Automaton a) { + super(new UTF32ToUTF8().convert(a), 256, true); + } + + /** + * Returns true if the given byte array is accepted by this automaton + */ + public boolean run(byte[] s, int offset, int length) { + int p = initial; + int l = offset + length; + for (int i = offset; i < l; i++) { + p = step(p, s[i] & 0xFF); + if (p == -1) return false; + } + return accept[p]; + } +} Property changes on: lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/automaton/State.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/State.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/State.java (working copy) @@ -32,6 +32,7 @@ import java.io.Serializable; import java.util.Arrays; import java.util.Collection; +import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -106,11 +107,12 @@ /** * Performs lookup in transitions, assuming determinism. * - * @param c character to look up + * @param c codepoint to look up * @return destination state, null if no matching outgoing transition - * @see #step(char, Collection) + * @see #step(int, Collection) */ - public State step(char c) { + public State step(int c) { + assert c >= 0; for (Transition t : transitions) if (t.min <= c && c <= t.max) return t.to; return null; @@ -119,11 +121,11 @@ /** * Performs lookup in transitions, allowing nondeterminism. * - * @param c character to look up + * @param c codepoint to look up * @param dest collection where destination states are stored - * @see #step(char) + * @see #step(int) */ - public void step(char c, Collection dest) { + public void step(int c, Collection dest) { for (Transition t : transitions) if (t.min <= c && c <= t.max) dest.add(t.to); } @@ -138,12 +140,16 @@ * Returns transitions sorted by (min, reverse max, to) or (to, min, reverse * max) */ - public Transition[] getSortedTransitionArray(boolean to_first) { + public Transition[] getSortedTransitionArray(Comparator comparator) { Transition[] e = transitions.toArray(new Transition[transitions.size()]); - Arrays.sort(e, new TransitionComparator(to_first)); + Arrays.sort(e, comparator); return e; } + public Transition[] getSortedTransitionArray(boolean to_first) { + return getSortedTransitionArray(new TransitionComparator(to_first)); + } + /** * Returns sorted list of outgoing transitions. * @@ -155,6 +161,9 @@ return Arrays.asList(getSortedTransitionArray(to_first)); } + public List getSortedTransitions(Comparator comparator) { + return Arrays.asList(getSortedTransitionArray(comparator)); + } /** * Return this state's number. Index: lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/LevenshteinAutomata.java (working copy) @@ -39,13 +39,13 @@ public static final int MAXIMUM_SUPPORTED_DISTANCE = 2; /* input word */ final String input; - final char word[]; + final int word[]; /* the automata alphabet. */ - final char alphabet[]; + final int alphabet[]; /* the unicode ranges outside of alphabet */ - final char rangeLower[]; - final char rangeUpper[]; + final int rangeLower[]; + final int rangeUpper[]; int numRanges = 0; ParametricDescription descriptions[]; @@ -55,35 +55,39 @@ */ public LevenshteinAutomata(String input) { this.input = input; - this.word = input.toCharArray(); + int length = Character.codePointCount(input, 0, input.length()); + word = new int[length]; + for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) { + word[j++] = cp = input.codePointAt(i); + } // calculate the alphabet - SortedSet set = new TreeSet(); + SortedSet set = new TreeSet(); for (int i = 0; i < word.length; i++) set.add(word[i]); - alphabet = new char[set.size()]; - Iterator iterator = set.iterator(); + alphabet = new int[set.size()]; + Iterator iterator = set.iterator(); for (int i = 0; i < alphabet.length; i++) alphabet[i] = iterator.next(); - rangeLower = new char[alphabet.length + 2]; - rangeUpper = new char[alphabet.length + 2]; + rangeLower = new int[alphabet.length + 2]; + rangeUpper = new int[alphabet.length + 2]; // calculate the unicode range intervals that exclude the alphabet // these are the ranges for all unicode characters not in the alphabet int lower = 0; for (int i = 0; i < alphabet.length; i++) { - char higher = alphabet[i]; + int higher = alphabet[i]; if (higher > lower) { - rangeLower[numRanges] = (char) lower; - rangeUpper[numRanges] = (char) (higher - 1); + rangeLower[numRanges] = lower; + rangeUpper[numRanges] = higher - 1; numRanges++; } lower = higher + 1; } /* add the final endpoint */ - if (lower <= 0xFFFF) { - rangeLower[numRanges] = (char) lower; - rangeUpper[numRanges] = '\uFFFF'; + if (lower <= Character.MAX_CODE_POINT) { + rangeLower[numRanges] = lower; + rangeUpper[numRanges] = Character.MAX_CODE_POINT; numRanges++; } @@ -129,7 +133,7 @@ final int end = xpos + Math.min(word.length - xpos, range); for (int x = 0; x < alphabet.length; x++) { - final char ch = alphabet[x]; + final int ch = alphabet[x]; // get the characteristic vector at this position wrt ch final int cvec = getVector(ch, xpos, end); int dest = description.transition(k, xpos, cvec); @@ -157,7 +161,7 @@ * Get the characteristic vector X(x, V) * where V is substring(pos, end) */ - int getVector(char x, int pos, int end) { + int getVector(int x, int pos, int end) { int vector = 0; for (int i = pos; i < end; i++) { vector <<= 1; Index: lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java (working copy) @@ -33,8 +33,8 @@ import java.util.Comparator; /** - * Comparator for state {@link Transition}s that orders unicode char range - * transitions in lexicographic order. + * Comparator for state {@link Transition}s that orders unicode codepoint range + * transitions in codepoint order. * * @lucene.experimental */ Index: lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (working copy) @@ -33,6 +33,8 @@ import java.util.HashSet; import java.util.Set; +import org.apache.lucene.util.BytesRef; + /** * Special automata operations. * @@ -46,7 +48,7 @@ * Finds the largest entry whose value is less than or equal to c, or 0 if * there is no such entry. */ - static int findIndex(char c, char[] points) { + static int findIndex(int c, int[] points) { int a = 0; int b = points.length; while (b - a > 1) { @@ -96,7 +98,7 @@ if (!s.accept && s.transitions.size() == 1) { Transition t = s.transitions.iterator().next(); if (t.min == t.max && !visited.contains(t.to)) { - b.append(t.min); + b.appendCodePoint(t.min); s = t.to; done = false; } @@ -105,6 +107,28 @@ return b.toString(); } + public static BytesRef getCommonPrefixBytesRef(Automaton a) { + if (a.isSingleton()) return new BytesRef(a.singleton); + BytesRef ref = new BytesRef(10); + HashSet visited = new HashSet(); + State s = a.initial; + boolean done; + do { + done = true; + visited.add(s); + if (!s.accept && s.transitions.size() == 1) { + Transition t = s.transitions.iterator().next(); + if (t.min == t.max && !visited.contains(t.to)) { + ref.grow(++ref.length); + ref.bytes[ref.length - 1] = (byte)t.min; + s = t.to; + done = false; + } + } + } while (!done); + return ref; + } + /** * Returns the longest string that is a suffix of all accepted strings and * visits each state at most once. @@ -119,9 +143,32 @@ Automaton r = a.clone(); reverse(r); r.determinize(); - return reverseUnicode3(SpecialOperations.getCommonPrefix(r)); + return new StringBuilder(SpecialOperations.getCommonPrefix(r)).reverse().toString(); } + public static BytesRef getCommonSuffixBytesRef(Automaton a) { + if (a.isSingleton()) // if singleton, the suffix is the string itself. + return new BytesRef(a.singleton); + + // reverse the language of the automaton, then reverse its common prefix. + Automaton r = a.clone(); + reverse(r); + r.determinize(); + BytesRef ref = SpecialOperations.getCommonPrefixBytesRef(r); + reverseBytes(ref); + return ref; + } + + private static void reverseBytes(BytesRef ref) { + if (ref.length <= 1) return; + int num = ref.length >> 1; + for (int i = ref.offset; i < ( ref.offset + num ); i++) { + byte b = ref.bytes[i]; + ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1]; + ref.bytes[ref.offset * 2 + ref.length - i - 1] = b; + } + } + /** * Reverses the language of the given (non-singleton) automaton while returning * the set of new initial states. @@ -149,31 +196,4 @@ a.deterministic = false; return accept; } - - /** - * Intentionally use a unicode 3 reverse. - * This is because we are only going to reverse it again... - */ - private static String reverseUnicode3( final String input ){ - char[] charInput = input.toCharArray(); - reverseUnicode3(charInput, 0, charInput.length); - return new String(charInput); - } - - /** - * Intentionally use a unicode 3 reverse. - * This is because it is only used by getCommonSuffix(), - * which will reverse the entire FSM using code unit reversal, - * so we must then reverse its common prefix back using the - * same code unit reversal. - */ - private static void reverseUnicode3(char[] buffer, int start, int len){ - if (len <= 1) return; - int num = len>>1; - for (int i = start; i < ( start + num ); i++) { - char c = buffer[i]; - buffer[i] = buffer[start * 2 + len - i - 1]; - buffer[start * 2 + len - i - 1] = c; - } - } } Index: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py (revision 0) +++ lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py (revision 0) @@ -0,0 +1,351 @@ +import types +import os +import sys +import random + +MAX_UNICODE = 0x10FFFF + +# TODO +# - could be more minimal +# - eg when bracket lands on a utf8 boundary, like 3 - 2047 -- they can share the two * edges +# - also 3 2048 or 3 65536 -- it should not have an * down the red path, but it does + +# MASKS[0] is bottom 1-bit +# MASKS[1] is bottom 2-bits +# ... + +utf8Ranges = [(0, 127), + (128, 2047), + (2048, 65535), + (65536, 1114111)] + +typeToColor = {'startend': 'purple', + 'start': 'blue', + 'end': 'red'} + +class FSA: + + def __init__(self): + # maps fromNode -> (startUTF8, endUTF8, endNode) + self.states = {} + self.nodeUpto = 0 + + def run(self, bytes): + state = self.start + for b in bytes: + found = False + oldState = state + for label, s, e, n in self.states[state][1:]: + if b >= s and b <= e: + if found: + raise RuntimeError('state %s has ambiguous output for byte %s' % (oldState, b)) + state = n + found = True + if not found: + return -1 + + return state + + def addEdge(self, n1, n2, v1, v2, label): + """ + Adds edge from n1-n2, utf8 byte range v1-v2. + """ + assert n1 in self.states + assert type(v1) is types.IntType + assert type(v2) is types.IntType + self.states[n1].append((label, v1, v2, n2)) + + def addNode(self, label=None): + try: + self.states[self.nodeUpto] = [label] + return self.nodeUpto + finally: + self.nodeUpto += 1 + + def toDOT(self, label): + __l = [] + w = __l.append + endNode = startNode = None + for id, details in self.states.items(): + name = details[0] + if name == 'end': + endNode = id + elif name == 'start': + startNode = id + + w('digraph %s {' % label) + w(' rankdir=LR;') + w(' size="8,5";') + w(' node [color=white label=""]; Ns;') + + w(' node [color=black];') + w(' node [shape=doublecircle, label=""]; N%s [label="%s"];' % (endNode, endNode)) + w(' node [shape=circle];') + + w(' N%s [label="%s"];' % (startNode, startNode)) + w(' Ns -> N%s;' % startNode) + for id, details in self.states.items(): + edges = details[1:] + w(' N%s [label="%s"];' % (id, id)) + for type, s, e, dest in edges: + c = typeToColor.get(type, 'black') + if type == 'all*': + # special case -- matches any utf8 byte at this point + label = '*' + elif s == e: + label = '%s' % binary(s) + else: + label = '%s-%s' % (binary(s), binary(e)) + w(' N%s -> N%s [label="%s" color="%s"];' % (id, dest, label, c)) + if name == 'end': + endNode = id + elif name == 'start': + startNode = id + w('}') + return '\n'.join(__l) + + def toPNG(self, label, pngOut): + open('tmp.dot', 'wb').write(self.toDOT(label)) + if os.system('dot -Tpng tmp.dot -o %s' % pngOut): + raise RuntimeException('dot failed') + + +MASKS = [] +v = 2 +for i in range(32): + MASKS.append(v-1) + v *= 2 + +def binary(x): + if x == 0: + return '00000000' + + l = [] + while x > 0: + if x & 1 == 1: + l.append('1') + else: + l.append('0') + x = x >> 1 + + # big endian! + l.reverse() + + l2 = [] + while len(l) > 0: + s = ''.join(l[-8:]) + if len(s) < 8: + s = '0'*(8-len(s)) + s + l2.append(s) + del l[-8:] + + return ' '.join(l2) + +def getUTF8Rest(code, numBytes): + l = [] + for i in range(numBytes): + l.append((128 | (code & MASKS[5]), 6)) + code = code >> 6 + l.reverse() + return tuple(l) + +def toUTF8(code): + # code = Unicode code point + assert code >= 0 + assert code <= MAX_UNICODE + + if code < 128: + # 0xxxxxxx + bytes = ((code, 7),) + elif code < 2048: + # 110yyyxx 10xxxxxx + byte1 = (6 << 5) | (code >> 6) + bytes = ((byte1, 5),) + getUTF8Rest(code, 1) + elif code < 65536: + # 1110yyyy 10yyyyxx 10xxxxxx + len = 3 + byte1 = (14 << 4) | (code >> 12) + bytes = ((byte1, 4),) + getUTF8Rest(code, 2) + else: + # 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx + len = 4 + byte1 = (30 << 3) | (code >> 18) + bytes = ((byte1, 3),) + getUTF8Rest(code, 3) + + return bytes + +def all(fsa, startNode, endNode, startCode, endCode, left): + if len(left) == 0: + fsa.addEdge(startNode, endNode, startCode, endCode, 'all') + else: + lastN = fsa.addNode() + fsa.addEdge(startNode, lastN, startCode, endCode, 'all') + while len(left) > 1: + n = fsa.addNode() + fsa.addEdge(lastN, n, 128, 191, 'all*') + left = left[1:] + lastN = n + fsa.addEdge(lastN, endNode, 128, 191, 'all*') + +def start(fsa, startNode, endNode, utf8, doAll): + if len(utf8) == 1: + fsa.addEdge(startNode, endNode, utf8[0][0], utf8[0][0] | MASKS[utf8[0][1]-1], 'start') + else: + n = fsa.addNode() + fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'start') + start(fsa, n, endNode, utf8[1:], True) + end = utf8[0][0] | MASKS[utf8[0][1]-1] + if doAll and utf8[0][0] != end: + all(fsa, startNode, endNode, utf8[0][0]+1, end, utf8[1:]) + +def end(fsa, startNode, endNode, utf8, doAll): + if len(utf8) == 1: + fsa.addEdge(startNode, endNode, utf8[0][0] & ~MASKS[utf8[0][1]-1], utf8[0][0], 'end') + else: + if utf8[0][1] == 5: + # special case -- avoid created unused edges (utf8 doesn't accept certain byte sequences): + start = 194 + else: + start = utf8[0][0] & (~MASKS[utf8[0][1]-1]) + if doAll and utf8[0][0] != start: + all(fsa, startNode, endNode, start, utf8[0][0]-1, utf8[1:]) + n = fsa.addNode() + fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'end') + end(fsa, n, endNode, utf8[1:], True) + +def build(fsa, + startNode, endNode, + startUTF8, endUTF8): + + # Break into start, middle, end: + if startUTF8[0][0] == endUTF8[0][0]: + # Degen case: lead with the same byte: + if len(startUTF8) == 1 and len(endUTF8) == 1: + fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend') + return + else: + assert len(startUTF8) != 1 + assert len(endUTF8) != 1 + n = fsa.addNode() + # single value edge + fsa.addEdge(startNode, n, startUTF8[0][0], startUTF8[0][0], 'single') + build(fsa, n, endNode, startUTF8[1:], endUTF8[1:]) + elif len(startUTF8) == len(endUTF8): + if len(startUTF8) == 1: + fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend') + else: + start(fsa, startNode, endNode, startUTF8, False) + if endUTF8[0][0] - startUTF8[0][0] > 1: + all(fsa, startNode, endNode, startUTF8[0][0]+1, endUTF8[0][0]-1, startUTF8[1:]) + end(fsa, startNode, endNode, endUTF8, False) + else: + # start + start(fsa, startNode, endNode, startUTF8, True) + + # possibly middle + byteCount = 1+len(startUTF8) + while byteCount < len(endUTF8): + s = toUTF8(utf8Ranges[byteCount-1][0]) + e = toUTF8(utf8Ranges[byteCount-1][1]) + all(fsa, startNode, endNode, + s[0][0], + e[0][0], + s[1:]) + byteCount += 1 + + # end + end(fsa, startNode, endNode, endUTF8, True) + +def main(): + + if len(sys.argv) not in (3, 4): + print + print 'Usage: python %s startUTF32 endUTF32 [testCode]' % sys.argv[0] + print + sys.exit(1) + + utf32Start = int(sys.argv[1]) + utf32End = int(sys.argv[2]) + + if utf32Start > utf32End: + print 'ERROR: start must be <= end' + sys.exit(1) + + fsa = FSA() + fsa.start = fsa.addNode('start') + fsa.end = fsa.addNode('end') + + print 's=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32Start)]) + print 'e=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32End)]) + + if len(sys.argv) == 4: + print 't=%s [%s]' % \ + (' '.join([binary(x[0]) for x in toUTF8(int(sys.argv[3]))]), + ' '.join(['%2x' % x[0] for x in toUTF8(int(sys.argv[3]))])) + + build(fsa, fsa.start, fsa.end, + toUTF8(utf32Start), + toUTF8(utf32End)) + + fsa.toPNG('test', '/tmp/outpy.png') + print 'Saved to /tmp/outpy.png...' + + test(fsa, utf32Start, utf32End, 100000); + +def test(fsa, utf32Start, utf32End, count): + + # verify correct ints are accepted + for i in range(count): + r = random.randint(utf32Start, utf32End) + dest = fsa.run([tup[0] for tup in toUTF8(r)]) + if dest != fsa.end: + print 'FAILED: valid %s (%s) is not accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)])) + return False + + invalidRange = MAX_UNICODE - (utf32End - utf32Start + 1) + if invalidRange >= 0: + # verify invalid ints are not accepted + for i in range(count): + r = random.randint(0, invalidRange-1) + if r >= utf32Start: + r = utf32End + 1 + r - utf32Start + dest = fsa.run([tup[0] for tup in toUTF8(r)]) + if dest != -1: + print 'FAILED: invalid %s (%s) is accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)])) + return False + + return True + +def stress(): + + print 'Testing...' + + iter = 0 + while True: + if iter % 10 == 0: + print '%s...' % iter + iter += 1 + + v1 = random.randint(0, MAX_UNICODE) + v2 = random.randint(0, MAX_UNICODE) + if v2 < v1: + v1, v2 = v2, v1 + + utf32Start = v1 + utf32End = v2 + + fsa = FSA() + fsa.start = fsa.addNode('start') + fsa.end = fsa.addNode('end') + build(fsa, fsa.start, fsa.end, + toUTF8(utf32Start), + toUTF8(utf32End)) + + if not test(fsa, utf32Start, utf32End, 10000): + print 'FAILED on utf32Start=%s utf32End=%s' % (utf32Start, utf32End) + +if __name__ == '__main__': + if len(sys.argv) > 1: + main() + else: + stress() Property changes on: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java (working copy) @@ -302,8 +302,8 @@ newstates.put(q, q); r = q; } - char min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; - char max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; + int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; + int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; p.s.transitions.add(new Transition(min, max, r.s)); } } @@ -348,10 +348,10 @@ int min1 = t1[n1].min, max1 = t1[n1].max; for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { if (t2[n2].min > min1) return false; - if (t2[n2].max < Character.MAX_VALUE) min1 = t2[n2].max + 1; + if (t2[n2].max < Character.MAX_CODE_POINT) min1 = t2[n2].max + 1; else { - min1 = Character.MAX_VALUE; - max1 = Character.MIN_VALUE; + min1 = Character.MAX_CODE_POINT; + max1 = Character.MIN_CODE_POINT; } StatePair q = new StatePair(t1[n1].to, t2[n2].to); if (!visited.contains(q)) { @@ -435,7 +435,7 @@ * Determinizes the given automaton using the given set of initial states. */ static void determinize(Automaton a, Set initialset) { - char[] points = a.getStartPoints(); + int[] points = a.getStartPoints(); // subset construction Map,Set> sets = new HashMap,Set>(); LinkedList> worklist = new LinkedList>(); @@ -463,10 +463,10 @@ newstate.put(p, new State()); } State q = newstate.get(p); - char min = points[n]; - char max; - if (n + 1 < points.length) max = (char) (points[n + 1] - 1); - else max = Character.MAX_VALUE; + int min = points[n]; + int max; + if (n + 1 < points.length) max = (points[n + 1] - 1); + else max = Character.MAX_CODE_POINT; r.transitions.add(new Transition(min, max, q)); } } @@ -563,8 +563,8 @@ if (a.isSingleton()) return false; if (a.initial.accept && a.initial.transitions.size() == 1) { Transition t = a.initial.transitions.iterator().next(); - return t.to == a.initial && t.min == Character.MIN_VALUE - && t.max == Character.MAX_VALUE; + return t.to == a.initial && t.min == Character.MIN_CODE_POINT + && t.max == Character.MAX_CODE_POINT; } return false; } @@ -580,8 +580,8 @@ if (a.isSingleton()) return s.equals(a.singleton); if (a.deterministic) { State p = a.initial; - for (int i = 0; i < s.length(); i++) { - State q = p.step(s.charAt(i)); + for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) { + State q = p.step(cp = s.charAt(i)); if (q == null) return false; p = q; } @@ -596,8 +596,8 @@ pp.add(a.initial); ArrayList dest = new ArrayList(); boolean accept = a.initial.accept; - for (int i = 0; i < s.length(); i++) { - char c = s.charAt(i); + for (int i = 0, c = 0; i < s.length(); i += Character.charCount(c)) { + c = s.codePointAt(i); accept = false; pp_other.clear(); bb_other.clear(); Index: lucene/src/java/org/apache/lucene/util/automaton/RegExp.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/RegExp.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/RegExp.java (working copy) @@ -366,9 +366,9 @@ Kind kind; RegExp exp1, exp2; String s; - char c; + int c; int min, max, digits; - char from, to; + int from, to; String b; int flags; @@ -625,10 +625,10 @@ b.append(")"); break; case REGEXP_CHAR: - b.append("\\").append(c); + b.append("\\").appendCodePoint(c); break; case REGEXP_CHAR_RANGE: - b.append("[\\").append(from).append("-\\").append(to).append("]"); + b.append("[\\").appendCodePoint(from).append("-\\").appendCodePoint(to).append("]"); break; case REGEXP_ANYCHAR: b.append("."); @@ -725,9 +725,9 @@ static private RegExp makeString(RegExp exp1, RegExp exp2) { StringBuilder b = new StringBuilder(); if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s); - else b.append(exp1.c); + else b.appendCodePoint(exp1.c); if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s); - else b.append(exp2.c); + else b.appendCodePoint(exp2.c); return makeString(b.toString()); } @@ -777,14 +777,14 @@ return r; } - static RegExp makeChar(char c) { + static RegExp makeChar(int c) { RegExp r = new RegExp(); r.kind = Kind.REGEXP_CHAR; r.c = c; return r; } - static RegExp makeCharRange(char from, char to) { + static RegExp makeCharRange(int from, int to) { RegExp r = new RegExp(); r.kind = Kind.REGEXP_CHAR_RANGE; r.from = from; @@ -834,13 +834,13 @@ } private boolean peek(String s) { - return more() && s.indexOf(b.charAt(pos)) != -1; + return more() && s.indexOf(b.codePointAt(pos)) != -1; } - private boolean match(char c) { + private boolean match(int c) { if (pos >= b.length()) return false; - if (b.charAt(pos) == c) { - pos++; + if (b.codePointAt(pos) == c) { + pos += Character.charCount(c); return true; } return false; @@ -850,9 +850,11 @@ return pos < b.length(); } - private char next() throws IllegalArgumentException { + private int next() throws IllegalArgumentException { if (!more()) throw new IllegalArgumentException("unexpected end-of-string"); - return b.charAt(pos++); + int ch = b.codePointAt(pos); + pos += Character.charCount(ch); + return ch; } private boolean check(int flag) { @@ -933,7 +935,7 @@ } final RegExp parseCharClass() throws IllegalArgumentException { - char c = parseCharExp(); + int c = parseCharExp(); if (match('-')) return makeCharRange(c, parseCharExp()); else return makeChar(c); } @@ -993,7 +995,7 @@ } else return makeChar(parseCharExp()); } - final char parseCharExp() throws IllegalArgumentException { + final int parseCharExp() throws IllegalArgumentException { match('\\'); return next(); } Index: lucene/src/java/org/apache/lucene/util/automaton/Transition.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/Transition.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/Transition.java (working copy) @@ -35,7 +35,7 @@ * Automaton transition. *

* A transition, which belongs to a source state, consists of a Unicode - * character interval and a destination state. + * codepoint interval and a destination state. * * @lucene.experimental */ @@ -45,18 +45,19 @@ * CLASS INVARIANT: min<=max */ - char min; - char max; + int min; + int max; State to; /** * Constructs a new singleton interval transition. * - * @param c transition character + * @param c transition codepoint * @param to destination state */ - public Transition(char c, State to) { + public Transition(int c, State to) { + assert c >= 0; min = max = c; this.to = to; } @@ -68,9 +69,11 @@ * @param max transition interval maximum * @param to destination state */ - public Transition(char min, char max, State to) { + public Transition(int min, int max, State to) { + assert min >= 0; + assert max >= 0; if (max < min) { - char t = max; + int t = max; max = min; min = t; } @@ -80,12 +83,12 @@ } /** Returns minimum of this transition interval. */ - public char getMin() { + public int getMin() { return min; } /** Returns maximum of this transition interval. */ - public char getMax() { + public int getMax() { return max; } @@ -134,14 +137,18 @@ } } - static void appendCharString(char c, StringBuilder b) { - if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.append(c); + static void appendCharString(int c, StringBuilder b) { + if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c); else { - b.append("\\u"); + b.append("\\\\U"); String s = Integer.toHexString(c); - if (c < 0x10) b.append("000").append(s); - else if (c < 0x100) b.append("00").append(s); - else if (c < 0x1000) b.append("0").append(s); + if (c < 0x10) b.append("0000000").append(s); + else if (c < 0x100) b.append("000000").append(s); + else if (c < 0x1000) b.append("00000").append(s); + else if (c < 0x10000) b.append("0000").append(s); + else if (c < 0x100000) b.append("000").append(s); + else if (c < 0x1000000) b.append("00").append(s); + else if (c < 0x10000000) b.append("0").append(s); else b.append(s); } } Index: lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 937269) +++ lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java (working copy) @@ -37,15 +37,16 @@ * * @lucene.experimental */ -public final class RunAutomaton implements Serializable { - +public abstract class RunAutomaton implements Serializable { + final int maxInterval; final int size; final boolean[] accept; final int initial; final int[] transitions; // delta(state,c) = transitions[state*points.length + // getCharClass(c)] - final char[] points; // char interval start points + final int[] points; // char interval start points final int[] classmap; // map from char number to class class + final Automaton automaton; /** * Returns a string representation of this automaton. @@ -61,10 +62,10 @@ for (int j = 0; j < points.length; j++) { int k = transitions[i * points.length + j]; if (k != -1) { - char min = points[j]; - char max; - if (j + 1 < points.length) max = (char) (points[j + 1] - 1); - else max = Character.MAX_VALUE; + int min = points[j]; + int max; + if (j + 1 < points.length) max = (points[j + 1] - 1); + else max = maxInterval; b.append(" "); Transition.appendCharString(min, b); if (min != max) { @@ -81,46 +82,54 @@ /** * Returns number of states in automaton. */ - public int getSize() { + public final int getSize() { return size; } /** * Returns acceptance status for given state. */ - public boolean isAccept(int state) { + public final boolean isAccept(int state) { return accept[state]; } /** * Returns initial state. */ - public int getInitialState() { + public final int getInitialState() { return initial; } /** - * Returns array of character class interval start points. The array should + * Returns array of codepoint class interval start points. The array should * not be modified by the caller. */ - public char[] getCharIntervals() { + public final int[] getCharIntervals() { return points.clone(); } /** - * Gets character class of given char. + * Gets character class of given codepoint */ - int getCharClass(char c) { + final int getCharClass(int c) { return SpecialOperations.findIndex(c, points); } /** + * @return the automaton + */ + public Automaton getAutomaton() { + return automaton; + } + + /** * Constructs a new RunAutomaton from a deterministic * Automaton. * * @param a an automaton */ - public RunAutomaton(Automaton a) { + public RunAutomaton(Automaton a, int maxInterval, boolean tableize) { + this.maxInterval = maxInterval; a.determinize(); points = a.getStartPoints(); Set states = a.getStates(); @@ -142,12 +151,18 @@ /* * Set alphabet table for optimal run performance. */ - classmap = new int[Character.MAX_VALUE + 1]; - int i = 0; - for (int j = 0; j <= Character.MAX_VALUE; j++) { - if (i + 1 < points.length && j == points[i + 1]) i++; - classmap[j] = i; + if (tableize) { + classmap = new int[maxInterval + 1]; + int i = 0; + for (int j = 0; j <= maxInterval; j++) { + if (i + 1 < points.length && j == points[i + 1]) + i++; + classmap[j] = i; + } + } else { + classmap = null; } + this.automaton = a; } /** @@ -157,54 +172,10 @@ * if a dead state is entered in an equivalent automaton with a total * transition function.) */ - public int step(int state, char c) { - return transitions[state * points.length + classmap[c]]; + public final int step(int state, int c) { + if (classmap == null) + return transitions[state * points.length + getCharClass(c)]; + else + return transitions[state * points.length + classmap[c]]; } - - /** - * Returns true if the given string is accepted by this automaton. - */ - public boolean run(String s) { - int p = initial; - int l = s.length(); - for (int i = 0; i < l; i++) { - p = step(p, s.charAt(i)); - if (p == -1) return false; - } - return accept[p]; - } - - /** - * Returns true if the given string is accepted by this automaton - */ - public boolean run(char[] s, int offset, int length) { - int p = initial; - int l = offset + length; - for (int i = offset; i < l; i++) { - p = step(p, s[i]); - if (p == -1) return false; - } - return accept[p]; - } - - /** - * Returns the length of the longest accepted run of the given string starting - * at the given offset. - * - * @param s the string - * @param offset offset into s where the run starts - * @return length of the longest accepted run, -1 if no run is accepted - */ - public int run(String s, int offset) { - int p = initial; - int l = s.length(); - int max = -1; - for (int r = 0; offset <= l; offset++, r++) { - if (accept[p]) max = r; - if (offset == l) break; - p = step(p, s.charAt(offset)); - if (p == -1) break; - } - return max; - } }