Index: solr/src/test/org/apache/solr/util/ArraysUtilsTest.java =================================================================== --- solr/src/test/org/apache/solr/util/ArraysUtilsTest.java (revision 940218) +++ solr/src/test/org/apache/solr/util/ArraysUtilsTest.java (working copy) @@ -1,48 +0,0 @@ -package org.apache.solr.util; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import junit.framework.TestCase; - -public class ArraysUtilsTest extends TestCase { - - - public ArraysUtilsTest(String s) { - super(s); - } - - protected void setUp() { - } - - protected void tearDown() { - - } - - public void test() { - String left = "this is equal"; - String right = left; - char[] leftChars = left.toCharArray(); - char[] rightChars = right.toCharArray(); - assertTrue(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 0, rightChars, 0, left.length())); - - assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 0, left.length())); - assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 2, left.length())); - - assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 25, rightChars, 0, left.length())); - assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 12, rightChars, 0, left.length())); - } -} \ No newline at end of file Index: solr/src/java/org/apache/solr/util/ArraysUtils.java =================================================================== --- solr/src/java/org/apache/solr/util/ArraysUtils.java (revision 940218) +++ solr/src/java/org/apache/solr/util/ArraysUtils.java (working copy) @@ -1,51 +0,0 @@ -package org.apache.solr.util; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -/** - * - * - **/ -//Since Arrays.equals doesn't implement offsets for equals -public class ArraysUtils { - - /** - * See if two array slices are the same. - * - * @param left The left array to compare - * @param offsetLeft The offset into the array. Must be positive - * @param right The right array to compare - * @param offsetRight the offset into the right array. Must be positive - * @param length The length of the section of the array to compare - * @return true if the two arrays, starting at their respective offsets, are equal - * - * @see java.util.Arrays#equals(char[], char[]) - */ - public static boolean equals(char[] left, int offsetLeft, char[] right, int offsetRight, int length) { - if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) { - for (int i = 0; i < length; i++) { - if (left[offsetLeft + i] != right[offsetRight + i]) { - return false; - } - - } - return true; - } - return false; - } -} Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 940218) +++ lucene/CHANGES.txt (working copy) @@ -115,6 +115,11 @@ actual file's length if the file exists, and throws FileNotFoundException otherwise. Returning length=0 for a non-existent file is no longer allowed. If you relied on that, make sure to catch the exception. (Shai Erera) + +* LUCENE-2265: FuzzyQuery and WildcardQuery now operate on Unicode codepoints, + not unicode code units. For example, a Wildcard "?" represents any unicode + character. Furthermore, the rest of the automaton package and RegexpQuery use + true Unicode codepoint representation. (Robert Muir, Mike McCandless) Changes in runtime behavior Index: lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java (revision 940218) +++ lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java (working copy) @@ -56,6 +56,20 @@ static final float epsilon = 0.00001f; public void testFromTestData() throws Exception { + // TODO: randomize! + assertFromTestData(new int[] { 0x40, 0x41 }); + assertFromTestData(new int[] { 0x40, 0x0195 }); + assertFromTestData(new int[] { 0x40, 0x0906 }); + assertFromTestData(new int[] { 0x40, 0x1040F }); + assertFromTestData(new int[] { 0x0194, 0x0195 }); + assertFromTestData(new int[] { 0x0194, 0x0906 }); + assertFromTestData(new int[] { 0x0194, 0x1040F }); + assertFromTestData(new int[] { 0x0905, 0x0906 }); + assertFromTestData(new int[] { 0x0905, 0x1040F }); + assertFromTestData(new int[] { 0x1040E, 0x1040F }); + } + + public void assertFromTestData(int codePointTable[]) throws Exception { InputStream stream = getClass().getResourceAsStream("fuzzyTestData.txt"); BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); @@ -71,7 +85,7 @@ doc.add(field); for (int i = 0; i < terms; i++) { - field.setValue(Integer.toBinaryString(i)); + field.setValue(mapInt(codePointTable, i)); writer.addDocument(doc); } @@ -82,7 +96,7 @@ String line; while ((line = reader.readLine()) != null) { String params[] = line.split(","); - String query = Integer.toBinaryString(Integer.parseInt(params[0])); + String query = mapInt(codePointTable, Integer.parseInt(params[0])); int prefix = Integer.parseInt(params[1]); int pqSize = Integer.parseInt(params[2]); float minScore = Float.parseFloat(params[3]); @@ -101,6 +115,15 @@ dir.close(); } + /* map bits to unicode codepoints */ + private static String mapInt(int codePointTable[], int i) { + StringBuilder sb = new StringBuilder(); + String binary = Integer.toBinaryString(i); + for (int j = 0; j < binary.length(); j++) + sb.appendCodePoint(codePointTable[binary.charAt(j) - '0']); + return sb.toString(); + } + /* Code to generate test data public static void main(String args[]) throws Exception { int bits = 3; Index: lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java (revision 940218) +++ lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java (working copy) @@ -124,55 +124,4 @@ Automaton a = new RegExp("((\uD866\uDF05)|\uFB94).*").toAutomaton(); assertAutomatonHits(2, a); } - - /** - * Test that AutomatonQuery properly seeks to supplementary characters. - * Transitions are modeled as UTF-16 code units, so without special handling - * by default it will try to seek to a lead surrogate with some DFAs - */ - public void testSeekSurrogate() throws IOException { - Automaton a = new RegExp("\uD866[a\uDF05\uFB93][a-z]{0,5}[fl]").toAutomaton(); - assertAutomatonHits(1, a); - } - - /** - * Try seeking to an ending lead surrogate. - */ - public void testSeekSurrogate2() throws IOException { - Automaton a = new RegExp("\uD866(\uDF06ghijkl)?").toAutomaton(); - assertAutomatonHits(1, a); - } - - /** - * Try seeking to an starting trail surrogate. - */ - public void testSeekSurrogate3() throws IOException { - Automaton a = new RegExp("[\uDF06\uFB94]mnopqr").toAutomaton(); - assertAutomatonHits(1, a); - } - - /** - * Try seeking to an medial/final trail surrogate. - */ - public void testSeekSurrogate4() throws IOException { - Automaton a = new RegExp("a[\uDF06\uFB94]bc").toAutomaton(); - assertAutomatonHits(1, a); - } - - /** - * Ensure the 'constant suffix' does not contain a leading trail surrogate. - */ - public void testSurrogateSuffix() throws IOException { - Automaton a = new RegExp(".*[\uD865\uD866]\uDF06ghijkl").toAutomaton(); - assertAutomatonHits(1, a); - } - - /** - * Try when the constant suffix is only a leading trail surrogate. - * instead this must use an empty suffix. - */ - public void testSurrogateSuffix2() throws IOException { - Automaton a = new RegExp(".*\uDF05").toAutomaton(); - assertAutomatonHits(1, a); - } } Index: lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java (revision 940218) +++ lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java (working copy) @@ -145,10 +145,8 @@ .makeString("foobar")); assertEquals(a1, a2); - assertEquals(a1.hashCode(), a2.hashCode()); assertEquals(a1, a3); - assertEquals(a1.hashCode(), a3.hashCode()); assertEquals(a1.toString(), a3.toString()); Index: lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java (revision 940218) +++ lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java (working copy) @@ -31,9 +31,11 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonTestUtil; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; -import org.apache.lucene.util.automaton.RunAutomaton; /** * Create an index with random unicode terms @@ -46,17 +48,17 @@ @Override protected void setUp() throws Exception { super.setUp(); - random = newRandom(System.nanoTime()); + random = newRandom(); RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); - Field field = new Field("field", "", Field.Store.YES, Field.Index.ANALYZED); + Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); doc.add(field); - for (int i = 0; i < 1000; i++) { - field.setValue(randomString()); + for (int i = 0; i < 2000; i++) { + field.setValue(_TestUtil.randomUnicodeString(random)); writer.addDocument(doc); } @@ -87,7 +89,7 @@ } private class SimpleAutomatonTermsEnum extends FilteredTermsEnum { - RunAutomaton runAutomaton = new RunAutomaton(automaton); + CharacterRunAutomaton runAutomaton = new CharacterRunAutomaton(automaton); UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); private SimpleAutomatonTermsEnum(IndexReader reader, String field) throws IOException { @@ -111,25 +113,14 @@ /** test a bunch of random regular expressions */ public void testRegexps() throws Exception { - for (int i = 0; i < 500; i++) - assertSame(randomRegex()); + for (int i = 0; i < 1000; i++) + assertSame(AutomatonTestUtil.randomRegexp(random).toString()); } /** check that the # of hits is the same as from a very * simple regexpquery implementation. */ - private void assertSame(String regexp) throws IOException { - // we will generate some illegal syntax regular expressions... - try { - new RegExp(regexp).toAutomaton(); - } catch (Exception e) { - return; - } - - // we will also generate some undefined unicode queries - if (!UnicodeUtil.validUTF16String(regexp)) - return; - + private void assertSame(String regexp) throws IOException { RegexpQuery smart = new RegexpQuery(new Term("field", regexp)); DumbRegexpQuery dumb = new DumbRegexpQuery(new Term("field", regexp)); @@ -143,79 +134,7 @@ TopDocs smartDocs = searcher.search(smart, 25); TopDocs dumbDocs = searcher.search(dumb, 25); - - assertEquals(dumbDocs.totalHits, smartDocs.totalHits); + + assertEquals("for re:" + regexp, dumbDocs.totalHits, smartDocs.totalHits); } - - char buffer[] = new char[20]; - - // start is inclusive and end is exclusive - public int nextInt(int start, int end) { - return start + random.nextInt(end - start); - } - - public String randomString() { - final int end = random.nextInt(20); - if (buffer.length < 1 + end) { - char[] newBuffer = new char[(int) ((1 + end) * 1.25)]; - System.arraycopy(buffer, 0, newBuffer, 0, buffer.length); - buffer = newBuffer; - } - for (int i = 0; i < end - 1; i++) { - int t = random.nextInt(6); - if (0 == t && i < end - 1) { - // Make a surrogate pair - // High surrogate - buffer[i++] = (char) nextInt(0xd800, 0xdc00); - // Low surrogate - buffer[i] = (char) nextInt(0xdc00, 0xe000); - } else if (t <= 1) buffer[i] = (char) random.nextInt(0x80); - else if (2 == t) buffer[i] = (char) nextInt(0x80, 0x800); - else if (3 == t) buffer[i] = (char) nextInt(0x800, 0xd800); - else if (4 == t) buffer[i] = (char) nextInt(0xe000, 0xffff); - else if (5 == t) { - // Illegal unpaired surrogate - if (random.nextBoolean()) buffer[i] = (char) nextInt(0xd800, 0xdc00); - else buffer[i] = (char) nextInt(0xdc00, 0xe000); - } - } - return new String(buffer, 0, end); - } - - // a random string biased towards populating a ton of operators - public String randomRegex() { - final int end = random.nextInt(20); - if (buffer.length < 1 + end) { - char[] newBuffer = new char[(int) ((1 + end) * 1.25)]; - System.arraycopy(buffer, 0, newBuffer, 0, buffer.length); - buffer = newBuffer; - } - for (int i = 0; i < end - 1; i++) { - int t = random.nextInt(10); - if (0 == t && i < end - 1) { - // Make a surrogate pair - // High surrogate - buffer[i++] = (char) nextInt(0xd800, 0xdc00); - // Low surrogate - buffer[i] = (char) nextInt(0xdc00, 0xe000); - } else if (t <= 1) buffer[i] = (char) random.nextInt(0x80); - else if (2 == t) buffer[i] = (char) nextInt(0x80, 0x800); - else if (3 == t) buffer[i] = (char) nextInt(0x800, 0xd800); - else if (4 == t) buffer[i] = (char) nextInt(0xe000, 0xffff); - else if (5 == t) { - // Illegal unpaired surrogate - if (random.nextBoolean()) buffer[i] = (char) nextInt(0xd800, 0xdc00); - else buffer[i] = (char) nextInt(0xdc00, 0xe000); - } else if (6 == t) { - buffer[i] = '.'; - } else if (7 == t) { - buffer[i] = '?'; - } else if (8 == t) { - buffer[i] = '*'; - } else if (9 == t) { - buffer[i] = '+'; - } - } - return new String(buffer, 0, end); - } } Index: lucene/src/test/org/apache/lucene/util/TestArrayUtil.java =================================================================== --- lucene/src/test/org/apache/lucene/util/TestArrayUtil.java (revision 940218) +++ lucene/src/test/org/apache/lucene/util/TestArrayUtil.java (working copy) @@ -102,4 +102,17 @@ } + public void testSliceEquals() { + String left = "this is equal"; + String right = left; + char[] leftChars = left.toCharArray(); + char[] rightChars = right.toCharArray(); + assertTrue(left + " does not equal: " + right, ArrayUtil.equals(leftChars, 0, rightChars, 0, left.length())); + + assertFalse(left + " does not equal: " + right, ArrayUtil.equals(leftChars, 1, rightChars, 0, left.length())); + assertFalse(left + " does not equal: " + right, ArrayUtil.equals(leftChars, 1, rightChars, 2, left.length())); + + assertFalse(left + " does not equal: " + right, ArrayUtil.equals(leftChars, 25, rightChars, 0, left.length())); + assertFalse(left + " does not equal: " + right, ArrayUtil.equals(leftChars, 12, rightChars, 0, left.length())); + } } Index: lucene/src/test/org/apache/lucene/util/_TestUtil.java =================================================================== --- lucene/src/test/org/apache/lucene/util/_TestUtil.java (revision 940218) +++ lucene/src/test/org/apache/lucene/util/_TestUtil.java (working copy) @@ -111,4 +111,33 @@ buf.append("]"); return buf.toString(); } + /** start and end are BOTH inclusive */ + public static int nextInt(Random r, int start, int end) { + return start + r.nextInt(end-start+1); + } + + /** Returns random string, including full unicode range. */ + public static String randomUnicodeString(Random r) { + final int end = r.nextInt(20); + if (end == 0) { + // allow 0 length + return ""; + } + final char[] buffer = new char[end]; + for (int i = 0; i < end; i++) { + int t = r.nextInt(5); + if (0 == t && i < end - 1) { + // Make a surrogate pair + // High surrogate + buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff); + // Low surrogate + buffer[i] = (char) nextInt(r, 0xdc00, 0xdfff); + } + else if (t <= 1) buffer[i] = (char) r.nextInt(0x80); + else if (2 == t) buffer[i] = (char) nextInt(r, 0x80, 0x800); + else if (3 == t) buffer[i] = (char) nextInt(r, 0x800, 0xd7ff); + else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xffff); + } + return new String(buffer, 0, end); + } } Index: lucene/src/test/org/apache/lucene/util/TestUnicodeUtil.java =================================================================== --- lucene/src/test/org/apache/lucene/util/TestUnicodeUtil.java (revision 940218) +++ lucene/src/test/org/apache/lucene/util/TestUnicodeUtil.java (working copy) @@ -17,7 +17,9 @@ * limitations under the License. */ +import java.util.Random; + /* * Some of this code came from the excellent Unicode * conversion examples from: @@ -81,4 +83,47 @@ assertEquals("dogs\uE000", UnicodeUtil.nextValidUTF16String("dogs\uDC00")); assertEquals("\uE000", UnicodeUtil.nextValidUTF16String("\uDC00dogs")); } + + public void testCodePointCount() { + final Random r = newRandom(); + BytesRef utf8 = new BytesRef(20); + for(int i=0;i<50000;i++) { + final String s = _TestUtil.randomUnicodeString(r); + UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8); + assertEquals(s.codePointCount(0, s.length()), + UnicodeUtil.codePointCount(utf8)); + } + } + + public void testUTF8toUTF32() { + final Random r = newRandom(); + BytesRef utf8 = new BytesRef(20); + IntsRef utf32 = new IntsRef(20); + int[] codePoints = new int[20]; + for(int i=0;i<50000;i++) { + final String s = _TestUtil.randomUnicodeString(r); + UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8); + UnicodeUtil.UTF8toUTF32(utf8, utf32); + + int charUpto = 0; + int intUpto = 0; + while(charUpto < s.length()) { + final int cp = s.codePointAt(charUpto); + codePoints[intUpto++] = cp; + charUpto += Character.charCount(cp); + } + if (!ArrayUtil.equals(codePoints, 0, utf32.ints, utf32.offset, intUpto)) { + System.out.println("FAILED"); + for(int j=0;j automata = new ArrayList(); + private List terms = new ArrayList(); + private Random random; + + @Override + protected void setUp() throws Exception { + super.setUp(); + random = newRandom(); + for (int i = 0; i < 5000; i++) { + String randomString = _TestUtil.randomUnicodeString(random); + terms.add(randomString); + automata.add(BasicAutomata.makeString(randomString)); + } + } + + public void testLexicon() { + for (int i = 0; i < 3; i++) { + assertLexicon(); + } + } + + public void assertLexicon() { + Collections.shuffle(automata, random); + final Automaton lex = BasicOperations.union(automata); + lex.determinize(); + assertTrue(SpecialOperations.isFinite(lex)); + for (String s : terms) { + assertTrue(BasicOperations.run(lex, s)); + } + final ByteRunAutomaton lexByte = new ByteRunAutomaton(lex); + for (String s : terms) { + BytesRef termByte = new BytesRef(s); + assertTrue(lexByte.run(termByte.bytes, 0, termByte.length)); + } + } +} Property changes on: lucene/src/test/org/apache/lucene/util/automaton/TestDeterminizeLexicon.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java =================================================================== --- lucene/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java (revision 940218) +++ lucene/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java (working copy) @@ -26,7 +26,7 @@ Automaton other = BasicAutomata.makeCharRange('5', '7'); Automaton concat = BasicOperations.concatenate(singleton, other); assertTrue(concat.isDeterministic()); - assertEquals(BasicOperations.concatenate(expandedSingleton, other), concat); + assertTrue(BasicOperations.sameLanguage(BasicOperations.concatenate(expandedSingleton, other), concat)); } /** Test optimization to concatenate() to an NFA */ @@ -38,7 +38,7 @@ BasicAutomata.makeString("three")); Automaton concat = BasicOperations.concatenate(singleton, nfa); assertFalse(concat.isDeterministic()); - assertEquals(BasicOperations.concatenate(expandedSingleton, nfa), concat); + assertTrue(BasicOperations.sameLanguage(BasicOperations.concatenate(expandedSingleton, nfa), concat)); } /** Test optimization to concatenate() with empty String */ @@ -49,9 +49,9 @@ Automaton concat1 = BasicOperations.concatenate(expandedSingleton, other); Automaton concat2 = BasicOperations.concatenate(singleton, other); assertTrue(concat2.isDeterministic()); - assertEquals(concat1, concat2); - assertEquals(other, concat1); - assertEquals(other, concat2); + assertTrue(BasicOperations.sameLanguage(concat1, concat2)); + assertTrue(BasicOperations.sameLanguage(other, concat1)); + assertTrue(BasicOperations.sameLanguage(other, concat2)); } /** Test optimization to concatenate() with empty String to an NFA */ @@ -64,8 +64,19 @@ Automaton concat1 = BasicOperations.concatenate(expandedSingleton, nfa); Automaton concat2 = BasicOperations.concatenate(singleton, nfa); assertFalse(concat2.isDeterministic()); - assertEquals(concat1, concat2); - assertEquals(nfa, concat1); - assertEquals(nfa, concat2); + assertTrue(BasicOperations.sameLanguage(concat1, concat2)); + assertTrue(BasicOperations.sameLanguage(nfa, concat1)); + assertTrue(BasicOperations.sameLanguage(nfa, concat2)); } + + /** Test singletons work correctly */ + public void testSingleton() { + Automaton singleton = BasicAutomata.makeString("foobar"); + Automaton expandedSingleton = singleton.cloneExpanded(); + assertTrue(BasicOperations.sameLanguage(singleton, expandedSingleton)); + + singleton = BasicAutomata.makeString("\ud801\udc1c"); + expandedSingleton = singleton.cloneExpanded(); + //assertEquals(singleton, expandedSingleton); + } } Index: lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java =================================================================== --- lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java (revision 0) +++ lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java (revision 0) @@ -0,0 +1,68 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Random; + +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util._TestUtil; + +public class AutomatonTestUtil { + /** Returns random string, including full unicode range. */ + public static RegExp randomRegexp(Random r) { + while (true) { + String regexp = randomRegexpString(r); + // we will also generate some undefined unicode queries + if (!UnicodeUtil.validUTF16String(regexp)) + continue; + try { + return new RegExp(regexp, RegExp.NONE); + } catch (Exception e) {} + } + } + + private static String randomRegexpString(Random r) { + final int end = r.nextInt(20); + if (end == 0) { + // allow 0 length + return ""; + } + final char[] buffer = new char[end]; + for (int i = 0; i < end; i++) { + int t = r.nextInt(11); + if (0 == t && i < end - 1) { + // Make a surrogate pair + // High surrogate + buffer[i++] = (char) _TestUtil.nextInt(r, 0xd800, 0xdbff); + // Low surrogate + buffer[i] = (char) _TestUtil.nextInt(r, 0xdc00, 0xdfff); + } + else if (t <= 1) buffer[i] = (char) r.nextInt(0x80); + else if (2 == t) buffer[i] = (char) _TestUtil.nextInt(r, 0x80, 0x800); + else if (3 == t) buffer[i] = (char) _TestUtil.nextInt(r, 0x800, 0xd7ff); + else if (4 == t) buffer[i] = (char) _TestUtil.nextInt(r, 0xe000, 0xffff); + else if (5 == t) buffer[i] = '.'; + else if (6 == t) buffer[i] = '?'; + else if (7 == t) buffer[i] = '*'; + else if (8 == t) buffer[i] = '+'; + else if (9 == t) buffer[i] = '('; + else if (10 == t) buffer[i] = ')'; + } + return new String(buffer, 0, end); + } +} Property changes on: lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java =================================================================== --- lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java (revision 0) +++ lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java (revision 0) @@ -0,0 +1,183 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; + +import java.util.Random; + +public class TestUTF32ToUTF8 extends LuceneTestCase { + private Random random; + + @Override + protected void setUp() throws Exception { + super.setUp(); + random = newRandom(); + } + + private static final int MAX_UNICODE = 0x10FFFF; + + final BytesRef b = new BytesRef(4); + + private boolean matches(ByteRunAutomaton a, int code) { + char[] chars = Character.toChars(code); + UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b); + return a.run(b.bytes, 0, b.length); + } + + private void testOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters) { + + // Verify correct ints are accepted + for(int iter=0;iter= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | + (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) { + iter--; + continue; + } + assertTrue("DFA for range " + startCode + "-" + endCode + " failed to match code=" + code, + matches(a, code)); + } + + // Verify invalid ints are not accepted + final int invalidRange = MAX_UNICODE - (endCode - startCode + 1); + if (invalidRange > 0) { + for(int iter=0;iter= startCode) { + code = endCode + 1 + x - startCode; + } else { + code = x; + } + if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | + (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) { + iter--; + continue; + } + assertFalse("DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code, + matches(a, code)); + + } + } + } + + // Evenly picks random code point from the 4 "buckets" + // (bucket = same #bytes when encoded to utf8) + private int getCodeStart(Random r) { + switch(r.nextInt(4)) { + case 0: + return _TestUtil.nextInt(r, 0, 128); + case 1: + return _TestUtil.nextInt(r, 128, 2048); + case 2: + return _TestUtil.nextInt(r, 2048, 65536); + default: + return _TestUtil.nextInt(r, 65536, 1+MAX_UNICODE); + } + } + + public void testRandomRanges() throws Exception { + final Random r = random; + int ITERS = 10; + int ITERS_PER_DFA = 100; + for(int iter=0;iter 1.0f / (1.0f - minimumSimilarity)) { + String text = term.text(); + if (text.codePointCount(0, text.length()) > 1.0f / (1.0f - minimumSimilarity)) { this.termLongEnough = true; } Index: lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 940218) +++ lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (working copy) @@ -21,11 +21,9 @@ import java.util.Comparator; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; -import org.apache.lucene.util.automaton.RunAutomaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.SpecialOperations; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; @@ -51,7 +49,7 @@ // the object-oriented form of the DFA private final Automaton automaton; // a tableized array-based form of the DFA - private final RunAutomaton runAutomaton; + private final ByteRunAutomaton runAutomaton; // common suffix of the automaton private final BytesRef commonSuffixRef; // true if the automaton accepts a finite language @@ -62,8 +60,6 @@ // visited the state; we use gens to avoid having to clear private final long[] visited; private long curGen; - // used for unicode conversion from BytesRef byte[] to char[] - private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); // the reference used for seeking forwards through the term dictionary private final BytesRef seekBytesRef = new BytesRef(10); // true if we are enumerating an infinite portion of the DFA. @@ -72,7 +68,6 @@ // of terms where we should simply do sequential reads instead. private boolean linear = false; private final BytesRef linearUpperBound = new BytesRef(10); - private final UnicodeUtil.UTF16Result linearUpperBoundUTF16 = new UnicodeUtil.UTF16Result(); private final Comparator termComp; /** @@ -80,39 +75,38 @@ * Construct an enumerator based upon an automaton, enumerating the specified * field, working on a supplied reader. *

- * @lucene.internal Use the public ctor instead. This constructor allows the - * (dangerous) option of passing in a pre-compiled RunAutomaton. If you use - * this ctor and compile your own RunAutomaton, you are responsible for - * ensuring it is in sync with the Automaton object, including internal - * State numbering, or you will get undefined behavior. + * @lucene.internal Use the public ctor instead. *

- * @param preCompiled optional pre-compiled RunAutomaton (can be null) + * @param runAutomaton pre-compiled ByteRunAutomaton * @param finite true if the automaton accepts a finite language */ - AutomatonTermsEnum(Automaton automaton, RunAutomaton preCompiled, - Term queryTerm, IndexReader reader, boolean finite) + AutomatonTermsEnum(ByteRunAutomaton runAutomaton, + String field, IndexReader reader, + boolean finite, BytesRef commonSuffixRef) throws IOException { - super(reader, queryTerm.field()); - this.automaton = automaton; + super(reader, field); + this.automaton = runAutomaton.getAutomaton(); this.finite = finite; - /* - * tableize the automaton. this also ensures it is deterministic, and has no - * transitions to dead states. it also invokes Automaton.setStateNumbers to - * number the original states (this is how they are tableized) - */ - if (preCompiled == null) - runAutomaton = new RunAutomaton(this.automaton); - else - runAutomaton = preCompiled; + this.runAutomaton = runAutomaton; + if (finite) { + // don't use suffix w/ finite DFAs + this.commonSuffixRef = null; + } else if (commonSuffixRef == null) { + // compute now + this.commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(automaton); + } else { + // precomputed + this.commonSuffixRef = commonSuffixRef; + } - commonSuffixRef = finite ? null : new BytesRef(getValidUTF16Suffix(SpecialOperations - .getCommonSuffix(automaton))); - // build a cache of sorted transitions for every state allTransitions = new Transition[runAutomaton.getSize()][]; - for (State state : this.automaton.getStates()) - allTransitions[state.getNumber()] = state.getSortedTransitionArray(false); + for (State state : this.automaton.getNumberedStates()) { + state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order); + state.trimTransitionsArray(); + allTransitions[state.getNumber()] = state.transitionsArray; + } // used for path tracking, where each bit is a numbered state. visited = new long[runAutomaton.getSize()]; @@ -126,9 +120,9 @@ *

* It will automatically calculate whether or not the automaton is finite */ - public AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader) - throws IOException { - this(automaton, null, queryTerm, reader, SpecialOperations.isFinite(automaton)); + public AutomatonTermsEnum(Automaton automaton, String field, IndexReader reader) + throws IOException { + this(new ByteRunAutomaton(automaton), field, reader, SpecialOperations.isFinite(automaton), null); } /** @@ -138,8 +132,7 @@ @Override protected AcceptStatus accept(final BytesRef term) { if (commonSuffixRef == null || term.endsWith(commonSuffixRef)) { - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); - if (runAutomaton.run(utf16.result, 0, utf16.length)) + if (runAutomaton.run(term.bytes, term.offset, term.length)) return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK; else return (linear && termComp.compare(term, linearUpperBound) < 0) ? @@ -153,15 +146,13 @@ @Override protected BytesRef nextSeekTerm(final BytesRef term) throws IOException { if (term == null) { + seekBytesRef.copy(""); // return the empty term, as its valid - if (runAutomaton.run("")) { - seekBytesRef.copy(""); + if (runAutomaton.run(seekBytesRef.bytes, seekBytesRef.offset, seekBytesRef.length)) { return seekBytesRef; } - - utf16.copyText(""); } else { - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); + seekBytesRef.copy(term); } // seek to the next possible string; @@ -169,8 +160,6 @@ // reposition if (linear) setLinear(infinitePosition); - UnicodeUtil.nextValidUTF16String(utf16); - UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, seekBytesRef); return seekBytesRef; } // no more possible strings can match @@ -187,27 +176,28 @@ */ private void setLinear(int position) { int state = runAutomaton.getInitialState(); - char maxInterval = 0xffff; - for (int i = 0; i < position; i++) - state = runAutomaton.step(state, utf16.result[i]); + int maxInterval = 0xef; + for (int i = 0; i < position; i++) { + state = runAutomaton.step(state, seekBytesRef.bytes[i] & 0xff); + assert state >= 0: "state=" + state; + } for (int i = 0; i < allTransitions[state].length; i++) { Transition t = allTransitions[state][i]; - if (t.getMin() <= utf16.result[position] && utf16.result[position] <= t.getMax()) { + if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 && + compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) { maxInterval = t.getMax(); break; } } - // 0xffff terms don't get the optimization... not worth the trouble. - if (maxInterval < 0xffff) - maxInterval++; + // 0xef terms don't get the optimization... not worth the trouble. + if (maxInterval != 0xef) + maxInterval = incrementUTF16(maxInterval); int length = position + 1; /* position + maxTransition */ - if (linearUpperBoundUTF16.result.length < length) - linearUpperBoundUTF16.result = new char[length]; - System.arraycopy(utf16.result, 0, linearUpperBoundUTF16.result, 0, position); - linearUpperBoundUTF16.result[position] = maxInterval; - linearUpperBoundUTF16.setLength(length); - UnicodeUtil.nextValidUTF16String(linearUpperBoundUTF16); - UnicodeUtil.UTF16toUTF8(linearUpperBoundUTF16.result, 0, length, linearUpperBound); + if (linearUpperBound.bytes.length < length) + linearUpperBound.bytes = new byte[length]; + System.arraycopy(seekBytesRef.bytes, 0, linearUpperBound.bytes, 0, position); + linearUpperBound.bytes[position] = (byte) maxInterval; + linearUpperBound.length = length; } /** @@ -229,9 +219,9 @@ linear = false; state = runAutomaton.getInitialState(); // walk the automaton until a character is rejected. - for (pos = 0; pos < utf16.length; pos++) { + for (pos = 0; pos < seekBytesRef.length; pos++) { visited[state] = curGen; - int nextState = runAutomaton.step(state, utf16.result[pos]); + int nextState = runAutomaton.step(state, seekBytesRef.bytes[pos] & 0xff); if (nextState == -1) break; // we found a loop, record it for faster enumeration @@ -249,7 +239,7 @@ } else { /* no more solutions exist from this useful portion, backtrack */ if (!backtrack(pos)) /* no more solutions at all */ return false; - else if (runAutomaton.run(utf16.result, 0, utf16.length)) + else if (runAutomaton.run(seekBytesRef.bytes, 0, seekBytesRef.length)) /* String is good to go as-is */ return true; /* else advance further */ @@ -280,19 +270,18 @@ * the next lexicographic character must be greater than the existing * character, if it exists. */ - char c = 0; - if (position < utf16.length) { - c = utf16.result[position]; + int c = 0; + if (position < seekBytesRef.length) { + c = seekBytesRef.bytes[position] & 0xff; // if the next character is U+FFFF and is not part of the useful portion, // then by definition it puts us in a reject state, and therefore this // path is dead. there cannot be any higher transitions. backtrack. - if (c == '\uFFFF') + c = incrementUTF16(c); + if (c == -1) return false; - else - c++; } - utf16.setLength(position); + seekBytesRef.length = position; visited[state] = curGen; Transition transitions[] = allTransitions[state]; @@ -301,11 +290,12 @@ for (int i = 0; i < transitions.length; i++) { Transition transition = transitions[i]; - if (transition.getMax() >= c) { - char nextChar = (char) Math.max(c, transition.getMin()); + if (compareToUTF16(transition.getMax(), c) >= 0) { + int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin(); // append either the next sequential char, or the minimum transition - utf16.setLength(utf16.length + 1); - utf16.result[utf16.length - 1] = nextChar; + seekBytesRef.grow(seekBytesRef.length + 1); + seekBytesRef.length++; + seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar; state = transition.getDest().getNumber(); /* * as long as is possible, continue down the minimal path in @@ -323,11 +313,12 @@ // we found a loop, record it for faster enumeration if (!finite && !linear && visited[state] == curGen) { linear = true; - infinitePosition = utf16.length; + infinitePosition = seekBytesRef.length; } // append the minimum transition - utf16.setLength(utf16.length + 1); - utf16.result[utf16.length - 1] = transition.getMin(); + seekBytesRef.grow(seekBytesRef.length + 1); + seekBytesRef.length++; + seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin(); } return true; } @@ -345,33 +336,48 @@ */ private boolean backtrack(int position) { while (position > 0) { - char nextChar = utf16.result[position - 1]; - // if a character is U+FFFF its a dead-end too, + int nextChar = seekBytesRef.bytes[position - 1] & 0xff; + // if a character is 0xef its a dead-end too, // because there is no higher character in UTF-16 sort order. - if (nextChar != '\uFFFF') { - nextChar++; - utf16.result[position - 1] = nextChar; - utf16.setLength(position); + nextChar = incrementUTF16(nextChar); + if (nextChar != -1) { + seekBytesRef.bytes[position - 1] = (byte) nextChar; + seekBytesRef.length = position; return true; } position--; } return false; /* all solutions exhausted */ } + + /* return the next utf8 byte in utf16 order, or -1 if exhausted */ + private final int incrementUTF16(int utf8) { + switch(utf8) { + case 0xed: return 0xf0; + case 0xfd: return 0xee; + case 0xee: return 0xef; + case 0xef: return -1; + default: return utf8 + 1; + } + } - /** - * if the suffix starts with a low surrogate, remove it. - * This won't be quite as efficient, but can be converted to valid UTF-8 - * - * This isn't nearly as complex as cleanupPosition, because its not - * going to use this suffix to walk any path thru the terms. - * - */ - private String getValidUTF16Suffix(String suffix) { - if (suffix != null && suffix.length() > 0 && - Character.isLowSurrogate(suffix.charAt(0))) - return suffix.substring(1); - else - return suffix; + int compareToUTF16(int aByte, int bByte) { + if (aByte != bByte) { + // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order + + // We know the terms are not equal, but, we may + // have to carefully fixup the bytes at the + // difference to match UTF16's sort order: + if (aByte >= 0xee && bByte >= 0xee) { + if ((aByte & 0xfe) == 0xee) { + aByte += 0x10; + } + if ((bByte&0xfe) == 0xee) { + bByte += 0x10; + } + } + return aByte - bByte; + } + return 0; } } Index: lucene/src/java/org/apache/lucene/search/WildcardQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/WildcardQuery.java (revision 940218) +++ lucene/src/java/org/apache/lucene/search/WildcardQuery.java (working copy) @@ -63,8 +63,8 @@ String wildcardText = wildcardquery.text(); - for (int i = 0; i < wildcardText.length(); i++) { - final char c = wildcardText.charAt(i); + for (int i = 0; i < wildcardText.length();) { + final int c = wildcardText.codePointAt(i); switch(c) { case WILDCARD_STRING: automata.add(BasicAutomata.makeAnyString()); @@ -75,6 +75,7 @@ default: automata.add(BasicAutomata.makeChar(c)); } + i += Character.charCount(c); } return BasicOperations.concatenate(automata); Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (revision 940218) +++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (working copy) @@ -24,12 +24,13 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.BasicOperations; +import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.LevenshteinAutomata; -import org.apache.lucene.util.automaton.RunAutomaton; import java.io.IOException; import java.util.ArrayList; @@ -49,7 +50,7 @@ private final MultiTermQuery.BoostAttribute boostAtt = attributes().addAttribute(MultiTermQuery.BoostAttribute.class); - + private float bottom = boostAtt.getMaxNonCompetitiveBoost(); private final float minSimilarity; @@ -58,11 +59,12 @@ private final int termLength; private int maxEdits; - private List automata; - private List runAutomata; + + private List runAutomata; private final IndexReader reader; private final Term term; + private final int termText[]; private final int realPrefixLength; /** @@ -89,9 +91,16 @@ throw new IllegalArgumentException("prefixLength cannot be less than 0"); this.reader = reader; this.term = term; + + // convert the string into a utf32 int[] representation for fast comparisons + final String utf16 = term.text(); + this.termText = new int[utf16.codePointCount(0, utf16.length())]; + for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) + termText[j++] = cp = utf16.codePointAt(i); + this.termLength = termText.length; + //The prefix could be longer than the word. //It's kind of silly though. It means we must match the entire word. - this.termLength = term.text().length(); this.realPrefixLength = prefixLength > termLength ? termLength : prefixLength; this.minSimilarity = minSimilarity; this.scale_factor = 1.0f / (1.0f - minSimilarity); @@ -101,7 +110,7 @@ TermsEnum subEnum = getAutomatonEnum(maxEdits, null); setEnum(subEnum != null ? subEnum : - new LinearFuzzyTermsEnum(reader, term, minSimilarity, prefixLength)); + new LinearFuzzyTermsEnum()); } /** @@ -111,37 +120,35 @@ private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) throws IOException { initAutomata(editDistance); - if (automata != null && editDistance < automata.size()) { - return new AutomatonFuzzyTermsEnum(automata.get(editDistance), term, - reader, minSimilarity, runAutomata.subList(0, editDistance + 1) - .toArray(new RunAutomaton[0]), lastTerm); + if (runAutomata != null && editDistance < runAutomata.size()) { + return new AutomatonFuzzyTermsEnum(runAutomata.subList(0, editDistance + 1) + .toArray(new ByteRunAutomaton[0]), lastTerm); } else { return null; } } - + /** initialize levenshtein DFAs up to maxDistance, if possible */ private void initAutomata(int maxDistance) { - if (automata == null && + if (runAutomata == null && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = - new LevenshteinAutomata(term.text().substring(realPrefixLength)); - automata = new ArrayList(maxDistance); - runAutomata = new ArrayList(maxDistance); + new LevenshteinAutomata(new String(termText, realPrefixLength, termText.length - realPrefixLength)); + + runAutomata = new ArrayList(maxDistance); for (int i = 0; i <= maxDistance; i++) { Automaton a = builder.toAutomaton(i); // constant prefix if (realPrefixLength > 0) { Automaton prefix = BasicAutomata.makeString( - term.text().substring(0, realPrefixLength)); + new String(termText, 0, realPrefixLength)); a = BasicOperations.concatenate(prefix, a); } - automata.add(a); - runAutomata.add(new RunAutomaton(a)); + runAutomata.add(new ByteRunAutomaton(a)); } } } - + /** swap in a new actual enum to proxy to */ private void setEnum(TermsEnum actualEnum) { this.actualEnum = actualEnum; @@ -173,7 +180,7 @@ // itself: re-init maxDistances so the fast-fail happens for more terms due // to the now stricter constraints. } - + // for some raw min similarity and input term length, the maximum # of edits private int initialMaxDistance(float minimumSimilarity, int termLen) { return (int) ((1-minimumSimilarity) * termLen); @@ -242,314 +249,292 @@ public BytesRef term() throws IOException { return actualEnum.term(); } -} - -/** - * Implement fuzzy enumeration with automaton. - *

- * This is the fastest method as opposed to LinearFuzzyTermsEnum: - * as enumeration is logarithmic to the number of terms (instead of linear) - * and comparison is linear to length of the term (rather than quadratic) - */ -final class AutomatonFuzzyTermsEnum extends AutomatonTermsEnum { - private final RunAutomaton matchers[]; - // used for unicode conversion from BytesRef byte[] to char[] - private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); - private final float minimumSimilarity; - private final float scale_factor; - - private final int fullSearchTermLength; - private final BytesRef termRef; - - private final BytesRef lastTerm; - private final MultiTermQuery.BoostAttribute boostAtt = - attributes().addAttribute(MultiTermQuery.BoostAttribute.class); - - public AutomatonFuzzyTermsEnum(Automaton automaton, Term queryTerm, - IndexReader reader, float minSimilarity, RunAutomaton matchers[], BytesRef lastTerm) throws IOException { - super(automaton, matchers[matchers.length - 1], queryTerm, reader, true); - this.minimumSimilarity = minSimilarity; - this.scale_factor = 1.0f / (1.0f - minimumSimilarity); - this.matchers = matchers; - this.lastTerm = lastTerm; - termRef = new BytesRef(queryTerm.text()); - fullSearchTermLength = queryTerm.text().length(); - } - - /** finds the smallest Lev(n) DFA that accepts the term. */ - @Override - protected AcceptStatus accept(BytesRef term) { - if (term.equals(termRef)) { // ed = 0 - boostAtt.setBoost(1.0F); - return AcceptStatus.YES_AND_SEEK; - } - - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); - - // TODO: benchmark doing this backwards - for (int i = 1; i < matchers.length; i++) - if (matchers[i].run(utf16.result, 0, utf16.length)) { - final float similarity = 1.0f - ((float) i / (float) - (Math.min(utf16.length, fullSearchTermLength))); - if (similarity > minimumSimilarity) { - boostAtt.setBoost((float) ((similarity - minimumSimilarity) * scale_factor)); - return AcceptStatus.YES_AND_SEEK; - } else { - return AcceptStatus.NO_AND_SEEK; - } - } - - return AcceptStatus.NO_AND_SEEK; - } - - /** defers to superclass, except can start at an arbitrary location */ - @Override - protected BytesRef nextSeekTerm(BytesRef term) throws IOException { - if (term == null) - term = lastTerm; - return super.nextSeekTerm(term); - } -} - -/** - * Implement fuzzy enumeration with linear brute force. - */ -final class LinearFuzzyTermsEnum extends FilteredTermsEnum { - - /* This should be somewhere around the average long word. - * If it is longer, we waste time and space. If it is shorter, we waste a - * little bit of time growing the array as we encounter longer words. - */ - private static final int TYPICAL_LONGEST_WORD_IN_INDEX = 19; - - /* Allows us save time required to create a new array - * every time similarity is called. - */ - private int[][] d; - - private final char[] text; - private final int prefixLen; - - private final float minimumSimilarity; - private final float scale_factor; - private final int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX]; - - private final MultiTermQuery.BoostAttribute boostAtt = - attributes().addAttribute(MultiTermQuery.BoostAttribute.class); - /** - * Constructor for enumeration of all terms from specified reader which share a prefix of - * length prefixLength with term and which have a fuzzy similarity > - * minSimilarity. - *

- * After calling the constructor the enumeration is already pointing to the first - * valid term if such a term exists. - * - * @param reader Delivers terms. - * @param term Pattern term. - * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f. - * @param prefixLength Length of required common prefix. Default value is 0. - * @throws IOException + * Finds and returns the smallest of three integers */ - public LinearFuzzyTermsEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException { - super(reader, term.field()); - - if (minSimilarity >= 1.0f) - throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1"); - else if (minSimilarity < 0.0f) - throw new IllegalArgumentException("minimumSimilarity cannot be less than 0"); - if(prefixLength < 0) - throw new IllegalArgumentException("prefixLength cannot be less than 0"); - - this.minimumSimilarity = minSimilarity; - this.scale_factor = 1.0f / (1.0f - minimumSimilarity); - - //The prefix could be longer than the word. - //It's kind of silly though. It means we must match the entire word. - final int fullSearchTermLength = term.text().length(); - final int realPrefixLength = prefixLength > fullSearchTermLength ? fullSearchTermLength : prefixLength; - - this.text = term.text().substring(realPrefixLength).toCharArray(); - final String prefix = term.text().substring(0, realPrefixLength); - prefixBytesRef = new BytesRef(prefix); - prefixLen = prefix.length(); - initializeMaxDistances(); - this.d = initDistanceArray(); - - setInitialSeekTerm(prefixBytesRef); + private static final int min(int a, int b, int c) { + final int t = (a < b) ? a : b; + return (t < c) ? t : c; } - - private final BytesRef prefixBytesRef; - // used for unicode conversion from BytesRef byte[] to char[] - private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); /** - * The termCompare method in FuzzyTermEnum uses Levenshtein distance to - * calculate the distance between the given term and the comparing term. + * Implement fuzzy enumeration with automaton. + *

+ * This is the fastest method as opposed to LinearFuzzyTermsEnum: + * as enumeration is logarithmic to the number of terms (instead of linear) + * and comparison is linear to length of the term (rather than quadratic) */ - @Override - protected final AcceptStatus accept(BytesRef term) { - if (term.startsWith(prefixBytesRef)) { - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); - final float similarity = similarity(utf16.result, prefixLen, utf16.length - prefixLen); - if (similarity > minimumSimilarity) { - boostAtt.setBoost((float)((similarity - minimumSimilarity) * scale_factor)); - return AcceptStatus.YES; - } else return AcceptStatus.NO; - } else { - return AcceptStatus.END; + private class AutomatonFuzzyTermsEnum extends AutomatonTermsEnum { + private final ByteRunAutomaton matchers[]; + + private final BytesRef termRef; + + private final BytesRef lastTerm; + private final MultiTermQuery.BoostAttribute boostAtt = + attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + + public AutomatonFuzzyTermsEnum(ByteRunAutomaton matchers[], + BytesRef lastTerm) throws IOException { + super(matchers[matchers.length - 1], term.field(), reader, true, null); + this.matchers = matchers; + this.lastTerm = lastTerm; + termRef = new BytesRef(term.text()); } + + /** finds the smallest Lev(n) DFA that accepts the term. */ + @Override + protected AcceptStatus accept(BytesRef term) { + if (term.equals(termRef)) { // ed = 0 + boostAtt.setBoost(1.0F); + return AcceptStatus.YES_AND_SEEK; + } + + int codePointCount = -1; + + // TODO: benchmark doing this backwards + for (int i = 1; i < matchers.length; i++) + if (matchers[i].run(term.bytes, 0, term.length)) { + // this sucks, we convert just to score based on length. + if (codePointCount == -1) { + codePointCount = UnicodeUtil.codePointCount(term); + } + final float similarity = 1.0f - ((float) i / (float) + (Math.min(codePointCount, termLength))); + if (similarity > minSimilarity) { + boostAtt.setBoost((float) ((similarity - minSimilarity) * scale_factor)); + return AcceptStatus.YES_AND_SEEK; + } else { + return AcceptStatus.NO_AND_SEEK; + } + } + + return AcceptStatus.NO_AND_SEEK; + } + + /** defers to superclass, except can start at an arbitrary location */ + @Override + protected BytesRef nextSeekTerm(BytesRef term) throws IOException { + if (term == null) + term = lastTerm; + return super.nextSeekTerm(term); + } } - /****************************** - * Compute Levenshtein distance - ******************************/ - /** - * Finds and returns the smallest of three integers + * Implement fuzzy enumeration with linear brute force. */ - private static final int min(int a, int b, int c) { - final int t = (a < b) ? a : b; - return (t < c) ? t : c; - } + private class LinearFuzzyTermsEnum extends FilteredTermsEnum { + + /* This should be somewhere around the average long word. + * If it is longer, we waste time and space. If it is shorter, we waste a + * little bit of time growing the array as we encounter longer words. + */ + private static final int TYPICAL_LONGEST_WORD_IN_INDEX = 19; + + /* Allows us save time required to create a new array + * every time similarity is called. + */ + private int[][] d; + + // this is the text, minus the prefix + private final int[] text; + + private final int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX]; + + private final MultiTermQuery.BoostAttribute boostAtt = + attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + + /** + * Constructor for enumeration of all terms from specified reader which share a prefix of + * length prefixLength with term and which have a fuzzy similarity > + * minSimilarity. + *

+ * After calling the constructor the enumeration is already pointing to the first + * valid term if such a term exists. + * + * @param reader Delivers terms. + * @param term Pattern term. + * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f. + * @param prefixLength Length of required common prefix. Default value is 0. + * @throws IOException + */ + public LinearFuzzyTermsEnum() throws IOException { + super(reader, term.field()); - private final int[][] initDistanceArray(){ - return new int[this.text.length + 1][TYPICAL_LONGEST_WORD_IN_INDEX]; - } - - /** - *

Similarity returns a number that is 1.0f or less (including negative numbers) - * based on how similar the Term is compared to a target term. It returns - * exactly 0.0f when - *

-   *    editDistance > maximumEditDistance
- * Otherwise it returns: - *
-   *    1 - (editDistance / length)
- * where length is the length of the shortest term (text or target) including a - * prefix that are identical and editDistance is the Levenshtein distance for - * the two words.

- * - *

Embedded within this algorithm is a fail-fast Levenshtein distance - * algorithm. The fail-fast algorithm differs from the standard Levenshtein - * distance algorithm in that it is aborted if it is discovered that the - * minimum distance between the words is greater than some threshold. - * - *

To calculate the maximum distance threshold we use the following formula: - *

-   *     (1 - minimumSimilarity) * length
- * where length is the shortest term including any prefix that is not part of the - * similarity comparison. This formula was derived by solving for what maximum value - * of distance returns false for the following statements: - *
-   *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
-   *   return (similarity > minimumSimilarity);
- * where distance is the Levenshtein distance for the two words. - *

- *

Levenshtein distance (also known as edit distance) is a measure of similarity - * between two strings where the distance is measured as the number of character - * deletions, insertions or substitutions required to transform one string to - * the other string. - * @param target the target word or phrase - * @return the similarity, 0.0 or less indicates that it matches less than the required - * threshold and 1.0 indicates that the text and target are identical - */ - private final float similarity(final char[] target, int offset, int length) { - final int m = length; - final int n = text.length; - if (n == 0) { - //we don't have anything to compare. That means if we just add - //the letters for m we get the new word - return prefixLen == 0 ? 0.0f : 1.0f - ((float) m / prefixLen); + this.text = new int[termLength - realPrefixLength]; + System.arraycopy(termText, realPrefixLength, text, 0, text.length); + final String prefix = new String(termText, 0, realPrefixLength); + prefixBytesRef = new BytesRef(prefix); + initializeMaxDistances(); + this.d = initDistanceArray(); + + setInitialSeekTerm(prefixBytesRef); } - if (m == 0) { - return prefixLen == 0 ? 0.0f : 1.0f - ((float) n / prefixLen); + + private final BytesRef prefixBytesRef; + // used for unicode conversion from BytesRef byte[] to int[] + private final IntsRef utf32 = new IntsRef(TYPICAL_LONGEST_WORD_IN_INDEX); + + /** + * The termCompare method in FuzzyTermEnum uses Levenshtein distance to + * calculate the distance between the given term and the comparing term. + */ + @Override + protected final AcceptStatus accept(BytesRef term) { + if (term.startsWith(prefixBytesRef)) { + UnicodeUtil.UTF8toUTF32(term, utf32); + final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength); + if (similarity > minSimilarity) { + boostAtt.setBoost((float)((similarity - minSimilarity) * scale_factor)); + return AcceptStatus.YES; + } else return AcceptStatus.NO; + } else { + return AcceptStatus.END; + } } - - final int maxDistance = getMaxDistance(m); - - if (maxDistance < Math.abs(m-n)) { - //just adding the characters of m to n or vice-versa results in - //too many edits - //for example "pre" length is 3 and "prefixes" length is 8. We can see that - //given this optimal circumstance, the edit distance cannot be less than 5. - //which is 8-3 or more precisely Math.abs(3-8). - //if our maximum edit distance is 4, then we can discard this word - //without looking at it. - return 0.0f; + + /****************************** + * Compute Levenshtein distance + ******************************/ + + private final int[][] initDistanceArray(){ + return new int[this.text.length + 1][TYPICAL_LONGEST_WORD_IN_INDEX]; } - - //let's make sure we have enough room in our array to do the distance calculations. - if (d[0].length <= m) { - growDistanceArray(m); - } - - // init matrix d - for (int i = 0; i <= n; i++) d[i][0] = i; - for (int j = 0; j <= m; j++) d[0][j] = j; - // start computing edit distance - for (int i = 1; i <= n; i++) { - int bestPossibleEditDistance = m; - final char s_i = text[i - 1]; - for (int j = 1; j <= m; j++) { - if (s_i != target[offset+j-1]) { + /** + *

Similarity returns a number that is 1.0f or less (including negative numbers) + * based on how similar the Term is compared to a target term. It returns + * exactly 0.0f when + *

+     *    editDistance > maximumEditDistance
+ * Otherwise it returns: + *
+     *    1 - (editDistance / length)
+ * where length is the length of the shortest term (text or target) including a + * prefix that are identical and editDistance is the Levenshtein distance for + * the two words.

+ * + *

Embedded within this algorithm is a fail-fast Levenshtein distance + * algorithm. The fail-fast algorithm differs from the standard Levenshtein + * distance algorithm in that it is aborted if it is discovered that the + * minimum distance between the words is greater than some threshold. + * + *

To calculate the maximum distance threshold we use the following formula: + *

+     *     (1 - minimumSimilarity) * length
+ * where length is the shortest term including any prefix that is not part of the + * similarity comparison. This formula was derived by solving for what maximum value + * of distance returns false for the following statements: + *
+     *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+     *   return (similarity > minimumSimilarity);
+ * where distance is the Levenshtein distance for the two words. + *

+ *

Levenshtein distance (also known as edit distance) is a measure of similarity + * between two strings where the distance is measured as the number of character + * deletions, insertions or substitutions required to transform one string to + * the other string. + * @param target the target word or phrase + * @return the similarity, 0.0 or less indicates that it matches less than the required + * threshold and 1.0 indicates that the text and target are identical + */ + private final float similarity(final int[] target, int offset, int length) { + final int m = length; + final int n = text.length; + if (n == 0) { + //we don't have anything to compare. That means if we just add + //the letters for m we get the new word + return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength); + } + if (m == 0) { + return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength); + } + + final int maxDistance = getMaxDistance(m); + + if (maxDistance < Math.abs(m-n)) { + //just adding the characters of m to n or vice-versa results in + //too many edits + //for example "pre" length is 3 and "prefixes" length is 8. We can see that + //given this optimal circumstance, the edit distance cannot be less than 5. + //which is 8-3 or more precisely Math.abs(3-8). + //if our maximum edit distance is 4, then we can discard this word + //without looking at it. + return 0.0f; + } + + //let's make sure we have enough room in our array to do the distance calculations. + if (d[0].length <= m) { + growDistanceArray(m); + } + + // init matrix d + for (int i = 0; i <= n; i++) d[i][0] = i; + for (int j = 0; j <= m; j++) d[0][j] = j; + + // start computing edit distance + for (int i = 1; i <= n; i++) { + int bestPossibleEditDistance = m; + final int s_i = text[i - 1]; + for (int j = 1; j <= m; j++) { + if (s_i != target[offset+j-1]) { d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1; + } + else { + d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]); + } + bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i][j]); } - else { - d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]); + + //After calculating row i, the best possible edit distance + //can be found by found by finding the smallest value in a given column. + //If the bestPossibleEditDistance is greater than the max distance, abort. + + if (i > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater + //the closest the target can be to the text is just too far away. + //this target is leaving the party early. + return 0.0f; } - bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i][j]); } - - //After calculating row i, the best possible edit distance - //can be found by found by finding the smallest value in a given column. - //If the bestPossibleEditDistance is greater than the max distance, abort. - - if (i > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater - //the closest the target can be to the text is just too far away. - //this target is leaving the party early. - return 0.0f; + + // this will return less than 0.0 when the edit distance is + // greater than the number of characters in the shorter word. + // but this was the formula that was previously used in FuzzyTermEnum, + // so it has not been changed (even though minimumSimilarity must be + // greater than 0.0) + return 1.0f - ((float)d[n][m] / (float) (realPrefixLength + Math.min(n, m))); + } + + /** + * Grow the second dimension of the array, so that we can calculate the + * Levenshtein difference. + */ + private void growDistanceArray(int m) { + for (int i = 0; i < d.length; i++) { + d[i] = new int[m+1]; } } - - // this will return less than 0.0 when the edit distance is - // greater than the number of characters in the shorter word. - // but this was the formula that was previously used in FuzzyTermEnum, - // so it has not been changed (even though minimumSimilarity must be - // greater than 0.0) - return 1.0f - ((float)d[n][m] / (float) (prefixLen + Math.min(n, m))); - } - - /** - * Grow the second dimension of the array, so that we can calculate the - * Levenshtein difference. - */ - private void growDistanceArray(int m) { - for (int i = 0; i < d.length; i++) { - d[i] = new int[m+1]; + + /** + * The max Distance is the maximum Levenshtein distance for the text + * compared to some other value that results in score that is + * better than the minimum similarity. + * @param m the length of the "other value" + * @return the maximum levenshtein distance that we care about + */ + private final int getMaxDistance(int m) { + return (m < maxDistances.length) ? maxDistances[m] : calculateMaxDistance(m); } - } - - /** - * The max Distance is the maximum Levenshtein distance for the text - * compared to some other value that results in score that is - * better than the minimum similarity. - * @param m the length of the "other value" - * @return the maximum levenshtein distance that we care about - */ - private final int getMaxDistance(int m) { - return (m < maxDistances.length) ? maxDistances[m] : calculateMaxDistance(m); - } - - private void initializeMaxDistances() { - for (int i = 0; i < maxDistances.length; i++) { - maxDistances[i] = calculateMaxDistance(i); + + private void initializeMaxDistances() { + for (int i = 0; i < maxDistances.length; i++) { + maxDistances[i] = calculateMaxDistance(i); + } } + + private int calculateMaxDistance(int m) { + return (int) ((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength)); + } } - - private int calculateMaxDistance(int m) { - return (int) ((1-minimumSimilarity) * (Math.min(text.length, m) + prefixLen)); - } } Index: lucene/src/java/org/apache/lucene/search/AutomatonQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/AutomatonQuery.java (revision 940218) +++ lucene/src/java/org/apache/lucene/search/AutomatonQuery.java (working copy) @@ -24,7 +24,9 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.MultiFields; import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.MinimizationOperations; @@ -50,10 +52,14 @@ */ public class AutomatonQuery extends MultiTermQuery { /** the automaton to match index terms against */ - protected Automaton automaton; + protected final Automaton automaton; /** term containing the field, and possibly some pattern structure */ - protected Term term; + protected final Term term; + transient ByteRunAutomaton runAutomaton; + transient boolean isFinite; + transient BytesRef commonSuffixRef; + /** * Create a new AutomatonQuery from an {@link Automaton}. * @@ -69,6 +75,14 @@ MinimizationOperations.minimize(automaton); } + private void compileAutomaton() { + if (runAutomaton == null) { + runAutomaton = new ByteRunAutomaton(automaton); + isFinite = SpecialOperations.isFinite(automaton); + commonSuffixRef = isFinite ? null : SpecialOperations.getCommonSuffixBytesRef(runAutomaton.getAutomaton()); + } + } + @Override protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { // matches nothing @@ -85,28 +99,42 @@ String singleton = automaton.getSingleton(); if (singleton != null) return new SingleTermsEnum(reader, term.createTerm(singleton)); - + // matches a fixed string in expanded representation - String commonPrefix = SpecialOperations.getCommonPrefix(automaton); - if (automaton.equals(BasicAutomata.makeString(commonPrefix))) { - return new SingleTermsEnum(reader, term.createTerm(commonPrefix)); - } + final String commonPrefix = SpecialOperations.getCommonPrefix(automaton); + + if (commonPrefix.length() > 0) { + if (BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) { + return new SingleTermsEnum(reader, term.createTerm(commonPrefix)); + } - // matches a constant prefix - Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata - .makeString(commonPrefix), BasicAutomata.makeAnyString()); - if (automaton.equals(prefixAutomaton)) { - return new PrefixTermsEnum(reader, term.createTerm(commonPrefix)); + // matches a constant prefix + Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata + .makeString(commonPrefix), BasicAutomata.makeAnyString()); + if (BasicOperations.sameLanguage(automaton, prefixAutomaton)) { + return new PrefixTermsEnum(reader, term.createTerm(commonPrefix)); + } } + + compileAutomaton(); - return new AutomatonTermsEnum(automaton, term, reader); + return new AutomatonTermsEnum(runAutomaton, term.field(), reader, isFinite, commonSuffixRef); } - + @Override public int hashCode() { final int prime = 31; int result = super.hashCode(); - result = prime * result + ((automaton == null) ? 0 : automaton.hashCode()); + if (automaton != null) { + // we already minimized the automaton in the ctor, so + // this hash code will be the same for automata that + // are the same: + int automatonHashCode = automaton.getNumberOfStates() * 3 + automaton.getNumberOfTransitions() * 2; + if (automatonHashCode == 0) { + automatonHashCode = 1; + } + result = prime * result + automatonHashCode; + } result = prime * result + ((term == null) ? 0 : term.hashCode()); return result; } @@ -123,7 +151,7 @@ if (automaton == null) { if (other.automaton != null) return false; - } else if (!automaton.equals(other.automaton)) + } else if (!BasicOperations.sameLanguage(automaton, other.automaton)) return false; if (term == null) { if (other.term != null) Index: lucene/src/java/org/apache/lucene/util/automaton/Automaton.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/Automaton.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/automaton/Automaton.java (working copy) @@ -31,14 +31,17 @@ import java.io.Serializable; import java.util.Arrays; +import java.util.BitSet; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; -import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + /** * Finite-state automaton with regular expression operations. *

@@ -93,7 +96,7 @@ /** * Hash code. Recomputed by {@link MinimizationOperations#minimize(Automaton)} */ - int hash_code; + //int hash_code; /** Singleton string. Null if not applicable. */ String singleton; @@ -116,14 +119,14 @@ * @see State * @see Transition */ - public Automaton() { - initial = new State(); + public Automaton(State initial) { + this.initial = initial; deterministic = true; singleton = null; } - - boolean isDebug() { - return System.getProperty("dk.brics.automaton.debug") != null; + + public Automaton() { + this(new State()); } /** @@ -198,10 +201,12 @@ * * @param s state */ + /* public void setInitialState(State s) { initial = s; singleton = null; } + */ /** * Gets initial state. @@ -252,34 +257,70 @@ public Object getInfo() { return info; } - - /** - * Returns the set of states that are reachable from the initial state. - * - * @return set of {@link State} objects - */ - public Set getStates() { - expandSingleton(); - Set visited; - if (isDebug()) visited = new LinkedHashSet(); - else visited = new HashSet(); - LinkedList worklist = new LinkedList(); - worklist.add(initial); - visited.add(initial); - while (worklist.size() > 0) { - State s = worklist.removeFirst(); - Collection tr; - if (isDebug()) tr = s.getSortedTransitions(false); - else tr = s.transitions; - for (Transition t : tr) - if (!visited.contains(t.to)) { - visited.add(t.to); - worklist.add(t.to); + + // cached + private State[] numberedStates; + + public State[] getNumberedStates() { + if (numberedStates == null) { + expandSingleton(); + final Set visited = new HashSet(); + final LinkedList worklist = new LinkedList(); + numberedStates = new State[4]; + int upto = 0; + worklist.add(initial); + visited.add(initial); + initial.number = upto; + numberedStates[upto] = initial; + upto++; + while (worklist.size() > 0) { + State s = worklist.removeFirst(); + for (int i=0;i 0) { State s = worklist.removeFirst(); if (s.accept) accepts.add(s); - for (Transition t : s.transitions) + for (Transition t : s.getTransitions()) if (!visited.contains(t.to)) { visited.add(t.to); worklist.add(t.to); @@ -305,32 +346,25 @@ } /** - * Assigns consecutive numbers to the given states. - */ - static void setStateNumbers(Set states) { - int number = 0; - for (State s : states) - s.number = number++; - } - - /** * Adds transitions to explicit crash state to ensure that transition function * is total. */ void totalize() { State s = new State(); - s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + s.addTransition(new Transition(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT, s)); - for (State p : getStates()) { - int maxi = Character.MIN_VALUE; - for (Transition t : p.getSortedTransitions(false)) { - if (t.min > maxi) p.transitions.add(new Transition((char) maxi, - (char) (t.min - 1), s)); + for (State p : getNumberedStates()) { + int maxi = Character.MIN_CODE_POINT; + p.sortTransitions(Transition.CompareByMinMaxThenDest); + for (Transition t : p.getTransitions()) { + if (t.min > maxi) p.addTransition(new Transition(maxi, + (t.min - 1), s)); if (t.max + 1 > maxi) maxi = t.max + 1; } - if (maxi <= Character.MAX_VALUE) p.transitions.add(new Transition( - (char) maxi, Character.MAX_VALUE, s)); + if (maxi <= Character.MAX_CODE_POINT) p.addTransition(new Transition( + maxi, Character.MAX_CODE_POINT, s)); } + clearNumberedStates(); } /** @@ -349,52 +383,28 @@ * and adjacent edge intervals with same destination. */ public void reduce() { + final State[] states = getNumberedStates(); if (isSingleton()) return; - Set states = getStates(); - setStateNumbers(states); - for (State s : states) { - List st = s.getSortedTransitions(true); - s.resetTransitions(); - State p = null; - int min = -1, max = -1; - for (Transition t : st) { - if (p == t.to) { - if (t.min <= max + 1) { - if (t.max > max) max = t.max; - } else { - if (p != null) s.transitions.add(new Transition((char) min, - (char) max, p)); - min = t.min; - max = t.max; - } - } else { - if (p != null) s.transitions.add(new Transition((char) min, - (char) max, p)); - p = t.to; - min = t.min; - max = t.max; - } - } - if (p != null) s.transitions - .add(new Transition((char) min, (char) max, p)); - } + for (State s : states) + s.reduce(); } /** * Returns sorted array of all interval start points. */ - char[] getStartPoints() { - Set pointset = new HashSet(); - for (State s : getStates()) { - pointset.add(Character.MIN_VALUE); - for (Transition t : s.transitions) { + int[] getStartPoints() { + final State[] states = getNumberedStates(); + Set pointset = new HashSet(); + pointset.add(Character.MIN_CODE_POINT); + for (State s : states) { + for (Transition t : s.getTransitions()) { pointset.add(t.min); - if (t.max < Character.MAX_VALUE) pointset.add((char) (t.max + 1)); + if (t.max < Character.MAX_CODE_POINT) pointset.add((t.max + 1)); } } - char[] points = new char[pointset.size()]; + int[] points = new int[pointset.size()]; int n = 0; - for (Character m : pointset) + for (Integer m : pointset) points[n++] = m; Arrays.sort(points); return points; @@ -406,47 +416,71 @@ * * @return set of {@link State} objects */ - public Set getLiveStates() { - expandSingleton(); - return getLiveStates(getStates()); - } - - private Set getLiveStates(Set states) { - HashMap> map = new HashMap>(); - for (State s : states) - map.put(s, new HashSet()); - for (State s : states) - for (Transition t : s.transitions) - map.get(t.to).add(s); - Set live = new HashSet(getAcceptStates()); + private State[] getLiveStates() { + final State[] states = getNumberedStates(); + Set live = new HashSet(); + for (State q : states) { + if (q.isAccept()) { + live.add(q); + } + } + // map> + Set map[] = new Set[states.length]; + for (int i = 0; i < map.length; i++) + map[i] = new HashSet(); + for (State s : states) { + for(int i=0;i worklist = new LinkedList(live); while (worklist.size() > 0) { State s = worklist.removeFirst(); - for (State p : map.get(s)) + for (State p : map[s.number]) if (!live.contains(p)) { live.add(p); worklist.add(p); } } - return live; + + return live.toArray(new State[live.size()]); } - + /** - * Removes transitions to dead states and calls {@link #reduce()} and - * {@link #clearHashCode()}. (A state is "dead" if no accept state is + * Removes transitions to dead states and calls {@link #reduce()}. + * (A state is "dead" if no accept state is * reachable from it.) */ public void removeDeadTransitions() { - clearHashCode(); + final State[] states = getNumberedStates(); + //clearHashCode(); if (isSingleton()) return; - Set states = getStates(); - Set live = getLiveStates(states); + State[] live = getLiveStates(); + + BitSet liveSet = new BitSet(states.length); + for (State s : live) + liveSet.set(s.number); + for (State s : states) { - Set st = s.transitions; - s.resetTransitions(); - for (Transition t : st) - if (live.contains(t.to)) s.transitions.add(t); + // filter out transitions to dead states: + int upto = 0; + for(int i=0;i 0) { + setNumberedStates(live); + } else { + // sneaky corner case -- if machine accepts no strings + clearNumberedStates(); + } reduce(); } @@ -454,11 +488,15 @@ * Returns a sorted array of transitions for each state (and sets state * numbers). */ - static Transition[][] getSortedTransitions(Set states) { - setStateNumbers(states); - Transition[][] transitions = new Transition[states.size()][]; - for (State s : states) - transitions[s.number] = s.getSortedTransitionArray(false); + Transition[][] getSortedTransitions() { + final State[] states = getNumberedStates(); + Transition[][] transitions = new Transition[states.length][]; + for (State s : states) { + s.sortTransitions(Transition.CompareByMinMaxThenDest); + s.trimTransitionsArray(); + transitions[s.number] = s.transitionsArray; + assert s.transitionsArray != null; + } return transitions; } @@ -470,9 +508,9 @@ if (isSingleton()) { State p = new State(); initial = p; - for (int i = 0; i < singleton.length(); i++) { + for (int i = 0, cp = 0; i < singleton.length(); i += Character.charCount(cp)) { State q = new State(); - p.transitions.add(new Transition(singleton.charAt(i), q)); + p.addTransition(new Transition(cp = singleton.codePointAt(i), q)); p = q; } p.accept = true; @@ -485,8 +523,8 @@ * Returns the number of states in this automaton. */ public int getNumberOfStates() { - if (isSingleton()) return singleton.length() + 1; - return getStates().size(); + if (isSingleton()) return singleton.codePointCount(0, singleton.length()) + 1; + return getNumberedStates().length; } /** @@ -494,45 +532,31 @@ * as the total number of edges, where one edge may be a character interval. */ public int getNumberOfTransitions() { - if (isSingleton()) return singleton.length(); + if (isSingleton()) return singleton.codePointCount(0, singleton.length()); int c = 0; - for (State s : getStates()) - c += s.transitions.size(); + for (State s : getNumberedStates()) + c += s.numTransitions(); return c; } - /** - * Returns true if the language of this automaton is equal to the language of - * the given automaton. Implemented using hashCode and - * subsetOf. - */ @Override public boolean equals(Object obj) { - if (obj == this) return true; - if (!(obj instanceof Automaton)) return false; - Automaton a = (Automaton) obj; - if (isSingleton() && a.isSingleton()) return singleton.equals(a.singleton); - return hashCode() == a.hashCode() && BasicOperations.subsetOf(this, a) - && BasicOperations.subsetOf(a, this); + throw new UnsupportedOperationException("use BasicOperations.sameLanguage instead"); } - - /** - * Returns hash code for this automaton. The hash code is based on the number - * of states and transitions in the minimized automaton. Invoking this method - * may involve minimizing the automaton. - */ + @Override public int hashCode() { - if (hash_code == 0) MinimizationOperations.minimize(this); - return hash_code; + throw new UnsupportedOperationException(); } /** * Must be invoked when the stored hash code may no longer be valid. */ + /* void clearHashCode() { hash_code = 0; } + */ /** * Returns a string representation of this automaton. @@ -542,12 +566,15 @@ StringBuilder b = new StringBuilder(); if (isSingleton()) { b.append("singleton: "); - for (char c : singleton.toCharArray()) + int length = singleton.codePointCount(0, singleton.length()); + int codepoints[] = new int[length]; + for (int i = 0, j = 0, cp = 0; i < singleton.length(); i += Character.charCount(cp)) + codepoints[j++] = cp = singleton.codePointAt(i); + for (int c : codepoints) Transition.appendCharString(c, b); b.append("\n"); } else { - Set states = getStates(); - setStateNumbers(states); + State[] states = getNumberedStates(); b.append("initial state: ").append(initial.number).append("\n"); for (State s : states) b.append(s.toString()); @@ -562,8 +589,7 @@ public String toDot() { StringBuilder b = new StringBuilder("digraph Automaton {\n"); b.append(" rankdir = LR;\n"); - Set states = getStates(); - setStateNumbers(states); + State[] states = getNumberedStates(); for (State s : states) { b.append(" ").append(s.number); if (s.accept) b.append(" [shape=doublecircle,label=\"\"];\n"); @@ -572,7 +598,7 @@ b.append(" initial [shape=plaintext,label=\"\"];\n"); b.append(" initial -> ").append(s.number).append("\n"); } - for (Transition t : s.transitions) { + for (Transition t : s.getTransitions()) { b.append(" ").append(s.number); t.appendDot(b); } @@ -609,17 +635,18 @@ Automaton a = (Automaton) super.clone(); if (!isSingleton()) { HashMap m = new HashMap(); - Set states = getStates(); + State[] states = getNumberedStates(); for (State s : states) m.put(s, new State()); for (State s : states) { State p = m.get(s); p.accept = s.accept; if (s == initial) a.initial = p; - for (Transition t : s.transitions) - p.transitions.add(new Transition(t.min, t.max, m.get(t.to))); + for (Transition t : s.getTransitions()) + p.addTransition(new Transition(t.min, t.max, m.get(t.to))); } } + a.clearNumberedStates(); return a; } catch (CloneNotSupportedException e) { throw new RuntimeException(e); Index: lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java (revision 0) @@ -0,0 +1,51 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CharacterRunAutomaton extends RunAutomaton { + + public CharacterRunAutomaton(Automaton a) { + super(a, Character.MAX_CODE_POINT, false); + } + + /** + * Returns true if the given string is accepted by this automaton. + */ + public boolean run(String s) { + int p = initial; + int l = s.length(); + for (int i = 0, cp = 0; i < l; i += Character.charCount(cp)) { + p = step(p, cp = s.codePointAt(i)); + if (p == -1) return false; + } + return accept[p]; + } + + /** + * Returns true if the given string is accepted by this automaton + */ + public boolean run(char[] s, int offset, int length) { + int p = initial; + int l = offset + length; + for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) { + p = step(p, cp = Character.codePointAt(s, i, l)); + if (p == -1) return false; + } + return accept[p]; + } +} Property changes on: lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (working copy) @@ -31,7 +31,6 @@ import java.util.ArrayList; import java.util.LinkedList; -import java.util.Set; /** * Operations for minimizing automata. @@ -41,7 +40,7 @@ final public class MinimizationOperations { private MinimizationOperations() {} - + /** * Minimizes (and determinizes if not already deterministic) the given * automaton. @@ -53,8 +52,8 @@ minimizeHopcroft(a); } // recompute hash code - a.hash_code = a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2; - if (a.hash_code == 0) a.hash_code = 1; + //a.hash_code = 1a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2; + //if (a.hash_code == 0) a.hash_code = 1; } private static void initialize(ArrayList list, int size) { @@ -67,24 +66,18 @@ */ public static void minimizeHopcroft(Automaton a) { a.determinize(); - Set tr = a.initial.getTransitions(); - if (tr.size() == 1) { - Transition t = tr.iterator().next(); - if (t.to == a.initial && t.min == Character.MIN_VALUE - && t.max == Character.MAX_VALUE) return; + if (a.initial.numTransitions == 1) { + Transition t = a.initial.transitionsArray[0]; + if (t.to == a.initial && t.min == Character.MIN_CODE_POINT + && t.max == Character.MAX_CODE_POINT) return; } a.totalize(); - // make arrays for numbered states and effective alphabet - Set ss = a.getStates(); - State[] states = new State[ss.size()]; - int number = 0; - for (State q : ss) { - states[number] = q; - q.number = number++; - } - char[] sigma = a.getStartPoints(); + + int[] sigma = a.getStartPoints(); // initialize data structures ArrayList>> reverse = new ArrayList>>(); + final State[] states = a.getNumberedStates(); + for (int q = 0; q < states.length; q++) { ArrayList> v = new ArrayList>(); initialize(v, sigma.length); @@ -121,7 +114,7 @@ partition.get(j).add(qq); block[qq.number] = j; for (int x = 0; x < sigma.length; x++) { - char y = sigma[x]; + int y = sigma[x]; State p = qq.step(y); reverse.get(p.number).get(x).add(qq); reverse_nonempty[p.number][x] = true; @@ -218,9 +211,10 @@ for (int n = 0; n < newstates.length; n++) { State s = newstates[n]; s.accept = states[s.number].accept; - for (Transition t : states[s.number].transitions) - s.transitions.add(new Transition(t.min, t.max, newstates[t.to.number])); + for (Transition t : states[s.number].getTransitions()) + s.addTransition(new Transition(t.min, t.max, newstates[t.to.number])); } + a.clearNumberedStates(); a.removeDeadTransitions(); } Index: lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (working copy) @@ -70,42 +70,42 @@ State s = new State(); a.initial = s; s.accept = true; - s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE, + s.addTransition(new Transition(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT, s)); a.deterministic = true; return a; } /** - * Returns a new (deterministic) automaton that accepts any single character. + * Returns a new (deterministic) automaton that accepts any single codepoint. */ public static Automaton makeAnyChar() { - return makeCharRange(Character.MIN_VALUE, Character.MAX_VALUE); + return makeCharRange(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); } /** - * Returns a new (deterministic) automaton that accepts a single character of + * Returns a new (deterministic) automaton that accepts a single codepoint of * the given value. */ - public static Automaton makeChar(char c) { + public static Automaton makeChar(int c) { Automaton a = new Automaton(); - a.singleton = Character.toString(c); + a.singleton = new String(Character.toChars(c)); a.deterministic = true; return a; } /** - * Returns a new (deterministic) automaton that accepts a single char whose + * Returns a new (deterministic) automaton that accepts a single codepoint whose * value is in the given interval (including both end points). */ - public static Automaton makeCharRange(char min, char max) { + public static Automaton makeCharRange(int min, int max) { if (min == max) return makeChar(min); Automaton a = new Automaton(); State s1 = new State(); State s2 = new State(); a.initial = s1; s2.accept = true; - if (min <= max) s1.transitions.add(new Transition(min, max, s2)); + if (min <= max) s1.addTransition(new Transition(min, max, s2)); a.deterministic = true; return a; } Index: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java (revision 0) @@ -0,0 +1,326 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +import java.util.List; +import java.util.ArrayList; + +// TODO +// - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef + +final class UTF32ToUTF8 { + + // Unicode boundaries for UTF8 bytes 1,2,3,4 + private static final int[] startCodes = new int[] {0, 128, 2048, 65536}; + private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111}; + + static int[] MASKS = new int[32]; + static { + int v = 2; + for(int i=0;i<32;i++) { + MASKS[i] = v-1; + v *= 2; + } + } + + // Represents one of the N utf8 bytes that (in sequence) + // define a code point. value is the byte value; bits is + // how many bits are "used" by utf8 at that byte + private static class UTF8Byte { + int value; // TODO: change to byte + byte bits; + } + + // Holds a single code point, as a sequence of 1-4 utf8 bytes: + // TODO: maybe move to UnicodeUtil? + private static class UTF8Sequence { + private final UTF8Byte[] bytes; + private int len; + + public UTF8Sequence() { + bytes = new UTF8Byte[4]; + for(int i=0;i<4;i++) { + bytes[i] = new UTF8Byte(); + } + } + + public int byteAt(int idx) { + return bytes[idx].value; + } + + public int numBits(int idx) { + return bytes[idx].bits; + } + + private void set(int code) { + if (code < 128) { + // 0xxxxxxx + bytes[0].value = code; + bytes[0].bits = 7; + len = 1; + } else if (code < 2048) { + // 110yyyxx 10xxxxxx + bytes[0].value = (6 << 5) | (code >> 6); + bytes[0].bits = 5; + setRest(code, 1); + len = 2; + } else if (code < 65536) { + // 1110yyyy 10yyyyxx 10xxxxxx + bytes[0].value = (14 << 4) | (code >> 12); + bytes[0].bits = 4; + setRest(code, 2); + len = 3; + } else { + // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx + bytes[0].value = (30 << 3) | (code >> 18); + bytes[0].bits = 3; + setRest(code, 3); + len = 4; + } + } + + private void setRest(int code, int numBytes) { + for(int i=0;i> 6; + } + } + + public String toString() { + StringBuilder b = new StringBuilder(); + for(int i=0;i 0) { + b.append(' '); + } + b.append(Integer.toBinaryString(bytes[i].value)); + } + return b.toString(); + } + } + + private final UTF8Sequence startUTF8 = new UTF8Sequence(); + private final UTF8Sequence endUTF8 = new UTF8Sequence(); + + private final UTF8Sequence tmpUTF8a = new UTF8Sequence(); + private final UTF8Sequence tmpUTF8b = new UTF8Sequence(); + + // Builds necessary utf8 edges between start & end + void convertOneEdge(State start, State end, int startCodePoint, int endCodePoint) { + startUTF8.set(startCodePoint); + endUTF8.set(endCodePoint); + //System.out.println("start = " + startUTF8); + //System.out.println(" end = " + endUTF8); + build(start, end, startUTF8, endUTF8, 0); + } + + private void build(State start, State end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) { + + // Break into start, middle, end: + if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) { + // Degen case: lead with the same byte: + if (upto == startUTF8.len-1 && upto == endUTF8.len-1) { + // Super degen: just single edge, one UTF8 byte: + start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); + return; + } else { + assert startUTF8.len > upto+1; + assert endUTF8.len > upto+1; + State n = newUTF8State(); + + // Single value leading edge + start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single + + // Recurse for the rest + build(n, end, startUTF8, endUTF8, 1+upto); + } + } else if (startUTF8.len == endUTF8.len) { + if (upto == startUTF8.len-1) { + start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend + } else { + start(start, end, startUTF8, upto, false); + if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) { + // There is a middle + all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1); + } + end(start, end, endUTF8, upto, false); + } + } else { + + // start + start(start, end, startUTF8, upto, true); + + // possibly middle, spanning multiple num bytes + int byteCount = 1+startUTF8.len-upto; + final int limit = endUTF8.len-upto; + while (byteCount < limit) { + // wasteful: we only need first byte, and, we should + // statically encode this first byte: + tmpUTF8a.set(startCodes[byteCount-1]); + tmpUTF8b.set(endCodes[byteCount-1]); + all(start, end, + tmpUTF8a.byteAt(0), + tmpUTF8b.byteAt(0), + tmpUTF8a.len - 1); + byteCount++; + } + + // end + end(start, end, endUTF8, upto, true); + } + } + + private void start(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) { + if (upto == utf8.len-1) { + // Done recursing + start.addTransition(new Transition(utf8.byteAt(upto), utf8.byteAt(upto) | MASKS[utf8.numBits(upto)-1], end)); // type=start + } else { + State n = newUTF8State(); + start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=start + start(n, end, utf8, 1+upto, true); + int endCode = utf8.byteAt(upto) | MASKS[utf8.numBits(upto)-1]; + if (doAll && utf8.byteAt(upto) != endCode) { + all(start, end, utf8.byteAt(upto)+1, endCode, utf8.len-upto-1); + } + } + } + + private void end(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) { + if (upto == utf8.len-1) { + // Done recursing + start.addTransition(new Transition(utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto)-1]), utf8.byteAt(upto), end)); // type=end + } else { + final int startCode; + if (utf8.numBits(upto) == 5) { + // special case -- avoid created unused edges (utf8 + // doesn't accept certain byte sequences) -- there + // are other cases we could optimize too: + startCode = 194; + } else { + startCode = utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto)-1]); + } + if (doAll && utf8.byteAt(upto) != startCode) { + all(start, end, startCode, utf8.byteAt(upto)-1, utf8.len-upto-1); + } + State n = newUTF8State(); + start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=end + end(n, end, utf8, 1+upto, true); + } + } + + private void all(State start, State end, int startCode, int endCode, int left) { + if (left == 0) { + start.addTransition(new Transition(startCode, endCode, end)); // type=all + } else { + State lastN = newUTF8State(); + start.addTransition(new Transition(startCode, endCode, lastN)); // type=all + while (left > 1) { + State n = newUTF8State(); + lastN.addTransition(new Transition(128, 191, n)); // type=all* + left--; + lastN = n; + } + lastN.addTransition(new Transition(128, 191, end)); // type = all* + } + } + + private State[] utf8States; + private int utf8StateCount; + + /** Converts an incoming utf32 automaton to an equivalent + * utf8 one. The incoming automaton need not be + * deterministic. Note that the returned automaton will + * not in general be deterministic, so you must + * determinize it if that's needed. */ + public Automaton convert(Automaton utf32) { + if (utf32.isSingleton()) { + utf32 = utf32.cloneExpanded(); + } + + State[] map = new State[utf32.getNumberedStates().length]; + List pending = new ArrayList(); + State utf32State = utf32.getInitialState(); + pending.add(utf32State); + Automaton utf8 = new Automaton(); + utf8.setDeterministic(false); + + State utf8State = utf8.getInitialState(); + + utf8States = new State[5]; + utf8StateCount = 0; + utf8State.number = utf8StateCount; + utf8States[utf8StateCount] = utf8State; + utf8StateCount++; + + utf8State.setAccept(utf32State.isAccept()); + + map[utf32State.number] = utf8State; + + while(pending.size() != 0) { + utf32State = pending.remove(pending.size()-1); + utf8State = map[utf32State.number]; + for(int i=0;i set = new TreeSet(); + SortedSet set = new TreeSet(); for (int i = 0; i < word.length; i++) set.add(word[i]); - alphabet = new char[set.size()]; - Iterator iterator = set.iterator(); + alphabet = new int[set.size()]; + Iterator iterator = set.iterator(); for (int i = 0; i < alphabet.length; i++) alphabet[i] = iterator.next(); - rangeLower = new char[alphabet.length + 2]; - rangeUpper = new char[alphabet.length + 2]; + rangeLower = new int[alphabet.length + 2]; + rangeUpper = new int[alphabet.length + 2]; // calculate the unicode range intervals that exclude the alphabet // these are the ranges for all unicode characters not in the alphabet int lower = 0; for (int i = 0; i < alphabet.length; i++) { - char higher = alphabet[i]; + int higher = alphabet[i]; if (higher > lower) { - rangeLower[numRanges] = (char) lower; - rangeUpper[numRanges] = (char) (higher - 1); + rangeLower[numRanges] = lower; + rangeUpper[numRanges] = higher - 1; numRanges++; } lower = higher + 1; } /* add the final endpoint */ - if (lower <= 0xFFFF) { - rangeLower[numRanges] = (char) lower; - rangeUpper[numRanges] = '\uFFFF'; + if (lower <= Character.MAX_CODE_POINT) { + rangeLower[numRanges] = lower; + rangeUpper[numRanges] = Character.MAX_CODE_POINT; numRanges++; } descriptions = new ParametricDescription[] { null, /* for n=0, we do not need to go through the trouble */ - new Lev1ParametricDescription(input.length()), - new Lev2ParametricDescription(input.length()), + new Lev1ParametricDescription(word.length), + new Lev2ParametricDescription(word.length), }; } @@ -119,6 +123,7 @@ // create all states, and mark as accept states if appropriate for (int i = 0; i < states.length; i++) { states[i] = new State(); + states[i].number = i; states[i].setAccept(description.isAccept(i)); } // create transitions from state to state @@ -129,7 +134,7 @@ final int end = xpos + Math.min(word.length - xpos, range); for (int x = 0; x < alphabet.length; x++) { - final char ch = alphabet[x]; + final int ch = alphabet[x]; // get the characteristic vector at this position wrt ch final int cvec = getVector(ch, xpos, end); int dest = description.transition(k, xpos, cvec); @@ -143,13 +148,15 @@ if (dest >= 0) for (int r = 0; r < numRanges; r++) states[k].addTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest])); + // reduce the state: this doesn't appear to help anything + //states[k].reduce(); } - Automaton a = new Automaton(); - a.setInitialState(states[0]); + Automaton a = new Automaton(states[0]); a.setDeterministic(true); + a.setNumberedStates(states); // we need not trim transitions to dead states, as they are not created. - // a.restoreInvariant(); + //a.restoreInvariant(); return a; } @@ -157,7 +164,7 @@ * Get the characteristic vector X(x, V) * where V is substring(pos, end) */ - int getVector(char x, int pos, int end) { + int getVector(int x, int pos, int end) { int vector = 0; for (int i = pos; i < end; i++) { vector <<= 1; Index: lucene/src/java/org/apache/lucene/util/automaton/State.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/State.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/automaton/State.java (working copy) @@ -28,13 +28,15 @@ */ package org.apache.lucene.util.automaton; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; import java.io.Serializable; import java.util.Arrays; import java.util.Collection; -import java.util.HashSet; -import java.util.List; +import java.util.Comparator; import java.util.Set; +import java.util.Iterator; /** * Automaton state. @@ -44,7 +46,8 @@ public class State implements Serializable, Comparable { boolean accept; - Set transitions; + public Transition[] transitionsArray; + public int numTransitions; int number; @@ -63,8 +66,26 @@ * Resets transition set. */ final void resetTransitions() { - transitions = new HashSet(); + transitionsArray = new Transition[0]; + numTransitions = 0; } + + private class TransitionsIterable implements Iterable { + public Iterator iterator() { + return new Iterator() { + int upto; + public boolean hasNext() { + return upto < numTransitions; + } + public Transition next() { + return transitionsArray[upto++]; + } + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + } /** * Returns the set of outgoing transitions. Subsequent changes are reflected @@ -72,9 +93,18 @@ * * @return transition set */ - public Set getTransitions() { - return transitions; + public Iterable getTransitions() { + return new TransitionsIterable(); } + + public int numTransitions() { + return numTransitions; + } + + public void setTransitions(Transition[] transitions) { + this.numTransitions = transitions.length; + this.transitionsArray = transitions; + } /** * Adds an outgoing transition. @@ -82,7 +112,12 @@ * @param t transition */ public void addTransition(Transition t) { - transitions.add(t); + if (numTransitions == transitionsArray.length) { + final Transition[] newArray = new Transition[ArrayUtil.oversize(1+numTransitions, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; + System.arraycopy(transitionsArray, 0, newArray, 0, numTransitions); + transitionsArray = newArray; + } + transitionsArray[numTransitions++] = t; } /** @@ -106,44 +141,88 @@ /** * Performs lookup in transitions, assuming determinism. * - * @param c character to look up + * @param c codepoint to look up * @return destination state, null if no matching outgoing transition - * @see #step(char, Collection) + * @see #step(int, Collection) */ - public State step(char c) { - for (Transition t : transitions) + public State step(int c) { + assert c >= 0; + for (int i=0;i dest) { - for (Transition t : transitions) + public void step(int c, Collection dest) { + for (int i=0;i max) max = t.max; + } else { + if (p != null) { + transitionsArray[upto++] = new Transition(min, max, p); + } + min = t.min; + max = t.max; + } + } else { + if (p != null) { + transitionsArray[upto++] = new Transition(min, max, p); + } + p = t.to; + min = t.min; + max = t.max; + } + } + + if (p != null) { + transitionsArray[upto++] = new Transition(min, max, p); + } + numTransitions = upto; } - + /** * Returns sorted list of outgoing transitions. * @@ -151,11 +230,12 @@ * reverse max, to) * @return transition list */ - public List getSortedTransitions(boolean to_first) { - return Arrays.asList(getSortedTransitionArray(to_first)); + + /** Sorts transitions array in-place. */ + public void sortTransitions(Comparator comparator) { + Arrays.sort(transitionsArray, 0, numTransitions, comparator); } - /** * Return this state's number. *

@@ -178,7 +258,7 @@ if (accept) b.append(" [accept]"); else b.append(" [reject]"); b.append(":\n"); - for (Transition t : transitions) + for (Transition t : getTransitions()) b.append(" ").append(t.toString()).append("\n"); return b.toString(); } @@ -190,20 +270,4 @@ public int compareTo(State s) { return s.id - id; } - - /** - * See {@link java.lang.Object#equals(java.lang.Object)}. - */ - @Override - public boolean equals(Object obj) { - return super.equals(obj); - } - - /** - * See {@link java.lang.Object#hashCode()}. - */ - @Override - public int hashCode() { - return super.hashCode(); - } } Index: lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.lucene.util.automaton; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class ByteRunAutomaton extends RunAutomaton { + + public ByteRunAutomaton(Automaton a) { + super(new UTF32ToUTF8().convert(a), 256, true); + } + + /** + * Returns true if the given byte array is accepted by this automaton + */ + public boolean run(byte[] s, int offset, int length) { + int p = initial; + int l = offset + length; + for (int i = offset; i < l; i++) { + p = step(p, s[i] & 0xFF); + if (p == -1) return false; + } + return accept[p]; + } +} Property changes on: lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java (working copy) @@ -1,75 +0,0 @@ -/* - * dk.brics.automaton - * - * Copyright (c) 2001-2009 Anders Moeller - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -package org.apache.lucene.util.automaton; - -import java.io.Serializable; -import java.util.Comparator; - -/** - * Comparator for state {@link Transition}s that orders unicode char range - * transitions in lexicographic order. - * - * @lucene.experimental - */ -class TransitionComparator implements Comparator, Serializable { - - boolean to_first; - - TransitionComparator(boolean to_first) { - this.to_first = to_first; - } - - /** - * Compares by (min, reverse max, to) or (to, min, reverse max). - */ - public int compare(Transition t1, Transition t2) { - if (to_first) { - if (t1.to != t2.to) { - if (t1.to == null) return -1; - else if (t2.to == null) return 1; - else if (t1.to.number < t2.to.number) return -1; - else if (t1.to.number > t2.to.number) return 1; - } - } - if (t1.min < t2.min) return -1; - if (t1.min > t2.min) return 1; - if (t1.max > t2.max) return -1; - if (t1.max < t2.max) return 1; - if (!to_first) { - if (t1.to != t2.to) { - if (t1.to == null) return -1; - else if (t2.to == null) return 1; - else if (t1.to.number < t2.to.number) return -1; - else if (t1.to.number > t2.to.number) return 1; - } - } - return 0; - } -} Index: lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (working copy) @@ -33,6 +33,8 @@ import java.util.HashSet; import java.util.Set; +import org.apache.lucene.util.BytesRef; + /** * Special automata operations. * @@ -46,7 +48,7 @@ * Finds the largest entry whose value is less than or equal to c, or 0 if * there is no such entry. */ - static int findIndex(char c, char[] points) { + static int findIndex(int c, int[] points) { int a = 0; int b = points.length; while (b - a > 1) { @@ -70,9 +72,11 @@ * Checks whether there is a loop containing s. (This is sufficient since * there are never transitions to dead states.) */ + // TODO: not great that this is recursive... in theory a + // large automata could exceed java's stack private static boolean isFinite(State s, HashSet path) { path.add(s); - for (Transition t : s.transitions) + for (Transition t : s.getTransitions()) if (path.contains(t.to) || !isFinite(t.to, path)) return false; path.remove(s); return true; @@ -93,10 +97,10 @@ do { done = true; visited.add(s); - if (!s.accept && s.transitions.size() == 1) { - Transition t = s.transitions.iterator().next(); + if (!s.accept && s.numTransitions() == 1) { + Transition t = s.getTransitions().iterator().next(); if (t.min == t.max && !visited.contains(t.to)) { - b.append(t.min); + b.appendCodePoint(t.min); s = t.to; done = false; } @@ -105,6 +109,31 @@ return b.toString(); } + // TODO: this currently requites a determinized machine, + // but it need not -- we can speed it up by walking the + // NFA instead. it'd still be fail fast. + public static BytesRef getCommonPrefixBytesRef(Automaton a) { + if (a.isSingleton()) return new BytesRef(a.singleton); + BytesRef ref = new BytesRef(10); + HashSet visited = new HashSet(); + State s = a.initial; + boolean done; + do { + done = true; + visited.add(s); + if (!s.accept && s.numTransitions() == 1) { + Transition t = s.getTransitions().iterator().next(); + if (t.min == t.max && !visited.contains(t.to)) { + ref.grow(++ref.length); + ref.bytes[ref.length - 1] = (byte)t.min; + s = t.to; + done = false; + } + } + } while (!done); + return ref; + } + /** * Returns the longest string that is a suffix of all accepted strings and * visits each state at most once. @@ -119,9 +148,32 @@ Automaton r = a.clone(); reverse(r); r.determinize(); - return reverseUnicode3(SpecialOperations.getCommonPrefix(r)); + return new StringBuilder(SpecialOperations.getCommonPrefix(r)).reverse().toString(); } + public static BytesRef getCommonSuffixBytesRef(Automaton a) { + if (a.isSingleton()) // if singleton, the suffix is the string itself. + return new BytesRef(a.singleton); + + // reverse the language of the automaton, then reverse its common prefix. + Automaton r = a.clone(); + reverse(r); + r.determinize(); + BytesRef ref = SpecialOperations.getCommonPrefixBytesRef(r); + reverseBytes(ref); + return ref; + } + + private static void reverseBytes(BytesRef ref) { + if (ref.length <= 1) return; + int num = ref.length >> 1; + for (int i = ref.offset; i < ( ref.offset + num ); i++) { + byte b = ref.bytes[i]; + ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1]; + ref.bytes[ref.offset * 2 + ref.length - i - 1] = b; + } + } + /** * Reverses the language of the given (non-singleton) automaton while returning * the set of new initial states. @@ -130,8 +182,11 @@ a.expandSingleton(); // reverse all edges HashMap> m = new HashMap>(); - Set states = a.getStates(); - Set accept = a.getAcceptStates(); + State[] states = a.getNumberedStates(); + Set accept = new HashSet(); + for (State s : states) + if (s.isAccept()) + accept.add(s); for (State r : states) { m.put(r, new HashSet()); r.accept = false; @@ -139,41 +194,17 @@ for (State r : states) for (Transition t : r.getTransitions()) m.get(t.to).add(new Transition(t.min, t.max, r)); - for (State r : states) - r.transitions = m.get(r); + for (State r : states) { + Set tr = m.get(r); + r.setTransitions(tr.toArray(new Transition[tr.size()])); + } // make new initial+final states a.initial.accept = true; a.initial = new State(); for (State r : accept) a.initial.addEpsilon(r); // ensures that all initial states are reachable a.deterministic = false; + a.clearNumberedStates(); return accept; } - - /** - * Intentionally use a unicode 3 reverse. - * This is because we are only going to reverse it again... - */ - private static String reverseUnicode3( final String input ){ - char[] charInput = input.toCharArray(); - reverseUnicode3(charInput, 0, charInput.length); - return new String(charInput); - } - - /** - * Intentionally use a unicode 3 reverse. - * This is because it is only used by getCommonSuffix(), - * which will reverse the entire FSM using code unit reversal, - * so we must then reverse its common prefix back using the - * same code unit reversal. - */ - private static void reverseUnicode3(char[] buffer, int start, int len){ - if (len <= 1) return; - int num = len>>1; - for (int i = start; i < ( start + num ); i++) { - char c = buffer[i]; - buffer[i] = buffer[start * 2 + len - i - 1]; - buffer[start * 2 + len - i - 1] = c; - } - } } Index: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py (revision 0) +++ lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py (revision 0) @@ -0,0 +1,351 @@ +import types +import os +import sys +import random + +MAX_UNICODE = 0x10FFFF + +# TODO +# - could be more minimal +# - eg when bracket lands on a utf8 boundary, like 3 - 2047 -- they can share the two * edges +# - also 3 2048 or 3 65536 -- it should not have an * down the red path, but it does + +# MASKS[0] is bottom 1-bit +# MASKS[1] is bottom 2-bits +# ... + +utf8Ranges = [(0, 127), + (128, 2047), + (2048, 65535), + (65536, 1114111)] + +typeToColor = {'startend': 'purple', + 'start': 'blue', + 'end': 'red'} + +class FSA: + + def __init__(self): + # maps fromNode -> (startUTF8, endUTF8, endNode) + self.states = {} + self.nodeUpto = 0 + + def run(self, bytes): + state = self.start + for b in bytes: + found = False + oldState = state + for label, s, e, n in self.states[state][1:]: + if b >= s and b <= e: + if found: + raise RuntimeError('state %s has ambiguous output for byte %s' % (oldState, b)) + state = n + found = True + if not found: + return -1 + + return state + + def addEdge(self, n1, n2, v1, v2, label): + """ + Adds edge from n1-n2, utf8 byte range v1-v2. + """ + assert n1 in self.states + assert type(v1) is types.IntType + assert type(v2) is types.IntType + self.states[n1].append((label, v1, v2, n2)) + + def addNode(self, label=None): + try: + self.states[self.nodeUpto] = [label] + return self.nodeUpto + finally: + self.nodeUpto += 1 + + def toDOT(self, label): + __l = [] + w = __l.append + endNode = startNode = None + for id, details in self.states.items(): + name = details[0] + if name == 'end': + endNode = id + elif name == 'start': + startNode = id + + w('digraph %s {' % label) + w(' rankdir=LR;') + w(' size="8,5";') + w(' node [color=white label=""]; Ns;') + + w(' node [color=black];') + w(' node [shape=doublecircle, label=""]; N%s [label="%s"];' % (endNode, endNode)) + w(' node [shape=circle];') + + w(' N%s [label="%s"];' % (startNode, startNode)) + w(' Ns -> N%s;' % startNode) + for id, details in self.states.items(): + edges = details[1:] + w(' N%s [label="%s"];' % (id, id)) + for type, s, e, dest in edges: + c = typeToColor.get(type, 'black') + if type == 'all*': + # special case -- matches any utf8 byte at this point + label = '*' + elif s == e: + label = '%s' % binary(s) + else: + label = '%s-%s' % (binary(s), binary(e)) + w(' N%s -> N%s [label="%s" color="%s"];' % (id, dest, label, c)) + if name == 'end': + endNode = id + elif name == 'start': + startNode = id + w('}') + return '\n'.join(__l) + + def toPNG(self, label, pngOut): + open('tmp.dot', 'wb').write(self.toDOT(label)) + if os.system('dot -Tpng tmp.dot -o %s' % pngOut): + raise RuntimeException('dot failed') + + +MASKS = [] +v = 2 +for i in range(32): + MASKS.append(v-1) + v *= 2 + +def binary(x): + if x == 0: + return '00000000' + + l = [] + while x > 0: + if x & 1 == 1: + l.append('1') + else: + l.append('0') + x = x >> 1 + + # big endian! + l.reverse() + + l2 = [] + while len(l) > 0: + s = ''.join(l[-8:]) + if len(s) < 8: + s = '0'*(8-len(s)) + s + l2.append(s) + del l[-8:] + + return ' '.join(l2) + +def getUTF8Rest(code, numBytes): + l = [] + for i in range(numBytes): + l.append((128 | (code & MASKS[5]), 6)) + code = code >> 6 + l.reverse() + return tuple(l) + +def toUTF8(code): + # code = Unicode code point + assert code >= 0 + assert code <= MAX_UNICODE + + if code < 128: + # 0xxxxxxx + bytes = ((code, 7),) + elif code < 2048: + # 110yyyxx 10xxxxxx + byte1 = (6 << 5) | (code >> 6) + bytes = ((byte1, 5),) + getUTF8Rest(code, 1) + elif code < 65536: + # 1110yyyy 10yyyyxx 10xxxxxx + len = 3 + byte1 = (14 << 4) | (code >> 12) + bytes = ((byte1, 4),) + getUTF8Rest(code, 2) + else: + # 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx + len = 4 + byte1 = (30 << 3) | (code >> 18) + bytes = ((byte1, 3),) + getUTF8Rest(code, 3) + + return bytes + +def all(fsa, startNode, endNode, startCode, endCode, left): + if len(left) == 0: + fsa.addEdge(startNode, endNode, startCode, endCode, 'all') + else: + lastN = fsa.addNode() + fsa.addEdge(startNode, lastN, startCode, endCode, 'all') + while len(left) > 1: + n = fsa.addNode() + fsa.addEdge(lastN, n, 128, 191, 'all*') + left = left[1:] + lastN = n + fsa.addEdge(lastN, endNode, 128, 191, 'all*') + +def start(fsa, startNode, endNode, utf8, doAll): + if len(utf8) == 1: + fsa.addEdge(startNode, endNode, utf8[0][0], utf8[0][0] | MASKS[utf8[0][1]-1], 'start') + else: + n = fsa.addNode() + fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'start') + start(fsa, n, endNode, utf8[1:], True) + end = utf8[0][0] | MASKS[utf8[0][1]-1] + if doAll and utf8[0][0] != end: + all(fsa, startNode, endNode, utf8[0][0]+1, end, utf8[1:]) + +def end(fsa, startNode, endNode, utf8, doAll): + if len(utf8) == 1: + fsa.addEdge(startNode, endNode, utf8[0][0] & ~MASKS[utf8[0][1]-1], utf8[0][0], 'end') + else: + if utf8[0][1] == 5: + # special case -- avoid created unused edges (utf8 doesn't accept certain byte sequences): + start = 194 + else: + start = utf8[0][0] & (~MASKS[utf8[0][1]-1]) + if doAll and utf8[0][0] != start: + all(fsa, startNode, endNode, start, utf8[0][0]-1, utf8[1:]) + n = fsa.addNode() + fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'end') + end(fsa, n, endNode, utf8[1:], True) + +def build(fsa, + startNode, endNode, + startUTF8, endUTF8): + + # Break into start, middle, end: + if startUTF8[0][0] == endUTF8[0][0]: + # Degen case: lead with the same byte: + if len(startUTF8) == 1 and len(endUTF8) == 1: + fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend') + return + else: + assert len(startUTF8) != 1 + assert len(endUTF8) != 1 + n = fsa.addNode() + # single value edge + fsa.addEdge(startNode, n, startUTF8[0][0], startUTF8[0][0], 'single') + build(fsa, n, endNode, startUTF8[1:], endUTF8[1:]) + elif len(startUTF8) == len(endUTF8): + if len(startUTF8) == 1: + fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend') + else: + start(fsa, startNode, endNode, startUTF8, False) + if endUTF8[0][0] - startUTF8[0][0] > 1: + all(fsa, startNode, endNode, startUTF8[0][0]+1, endUTF8[0][0]-1, startUTF8[1:]) + end(fsa, startNode, endNode, endUTF8, False) + else: + # start + start(fsa, startNode, endNode, startUTF8, True) + + # possibly middle + byteCount = 1+len(startUTF8) + while byteCount < len(endUTF8): + s = toUTF8(utf8Ranges[byteCount-1][0]) + e = toUTF8(utf8Ranges[byteCount-1][1]) + all(fsa, startNode, endNode, + s[0][0], + e[0][0], + s[1:]) + byteCount += 1 + + # end + end(fsa, startNode, endNode, endUTF8, True) + +def main(): + + if len(sys.argv) not in (3, 4): + print + print 'Usage: python %s startUTF32 endUTF32 [testCode]' % sys.argv[0] + print + sys.exit(1) + + utf32Start = int(sys.argv[1]) + utf32End = int(sys.argv[2]) + + if utf32Start > utf32End: + print 'ERROR: start must be <= end' + sys.exit(1) + + fsa = FSA() + fsa.start = fsa.addNode('start') + fsa.end = fsa.addNode('end') + + print 's=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32Start)]) + print 'e=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32End)]) + + if len(sys.argv) == 4: + print 't=%s [%s]' % \ + (' '.join([binary(x[0]) for x in toUTF8(int(sys.argv[3]))]), + ' '.join(['%2x' % x[0] for x in toUTF8(int(sys.argv[3]))])) + + build(fsa, fsa.start, fsa.end, + toUTF8(utf32Start), + toUTF8(utf32End)) + + fsa.toPNG('test', '/tmp/outpy.png') + print 'Saved to /tmp/outpy.png...' + + test(fsa, utf32Start, utf32End, 100000); + +def test(fsa, utf32Start, utf32End, count): + + # verify correct ints are accepted + for i in range(count): + r = random.randint(utf32Start, utf32End) + dest = fsa.run([tup[0] for tup in toUTF8(r)]) + if dest != fsa.end: + print 'FAILED: valid %s (%s) is not accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)])) + return False + + invalidRange = MAX_UNICODE - (utf32End - utf32Start + 1) + if invalidRange >= 0: + # verify invalid ints are not accepted + for i in range(count): + r = random.randint(0, invalidRange-1) + if r >= utf32Start: + r = utf32End + 1 + r - utf32Start + dest = fsa.run([tup[0] for tup in toUTF8(r)]) + if dest != -1: + print 'FAILED: invalid %s (%s) is accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)])) + return False + + return True + +def stress(): + + print 'Testing...' + + iter = 0 + while True: + if iter % 10 == 0: + print '%s...' % iter + iter += 1 + + v1 = random.randint(0, MAX_UNICODE) + v2 = random.randint(0, MAX_UNICODE) + if v2 < v1: + v1, v2 = v2, v1 + + utf32Start = v1 + utf32End = v2 + + fsa = FSA() + fsa.start = fsa.addNode('start') + fsa.end = fsa.addNode('end') + build(fsa, fsa.start, fsa.end, + toUTF8(utf32Start), + toUTF8(utf32End)) + + if not test(fsa, utf32Start, utf32End, 10000): + print 'FAILED on utf32Start=%s utf32End=%s' % (utf32Start, utf32End) + +if __name__ == '__main__': + if len(sys.argv) > 1: + main() + else: + stress() Property changes on: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java (working copy) @@ -29,8 +29,12 @@ package org.apache.lucene.util.automaton; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + import java.util.ArrayList; import java.util.BitSet; +import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; @@ -73,7 +77,8 @@ s.addEpsilon(a2.initial); } a1.deterministic = deterministic; - a1.clearHashCode(); + //a1.clearHashCode(); + a1.clearNumberedStates(); a1.checkMinimizeAlways(); return a1; } @@ -125,7 +130,8 @@ ac = ns; } b.deterministic = false; - b.clearHashCode(); + //b.clearHashCode(); + b.clearNumberedStates(); b.checkMinimizeAlways(); return b; } @@ -144,7 +150,8 @@ s.accept = true; a.initial = s; a.deterministic = false; - a.clearHashCode(); + //a.clearHashCode(); + a.clearNumberedStates(); a.checkMinimizeAlways(); return a; } @@ -165,7 +172,8 @@ p.addEpsilon(s); a.initial = s; a.deterministic = false; - a.clearHashCode(); + //a.clearHashCode(); + a.clearNumberedStates(); a.checkMinimizeAlways(); return a; } @@ -217,7 +225,8 @@ for (State p : b.getAcceptStates()) p.addEpsilon(d.initial); b.deterministic = false; - b.clearHashCode(); + //b.clearHashCode(); + b.clearNumberedStates(); b.checkMinimizeAlways(); } return b; @@ -233,7 +242,7 @@ a = a.cloneExpandedIfRequired(); a.determinize(); a.totalize(); - for (State p : a.getStates()) + for (State p : a.getNumberedStates()) p.accept = !p.accept; a.removeDeadTransitions(); return a; @@ -274,10 +283,8 @@ else return BasicAutomata.makeEmpty(); } if (a1 == a2) return a1.cloneIfRequired(); - Transition[][] transitions1 = Automaton - .getSortedTransitions(a1.getStates()); - Transition[][] transitions2 = Automaton - .getSortedTransitions(a2.getStates()); + Transition[][] transitions1 = a1.getSortedTransitions(); + Transition[][] transitions2 = a2.getSortedTransitions(); Automaton c = new Automaton(); LinkedList worklist = new LinkedList(); HashMap newstates = new HashMap(); @@ -302,9 +309,9 @@ newstates.put(q, q); r = q; } - char min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; - char max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; - p.s.transitions.add(new Transition(min, max, r.s)); + int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; + int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; + p.s.addTransition(new Transition(min, max, r.s)); } } } @@ -313,6 +320,24 @@ c.checkMinimizeAlways(); return c; } + + /** Returns true if these two auotomata accept exactly the + * same language. This is a costly computation! Note + * also that a1 and a2 will be determinized as a side + * effect. */ + public static boolean sameLanguage(Automaton a1, Automaton a2) { + if (a1 == a2) { + return true; + } + if (a1.isSingleton() && a2.isSingleton()) { + return a1.singleton.equals(a2.singleton); + } else if (a1.isSingleton()) { + // subsetOf is faster if the first automaton is a singleton + return subsetOf(a1, a2) && subsetOf(a2, a1); + } else { + return subsetOf(a2, a1) && subsetOf(a1, a2); + } + } /** * Returns true if the language of a1 is a subset of the language @@ -328,10 +353,8 @@ return BasicOperations.run(a2, a1.singleton); } a2.determinize(); - Transition[][] transitions1 = Automaton - .getSortedTransitions(a1.getStates()); - Transition[][] transitions2 = Automaton - .getSortedTransitions(a2.getStates()); + Transition[][] transitions1 = a1.getSortedTransitions(); + Transition[][] transitions2 = a2.getSortedTransitions(); LinkedList worklist = new LinkedList(); HashSet visited = new HashSet(); StatePair p = new StatePair(a1.initial, a2.initial); @@ -339,19 +362,24 @@ visited.add(p); while (worklist.size() > 0) { p = worklist.removeFirst(); - if (p.s1.accept && !p.s2.accept) return false; + if (p.s1.accept && !p.s2.accept) { + return false; + } Transition[] t1 = transitions1[p.s1.number]; Transition[] t2 = transitions2[p.s2.number]; for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { while (b2 < t2.length && t2[b2].max < t1[n1].min) b2++; int min1 = t1[n1].min, max1 = t1[n1].max; + for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { - if (t2[n2].min > min1) return false; - if (t2[n2].max < Character.MAX_VALUE) min1 = t2[n2].max + 1; + if (t2[n2].min > min1) { + return false; + } + if (t2[n2].max < Character.MAX_CODE_POINT) min1 = t2[n2].max + 1; else { - min1 = Character.MAX_VALUE; - max1 = Character.MIN_VALUE; + min1 = Character.MAX_CODE_POINT; + max1 = Character.MIN_CODE_POINT; } StatePair q = new StatePair(t1[n1].to, t2[n2].to); if (!visited.contains(q)) { @@ -359,7 +387,9 @@ visited.add(q); } } - if (min1 <= max1) return false; + if (min1 <= max1) { + return false; + } } } return true; @@ -387,7 +417,8 @@ s.addEpsilon(a2.initial); a1.initial = s; a1.deterministic = false; - a1.clearHashCode(); + //a1.clearHashCode(); + a1.clearNumberedStates(); a1.checkMinimizeAlways(); return a1; } @@ -414,64 +445,257 @@ Automaton a = new Automaton(); a.initial = s; a.deterministic = false; - a.clearHashCode(); + //a.clearHashCode(); + a.clearNumberedStates(); a.checkMinimizeAlways(); return a; } - + + // Simple custom ArrayList + private final static class TransitionList { + Transition[] transitions = new Transition[2]; + int count; + + public void add(Transition t) { + if (transitions.length == count) { + Transition[] newArray = new Transition[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; + System.arraycopy(transitions, 0, newArray, 0, count); + transitions = newArray; + } + transitions[count++] = t; + } + } + + // Holds all transitions that start on this int point, or + // end at this point-1 + private final static class PointTransitions implements Comparable { + int point; + final TransitionList ends = new TransitionList(); + final TransitionList starts = new TransitionList(); + public int compareTo(PointTransitions other) { + return point - other.point; + } + + public void reset(int point) { + this.point = point; + ends.count = 0; + starts.count = 0; + } + + public boolean equals(Object other) { + return ((PointTransitions) other).point == point; + } + + public int hashCode() { + return point; + } + } + + private final static class PointTransitionSet { + int count; + PointTransitions[] points = new PointTransitions[5]; + + private final static int HASHMAP_CUTOVER = 30; + private final HashMap map = new HashMap(); + private boolean useHash = false; + + private PointTransitions next(int point) { + // 1st time we are seeing this point + if (count == points.length) { + final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; + System.arraycopy(points, 0, newArray, 0, count); + points = newArray; + } + PointTransitions points0 = points[count]; + if (points0 == null) { + points0 = points[count] = new PointTransitions(); + } + points0.reset(point); + count++; + return points0; + } + + private PointTransitions find(int point) { + if (useHash) { + final Integer pi = point; + PointTransitions p = map.get(pi); + if (p == null) { + p = next(point); + map.put(pi, p); + } + return p; + } else { + for(int i=0;i 1) { + Arrays.sort(points, 0, count); + } + } + + public void add(Transition t) { + find(t.min).starts.add(t); + find(1+t.max).ends.add(t); + } + + public String toString() { + StringBuilder s = new StringBuilder(); + for(int i=0;i 0) { + s.append(' '); + } + s.append(points[i].point).append(':').append(points[i].starts.count).append(',').append(points[i].ends.count); + } + return s.toString(); + } + } + /** * Determinizes the given automaton. *

- * Complexity: exponential in number of states. + * Worst case complexity: exponential in number of states. */ - public static void determinize(Automaton a) { - if (a.deterministic || a.isSingleton()) return; - Set initialset = new HashSet(); - initialset.add(a.initial); - determinize(a, initialset); - } - - /** - * Determinizes the given automaton using the given set of initial states. - */ - static void determinize(Automaton a, Set initialset) { - char[] points = a.getStartPoints(); + static void determinize(Automaton a) { + if (a.deterministic || a.isSingleton()) { + return; + } + + final State[] allStates = a.getNumberedStates(); + // subset construction - Map,Set> sets = new HashMap,Set>(); - LinkedList> worklist = new LinkedList>(); - Map,State> newstate = new HashMap,State>(); - sets.put(initialset, initialset); + final boolean initAccept = a.initial.accept; + final int initNumber = a.initial.number; + a.initial = new State(); + SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(initNumber, a.initial); + + LinkedList worklist = new LinkedList(); + Map newstate = new HashMap(); + worklist.add(initialset); - a.initial = new State(); + + a.initial.accept = initAccept; newstate.put(initialset, a.initial); + + int newStateUpto = 0; + State[] newStatesArray = new State[5]; + newStatesArray[newStateUpto] = a.initial; + a.initial.number = newStateUpto; + newStateUpto++; + + // like Set + final PointTransitionSet points = new PointTransitionSet(); + + // like SortedMap + final SortedIntSet statesSet = new SortedIntSet(5); + while (worklist.size() > 0) { - Set s = worklist.removeFirst(); - State r = newstate.get(s); - for (State q : s) - if (q.accept) { - r.accept = true; - break; + SortedIntSet.FrozenIntSet s = worklist.removeFirst(); + + // Collate all outgoing transitions by min/1+max: + for(int i=0;i p = new HashSet(); - for (State q : s) - for (Transition t : q.transitions) - if (t.min <= points[n] && points[n] <= t.max) p.add(t.to); - if (!sets.containsKey(p)) { - sets.put(p, p); - worklist.add(p); - newstate.put(p, new State()); + } + + if (points.count == 0) { + // No outgoing transitions -- skip it + continue; + } + + points.sort(); + + int lastPoint = -1; + int accCount = 0; + + final State r = s.state; + for(int i=0;i 0) { + assert lastPoint != -1; + + statesSet.computeHash(); + + State q = newstate.get(statesSet); + if (q == null) { + q = new State(); + final SortedIntSet.FrozenIntSet p = statesSet.freeze(q); + worklist.add(p); + if (newStateUpto == newStatesArray.length) { + final State[] newArray = new State[ArrayUtil.oversize(1+newStateUpto, RamUsageEstimator.NUM_BYTES_OBJ_REF)]; + System.arraycopy(newStatesArray, 0, newArray, 0, newStateUpto); + newStatesArray = newArray; + } + newStatesArray[newStateUpto] = q; + q.number = newStateUpto; + newStateUpto++; + q.accept = accCount > 0; + newstate.put(p, q); + } else { + assert (accCount > 0 ? true:false) == q.accept: "accCount=" + accCount + " vs existing accept=" + q.accept + " states=" + statesSet; + } + + r.addTransition(new Transition(lastPoint, point-1, q)); } - State q = newstate.get(p); - char min = points[n]; - char max; - if (n + 1 < points.length) max = (char) (points[n + 1] - 1); - else max = Character.MAX_VALUE; - r.transitions.add(new Transition(min, max, q)); + + // process transitions that end on this point + // (closes an overlapping interval) + Transition[] transitions = points.points[i].ends.transitions; + int limit = points.points[i].ends.count; + for(int j=0;j map = new TreeMap(); + + private boolean useTreeMap; + + State state; + + public SortedIntSet(int capacity) { + values = new int[capacity]; + counts = new int[capacity]; + } + + // Adds this state to the set + public void incr(int num) { + if (useTreeMap) { + final Integer key = num; + Integer val = map.get(key); + if (val == null) { + map.put(key, 1); + } else { + map.put(key, 1+val); + } + return; + } + + if (upto == values.length) { + values = ArrayUtil.grow(values, 1+upto); + counts = ArrayUtil.grow(counts, 1+upto); + } + + for(int i=0;i= i) { + values[1+j] = values[j]; + counts[1+j] = counts[j]; + j--; + } + values[i] = num; + counts[i] = 1; + upto++; + return; + } + } + + // append + values[upto] = num; + counts[upto] = 1; + upto++; + + if (upto == TREE_MAP_CUTOVER) { + useTreeMap = true; + for(int i=0;i values.length) { + final int size = ArrayUtil.oversize(map.size(), RamUsageEstimator.NUM_BYTES_INT); + values = new int[size]; + counts = new int[size]; + } + hashCode = map.size(); + upto = 0; + for(int state : map.keySet()) { + hashCode = 683*hashCode + state; + values[upto++] = state; + } + } else { + hashCode = upto; + for(int i=0;i 0) { + sb.append(' '); + } + sb.append(values[i]).append(':').append(counts[i]); + } + sb.append(']'); + return sb.toString(); + } + + public final static class FrozenIntSet { + final int[] values; + final int hashCode; + final State state; + + public FrozenIntSet(int[] values, int hashCode, State state) { + this.values = values; + this.hashCode = hashCode; + this.state = state; + } + + public FrozenIntSet(int num, State state) { + this.values = new int[] {num}; + this.state = state; + this.hashCode = 683+num; + } + + public int hashCode() { + return hashCode; + } + + public boolean equals(Object _other) { + if (_other == null) { + return false; + } + if (_other instanceof FrozenIntSet) { + FrozenIntSet other = (FrozenIntSet) _other; + if (hashCode != other.hashCode) { + return false; + } + if (other.values.length != values.length) { + return false; + } + for(int i=0;i 0) { + sb.append(' '); + } + sb.append(values[i]); + } + sb.append(']'); + return sb.toString(); + } + } +} + Property changes on: lucene/src/java/org/apache/lucene/util/automaton/SortedIntSet.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/util/automaton/RegExp.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/RegExp.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/automaton/RegExp.java (working copy) @@ -366,9 +366,9 @@ Kind kind; RegExp exp1, exp2; String s; - char c; + int c; int min, max, digits; - char from, to; + int from, to; String b; int flags; @@ -625,10 +625,10 @@ b.append(")"); break; case REGEXP_CHAR: - b.append("\\").append(c); + b.append("\\").appendCodePoint(c); break; case REGEXP_CHAR_RANGE: - b.append("[\\").append(from).append("-\\").append(to).append("]"); + b.append("[\\").appendCodePoint(from).append("-\\").appendCodePoint(to).append("]"); break; case REGEXP_ANYCHAR: b.append("."); @@ -725,9 +725,9 @@ static private RegExp makeString(RegExp exp1, RegExp exp2) { StringBuilder b = new StringBuilder(); if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s); - else b.append(exp1.c); + else b.appendCodePoint(exp1.c); if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s); - else b.append(exp2.c); + else b.appendCodePoint(exp2.c); return makeString(b.toString()); } @@ -777,14 +777,14 @@ return r; } - static RegExp makeChar(char c) { + static RegExp makeChar(int c) { RegExp r = new RegExp(); r.kind = Kind.REGEXP_CHAR; r.c = c; return r; } - static RegExp makeCharRange(char from, char to) { + static RegExp makeCharRange(int from, int to) { RegExp r = new RegExp(); r.kind = Kind.REGEXP_CHAR_RANGE; r.from = from; @@ -834,13 +834,13 @@ } private boolean peek(String s) { - return more() && s.indexOf(b.charAt(pos)) != -1; + return more() && s.indexOf(b.codePointAt(pos)) != -1; } - private boolean match(char c) { + private boolean match(int c) { if (pos >= b.length()) return false; - if (b.charAt(pos) == c) { - pos++; + if (b.codePointAt(pos) == c) { + pos += Character.charCount(c); return true; } return false; @@ -850,9 +850,11 @@ return pos < b.length(); } - private char next() throws IllegalArgumentException { + private int next() throws IllegalArgumentException { if (!more()) throw new IllegalArgumentException("unexpected end-of-string"); - return b.charAt(pos++); + int ch = b.codePointAt(pos); + pos += Character.charCount(ch); + return ch; } private boolean check(int flag) { @@ -933,7 +935,7 @@ } final RegExp parseCharClass() throws IllegalArgumentException { - char c = parseCharExp(); + int c = parseCharExp(); if (match('-')) return makeCharRange(c, parseCharExp()); else return makeChar(c); } @@ -993,7 +995,7 @@ } else return makeChar(parseCharExp()); } - final char parseCharExp() throws IllegalArgumentException { + final int parseCharExp() throws IllegalArgumentException { match('\\'); return next(); } Index: lucene/src/java/org/apache/lucene/util/automaton/Transition.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/Transition.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/automaton/Transition.java (working copy) @@ -30,12 +30,13 @@ package org.apache.lucene.util.automaton; import java.io.Serializable; +import java.util.Comparator; /** * Automaton transition. *

* A transition, which belongs to a source state, consists of a Unicode - * character interval and a destination state. + * codepoint interval and a destination state. * * @lucene.experimental */ @@ -45,18 +46,18 @@ * CLASS INVARIANT: min<=max */ - char min; - char max; + final int min; + final int max; + final State to; - State to; - /** * Constructs a new singleton interval transition. * - * @param c transition character + * @param c transition codepoint * @param to destination state */ - public Transition(char c, State to) { + public Transition(int c, State to) { + assert c >= 0; min = max = c; this.to = to; } @@ -68,9 +69,11 @@ * @param max transition interval maximum * @param to destination state */ - public Transition(char min, char max, State to) { + public Transition(int min, int max, State to) { + assert min >= 0; + assert max >= 0; if (max < min) { - char t = max; + int t = max; max = min; min = t; } @@ -80,12 +83,12 @@ } /** Returns minimum of this transition interval. */ - public char getMin() { + public int getMin() { return min; } /** Returns maximum of this transition interval. */ - public char getMax() { + public int getMax() { return max; } @@ -134,14 +137,18 @@ } } - static void appendCharString(char c, StringBuilder b) { - if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.append(c); + static void appendCharString(int c, StringBuilder b) { + if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c); else { - b.append("\\u"); + b.append("\\\\U"); String s = Integer.toHexString(c); - if (c < 0x10) b.append("000").append(s); - else if (c < 0x100) b.append("00").append(s); - else if (c < 0x1000) b.append("0").append(s); + if (c < 0x10) b.append("0000000").append(s); + else if (c < 0x100) b.append("000000").append(s); + else if (c < 0x1000) b.append("00000").append(s); + else if (c < 0x10000) b.append("0000").append(s); + else if (c < 0x100000) b.append("000").append(s); + else if (c < 0x1000000) b.append("00").append(s); + else if (c < 0x10000000) b.append("0").append(s); else b.append(s); } } @@ -171,4 +178,96 @@ } b.append("\"]\n"); } + + private static final class CompareByDestThenMinMaxSingle implements Comparator { + public int compare(Transition t1, Transition t2) { + if (t1.to != t2.to) { + if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + if (t1.min < t2.min) return -1; + if (t1.min > t2.min) return 1; + if (t1.max > t2.max) return -1; + if (t1.max < t2.max) return 1; + return 0; + } + } + + public static final Comparator CompareByDestThenMinMax = new CompareByDestThenMinMaxSingle(); + + private static final class CompareByMinMaxThenDestSingle implements Comparator { + public int compare(Transition t1, Transition t2) { + if (t1.min < t2.min) return -1; + if (t1.min > t2.min) return 1; + if (t1.max > t2.max) return -1; + if (t1.max < t2.max) return 1; + if (t1.to != t2.to) { + if (t1.to.number < t2.to.number) return -1; + if (t1.to.number > t2.to.number) return 1; + } + return 0; + } + } + + public static final Comparator CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle(); + + private static class UTF8InUTF16Order { + protected int compareCodePoint(int aByte, int bByte) { + if (aByte != bByte) { + // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order + + // We know the terms are not equal, but, we may + // have to carefully fixup the bytes at the + // difference to match UTF16's sort order: + if (aByte >= 0xee && bByte >= 0xee) { + if ((aByte & 0xfe) == 0xee) { + aByte += 0x10; + } + if ((bByte&0xfe) == 0xee) { + bByte += 0x10; + } + } + return aByte - bByte; + } + return 0; + } + } + + private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator { + public int compare(Transition t1, Transition t2) { + if (t1.to != t2.to) { + if (t1.to == null) return -1; + else if (t2.to == null) return 1; + else if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + int minComp = compareCodePoint(t1.min, t2.min); + if (minComp != 0) return minComp; + int maxComp = compareCodePoint(t1.max, t2.max); + if (maxComp != 0) return maxComp; + return 0; + } + } + + public static final Comparator CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle(); + + private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator { + public int compare(Transition t1, Transition t2) { + int minComp = compareCodePoint(t1.min, t2.min); + if (minComp != 0) return minComp; + int maxComp = compareCodePoint(t1.max, t2.max); + if (maxComp != 0) return maxComp; + if (t1.to != t2.to) { + if (t1.to == null) return -1; + else if (t2.to == null) return 1; + else if (t1.to.number < t2.to.number) return -1; + else if (t1.to.number > t2.to.number) return 1; + } + return 0; + } + } + + public static final Comparator CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle(); + + } Index: lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java =================================================================== --- lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java (working copy) @@ -30,22 +30,22 @@ package org.apache.lucene.util.automaton; import java.io.Serializable; -import java.util.Set; /** * Finite-state automaton with fast run operation. * * @lucene.experimental */ -public final class RunAutomaton implements Serializable { - +public abstract class RunAutomaton implements Serializable { + final int maxInterval; final int size; final boolean[] accept; final int initial; final int[] transitions; // delta(state,c) = transitions[state*points.length + // getCharClass(c)] - final char[] points; // char interval start points + final int[] points; // char interval start points final int[] classmap; // map from char number to class class + final Automaton automaton; /** * Returns a string representation of this automaton. @@ -61,10 +61,10 @@ for (int j = 0; j < points.length; j++) { int k = transitions[i * points.length + j]; if (k != -1) { - char min = points[j]; - char max; - if (j + 1 < points.length) max = (char) (points[j + 1] - 1); - else max = Character.MAX_VALUE; + int min = points[j]; + int max; + if (j + 1 < points.length) max = (points[j + 1] - 1); + else max = maxInterval; b.append(" "); Transition.appendCharString(min, b); if (min != max) { @@ -81,52 +81,59 @@ /** * Returns number of states in automaton. */ - public int getSize() { + public final int getSize() { return size; } /** * Returns acceptance status for given state. */ - public boolean isAccept(int state) { + public final boolean isAccept(int state) { return accept[state]; } /** * Returns initial state. */ - public int getInitialState() { + public final int getInitialState() { return initial; } /** - * Returns array of character class interval start points. The array should + * Returns array of codepoint class interval start points. The array should * not be modified by the caller. */ - public char[] getCharIntervals() { + public final int[] getCharIntervals() { return points.clone(); } /** - * Gets character class of given char. + * Gets character class of given codepoint */ - int getCharClass(char c) { + final int getCharClass(int c) { return SpecialOperations.findIndex(c, points); } /** + * @return the automaton + */ + public Automaton getAutomaton() { + return automaton; + } + + /** * Constructs a new RunAutomaton from a deterministic * Automaton. * * @param a an automaton */ - public RunAutomaton(Automaton a) { + public RunAutomaton(Automaton a, int maxInterval, boolean tableize) { + this.maxInterval = maxInterval; a.determinize(); points = a.getStartPoints(); - Set states = a.getStates(); - Automaton.setStateNumbers(states); initial = a.initial.number; - size = states.size(); + final State[] states = a.getNumberedStates(); + size = states.length; accept = new boolean[size]; transitions = new int[size * points.length]; for (int n = 0; n < size * points.length; n++) @@ -142,12 +149,18 @@ /* * Set alphabet table for optimal run performance. */ - classmap = new int[Character.MAX_VALUE + 1]; - int i = 0; - for (int j = 0; j <= Character.MAX_VALUE; j++) { - if (i + 1 < points.length && j == points[i + 1]) i++; - classmap[j] = i; + if (tableize) { + classmap = new int[maxInterval + 1]; + int i = 0; + for (int j = 0; j <= maxInterval; j++) { + if (i + 1 < points.length && j == points[i + 1]) + i++; + classmap[j] = i; + } + } else { + classmap = null; } + this.automaton = a; } /** @@ -157,54 +170,10 @@ * if a dead state is entered in an equivalent automaton with a total * transition function.) */ - public int step(int state, char c) { - return transitions[state * points.length + classmap[c]]; + public final int step(int state, int c) { + if (classmap == null) + return transitions[state * points.length + getCharClass(c)]; + else + return transitions[state * points.length + classmap[c]]; } - - /** - * Returns true if the given string is accepted by this automaton. - */ - public boolean run(String s) { - int p = initial; - int l = s.length(); - for (int i = 0; i < l; i++) { - p = step(p, s.charAt(i)); - if (p == -1) return false; - } - return accept[p]; - } - - /** - * Returns true if the given string is accepted by this automaton - */ - public boolean run(char[] s, int offset, int length) { - int p = initial; - int l = offset + length; - for (int i = offset; i < l; i++) { - p = step(p, s[i]); - if (p == -1) return false; - } - return accept[p]; - } - - /** - * Returns the length of the longest accepted run of the given string starting - * at the given offset. - * - * @param s the string - * @param offset offset into s where the run starts - * @return length of the longest accepted run, -1 if no run is accepted - */ - public int run(String s, int offset) { - int p = initial; - int l = s.length(); - int max = -1; - for (int r = 0; offset <= l; offset++, r++) { - if (accept[p]) max = r; - if (offset == l) break; - p = step(p, s.charAt(offset)); - if (p == -1) break; - } - return max; - } } Index: lucene/src/java/org/apache/lucene/util/IntsRef.java =================================================================== --- lucene/src/java/org/apache/lucene/util/IntsRef.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/IntsRef.java (working copy) @@ -30,6 +30,10 @@ public IntsRef() { } + public IntsRef(int capacity) { + ints = new int[capacity]; + } + public IntsRef(int[] ints, int offset, int length) { this.ints = ints; this.offset = offset; Index: lucene/src/java/org/apache/lucene/util/ArrayUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/ArrayUtil.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/ArrayUtil.java (working copy) @@ -368,4 +368,57 @@ code = code * 31 + array[i]; return code; } + + + // Since Arrays.equals doesn't implement offsets for equals + /** + * See if two array slices are the same. + * + * @param left The left array to compare + * @param offsetLeft The offset into the array. Must be positive + * @param right The right array to compare + * @param offsetRight the offset into the right array. Must be positive + * @param length The length of the section of the array to compare + * @return true if the two arrays, starting at their respective offsets, are equal + * + * @see java.util.Arrays#equals(char[], char[]) + */ + public static boolean equals(char[] left, int offsetLeft, char[] right, int offsetRight, int length) { + if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) { + for (int i = 0; i < length; i++) { + if (left[offsetLeft + i] != right[offsetRight + i]) { + return false; + } + + } + return true; + } + return false; + } + + // Since Arrays.equals doesn't implement offsets for equals + /** + * See if two array slices are the same. + * + * @param left The left array to compare + * @param offsetLeft The offset into the array. Must be positive + * @param right The right array to compare + * @param offsetRight the offset into the right array. Must be positive + * @param length The length of the section of the array to compare + * @return true if the two arrays, starting at their respective offsets, are equal + * + * @see java.util.Arrays#equals(char[], char[]) + */ + public static boolean equals(int[] left, int offsetLeft, int[] right, int offsetRight, int length) { + if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) { + for (int i = 0; i < length; i++) { + if (left[offsetLeft + i] != right[offsetRight + i]) { + return false; + } + + } + return true; + } + return false; + } } Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 940218) +++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) @@ -491,4 +491,92 @@ return true; } + + // Borrowed from Python's 3.1.2 sources, + // Objects/unicodeobject.c, and modified (see commented + // out section, and the -1s) to disallow the reserved for + // future (RFC 3629) 5/6 byte sequence characters, and + // invalid 0xFE and 0xFF bytes. + + /* Map UTF-8 encoded prefix byte to sequence length. -1 (0xFF) + * means illegal prefix. see RFC 2279 for details */ + static byte[] utf8CodeLength = new byte[] { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4 //, 5, 5, 5, 5, 6, 6, 0, 0 + }; + + + /** Returns the number of code points in this utf8 + * sequence. Behavior is undefined if the utf8 sequence + * is invalid.*/ + public static final int codePointCount(BytesRef utf8) { + int upto = utf8.offset; + final int limit = utf8.offset + utf8.length; + final byte[] bytes = utf8.bytes; + int codePointCount = 0; + while (upto < limit) { + codePointCount++; + upto += utf8CodeLength[bytes[upto]&0xFF]; + } + return codePointCount; + } + + public static void UTF8toUTF32(final BytesRef utf8, final IntsRef utf32) { + // pre-alloc for worst case + if (utf32.ints == null || utf32.ints.length < utf8.length) { + utf32.ints = new int[utf8.length]; + } + int utf32Count = 0; + int utf8Upto = utf8.offset; + final int[] ints = utf32.ints; + final byte[] bytes = utf8.bytes; + final int utf8Limit = utf8.offset + utf8.length; + while(utf8Upto < utf8Limit) { + final int numBytes = utf8CodeLength[bytes[utf8Upto]&0xFF]; + int v = 0; + switch(numBytes) { + case 1: + ints[utf32Count++] = bytes[utf8Upto++]; + continue; + case 2: + // 5 useful bits + v = bytes[utf8Upto++] & 31; + break; + case 3: + // 4 useful bits + v = bytes[utf8Upto++] & 15; + break; + case 4: + // 3 useful bits + v = bytes[utf8Upto++] & 7; + break; + default : + throw new IllegalStateException("invalid utf8"); + } + + final int limit = utf8Upto + numBytes-1; + + while(utf8Upto < limit) { + v = v << 6 | bytes[utf8Upto++]&63; + } + ints[utf32Count++] = v; + } + + utf32.offset = 0; + utf32.length = utf32Count; + } } Index: lucene/LICENSE.txt =================================================================== --- lucene/LICENSE.txt (revision 940218) +++ lucene/LICENSE.txt (working copy) @@ -237,6 +237,12 @@ http://www.python.org/download/releases/2.4.2/license/ +Some code in src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from Python 3.1.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/3.1.2/license/ + Some code in src/java/org/apache/lucene/util/automaton was derived from Brics automaton sources available at www.brics.dk/automaton/. Here is the copyright from those sources: