Index: solr/src/test/org/apache/solr/util/ArraysUtilsTest.java
===================================================================
--- solr/src/test/org/apache/solr/util/ArraysUtilsTest.java (revision 940218)
+++ solr/src/test/org/apache/solr/util/ArraysUtilsTest.java (working copy)
@@ -1,48 +0,0 @@
-package org.apache.solr.util;
-
-/**
- * Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import junit.framework.TestCase;
-
-public class ArraysUtilsTest extends TestCase {
-
-
- public ArraysUtilsTest(String s) {
- super(s);
- }
-
- protected void setUp() {
- }
-
- protected void tearDown() {
-
- }
-
- public void test() {
- String left = "this is equal";
- String right = left;
- char[] leftChars = left.toCharArray();
- char[] rightChars = right.toCharArray();
- assertTrue(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 0, rightChars, 0, left.length()));
-
- assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 0, left.length()));
- assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 1, rightChars, 2, left.length()));
-
- assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 25, rightChars, 0, left.length()));
- assertFalse(left + " does not equal: " + right, ArraysUtils.equals(leftChars, 12, rightChars, 0, left.length()));
- }
-}
\ No newline at end of file
Index: solr/src/java/org/apache/solr/util/ArraysUtils.java
===================================================================
--- solr/src/java/org/apache/solr/util/ArraysUtils.java (revision 940218)
+++ solr/src/java/org/apache/solr/util/ArraysUtils.java (working copy)
@@ -1,51 +0,0 @@
-package org.apache.solr.util;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-/**
- *
- *
- **/
-//Since Arrays.equals doesn't implement offsets for equals
-public class ArraysUtils {
-
- /**
- * See if two array slices are the same.
- *
- * @param left The left array to compare
- * @param offsetLeft The offset into the array. Must be positive
- * @param right The right array to compare
- * @param offsetRight the offset into the right array. Must be positive
- * @param length The length of the section of the array to compare
- * @return true if the two arrays, starting at their respective offsets, are equal
- *
- * @see java.util.Arrays#equals(char[], char[])
- */
- public static boolean equals(char[] left, int offsetLeft, char[] right, int offsetRight, int length) {
- if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) {
- for (int i = 0; i < length; i++) {
- if (left[offsetLeft + i] != right[offsetRight + i]) {
- return false;
- }
-
- }
- return true;
- }
- return false;
- }
-}
Index: lucene/CHANGES.txt
===================================================================
--- lucene/CHANGES.txt (revision 940218)
+++ lucene/CHANGES.txt (working copy)
@@ -115,6 +115,11 @@
actual file's length if the file exists, and throws FileNotFoundException
otherwise. Returning length=0 for a non-existent file is no longer allowed. If
you relied on that, make sure to catch the exception. (Shai Erera)
+
+* LUCENE-2265: FuzzyQuery and WildcardQuery now operate on Unicode codepoints,
+ not unicode code units. For example, a Wildcard "?" represents any unicode
+ character. Furthermore, the rest of the automaton package and RegexpQuery use
+ true Unicode codepoint representation. (Robert Muir, Mike McCandless)
Changes in runtime behavior
Index: lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java (revision 940218)
+++ lucene/src/test/org/apache/lucene/search/TestFuzzyQuery2.java (working copy)
@@ -56,6 +56,20 @@
static final float epsilon = 0.00001f;
public void testFromTestData() throws Exception {
+ // TODO: randomize!
+ assertFromTestData(new int[] { 0x40, 0x41 });
+ assertFromTestData(new int[] { 0x40, 0x0195 });
+ assertFromTestData(new int[] { 0x40, 0x0906 });
+ assertFromTestData(new int[] { 0x40, 0x1040F });
+ assertFromTestData(new int[] { 0x0194, 0x0195 });
+ assertFromTestData(new int[] { 0x0194, 0x0906 });
+ assertFromTestData(new int[] { 0x0194, 0x1040F });
+ assertFromTestData(new int[] { 0x0905, 0x0906 });
+ assertFromTestData(new int[] { 0x0905, 0x1040F });
+ assertFromTestData(new int[] { 0x1040E, 0x1040F });
+ }
+
+ public void assertFromTestData(int codePointTable[]) throws Exception {
InputStream stream = getClass().getResourceAsStream("fuzzyTestData.txt");
BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
@@ -71,7 +85,7 @@
doc.add(field);
for (int i = 0; i < terms; i++) {
- field.setValue(Integer.toBinaryString(i));
+ field.setValue(mapInt(codePointTable, i));
writer.addDocument(doc);
}
@@ -82,7 +96,7 @@
String line;
while ((line = reader.readLine()) != null) {
String params[] = line.split(",");
- String query = Integer.toBinaryString(Integer.parseInt(params[0]));
+ String query = mapInt(codePointTable, Integer.parseInt(params[0]));
int prefix = Integer.parseInt(params[1]);
int pqSize = Integer.parseInt(params[2]);
float minScore = Float.parseFloat(params[3]);
@@ -101,6 +115,15 @@
dir.close();
}
+ /* map bits to unicode codepoints */
+ private static String mapInt(int codePointTable[], int i) {
+ StringBuilder sb = new StringBuilder();
+ String binary = Integer.toBinaryString(i);
+ for (int j = 0; j < binary.length(); j++)
+ sb.appendCodePoint(codePointTable[binary.charAt(j) - '0']);
+ return sb.toString();
+ }
+
/* Code to generate test data
public static void main(String args[]) throws Exception {
int bits = 3;
Index: lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java (revision 940218)
+++ lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java (working copy)
@@ -124,55 +124,4 @@
Automaton a = new RegExp("((\uD866\uDF05)|\uFB94).*").toAutomaton();
assertAutomatonHits(2, a);
}
-
- /**
- * Test that AutomatonQuery properly seeks to supplementary characters.
- * Transitions are modeled as UTF-16 code units, so without special handling
- * by default it will try to seek to a lead surrogate with some DFAs
- */
- public void testSeekSurrogate() throws IOException {
- Automaton a = new RegExp("\uD866[a\uDF05\uFB93][a-z]{0,5}[fl]").toAutomaton();
- assertAutomatonHits(1, a);
- }
-
- /**
- * Try seeking to an ending lead surrogate.
- */
- public void testSeekSurrogate2() throws IOException {
- Automaton a = new RegExp("\uD866(\uDF06ghijkl)?").toAutomaton();
- assertAutomatonHits(1, a);
- }
-
- /**
- * Try seeking to an starting trail surrogate.
- */
- public void testSeekSurrogate3() throws IOException {
- Automaton a = new RegExp("[\uDF06\uFB94]mnopqr").toAutomaton();
- assertAutomatonHits(1, a);
- }
-
- /**
- * Try seeking to an medial/final trail surrogate.
- */
- public void testSeekSurrogate4() throws IOException {
- Automaton a = new RegExp("a[\uDF06\uFB94]bc").toAutomaton();
- assertAutomatonHits(1, a);
- }
-
- /**
- * Ensure the 'constant suffix' does not contain a leading trail surrogate.
- */
- public void testSurrogateSuffix() throws IOException {
- Automaton a = new RegExp(".*[\uD865\uD866]\uDF06ghijkl").toAutomaton();
- assertAutomatonHits(1, a);
- }
-
- /**
- * Try when the constant suffix is only a leading trail surrogate.
- * instead this must use an empty suffix.
- */
- public void testSurrogateSuffix2() throws IOException {
- Automaton a = new RegExp(".*\uDF05").toAutomaton();
- assertAutomatonHits(1, a);
- }
}
Index: lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java (revision 940218)
+++ lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java (working copy)
@@ -145,10 +145,8 @@
.makeString("foobar"));
assertEquals(a1, a2);
- assertEquals(a1.hashCode(), a2.hashCode());
assertEquals(a1, a3);
- assertEquals(a1.hashCode(), a3.hashCode());
assertEquals(a1.toString(), a3.toString());
Index: lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java (revision 940218)
+++ lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java (working copy)
@@ -31,9 +31,11 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
-import org.apache.lucene.util.automaton.RunAutomaton;
/**
* Create an index with random unicode terms
@@ -46,17 +48,17 @@
@Override
protected void setUp() throws Exception {
super.setUp();
- random = newRandom(System.nanoTime());
+ random = newRandom();
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new KeywordAnalyzer(),
IndexWriter.MaxFieldLength.UNLIMITED);
Document doc = new Document();
- Field field = new Field("field", "", Field.Store.YES, Field.Index.ANALYZED);
+ Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED);
doc.add(field);
- for (int i = 0; i < 1000; i++) {
- field.setValue(randomString());
+ for (int i = 0; i < 2000; i++) {
+ field.setValue(_TestUtil.randomUnicodeString(random));
writer.addDocument(doc);
}
@@ -87,7 +89,7 @@
}
private class SimpleAutomatonTermsEnum extends FilteredTermsEnum {
- RunAutomaton runAutomaton = new RunAutomaton(automaton);
+ CharacterRunAutomaton runAutomaton = new CharacterRunAutomaton(automaton);
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
private SimpleAutomatonTermsEnum(IndexReader reader, String field) throws IOException {
@@ -111,25 +113,14 @@
/** test a bunch of random regular expressions */
public void testRegexps() throws Exception {
- for (int i = 0; i < 500; i++)
- assertSame(randomRegex());
+ for (int i = 0; i < 1000; i++)
+ assertSame(AutomatonTestUtil.randomRegexp(random).toString());
}
/** check that the # of hits is the same as from a very
* simple regexpquery implementation.
*/
- private void assertSame(String regexp) throws IOException {
- // we will generate some illegal syntax regular expressions...
- try {
- new RegExp(regexp).toAutomaton();
- } catch (Exception e) {
- return;
- }
-
- // we will also generate some undefined unicode queries
- if (!UnicodeUtil.validUTF16String(regexp))
- return;
-
+ private void assertSame(String regexp) throws IOException {
RegexpQuery smart = new RegexpQuery(new Term("field", regexp));
DumbRegexpQuery dumb = new DumbRegexpQuery(new Term("field", regexp));
@@ -143,79 +134,7 @@
TopDocs smartDocs = searcher.search(smart, 25);
TopDocs dumbDocs = searcher.search(dumb, 25);
-
- assertEquals(dumbDocs.totalHits, smartDocs.totalHits);
+
+ assertEquals("for re:" + regexp, dumbDocs.totalHits, smartDocs.totalHits);
}
-
- char buffer[] = new char[20];
-
- // start is inclusive and end is exclusive
- public int nextInt(int start, int end) {
- return start + random.nextInt(end - start);
- }
-
- public String randomString() {
- final int end = random.nextInt(20);
- if (buffer.length < 1 + end) {
- char[] newBuffer = new char[(int) ((1 + end) * 1.25)];
- System.arraycopy(buffer, 0, newBuffer, 0, buffer.length);
- buffer = newBuffer;
- }
- for (int i = 0; i < end - 1; i++) {
- int t = random.nextInt(6);
- if (0 == t && i < end - 1) {
- // Make a surrogate pair
- // High surrogate
- buffer[i++] = (char) nextInt(0xd800, 0xdc00);
- // Low surrogate
- buffer[i] = (char) nextInt(0xdc00, 0xe000);
- } else if (t <= 1) buffer[i] = (char) random.nextInt(0x80);
- else if (2 == t) buffer[i] = (char) nextInt(0x80, 0x800);
- else if (3 == t) buffer[i] = (char) nextInt(0x800, 0xd800);
- else if (4 == t) buffer[i] = (char) nextInt(0xe000, 0xffff);
- else if (5 == t) {
- // Illegal unpaired surrogate
- if (random.nextBoolean()) buffer[i] = (char) nextInt(0xd800, 0xdc00);
- else buffer[i] = (char) nextInt(0xdc00, 0xe000);
- }
- }
- return new String(buffer, 0, end);
- }
-
- // a random string biased towards populating a ton of operators
- public String randomRegex() {
- final int end = random.nextInt(20);
- if (buffer.length < 1 + end) {
- char[] newBuffer = new char[(int) ((1 + end) * 1.25)];
- System.arraycopy(buffer, 0, newBuffer, 0, buffer.length);
- buffer = newBuffer;
- }
- for (int i = 0; i < end - 1; i++) {
- int t = random.nextInt(10);
- if (0 == t && i < end - 1) {
- // Make a surrogate pair
- // High surrogate
- buffer[i++] = (char) nextInt(0xd800, 0xdc00);
- // Low surrogate
- buffer[i] = (char) nextInt(0xdc00, 0xe000);
- } else if (t <= 1) buffer[i] = (char) random.nextInt(0x80);
- else if (2 == t) buffer[i] = (char) nextInt(0x80, 0x800);
- else if (3 == t) buffer[i] = (char) nextInt(0x800, 0xd800);
- else if (4 == t) buffer[i] = (char) nextInt(0xe000, 0xffff);
- else if (5 == t) {
- // Illegal unpaired surrogate
- if (random.nextBoolean()) buffer[i] = (char) nextInt(0xd800, 0xdc00);
- else buffer[i] = (char) nextInt(0xdc00, 0xe000);
- } else if (6 == t) {
- buffer[i] = '.';
- } else if (7 == t) {
- buffer[i] = '?';
- } else if (8 == t) {
- buffer[i] = '*';
- } else if (9 == t) {
- buffer[i] = '+';
- }
- }
- return new String(buffer, 0, end);
- }
}
Index: lucene/src/test/org/apache/lucene/util/TestArrayUtil.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/TestArrayUtil.java (revision 940218)
+++ lucene/src/test/org/apache/lucene/util/TestArrayUtil.java (working copy)
@@ -102,4 +102,17 @@
}
+ public void testSliceEquals() {
+ String left = "this is equal";
+ String right = left;
+ char[] leftChars = left.toCharArray();
+ char[] rightChars = right.toCharArray();
+ assertTrue(left + " does not equal: " + right, ArrayUtil.equals(leftChars, 0, rightChars, 0, left.length()));
+
+ assertFalse(left + " does not equal: " + right, ArrayUtil.equals(leftChars, 1, rightChars, 0, left.length()));
+ assertFalse(left + " does not equal: " + right, ArrayUtil.equals(leftChars, 1, rightChars, 2, left.length()));
+
+ assertFalse(left + " does not equal: " + right, ArrayUtil.equals(leftChars, 25, rightChars, 0, left.length()));
+ assertFalse(left + " does not equal: " + right, ArrayUtil.equals(leftChars, 12, rightChars, 0, left.length()));
+ }
}
Index: lucene/src/test/org/apache/lucene/util/_TestUtil.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/_TestUtil.java (revision 940218)
+++ lucene/src/test/org/apache/lucene/util/_TestUtil.java (working copy)
@@ -111,4 +111,33 @@
buf.append("]");
return buf.toString();
}
+ /** start and end are BOTH inclusive */
+ public static int nextInt(Random r, int start, int end) {
+ return start + r.nextInt(end-start+1);
+ }
+
+ /** Returns random string, including full unicode range. */
+ public static String randomUnicodeString(Random r) {
+ final int end = r.nextInt(20);
+ if (end == 0) {
+ // allow 0 length
+ return "";
+ }
+ final char[] buffer = new char[end];
+ for (int i = 0; i < end; i++) {
+ int t = r.nextInt(5);
+ if (0 == t && i < end - 1) {
+ // Make a surrogate pair
+ // High surrogate
+ buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff);
+ // Low surrogate
+ buffer[i] = (char) nextInt(r, 0xdc00, 0xdfff);
+ }
+ else if (t <= 1) buffer[i] = (char) r.nextInt(0x80);
+ else if (2 == t) buffer[i] = (char) nextInt(r, 0x80, 0x800);
+ else if (3 == t) buffer[i] = (char) nextInt(r, 0x800, 0xd7ff);
+ else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xffff);
+ }
+ return new String(buffer, 0, end);
+ }
}
Index: lucene/src/test/org/apache/lucene/util/TestUnicodeUtil.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/TestUnicodeUtil.java (revision 940218)
+++ lucene/src/test/org/apache/lucene/util/TestUnicodeUtil.java (working copy)
@@ -17,7 +17,9 @@
* limitations under the License.
*/
+import java.util.Random;
+
/*
* Some of this code came from the excellent Unicode
* conversion examples from:
@@ -81,4 +83,47 @@
assertEquals("dogs\uE000", UnicodeUtil.nextValidUTF16String("dogs\uDC00"));
assertEquals("\uE000", UnicodeUtil.nextValidUTF16String("\uDC00dogs"));
}
+
+ public void testCodePointCount() {
+ final Random r = newRandom();
+ BytesRef utf8 = new BytesRef(20);
+ for(int i=0;i<50000;i++) {
+ final String s = _TestUtil.randomUnicodeString(r);
+ UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
+ assertEquals(s.codePointCount(0, s.length()),
+ UnicodeUtil.codePointCount(utf8));
+ }
+ }
+
+ public void testUTF8toUTF32() {
+ final Random r = newRandom();
+ BytesRef utf8 = new BytesRef(20);
+ IntsRef utf32 = new IntsRef(20);
+ int[] codePoints = new int[20];
+ for(int i=0;i<50000;i++) {
+ final String s = _TestUtil.randomUnicodeString(r);
+ UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
+ UnicodeUtil.UTF8toUTF32(utf8, utf32);
+
+ int charUpto = 0;
+ int intUpto = 0;
+ while(charUpto < s.length()) {
+ final int cp = s.codePointAt(charUpto);
+ codePoints[intUpto++] = cp;
+ charUpto += Character.charCount(cp);
+ }
+ if (!ArrayUtil.equals(codePoints, 0, utf32.ints, utf32.offset, intUpto)) {
+ System.out.println("FAILED");
+ for(int j=0;j automata = new ArrayList();
+ private List terms = new ArrayList();
+ private Random random;
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ random = newRandom();
+ for (int i = 0; i < 5000; i++) {
+ String randomString = _TestUtil.randomUnicodeString(random);
+ terms.add(randomString);
+ automata.add(BasicAutomata.makeString(randomString));
+ }
+ }
+
+ public void testLexicon() {
+ for (int i = 0; i < 3; i++) {
+ assertLexicon();
+ }
+ }
+
+ public void assertLexicon() {
+ Collections.shuffle(automata, random);
+ final Automaton lex = BasicOperations.union(automata);
+ lex.determinize();
+ assertTrue(SpecialOperations.isFinite(lex));
+ for (String s : terms) {
+ assertTrue(BasicOperations.run(lex, s));
+ }
+ final ByteRunAutomaton lexByte = new ByteRunAutomaton(lex);
+ for (String s : terms) {
+ BytesRef termByte = new BytesRef(s);
+ assertTrue(lexByte.run(termByte.bytes, 0, termByte.length));
+ }
+ }
+}
Property changes on: lucene/src/test/org/apache/lucene/util/automaton/TestDeterminizeLexicon.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java (revision 940218)
+++ lucene/src/test/org/apache/lucene/util/automaton/TestBasicOperations.java (working copy)
@@ -26,7 +26,7 @@
Automaton other = BasicAutomata.makeCharRange('5', '7');
Automaton concat = BasicOperations.concatenate(singleton, other);
assertTrue(concat.isDeterministic());
- assertEquals(BasicOperations.concatenate(expandedSingleton, other), concat);
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.concatenate(expandedSingleton, other), concat));
}
/** Test optimization to concatenate() to an NFA */
@@ -38,7 +38,7 @@
BasicAutomata.makeString("three"));
Automaton concat = BasicOperations.concatenate(singleton, nfa);
assertFalse(concat.isDeterministic());
- assertEquals(BasicOperations.concatenate(expandedSingleton, nfa), concat);
+ assertTrue(BasicOperations.sameLanguage(BasicOperations.concatenate(expandedSingleton, nfa), concat));
}
/** Test optimization to concatenate() with empty String */
@@ -49,9 +49,9 @@
Automaton concat1 = BasicOperations.concatenate(expandedSingleton, other);
Automaton concat2 = BasicOperations.concatenate(singleton, other);
assertTrue(concat2.isDeterministic());
- assertEquals(concat1, concat2);
- assertEquals(other, concat1);
- assertEquals(other, concat2);
+ assertTrue(BasicOperations.sameLanguage(concat1, concat2));
+ assertTrue(BasicOperations.sameLanguage(other, concat1));
+ assertTrue(BasicOperations.sameLanguage(other, concat2));
}
/** Test optimization to concatenate() with empty String to an NFA */
@@ -64,8 +64,19 @@
Automaton concat1 = BasicOperations.concatenate(expandedSingleton, nfa);
Automaton concat2 = BasicOperations.concatenate(singleton, nfa);
assertFalse(concat2.isDeterministic());
- assertEquals(concat1, concat2);
- assertEquals(nfa, concat1);
- assertEquals(nfa, concat2);
+ assertTrue(BasicOperations.sameLanguage(concat1, concat2));
+ assertTrue(BasicOperations.sameLanguage(nfa, concat1));
+ assertTrue(BasicOperations.sameLanguage(nfa, concat2));
}
+
+ /** Test singletons work correctly */
+ public void testSingleton() {
+ Automaton singleton = BasicAutomata.makeString("foobar");
+ Automaton expandedSingleton = singleton.cloneExpanded();
+ assertTrue(BasicOperations.sameLanguage(singleton, expandedSingleton));
+
+ singleton = BasicAutomata.makeString("\ud801\udc1c");
+ expandedSingleton = singleton.cloneExpanded();
+ //assertEquals(singleton, expandedSingleton);
+ }
}
Index: lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java (revision 0)
+++ lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java (revision 0)
@@ -0,0 +1,68 @@
+package org.apache.lucene.util.automaton;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Random;
+
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util._TestUtil;
+
+public class AutomatonTestUtil {
+ /** Returns random string, including full unicode range. */
+ public static RegExp randomRegexp(Random r) {
+ while (true) {
+ String regexp = randomRegexpString(r);
+ // we will also generate some undefined unicode queries
+ if (!UnicodeUtil.validUTF16String(regexp))
+ continue;
+ try {
+ return new RegExp(regexp, RegExp.NONE);
+ } catch (Exception e) {}
+ }
+ }
+
+ private static String randomRegexpString(Random r) {
+ final int end = r.nextInt(20);
+ if (end == 0) {
+ // allow 0 length
+ return "";
+ }
+ final char[] buffer = new char[end];
+ for (int i = 0; i < end; i++) {
+ int t = r.nextInt(11);
+ if (0 == t && i < end - 1) {
+ // Make a surrogate pair
+ // High surrogate
+ buffer[i++] = (char) _TestUtil.nextInt(r, 0xd800, 0xdbff);
+ // Low surrogate
+ buffer[i] = (char) _TestUtil.nextInt(r, 0xdc00, 0xdfff);
+ }
+ else if (t <= 1) buffer[i] = (char) r.nextInt(0x80);
+ else if (2 == t) buffer[i] = (char) _TestUtil.nextInt(r, 0x80, 0x800);
+ else if (3 == t) buffer[i] = (char) _TestUtil.nextInt(r, 0x800, 0xd7ff);
+ else if (4 == t) buffer[i] = (char) _TestUtil.nextInt(r, 0xe000, 0xffff);
+ else if (5 == t) buffer[i] = '.';
+ else if (6 == t) buffer[i] = '?';
+ else if (7 == t) buffer[i] = '*';
+ else if (8 == t) buffer[i] = '+';
+ else if (9 == t) buffer[i] = '(';
+ else if (10 == t) buffer[i] = ')';
+ }
+ return new String(buffer, 0, end);
+ }
+}
Property changes on: lucene/src/test/org/apache/lucene/util/automaton/AutomatonTestUtil.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java (revision 0)
+++ lucene/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java (revision 0)
@@ -0,0 +1,183 @@
+package org.apache.lucene.util.automaton;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+
+import java.util.Random;
+
+public class TestUTF32ToUTF8 extends LuceneTestCase {
+ private Random random;
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ random = newRandom();
+ }
+
+ private static final int MAX_UNICODE = 0x10FFFF;
+
+ final BytesRef b = new BytesRef(4);
+
+ private boolean matches(ByteRunAutomaton a, int code) {
+ char[] chars = Character.toChars(code);
+ UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b);
+ return a.run(b.bytes, 0, b.length);
+ }
+
+ private void testOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters) {
+
+ // Verify correct ints are accepted
+ for(int iter=0;iter= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) |
+ (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) {
+ iter--;
+ continue;
+ }
+ assertTrue("DFA for range " + startCode + "-" + endCode + " failed to match code=" + code,
+ matches(a, code));
+ }
+
+ // Verify invalid ints are not accepted
+ final int invalidRange = MAX_UNICODE - (endCode - startCode + 1);
+ if (invalidRange > 0) {
+ for(int iter=0;iter= startCode) {
+ code = endCode + 1 + x - startCode;
+ } else {
+ code = x;
+ }
+ if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) |
+ (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) {
+ iter--;
+ continue;
+ }
+ assertFalse("DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code,
+ matches(a, code));
+
+ }
+ }
+ }
+
+ // Evenly picks random code point from the 4 "buckets"
+ // (bucket = same #bytes when encoded to utf8)
+ private int getCodeStart(Random r) {
+ switch(r.nextInt(4)) {
+ case 0:
+ return _TestUtil.nextInt(r, 0, 128);
+ case 1:
+ return _TestUtil.nextInt(r, 128, 2048);
+ case 2:
+ return _TestUtil.nextInt(r, 2048, 65536);
+ default:
+ return _TestUtil.nextInt(r, 65536, 1+MAX_UNICODE);
+ }
+ }
+
+ public void testRandomRanges() throws Exception {
+ final Random r = random;
+ int ITERS = 10;
+ int ITERS_PER_DFA = 100;
+ for(int iter=0;iter 1.0f / (1.0f - minimumSimilarity)) {
+ String text = term.text();
+ if (text.codePointCount(0, text.length()) > 1.0f / (1.0f - minimumSimilarity)) {
this.termLongEnough = true;
}
Index: lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (working copy)
@@ -21,11 +21,9 @@
import java.util.Comparator;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.RunAutomaton;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
@@ -51,7 +49,7 @@
// the object-oriented form of the DFA
private final Automaton automaton;
// a tableized array-based form of the DFA
- private final RunAutomaton runAutomaton;
+ private final ByteRunAutomaton runAutomaton;
// common suffix of the automaton
private final BytesRef commonSuffixRef;
// true if the automaton accepts a finite language
@@ -62,8 +60,6 @@
// visited the state; we use gens to avoid having to clear
private final long[] visited;
private long curGen;
- // used for unicode conversion from BytesRef byte[] to char[]
- private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
// the reference used for seeking forwards through the term dictionary
private final BytesRef seekBytesRef = new BytesRef(10);
// true if we are enumerating an infinite portion of the DFA.
@@ -72,7 +68,6 @@
// of terms where we should simply do sequential reads instead.
private boolean linear = false;
private final BytesRef linearUpperBound = new BytesRef(10);
- private final UnicodeUtil.UTF16Result linearUpperBoundUTF16 = new UnicodeUtil.UTF16Result();
private final Comparator termComp;
/**
@@ -80,39 +75,38 @@
* Construct an enumerator based upon an automaton, enumerating the specified
* field, working on a supplied reader.
*
- * @lucene.internal Use the public ctor instead. This constructor allows the
- * (dangerous) option of passing in a pre-compiled RunAutomaton. If you use
- * this ctor and compile your own RunAutomaton, you are responsible for
- * ensuring it is in sync with the Automaton object, including internal
- * State numbering, or you will get undefined behavior.
+ * @lucene.internal Use the public ctor instead.
*
- * @param preCompiled optional pre-compiled RunAutomaton (can be null)
+ * @param runAutomaton pre-compiled ByteRunAutomaton
* @param finite true if the automaton accepts a finite language
*/
- AutomatonTermsEnum(Automaton automaton, RunAutomaton preCompiled,
- Term queryTerm, IndexReader reader, boolean finite)
+ AutomatonTermsEnum(ByteRunAutomaton runAutomaton,
+ String field, IndexReader reader,
+ boolean finite, BytesRef commonSuffixRef)
throws IOException {
- super(reader, queryTerm.field());
- this.automaton = automaton;
+ super(reader, field);
+ this.automaton = runAutomaton.getAutomaton();
this.finite = finite;
- /*
- * tableize the automaton. this also ensures it is deterministic, and has no
- * transitions to dead states. it also invokes Automaton.setStateNumbers to
- * number the original states (this is how they are tableized)
- */
- if (preCompiled == null)
- runAutomaton = new RunAutomaton(this.automaton);
- else
- runAutomaton = preCompiled;
+ this.runAutomaton = runAutomaton;
+ if (finite) {
+ // don't use suffix w/ finite DFAs
+ this.commonSuffixRef = null;
+ } else if (commonSuffixRef == null) {
+ // compute now
+ this.commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(automaton);
+ } else {
+ // precomputed
+ this.commonSuffixRef = commonSuffixRef;
+ }
- commonSuffixRef = finite ? null : new BytesRef(getValidUTF16Suffix(SpecialOperations
- .getCommonSuffix(automaton)));
-
// build a cache of sorted transitions for every state
allTransitions = new Transition[runAutomaton.getSize()][];
- for (State state : this.automaton.getStates())
- allTransitions[state.getNumber()] = state.getSortedTransitionArray(false);
+ for (State state : this.automaton.getNumberedStates()) {
+ state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order);
+ state.trimTransitionsArray();
+ allTransitions[state.getNumber()] = state.transitionsArray;
+ }
// used for path tracking, where each bit is a numbered state.
visited = new long[runAutomaton.getSize()];
@@ -126,9 +120,9 @@
*
* It will automatically calculate whether or not the automaton is finite
*/
- public AutomatonTermsEnum(Automaton automaton, Term queryTerm, IndexReader reader)
- throws IOException {
- this(automaton, null, queryTerm, reader, SpecialOperations.isFinite(automaton));
+ public AutomatonTermsEnum(Automaton automaton, String field, IndexReader reader)
+ throws IOException {
+ this(new ByteRunAutomaton(automaton), field, reader, SpecialOperations.isFinite(automaton), null);
}
/**
@@ -138,8 +132,7 @@
@Override
protected AcceptStatus accept(final BytesRef term) {
if (commonSuffixRef == null || term.endsWith(commonSuffixRef)) {
- UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
- if (runAutomaton.run(utf16.result, 0, utf16.length))
+ if (runAutomaton.run(term.bytes, term.offset, term.length))
return linear ? AcceptStatus.YES : AcceptStatus.YES_AND_SEEK;
else
return (linear && termComp.compare(term, linearUpperBound) < 0) ?
@@ -153,15 +146,13 @@
@Override
protected BytesRef nextSeekTerm(final BytesRef term) throws IOException {
if (term == null) {
+ seekBytesRef.copy("");
// return the empty term, as its valid
- if (runAutomaton.run("")) {
- seekBytesRef.copy("");
+ if (runAutomaton.run(seekBytesRef.bytes, seekBytesRef.offset, seekBytesRef.length)) {
return seekBytesRef;
}
-
- utf16.copyText("");
} else {
- UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
+ seekBytesRef.copy(term);
}
// seek to the next possible string;
@@ -169,8 +160,6 @@
// reposition
if (linear)
setLinear(infinitePosition);
- UnicodeUtil.nextValidUTF16String(utf16);
- UnicodeUtil.UTF16toUTF8(utf16.result, 0, utf16.length, seekBytesRef);
return seekBytesRef;
}
// no more possible strings can match
@@ -187,27 +176,28 @@
*/
private void setLinear(int position) {
int state = runAutomaton.getInitialState();
- char maxInterval = 0xffff;
- for (int i = 0; i < position; i++)
- state = runAutomaton.step(state, utf16.result[i]);
+ int maxInterval = 0xef;
+ for (int i = 0; i < position; i++) {
+ state = runAutomaton.step(state, seekBytesRef.bytes[i] & 0xff);
+ assert state >= 0: "state=" + state;
+ }
for (int i = 0; i < allTransitions[state].length; i++) {
Transition t = allTransitions[state][i];
- if (t.getMin() <= utf16.result[position] && utf16.result[position] <= t.getMax()) {
+ if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 &&
+ compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) {
maxInterval = t.getMax();
break;
}
}
- // 0xffff terms don't get the optimization... not worth the trouble.
- if (maxInterval < 0xffff)
- maxInterval++;
+ // 0xef terms don't get the optimization... not worth the trouble.
+ if (maxInterval != 0xef)
+ maxInterval = incrementUTF16(maxInterval);
int length = position + 1; /* position + maxTransition */
- if (linearUpperBoundUTF16.result.length < length)
- linearUpperBoundUTF16.result = new char[length];
- System.arraycopy(utf16.result, 0, linearUpperBoundUTF16.result, 0, position);
- linearUpperBoundUTF16.result[position] = maxInterval;
- linearUpperBoundUTF16.setLength(length);
- UnicodeUtil.nextValidUTF16String(linearUpperBoundUTF16);
- UnicodeUtil.UTF16toUTF8(linearUpperBoundUTF16.result, 0, length, linearUpperBound);
+ if (linearUpperBound.bytes.length < length)
+ linearUpperBound.bytes = new byte[length];
+ System.arraycopy(seekBytesRef.bytes, 0, linearUpperBound.bytes, 0, position);
+ linearUpperBound.bytes[position] = (byte) maxInterval;
+ linearUpperBound.length = length;
}
/**
@@ -229,9 +219,9 @@
linear = false;
state = runAutomaton.getInitialState();
// walk the automaton until a character is rejected.
- for (pos = 0; pos < utf16.length; pos++) {
+ for (pos = 0; pos < seekBytesRef.length; pos++) {
visited[state] = curGen;
- int nextState = runAutomaton.step(state, utf16.result[pos]);
+ int nextState = runAutomaton.step(state, seekBytesRef.bytes[pos] & 0xff);
if (nextState == -1)
break;
// we found a loop, record it for faster enumeration
@@ -249,7 +239,7 @@
} else { /* no more solutions exist from this useful portion, backtrack */
if (!backtrack(pos)) /* no more solutions at all */
return false;
- else if (runAutomaton.run(utf16.result, 0, utf16.length))
+ else if (runAutomaton.run(seekBytesRef.bytes, 0, seekBytesRef.length))
/* String is good to go as-is */
return true;
/* else advance further */
@@ -280,19 +270,18 @@
* the next lexicographic character must be greater than the existing
* character, if it exists.
*/
- char c = 0;
- if (position < utf16.length) {
- c = utf16.result[position];
+ int c = 0;
+ if (position < seekBytesRef.length) {
+ c = seekBytesRef.bytes[position] & 0xff;
// if the next character is U+FFFF and is not part of the useful portion,
// then by definition it puts us in a reject state, and therefore this
// path is dead. there cannot be any higher transitions. backtrack.
- if (c == '\uFFFF')
+ c = incrementUTF16(c);
+ if (c == -1)
return false;
- else
- c++;
}
- utf16.setLength(position);
+ seekBytesRef.length = position;
visited[state] = curGen;
Transition transitions[] = allTransitions[state];
@@ -301,11 +290,12 @@
for (int i = 0; i < transitions.length; i++) {
Transition transition = transitions[i];
- if (transition.getMax() >= c) {
- char nextChar = (char) Math.max(c, transition.getMin());
+ if (compareToUTF16(transition.getMax(), c) >= 0) {
+ int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin();
// append either the next sequential char, or the minimum transition
- utf16.setLength(utf16.length + 1);
- utf16.result[utf16.length - 1] = nextChar;
+ seekBytesRef.grow(seekBytesRef.length + 1);
+ seekBytesRef.length++;
+ seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar;
state = transition.getDest().getNumber();
/*
* as long as is possible, continue down the minimal path in
@@ -323,11 +313,12 @@
// we found a loop, record it for faster enumeration
if (!finite && !linear && visited[state] == curGen) {
linear = true;
- infinitePosition = utf16.length;
+ infinitePosition = seekBytesRef.length;
}
// append the minimum transition
- utf16.setLength(utf16.length + 1);
- utf16.result[utf16.length - 1] = transition.getMin();
+ seekBytesRef.grow(seekBytesRef.length + 1);
+ seekBytesRef.length++;
+ seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin();
}
return true;
}
@@ -345,33 +336,48 @@
*/
private boolean backtrack(int position) {
while (position > 0) {
- char nextChar = utf16.result[position - 1];
- // if a character is U+FFFF its a dead-end too,
+ int nextChar = seekBytesRef.bytes[position - 1] & 0xff;
+ // if a character is 0xef its a dead-end too,
// because there is no higher character in UTF-16 sort order.
- if (nextChar != '\uFFFF') {
- nextChar++;
- utf16.result[position - 1] = nextChar;
- utf16.setLength(position);
+ nextChar = incrementUTF16(nextChar);
+ if (nextChar != -1) {
+ seekBytesRef.bytes[position - 1] = (byte) nextChar;
+ seekBytesRef.length = position;
return true;
}
position--;
}
return false; /* all solutions exhausted */
}
+
+ /* return the next utf8 byte in utf16 order, or -1 if exhausted */
+ private final int incrementUTF16(int utf8) {
+ switch(utf8) {
+ case 0xed: return 0xf0;
+ case 0xfd: return 0xee;
+ case 0xee: return 0xef;
+ case 0xef: return -1;
+ default: return utf8 + 1;
+ }
+ }
- /**
- * if the suffix starts with a low surrogate, remove it.
- * This won't be quite as efficient, but can be converted to valid UTF-8
- *
- * This isn't nearly as complex as cleanupPosition, because its not
- * going to use this suffix to walk any path thru the terms.
- *
- */
- private String getValidUTF16Suffix(String suffix) {
- if (suffix != null && suffix.length() > 0 &&
- Character.isLowSurrogate(suffix.charAt(0)))
- return suffix.substring(1);
- else
- return suffix;
+ int compareToUTF16(int aByte, int bByte) {
+ if (aByte != bByte) {
+ // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
+
+ // We know the terms are not equal, but, we may
+ // have to carefully fixup the bytes at the
+ // difference to match UTF16's sort order:
+ if (aByte >= 0xee && bByte >= 0xee) {
+ if ((aByte & 0xfe) == 0xee) {
+ aByte += 0x10;
+ }
+ if ((bByte&0xfe) == 0xee) {
+ bByte += 0x10;
+ }
+ }
+ return aByte - bByte;
+ }
+ return 0;
}
}
Index: lucene/src/java/org/apache/lucene/search/WildcardQuery.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/WildcardQuery.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/search/WildcardQuery.java (working copy)
@@ -63,8 +63,8 @@
String wildcardText = wildcardquery.text();
- for (int i = 0; i < wildcardText.length(); i++) {
- final char c = wildcardText.charAt(i);
+ for (int i = 0; i < wildcardText.length();) {
+ final int c = wildcardText.codePointAt(i);
switch(c) {
case WILDCARD_STRING:
automata.add(BasicAutomata.makeAnyString());
@@ -75,6 +75,7 @@
default:
automata.add(BasicAutomata.makeChar(c));
}
+ i += Character.charCount(c);
}
return BasicOperations.concatenate(automata);
Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (working copy)
@@ -24,12 +24,13 @@
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
-import org.apache.lucene.util.automaton.RunAutomaton;
import java.io.IOException;
import java.util.ArrayList;
@@ -49,7 +50,7 @@
private final MultiTermQuery.BoostAttribute boostAtt =
attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
-
+
private float bottom = boostAtt.getMaxNonCompetitiveBoost();
private final float minSimilarity;
@@ -58,11 +59,12 @@
private final int termLength;
private int maxEdits;
- private List automata;
- private List runAutomata;
+
+ private List runAutomata;
private final IndexReader reader;
private final Term term;
+ private final int termText[];
private final int realPrefixLength;
/**
@@ -89,9 +91,16 @@
throw new IllegalArgumentException("prefixLength cannot be less than 0");
this.reader = reader;
this.term = term;
+
+ // convert the string into a utf32 int[] representation for fast comparisons
+ final String utf16 = term.text();
+ this.termText = new int[utf16.codePointCount(0, utf16.length())];
+ for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp))
+ termText[j++] = cp = utf16.codePointAt(i);
+ this.termLength = termText.length;
+
//The prefix could be longer than the word.
//It's kind of silly though. It means we must match the entire word.
- this.termLength = term.text().length();
this.realPrefixLength = prefixLength > termLength ? termLength : prefixLength;
this.minSimilarity = minSimilarity;
this.scale_factor = 1.0f / (1.0f - minSimilarity);
@@ -101,7 +110,7 @@
TermsEnum subEnum = getAutomatonEnum(maxEdits, null);
setEnum(subEnum != null ? subEnum :
- new LinearFuzzyTermsEnum(reader, term, minSimilarity, prefixLength));
+ new LinearFuzzyTermsEnum());
}
/**
@@ -111,37 +120,35 @@
private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm)
throws IOException {
initAutomata(editDistance);
- if (automata != null && editDistance < automata.size()) {
- return new AutomatonFuzzyTermsEnum(automata.get(editDistance), term,
- reader, minSimilarity, runAutomata.subList(0, editDistance + 1)
- .toArray(new RunAutomaton[0]), lastTerm);
+ if (runAutomata != null && editDistance < runAutomata.size()) {
+ return new AutomatonFuzzyTermsEnum(runAutomata.subList(0, editDistance + 1)
+ .toArray(new ByteRunAutomaton[0]), lastTerm);
} else {
return null;
}
}
-
+
/** initialize levenshtein DFAs up to maxDistance, if possible */
private void initAutomata(int maxDistance) {
- if (automata == null &&
+ if (runAutomata == null &&
maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
LevenshteinAutomata builder =
- new LevenshteinAutomata(term.text().substring(realPrefixLength));
- automata = new ArrayList(maxDistance);
- runAutomata = new ArrayList(maxDistance);
+ new LevenshteinAutomata(new String(termText, realPrefixLength, termText.length - realPrefixLength));
+
+ runAutomata = new ArrayList(maxDistance);
for (int i = 0; i <= maxDistance; i++) {
Automaton a = builder.toAutomaton(i);
// constant prefix
if (realPrefixLength > 0) {
Automaton prefix = BasicAutomata.makeString(
- term.text().substring(0, realPrefixLength));
+ new String(termText, 0, realPrefixLength));
a = BasicOperations.concatenate(prefix, a);
}
- automata.add(a);
- runAutomata.add(new RunAutomaton(a));
+ runAutomata.add(new ByteRunAutomaton(a));
}
}
}
-
+
/** swap in a new actual enum to proxy to */
private void setEnum(TermsEnum actualEnum) {
this.actualEnum = actualEnum;
@@ -173,7 +180,7 @@
// itself: re-init maxDistances so the fast-fail happens for more terms due
// to the now stricter constraints.
}
-
+
// for some raw min similarity and input term length, the maximum # of edits
private int initialMaxDistance(float minimumSimilarity, int termLen) {
return (int) ((1-minimumSimilarity) * termLen);
@@ -242,314 +249,292 @@
public BytesRef term() throws IOException {
return actualEnum.term();
}
-}
-
-/**
- * Implement fuzzy enumeration with automaton.
- *
- * This is the fastest method as opposed to LinearFuzzyTermsEnum:
- * as enumeration is logarithmic to the number of terms (instead of linear)
- * and comparison is linear to length of the term (rather than quadratic)
- */
-final class AutomatonFuzzyTermsEnum extends AutomatonTermsEnum {
- private final RunAutomaton matchers[];
- // used for unicode conversion from BytesRef byte[] to char[]
- private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
- private final float minimumSimilarity;
- private final float scale_factor;
-
- private final int fullSearchTermLength;
- private final BytesRef termRef;
-
- private final BytesRef lastTerm;
- private final MultiTermQuery.BoostAttribute boostAtt =
- attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
-
- public AutomatonFuzzyTermsEnum(Automaton automaton, Term queryTerm,
- IndexReader reader, float minSimilarity, RunAutomaton matchers[], BytesRef lastTerm) throws IOException {
- super(automaton, matchers[matchers.length - 1], queryTerm, reader, true);
- this.minimumSimilarity = minSimilarity;
- this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
- this.matchers = matchers;
- this.lastTerm = lastTerm;
- termRef = new BytesRef(queryTerm.text());
- fullSearchTermLength = queryTerm.text().length();
- }
-
- /** finds the smallest Lev(n) DFA that accepts the term. */
- @Override
- protected AcceptStatus accept(BytesRef term) {
- if (term.equals(termRef)) { // ed = 0
- boostAtt.setBoost(1.0F);
- return AcceptStatus.YES_AND_SEEK;
- }
-
- UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
-
- // TODO: benchmark doing this backwards
- for (int i = 1; i < matchers.length; i++)
- if (matchers[i].run(utf16.result, 0, utf16.length)) {
- final float similarity = 1.0f - ((float) i / (float)
- (Math.min(utf16.length, fullSearchTermLength)));
- if (similarity > minimumSimilarity) {
- boostAtt.setBoost((float) ((similarity - minimumSimilarity) * scale_factor));
- return AcceptStatus.YES_AND_SEEK;
- } else {
- return AcceptStatus.NO_AND_SEEK;
- }
- }
-
- return AcceptStatus.NO_AND_SEEK;
- }
-
- /** defers to superclass, except can start at an arbitrary location */
- @Override
- protected BytesRef nextSeekTerm(BytesRef term) throws IOException {
- if (term == null)
- term = lastTerm;
- return super.nextSeekTerm(term);
- }
-}
-
-/**
- * Implement fuzzy enumeration with linear brute force.
- */
-final class LinearFuzzyTermsEnum extends FilteredTermsEnum {
-
- /* This should be somewhere around the average long word.
- * If it is longer, we waste time and space. If it is shorter, we waste a
- * little bit of time growing the array as we encounter longer words.
- */
- private static final int TYPICAL_LONGEST_WORD_IN_INDEX = 19;
-
- /* Allows us save time required to create a new array
- * every time similarity is called.
- */
- private int[][] d;
-
- private final char[] text;
- private final int prefixLen;
-
- private final float minimumSimilarity;
- private final float scale_factor;
- private final int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX];
-
- private final MultiTermQuery.BoostAttribute boostAtt =
- attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
-
/**
- * Constructor for enumeration of all terms from specified reader which share a prefix of
- * length prefixLength with term and which have a fuzzy similarity >
- * minSimilarity.
- *
- * After calling the constructor the enumeration is already pointing to the first
- * valid term if such a term exists.
- *
- * @param reader Delivers terms.
- * @param term Pattern term.
- * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
- * @param prefixLength Length of required common prefix. Default value is 0.
- * @throws IOException
+ * Finds and returns the smallest of three integers
*/
- public LinearFuzzyTermsEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException {
- super(reader, term.field());
-
- if (minSimilarity >= 1.0f)
- throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1");
- else if (minSimilarity < 0.0f)
- throw new IllegalArgumentException("minimumSimilarity cannot be less than 0");
- if(prefixLength < 0)
- throw new IllegalArgumentException("prefixLength cannot be less than 0");
-
- this.minimumSimilarity = minSimilarity;
- this.scale_factor = 1.0f / (1.0f - minimumSimilarity);
-
- //The prefix could be longer than the word.
- //It's kind of silly though. It means we must match the entire word.
- final int fullSearchTermLength = term.text().length();
- final int realPrefixLength = prefixLength > fullSearchTermLength ? fullSearchTermLength : prefixLength;
-
- this.text = term.text().substring(realPrefixLength).toCharArray();
- final String prefix = term.text().substring(0, realPrefixLength);
- prefixBytesRef = new BytesRef(prefix);
- prefixLen = prefix.length();
- initializeMaxDistances();
- this.d = initDistanceArray();
-
- setInitialSeekTerm(prefixBytesRef);
+ private static final int min(int a, int b, int c) {
+ final int t = (a < b) ? a : b;
+ return (t < c) ? t : c;
}
-
- private final BytesRef prefixBytesRef;
- // used for unicode conversion from BytesRef byte[] to char[]
- private final UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
/**
- * The termCompare method in FuzzyTermEnum uses Levenshtein distance to
- * calculate the distance between the given term and the comparing term.
+ * Implement fuzzy enumeration with automaton.
+ *
+ * This is the fastest method as opposed to LinearFuzzyTermsEnum:
+ * as enumeration is logarithmic to the number of terms (instead of linear)
+ * and comparison is linear to length of the term (rather than quadratic)
*/
- @Override
- protected final AcceptStatus accept(BytesRef term) {
- if (term.startsWith(prefixBytesRef)) {
- UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
- final float similarity = similarity(utf16.result, prefixLen, utf16.length - prefixLen);
- if (similarity > minimumSimilarity) {
- boostAtt.setBoost((float)((similarity - minimumSimilarity) * scale_factor));
- return AcceptStatus.YES;
- } else return AcceptStatus.NO;
- } else {
- return AcceptStatus.END;
+ private class AutomatonFuzzyTermsEnum extends AutomatonTermsEnum {
+ private final ByteRunAutomaton matchers[];
+
+ private final BytesRef termRef;
+
+ private final BytesRef lastTerm;
+ private final MultiTermQuery.BoostAttribute boostAtt =
+ attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
+
+ public AutomatonFuzzyTermsEnum(ByteRunAutomaton matchers[],
+ BytesRef lastTerm) throws IOException {
+ super(matchers[matchers.length - 1], term.field(), reader, true, null);
+ this.matchers = matchers;
+ this.lastTerm = lastTerm;
+ termRef = new BytesRef(term.text());
}
+
+ /** finds the smallest Lev(n) DFA that accepts the term. */
+ @Override
+ protected AcceptStatus accept(BytesRef term) {
+ if (term.equals(termRef)) { // ed = 0
+ boostAtt.setBoost(1.0F);
+ return AcceptStatus.YES_AND_SEEK;
+ }
+
+ int codePointCount = -1;
+
+ // TODO: benchmark doing this backwards
+ for (int i = 1; i < matchers.length; i++)
+ if (matchers[i].run(term.bytes, 0, term.length)) {
+ // this sucks, we convert just to score based on length.
+ if (codePointCount == -1) {
+ codePointCount = UnicodeUtil.codePointCount(term);
+ }
+ final float similarity = 1.0f - ((float) i / (float)
+ (Math.min(codePointCount, termLength)));
+ if (similarity > minSimilarity) {
+ boostAtt.setBoost((float) ((similarity - minSimilarity) * scale_factor));
+ return AcceptStatus.YES_AND_SEEK;
+ } else {
+ return AcceptStatus.NO_AND_SEEK;
+ }
+ }
+
+ return AcceptStatus.NO_AND_SEEK;
+ }
+
+ /** defers to superclass, except can start at an arbitrary location */
+ @Override
+ protected BytesRef nextSeekTerm(BytesRef term) throws IOException {
+ if (term == null)
+ term = lastTerm;
+ return super.nextSeekTerm(term);
+ }
}
- /******************************
- * Compute Levenshtein distance
- ******************************/
-
/**
- * Finds and returns the smallest of three integers
+ * Implement fuzzy enumeration with linear brute force.
*/
- private static final int min(int a, int b, int c) {
- final int t = (a < b) ? a : b;
- return (t < c) ? t : c;
- }
+ private class LinearFuzzyTermsEnum extends FilteredTermsEnum {
+
+ /* This should be somewhere around the average long word.
+ * If it is longer, we waste time and space. If it is shorter, we waste a
+ * little bit of time growing the array as we encounter longer words.
+ */
+ private static final int TYPICAL_LONGEST_WORD_IN_INDEX = 19;
+
+ /* Allows us save time required to create a new array
+ * every time similarity is called.
+ */
+ private int[][] d;
+
+ // this is the text, minus the prefix
+ private final int[] text;
+
+ private final int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX];
+
+ private final MultiTermQuery.BoostAttribute boostAtt =
+ attributes().addAttribute(MultiTermQuery.BoostAttribute.class);
+
+ /**
+ * Constructor for enumeration of all terms from specified reader which share a prefix of
+ * length prefixLength with term and which have a fuzzy similarity >
+ * minSimilarity.
+ *
+ * After calling the constructor the enumeration is already pointing to the first
+ * valid term if such a term exists.
+ *
+ * @param reader Delivers terms.
+ * @param term Pattern term.
+ * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f.
+ * @param prefixLength Length of required common prefix. Default value is 0.
+ * @throws IOException
+ */
+ public LinearFuzzyTermsEnum() throws IOException {
+ super(reader, term.field());
- private final int[][] initDistanceArray(){
- return new int[this.text.length + 1][TYPICAL_LONGEST_WORD_IN_INDEX];
- }
-
- /**
- *
Similarity returns a number that is 1.0f or less (including negative numbers)
- * based on how similar the Term is compared to a target term. It returns
- * exactly 0.0f when
- *
- * editDistance > maximumEditDistance
- * Otherwise it returns:
- *
- * 1 - (editDistance / length)
- * where length is the length of the shortest term (text or target) including a
- * prefix that are identical and editDistance is the Levenshtein distance for
- * the two words.
- *
- * Embedded within this algorithm is a fail-fast Levenshtein distance
- * algorithm. The fail-fast algorithm differs from the standard Levenshtein
- * distance algorithm in that it is aborted if it is discovered that the
- * minimum distance between the words is greater than some threshold.
- *
- *
To calculate the maximum distance threshold we use the following formula:
- *
- * (1 - minimumSimilarity) * length
- * where length is the shortest term including any prefix that is not part of the
- * similarity comparison. This formula was derived by solving for what maximum value
- * of distance returns false for the following statements:
- *
- * similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
- * return (similarity > minimumSimilarity);
- * where distance is the Levenshtein distance for the two words.
- *
- * Levenshtein distance (also known as edit distance) is a measure of similarity
- * between two strings where the distance is measured as the number of character
- * deletions, insertions or substitutions required to transform one string to
- * the other string.
- * @param target the target word or phrase
- * @return the similarity, 0.0 or less indicates that it matches less than the required
- * threshold and 1.0 indicates that the text and target are identical
- */
- private final float similarity(final char[] target, int offset, int length) {
- final int m = length;
- final int n = text.length;
- if (n == 0) {
- //we don't have anything to compare. That means if we just add
- //the letters for m we get the new word
- return prefixLen == 0 ? 0.0f : 1.0f - ((float) m / prefixLen);
+ this.text = new int[termLength - realPrefixLength];
+ System.arraycopy(termText, realPrefixLength, text, 0, text.length);
+ final String prefix = new String(termText, 0, realPrefixLength);
+ prefixBytesRef = new BytesRef(prefix);
+ initializeMaxDistances();
+ this.d = initDistanceArray();
+
+ setInitialSeekTerm(prefixBytesRef);
}
- if (m == 0) {
- return prefixLen == 0 ? 0.0f : 1.0f - ((float) n / prefixLen);
+
+ private final BytesRef prefixBytesRef;
+ // used for unicode conversion from BytesRef byte[] to int[]
+ private final IntsRef utf32 = new IntsRef(TYPICAL_LONGEST_WORD_IN_INDEX);
+
+ /**
+ * The termCompare method in FuzzyTermEnum uses Levenshtein distance to
+ * calculate the distance between the given term and the comparing term.
+ */
+ @Override
+ protected final AcceptStatus accept(BytesRef term) {
+ if (term.startsWith(prefixBytesRef)) {
+ UnicodeUtil.UTF8toUTF32(term, utf32);
+ final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
+ if (similarity > minSimilarity) {
+ boostAtt.setBoost((float)((similarity - minSimilarity) * scale_factor));
+ return AcceptStatus.YES;
+ } else return AcceptStatus.NO;
+ } else {
+ return AcceptStatus.END;
+ }
}
-
- final int maxDistance = getMaxDistance(m);
-
- if (maxDistance < Math.abs(m-n)) {
- //just adding the characters of m to n or vice-versa results in
- //too many edits
- //for example "pre" length is 3 and "prefixes" length is 8. We can see that
- //given this optimal circumstance, the edit distance cannot be less than 5.
- //which is 8-3 or more precisely Math.abs(3-8).
- //if our maximum edit distance is 4, then we can discard this word
- //without looking at it.
- return 0.0f;
+
+ /******************************
+ * Compute Levenshtein distance
+ ******************************/
+
+ private final int[][] initDistanceArray(){
+ return new int[this.text.length + 1][TYPICAL_LONGEST_WORD_IN_INDEX];
}
-
- //let's make sure we have enough room in our array to do the distance calculations.
- if (d[0].length <= m) {
- growDistanceArray(m);
- }
-
- // init matrix d
- for (int i = 0; i <= n; i++) d[i][0] = i;
- for (int j = 0; j <= m; j++) d[0][j] = j;
- // start computing edit distance
- for (int i = 1; i <= n; i++) {
- int bestPossibleEditDistance = m;
- final char s_i = text[i - 1];
- for (int j = 1; j <= m; j++) {
- if (s_i != target[offset+j-1]) {
+ /**
+ *
Similarity returns a number that is 1.0f or less (including negative numbers)
+ * based on how similar the Term is compared to a target term. It returns
+ * exactly 0.0f when
+ *
+ * editDistance > maximumEditDistance
+ * Otherwise it returns:
+ *
+ * 1 - (editDistance / length)
+ * where length is the length of the shortest term (text or target) including a
+ * prefix that are identical and editDistance is the Levenshtein distance for
+ * the two words.
+ *
+ * Embedded within this algorithm is a fail-fast Levenshtein distance
+ * algorithm. The fail-fast algorithm differs from the standard Levenshtein
+ * distance algorithm in that it is aborted if it is discovered that the
+ * minimum distance between the words is greater than some threshold.
+ *
+ *
To calculate the maximum distance threshold we use the following formula:
+ *
+ * (1 - minimumSimilarity) * length
+ * where length is the shortest term including any prefix that is not part of the
+ * similarity comparison. This formula was derived by solving for what maximum value
+ * of distance returns false for the following statements:
+ *
+ * similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+ * return (similarity > minimumSimilarity);
+ * where distance is the Levenshtein distance for the two words.
+ *
+ * Levenshtein distance (also known as edit distance) is a measure of similarity
+ * between two strings where the distance is measured as the number of character
+ * deletions, insertions or substitutions required to transform one string to
+ * the other string.
+ * @param target the target word or phrase
+ * @return the similarity, 0.0 or less indicates that it matches less than the required
+ * threshold and 1.0 indicates that the text and target are identical
+ */
+ private final float similarity(final int[] target, int offset, int length) {
+ final int m = length;
+ final int n = text.length;
+ if (n == 0) {
+ //we don't have anything to compare. That means if we just add
+ //the letters for m we get the new word
+ return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength);
+ }
+ if (m == 0) {
+ return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength);
+ }
+
+ final int maxDistance = getMaxDistance(m);
+
+ if (maxDistance < Math.abs(m-n)) {
+ //just adding the characters of m to n or vice-versa results in
+ //too many edits
+ //for example "pre" length is 3 and "prefixes" length is 8. We can see that
+ //given this optimal circumstance, the edit distance cannot be less than 5.
+ //which is 8-3 or more precisely Math.abs(3-8).
+ //if our maximum edit distance is 4, then we can discard this word
+ //without looking at it.
+ return 0.0f;
+ }
+
+ //let's make sure we have enough room in our array to do the distance calculations.
+ if (d[0].length <= m) {
+ growDistanceArray(m);
+ }
+
+ // init matrix d
+ for (int i = 0; i <= n; i++) d[i][0] = i;
+ for (int j = 0; j <= m; j++) d[0][j] = j;
+
+ // start computing edit distance
+ for (int i = 1; i <= n; i++) {
+ int bestPossibleEditDistance = m;
+ final int s_i = text[i - 1];
+ for (int j = 1; j <= m; j++) {
+ if (s_i != target[offset+j-1]) {
d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1;
+ }
+ else {
+ d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
+ }
+ bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i][j]);
}
- else {
- d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]);
+
+ //After calculating row i, the best possible edit distance
+ //can be found by found by finding the smallest value in a given column.
+ //If the bestPossibleEditDistance is greater than the max distance, abort.
+
+ if (i > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
+ //the closest the target can be to the text is just too far away.
+ //this target is leaving the party early.
+ return 0.0f;
}
- bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i][j]);
}
-
- //After calculating row i, the best possible edit distance
- //can be found by found by finding the smallest value in a given column.
- //If the bestPossibleEditDistance is greater than the max distance, abort.
-
- if (i > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
- //the closest the target can be to the text is just too far away.
- //this target is leaving the party early.
- return 0.0f;
+
+ // this will return less than 0.0 when the edit distance is
+ // greater than the number of characters in the shorter word.
+ // but this was the formula that was previously used in FuzzyTermEnum,
+ // so it has not been changed (even though minimumSimilarity must be
+ // greater than 0.0)
+ return 1.0f - ((float)d[n][m] / (float) (realPrefixLength + Math.min(n, m)));
+ }
+
+ /**
+ * Grow the second dimension of the array, so that we can calculate the
+ * Levenshtein difference.
+ */
+ private void growDistanceArray(int m) {
+ for (int i = 0; i < d.length; i++) {
+ d[i] = new int[m+1];
}
}
-
- // this will return less than 0.0 when the edit distance is
- // greater than the number of characters in the shorter word.
- // but this was the formula that was previously used in FuzzyTermEnum,
- // so it has not been changed (even though minimumSimilarity must be
- // greater than 0.0)
- return 1.0f - ((float)d[n][m] / (float) (prefixLen + Math.min(n, m)));
- }
-
- /**
- * Grow the second dimension of the array, so that we can calculate the
- * Levenshtein difference.
- */
- private void growDistanceArray(int m) {
- for (int i = 0; i < d.length; i++) {
- d[i] = new int[m+1];
+
+ /**
+ * The max Distance is the maximum Levenshtein distance for the text
+ * compared to some other value that results in score that is
+ * better than the minimum similarity.
+ * @param m the length of the "other value"
+ * @return the maximum levenshtein distance that we care about
+ */
+ private final int getMaxDistance(int m) {
+ return (m < maxDistances.length) ? maxDistances[m] : calculateMaxDistance(m);
}
- }
-
- /**
- * The max Distance is the maximum Levenshtein distance for the text
- * compared to some other value that results in score that is
- * better than the minimum similarity.
- * @param m the length of the "other value"
- * @return the maximum levenshtein distance that we care about
- */
- private final int getMaxDistance(int m) {
- return (m < maxDistances.length) ? maxDistances[m] : calculateMaxDistance(m);
- }
-
- private void initializeMaxDistances() {
- for (int i = 0; i < maxDistances.length; i++) {
- maxDistances[i] = calculateMaxDistance(i);
+
+ private void initializeMaxDistances() {
+ for (int i = 0; i < maxDistances.length; i++) {
+ maxDistances[i] = calculateMaxDistance(i);
+ }
}
+
+ private int calculateMaxDistance(int m) {
+ return (int) ((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength));
+ }
}
-
- private int calculateMaxDistance(int m) {
- return (int) ((1-minimumSimilarity) * (Math.min(text.length, m) + prefixLen));
- }
}
Index: lucene/src/java/org/apache/lucene/search/AutomatonQuery.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/AutomatonQuery.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/search/AutomatonQuery.java (working copy)
@@ -24,7 +24,9 @@
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.MinimizationOperations;
@@ -50,10 +52,14 @@
*/
public class AutomatonQuery extends MultiTermQuery {
/** the automaton to match index terms against */
- protected Automaton automaton;
+ protected final Automaton automaton;
/** term containing the field, and possibly some pattern structure */
- protected Term term;
+ protected final Term term;
+ transient ByteRunAutomaton runAutomaton;
+ transient boolean isFinite;
+ transient BytesRef commonSuffixRef;
+
/**
* Create a new AutomatonQuery from an {@link Automaton}.
*
@@ -69,6 +75,14 @@
MinimizationOperations.minimize(automaton);
}
+ private void compileAutomaton() {
+ if (runAutomaton == null) {
+ runAutomaton = new ByteRunAutomaton(automaton);
+ isFinite = SpecialOperations.isFinite(automaton);
+ commonSuffixRef = isFinite ? null : SpecialOperations.getCommonSuffixBytesRef(runAutomaton.getAutomaton());
+ }
+ }
+
@Override
protected TermsEnum getTermsEnum(IndexReader reader) throws IOException {
// matches nothing
@@ -85,28 +99,42 @@
String singleton = automaton.getSingleton();
if (singleton != null)
return new SingleTermsEnum(reader, term.createTerm(singleton));
-
+
// matches a fixed string in expanded representation
- String commonPrefix = SpecialOperations.getCommonPrefix(automaton);
- if (automaton.equals(BasicAutomata.makeString(commonPrefix))) {
- return new SingleTermsEnum(reader, term.createTerm(commonPrefix));
- }
+ final String commonPrefix = SpecialOperations.getCommonPrefix(automaton);
+
+ if (commonPrefix.length() > 0) {
+ if (BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
+ return new SingleTermsEnum(reader, term.createTerm(commonPrefix));
+ }
- // matches a constant prefix
- Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata
- .makeString(commonPrefix), BasicAutomata.makeAnyString());
- if (automaton.equals(prefixAutomaton)) {
- return new PrefixTermsEnum(reader, term.createTerm(commonPrefix));
+ // matches a constant prefix
+ Automaton prefixAutomaton = BasicOperations.concatenate(BasicAutomata
+ .makeString(commonPrefix), BasicAutomata.makeAnyString());
+ if (BasicOperations.sameLanguage(automaton, prefixAutomaton)) {
+ return new PrefixTermsEnum(reader, term.createTerm(commonPrefix));
+ }
}
+
+ compileAutomaton();
- return new AutomatonTermsEnum(automaton, term, reader);
+ return new AutomatonTermsEnum(runAutomaton, term.field(), reader, isFinite, commonSuffixRef);
}
-
+
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
- result = prime * result + ((automaton == null) ? 0 : automaton.hashCode());
+ if (automaton != null) {
+ // we already minimized the automaton in the ctor, so
+ // this hash code will be the same for automata that
+ // are the same:
+ int automatonHashCode = automaton.getNumberOfStates() * 3 + automaton.getNumberOfTransitions() * 2;
+ if (automatonHashCode == 0) {
+ automatonHashCode = 1;
+ }
+ result = prime * result + automatonHashCode;
+ }
result = prime * result + ((term == null) ? 0 : term.hashCode());
return result;
}
@@ -123,7 +151,7 @@
if (automaton == null) {
if (other.automaton != null)
return false;
- } else if (!automaton.equals(other.automaton))
+ } else if (!BasicOperations.sameLanguage(automaton, other.automaton))
return false;
if (term == null) {
if (other.term != null)
Index: lucene/src/java/org/apache/lucene/util/automaton/Automaton.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/Automaton.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/automaton/Automaton.java (working copy)
@@ -31,14 +31,17 @@
import java.io.Serializable;
import java.util.Arrays;
+import java.util.BitSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
/**
* Finite-state automaton with regular expression operations.
*
@@ -93,7 +96,7 @@
/**
* Hash code. Recomputed by {@link MinimizationOperations#minimize(Automaton)}
*/
- int hash_code;
+ //int hash_code;
/** Singleton string. Null if not applicable. */
String singleton;
@@ -116,14 +119,14 @@
* @see State
* @see Transition
*/
- public Automaton() {
- initial = new State();
+ public Automaton(State initial) {
+ this.initial = initial;
deterministic = true;
singleton = null;
}
-
- boolean isDebug() {
- return System.getProperty("dk.brics.automaton.debug") != null;
+
+ public Automaton() {
+ this(new State());
}
/**
@@ -198,10 +201,12 @@
*
* @param s state
*/
+ /*
public void setInitialState(State s) {
initial = s;
singleton = null;
}
+ */
/**
* Gets initial state.
@@ -252,34 +257,70 @@
public Object getInfo() {
return info;
}
-
- /**
- * Returns the set of states that are reachable from the initial state.
- *
- * @return set of {@link State} objects
- */
- public Set getStates() {
- expandSingleton();
- Set visited;
- if (isDebug()) visited = new LinkedHashSet();
- else visited = new HashSet();
- LinkedList worklist = new LinkedList();
- worklist.add(initial);
- visited.add(initial);
- while (worklist.size() > 0) {
- State s = worklist.removeFirst();
- Collection tr;
- if (isDebug()) tr = s.getSortedTransitions(false);
- else tr = s.transitions;
- for (Transition t : tr)
- if (!visited.contains(t.to)) {
- visited.add(t.to);
- worklist.add(t.to);
+
+ // cached
+ private State[] numberedStates;
+
+ public State[] getNumberedStates() {
+ if (numberedStates == null) {
+ expandSingleton();
+ final Set visited = new HashSet();
+ final LinkedList worklist = new LinkedList();
+ numberedStates = new State[4];
+ int upto = 0;
+ worklist.add(initial);
+ visited.add(initial);
+ initial.number = upto;
+ numberedStates[upto] = initial;
+ upto++;
+ while (worklist.size() > 0) {
+ State s = worklist.removeFirst();
+ for (int i=0;i 0) {
State s = worklist.removeFirst();
if (s.accept) accepts.add(s);
- for (Transition t : s.transitions)
+ for (Transition t : s.getTransitions())
if (!visited.contains(t.to)) {
visited.add(t.to);
worklist.add(t.to);
@@ -305,32 +346,25 @@
}
/**
- * Assigns consecutive numbers to the given states.
- */
- static void setStateNumbers(Set states) {
- int number = 0;
- for (State s : states)
- s.number = number++;
- }
-
- /**
* Adds transitions to explicit crash state to ensure that transition function
* is total.
*/
void totalize() {
State s = new State();
- s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+ s.addTransition(new Transition(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT,
s));
- for (State p : getStates()) {
- int maxi = Character.MIN_VALUE;
- for (Transition t : p.getSortedTransitions(false)) {
- if (t.min > maxi) p.transitions.add(new Transition((char) maxi,
- (char) (t.min - 1), s));
+ for (State p : getNumberedStates()) {
+ int maxi = Character.MIN_CODE_POINT;
+ p.sortTransitions(Transition.CompareByMinMaxThenDest);
+ for (Transition t : p.getTransitions()) {
+ if (t.min > maxi) p.addTransition(new Transition(maxi,
+ (t.min - 1), s));
if (t.max + 1 > maxi) maxi = t.max + 1;
}
- if (maxi <= Character.MAX_VALUE) p.transitions.add(new Transition(
- (char) maxi, Character.MAX_VALUE, s));
+ if (maxi <= Character.MAX_CODE_POINT) p.addTransition(new Transition(
+ maxi, Character.MAX_CODE_POINT, s));
}
+ clearNumberedStates();
}
/**
@@ -349,52 +383,28 @@
* and adjacent edge intervals with same destination.
*/
public void reduce() {
+ final State[] states = getNumberedStates();
if (isSingleton()) return;
- Set states = getStates();
- setStateNumbers(states);
- for (State s : states) {
- List st = s.getSortedTransitions(true);
- s.resetTransitions();
- State p = null;
- int min = -1, max = -1;
- for (Transition t : st) {
- if (p == t.to) {
- if (t.min <= max + 1) {
- if (t.max > max) max = t.max;
- } else {
- if (p != null) s.transitions.add(new Transition((char) min,
- (char) max, p));
- min = t.min;
- max = t.max;
- }
- } else {
- if (p != null) s.transitions.add(new Transition((char) min,
- (char) max, p));
- p = t.to;
- min = t.min;
- max = t.max;
- }
- }
- if (p != null) s.transitions
- .add(new Transition((char) min, (char) max, p));
- }
+ for (State s : states)
+ s.reduce();
}
/**
* Returns sorted array of all interval start points.
*/
- char[] getStartPoints() {
- Set pointset = new HashSet();
- for (State s : getStates()) {
- pointset.add(Character.MIN_VALUE);
- for (Transition t : s.transitions) {
+ int[] getStartPoints() {
+ final State[] states = getNumberedStates();
+ Set pointset = new HashSet();
+ pointset.add(Character.MIN_CODE_POINT);
+ for (State s : states) {
+ for (Transition t : s.getTransitions()) {
pointset.add(t.min);
- if (t.max < Character.MAX_VALUE) pointset.add((char) (t.max + 1));
+ if (t.max < Character.MAX_CODE_POINT) pointset.add((t.max + 1));
}
}
- char[] points = new char[pointset.size()];
+ int[] points = new int[pointset.size()];
int n = 0;
- for (Character m : pointset)
+ for (Integer m : pointset)
points[n++] = m;
Arrays.sort(points);
return points;
@@ -406,47 +416,71 @@
*
* @return set of {@link State} objects
*/
- public Set getLiveStates() {
- expandSingleton();
- return getLiveStates(getStates());
- }
-
- private Set getLiveStates(Set states) {
- HashMap> map = new HashMap>();
- for (State s : states)
- map.put(s, new HashSet());
- for (State s : states)
- for (Transition t : s.transitions)
- map.get(t.to).add(s);
- Set live = new HashSet(getAcceptStates());
+ private State[] getLiveStates() {
+ final State[] states = getNumberedStates();
+ Set live = new HashSet();
+ for (State q : states) {
+ if (q.isAccept()) {
+ live.add(q);
+ }
+ }
+ // map>
+ Set map[] = new Set[states.length];
+ for (int i = 0; i < map.length; i++)
+ map[i] = new HashSet();
+ for (State s : states) {
+ for(int i=0;i worklist = new LinkedList(live);
while (worklist.size() > 0) {
State s = worklist.removeFirst();
- for (State p : map.get(s))
+ for (State p : map[s.number])
if (!live.contains(p)) {
live.add(p);
worklist.add(p);
}
}
- return live;
+
+ return live.toArray(new State[live.size()]);
}
-
+
/**
- * Removes transitions to dead states and calls {@link #reduce()} and
- * {@link #clearHashCode()}. (A state is "dead" if no accept state is
+ * Removes transitions to dead states and calls {@link #reduce()}.
+ * (A state is "dead" if no accept state is
* reachable from it.)
*/
public void removeDeadTransitions() {
- clearHashCode();
+ final State[] states = getNumberedStates();
+ //clearHashCode();
if (isSingleton()) return;
- Set states = getStates();
- Set live = getLiveStates(states);
+ State[] live = getLiveStates();
+
+ BitSet liveSet = new BitSet(states.length);
+ for (State s : live)
+ liveSet.set(s.number);
+
for (State s : states) {
- Set st = s.transitions;
- s.resetTransitions();
- for (Transition t : st)
- if (live.contains(t.to)) s.transitions.add(t);
+ // filter out transitions to dead states:
+ int upto = 0;
+ for(int i=0;i 0) {
+ setNumberedStates(live);
+ } else {
+ // sneaky corner case -- if machine accepts no strings
+ clearNumberedStates();
+ }
reduce();
}
@@ -454,11 +488,15 @@
* Returns a sorted array of transitions for each state (and sets state
* numbers).
*/
- static Transition[][] getSortedTransitions(Set states) {
- setStateNumbers(states);
- Transition[][] transitions = new Transition[states.size()][];
- for (State s : states)
- transitions[s.number] = s.getSortedTransitionArray(false);
+ Transition[][] getSortedTransitions() {
+ final State[] states = getNumberedStates();
+ Transition[][] transitions = new Transition[states.length][];
+ for (State s : states) {
+ s.sortTransitions(Transition.CompareByMinMaxThenDest);
+ s.trimTransitionsArray();
+ transitions[s.number] = s.transitionsArray;
+ assert s.transitionsArray != null;
+ }
return transitions;
}
@@ -470,9 +508,9 @@
if (isSingleton()) {
State p = new State();
initial = p;
- for (int i = 0; i < singleton.length(); i++) {
+ for (int i = 0, cp = 0; i < singleton.length(); i += Character.charCount(cp)) {
State q = new State();
- p.transitions.add(new Transition(singleton.charAt(i), q));
+ p.addTransition(new Transition(cp = singleton.codePointAt(i), q));
p = q;
}
p.accept = true;
@@ -485,8 +523,8 @@
* Returns the number of states in this automaton.
*/
public int getNumberOfStates() {
- if (isSingleton()) return singleton.length() + 1;
- return getStates().size();
+ if (isSingleton()) return singleton.codePointCount(0, singleton.length()) + 1;
+ return getNumberedStates().length;
}
/**
@@ -494,45 +532,31 @@
* as the total number of edges, where one edge may be a character interval.
*/
public int getNumberOfTransitions() {
- if (isSingleton()) return singleton.length();
+ if (isSingleton()) return singleton.codePointCount(0, singleton.length());
int c = 0;
- for (State s : getStates())
- c += s.transitions.size();
+ for (State s : getNumberedStates())
+ c += s.numTransitions();
return c;
}
- /**
- * Returns true if the language of this automaton is equal to the language of
- * the given automaton. Implemented using hashCode and
- * subsetOf.
- */
@Override
public boolean equals(Object obj) {
- if (obj == this) return true;
- if (!(obj instanceof Automaton)) return false;
- Automaton a = (Automaton) obj;
- if (isSingleton() && a.isSingleton()) return singleton.equals(a.singleton);
- return hashCode() == a.hashCode() && BasicOperations.subsetOf(this, a)
- && BasicOperations.subsetOf(a, this);
+ throw new UnsupportedOperationException("use BasicOperations.sameLanguage instead");
}
-
- /**
- * Returns hash code for this automaton. The hash code is based on the number
- * of states and transitions in the minimized automaton. Invoking this method
- * may involve minimizing the automaton.
- */
+
@Override
public int hashCode() {
- if (hash_code == 0) MinimizationOperations.minimize(this);
- return hash_code;
+ throw new UnsupportedOperationException();
}
/**
* Must be invoked when the stored hash code may no longer be valid.
*/
+ /*
void clearHashCode() {
hash_code = 0;
}
+ */
/**
* Returns a string representation of this automaton.
@@ -542,12 +566,15 @@
StringBuilder b = new StringBuilder();
if (isSingleton()) {
b.append("singleton: ");
- for (char c : singleton.toCharArray())
+ int length = singleton.codePointCount(0, singleton.length());
+ int codepoints[] = new int[length];
+ for (int i = 0, j = 0, cp = 0; i < singleton.length(); i += Character.charCount(cp))
+ codepoints[j++] = cp = singleton.codePointAt(i);
+ for (int c : codepoints)
Transition.appendCharString(c, b);
b.append("\n");
} else {
- Set states = getStates();
- setStateNumbers(states);
+ State[] states = getNumberedStates();
b.append("initial state: ").append(initial.number).append("\n");
for (State s : states)
b.append(s.toString());
@@ -562,8 +589,7 @@
public String toDot() {
StringBuilder b = new StringBuilder("digraph Automaton {\n");
b.append(" rankdir = LR;\n");
- Set states = getStates();
- setStateNumbers(states);
+ State[] states = getNumberedStates();
for (State s : states) {
b.append(" ").append(s.number);
if (s.accept) b.append(" [shape=doublecircle,label=\"\"];\n");
@@ -572,7 +598,7 @@
b.append(" initial [shape=plaintext,label=\"\"];\n");
b.append(" initial -> ").append(s.number).append("\n");
}
- for (Transition t : s.transitions) {
+ for (Transition t : s.getTransitions()) {
b.append(" ").append(s.number);
t.appendDot(b);
}
@@ -609,17 +635,18 @@
Automaton a = (Automaton) super.clone();
if (!isSingleton()) {
HashMap m = new HashMap();
- Set states = getStates();
+ State[] states = getNumberedStates();
for (State s : states)
m.put(s, new State());
for (State s : states) {
State p = m.get(s);
p.accept = s.accept;
if (s == initial) a.initial = p;
- for (Transition t : s.transitions)
- p.transitions.add(new Transition(t.min, t.max, m.get(t.to)));
+ for (Transition t : s.getTransitions())
+ p.addTransition(new Transition(t.min, t.max, m.get(t.to)));
}
}
+ a.clearNumberedStates();
return a;
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
Index: lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java (revision 0)
+++ lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java (revision 0)
@@ -0,0 +1,51 @@
+package org.apache.lucene.util.automaton;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class CharacterRunAutomaton extends RunAutomaton {
+
+ public CharacterRunAutomaton(Automaton a) {
+ super(a, Character.MAX_CODE_POINT, false);
+ }
+
+ /**
+ * Returns true if the given string is accepted by this automaton.
+ */
+ public boolean run(String s) {
+ int p = initial;
+ int l = s.length();
+ for (int i = 0, cp = 0; i < l; i += Character.charCount(cp)) {
+ p = step(p, cp = s.codePointAt(i));
+ if (p == -1) return false;
+ }
+ return accept[p];
+ }
+
+ /**
+ * Returns true if the given string is accepted by this automaton
+ */
+ public boolean run(char[] s, int offset, int length) {
+ int p = initial;
+ int l = offset + length;
+ for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
+ p = step(p, cp = Character.codePointAt(s, i, l));
+ if (p == -1) return false;
+ }
+ return accept[p];
+ }
+}
Property changes on: lucene/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/automaton/MinimizationOperations.java (working copy)
@@ -31,7 +31,6 @@
import java.util.ArrayList;
import java.util.LinkedList;
-import java.util.Set;
/**
* Operations for minimizing automata.
@@ -41,7 +40,7 @@
final public class MinimizationOperations {
private MinimizationOperations() {}
-
+
/**
* Minimizes (and determinizes if not already deterministic) the given
* automaton.
@@ -53,8 +52,8 @@
minimizeHopcroft(a);
}
// recompute hash code
- a.hash_code = a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2;
- if (a.hash_code == 0) a.hash_code = 1;
+ //a.hash_code = 1a.getNumberOfStates() * 3 + a.getNumberOfTransitions() * 2;
+ //if (a.hash_code == 0) a.hash_code = 1;
}
private static void initialize(ArrayList list, int size) {
@@ -67,24 +66,18 @@
*/
public static void minimizeHopcroft(Automaton a) {
a.determinize();
- Set tr = a.initial.getTransitions();
- if (tr.size() == 1) {
- Transition t = tr.iterator().next();
- if (t.to == a.initial && t.min == Character.MIN_VALUE
- && t.max == Character.MAX_VALUE) return;
+ if (a.initial.numTransitions == 1) {
+ Transition t = a.initial.transitionsArray[0];
+ if (t.to == a.initial && t.min == Character.MIN_CODE_POINT
+ && t.max == Character.MAX_CODE_POINT) return;
}
a.totalize();
- // make arrays for numbered states and effective alphabet
- Set ss = a.getStates();
- State[] states = new State[ss.size()];
- int number = 0;
- for (State q : ss) {
- states[number] = q;
- q.number = number++;
- }
- char[] sigma = a.getStartPoints();
+
+ int[] sigma = a.getStartPoints();
// initialize data structures
ArrayList>> reverse = new ArrayList>>();
+ final State[] states = a.getNumberedStates();
+
for (int q = 0; q < states.length; q++) {
ArrayList> v = new ArrayList>();
initialize(v, sigma.length);
@@ -121,7 +114,7 @@
partition.get(j).add(qq);
block[qq.number] = j;
for (int x = 0; x < sigma.length; x++) {
- char y = sigma[x];
+ int y = sigma[x];
State p = qq.step(y);
reverse.get(p.number).get(x).add(qq);
reverse_nonempty[p.number][x] = true;
@@ -218,9 +211,10 @@
for (int n = 0; n < newstates.length; n++) {
State s = newstates[n];
s.accept = states[s.number].accept;
- for (Transition t : states[s.number].transitions)
- s.transitions.add(new Transition(t.min, t.max, newstates[t.to.number]));
+ for (Transition t : states[s.number].getTransitions())
+ s.addTransition(new Transition(t.min, t.max, newstates[t.to.number]));
}
+ a.clearNumberedStates();
a.removeDeadTransitions();
}
Index: lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/automaton/BasicAutomata.java (working copy)
@@ -70,42 +70,42 @@
State s = new State();
a.initial = s;
s.accept = true;
- s.transitions.add(new Transition(Character.MIN_VALUE, Character.MAX_VALUE,
+ s.addTransition(new Transition(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT,
s));
a.deterministic = true;
return a;
}
/**
- * Returns a new (deterministic) automaton that accepts any single character.
+ * Returns a new (deterministic) automaton that accepts any single codepoint.
*/
public static Automaton makeAnyChar() {
- return makeCharRange(Character.MIN_VALUE, Character.MAX_VALUE);
+ return makeCharRange(Character.MIN_CODE_POINT, Character.MAX_CODE_POINT);
}
/**
- * Returns a new (deterministic) automaton that accepts a single character of
+ * Returns a new (deterministic) automaton that accepts a single codepoint of
* the given value.
*/
- public static Automaton makeChar(char c) {
+ public static Automaton makeChar(int c) {
Automaton a = new Automaton();
- a.singleton = Character.toString(c);
+ a.singleton = new String(Character.toChars(c));
a.deterministic = true;
return a;
}
/**
- * Returns a new (deterministic) automaton that accepts a single char whose
+ * Returns a new (deterministic) automaton that accepts a single codepoint whose
* value is in the given interval (including both end points).
*/
- public static Automaton makeCharRange(char min, char max) {
+ public static Automaton makeCharRange(int min, int max) {
if (min == max) return makeChar(min);
Automaton a = new Automaton();
State s1 = new State();
State s2 = new State();
a.initial = s1;
s2.accept = true;
- if (min <= max) s1.transitions.add(new Transition(min, max, s2));
+ if (min <= max) s1.addTransition(new Transition(min, max, s2));
a.deterministic = true;
return a;
}
Index: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java (revision 0)
+++ lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java (revision 0)
@@ -0,0 +1,326 @@
+package org.apache.lucene.util.automaton;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.automaton.State;
+import org.apache.lucene.util.automaton.Transition;
+
+import java.util.List;
+import java.util.ArrayList;
+
+// TODO
+// - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef
+
+final class UTF32ToUTF8 {
+
+ // Unicode boundaries for UTF8 bytes 1,2,3,4
+ private static final int[] startCodes = new int[] {0, 128, 2048, 65536};
+ private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111};
+
+ static int[] MASKS = new int[32];
+ static {
+ int v = 2;
+ for(int i=0;i<32;i++) {
+ MASKS[i] = v-1;
+ v *= 2;
+ }
+ }
+
+ // Represents one of the N utf8 bytes that (in sequence)
+ // define a code point. value is the byte value; bits is
+ // how many bits are "used" by utf8 at that byte
+ private static class UTF8Byte {
+ int value; // TODO: change to byte
+ byte bits;
+ }
+
+ // Holds a single code point, as a sequence of 1-4 utf8 bytes:
+ // TODO: maybe move to UnicodeUtil?
+ private static class UTF8Sequence {
+ private final UTF8Byte[] bytes;
+ private int len;
+
+ public UTF8Sequence() {
+ bytes = new UTF8Byte[4];
+ for(int i=0;i<4;i++) {
+ bytes[i] = new UTF8Byte();
+ }
+ }
+
+ public int byteAt(int idx) {
+ return bytes[idx].value;
+ }
+
+ public int numBits(int idx) {
+ return bytes[idx].bits;
+ }
+
+ private void set(int code) {
+ if (code < 128) {
+ // 0xxxxxxx
+ bytes[0].value = code;
+ bytes[0].bits = 7;
+ len = 1;
+ } else if (code < 2048) {
+ // 110yyyxx 10xxxxxx
+ bytes[0].value = (6 << 5) | (code >> 6);
+ bytes[0].bits = 5;
+ setRest(code, 1);
+ len = 2;
+ } else if (code < 65536) {
+ // 1110yyyy 10yyyyxx 10xxxxxx
+ bytes[0].value = (14 << 4) | (code >> 12);
+ bytes[0].bits = 4;
+ setRest(code, 2);
+ len = 3;
+ } else {
+ // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+ bytes[0].value = (30 << 3) | (code >> 18);
+ bytes[0].bits = 3;
+ setRest(code, 3);
+ len = 4;
+ }
+ }
+
+ private void setRest(int code, int numBytes) {
+ for(int i=0;i> 6;
+ }
+ }
+
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ for(int i=0;i 0) {
+ b.append(' ');
+ }
+ b.append(Integer.toBinaryString(bytes[i].value));
+ }
+ return b.toString();
+ }
+ }
+
+ private final UTF8Sequence startUTF8 = new UTF8Sequence();
+ private final UTF8Sequence endUTF8 = new UTF8Sequence();
+
+ private final UTF8Sequence tmpUTF8a = new UTF8Sequence();
+ private final UTF8Sequence tmpUTF8b = new UTF8Sequence();
+
+ // Builds necessary utf8 edges between start & end
+ void convertOneEdge(State start, State end, int startCodePoint, int endCodePoint) {
+ startUTF8.set(startCodePoint);
+ endUTF8.set(endCodePoint);
+ //System.out.println("start = " + startUTF8);
+ //System.out.println(" end = " + endUTF8);
+ build(start, end, startUTF8, endUTF8, 0);
+ }
+
+ private void build(State start, State end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) {
+
+ // Break into start, middle, end:
+ if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) {
+ // Degen case: lead with the same byte:
+ if (upto == startUTF8.len-1 && upto == endUTF8.len-1) {
+ // Super degen: just single edge, one UTF8 byte:
+ start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end));
+ return;
+ } else {
+ assert startUTF8.len > upto+1;
+ assert endUTF8.len > upto+1;
+ State n = newUTF8State();
+
+ // Single value leading edge
+ start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single
+
+ // Recurse for the rest
+ build(n, end, startUTF8, endUTF8, 1+upto);
+ }
+ } else if (startUTF8.len == endUTF8.len) {
+ if (upto == startUTF8.len-1) {
+ start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend
+ } else {
+ start(start, end, startUTF8, upto, false);
+ if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) {
+ // There is a middle
+ all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1);
+ }
+ end(start, end, endUTF8, upto, false);
+ }
+ } else {
+
+ // start
+ start(start, end, startUTF8, upto, true);
+
+ // possibly middle, spanning multiple num bytes
+ int byteCount = 1+startUTF8.len-upto;
+ final int limit = endUTF8.len-upto;
+ while (byteCount < limit) {
+ // wasteful: we only need first byte, and, we should
+ // statically encode this first byte:
+ tmpUTF8a.set(startCodes[byteCount-1]);
+ tmpUTF8b.set(endCodes[byteCount-1]);
+ all(start, end,
+ tmpUTF8a.byteAt(0),
+ tmpUTF8b.byteAt(0),
+ tmpUTF8a.len - 1);
+ byteCount++;
+ }
+
+ // end
+ end(start, end, endUTF8, upto, true);
+ }
+ }
+
+ private void start(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) {
+ if (upto == utf8.len-1) {
+ // Done recursing
+ start.addTransition(new Transition(utf8.byteAt(upto), utf8.byteAt(upto) | MASKS[utf8.numBits(upto)-1], end)); // type=start
+ } else {
+ State n = newUTF8State();
+ start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=start
+ start(n, end, utf8, 1+upto, true);
+ int endCode = utf8.byteAt(upto) | MASKS[utf8.numBits(upto)-1];
+ if (doAll && utf8.byteAt(upto) != endCode) {
+ all(start, end, utf8.byteAt(upto)+1, endCode, utf8.len-upto-1);
+ }
+ }
+ }
+
+ private void end(State start, State end, UTF8Sequence utf8, int upto, boolean doAll) {
+ if (upto == utf8.len-1) {
+ // Done recursing
+ start.addTransition(new Transition(utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto)-1]), utf8.byteAt(upto), end)); // type=end
+ } else {
+ final int startCode;
+ if (utf8.numBits(upto) == 5) {
+ // special case -- avoid created unused edges (utf8
+ // doesn't accept certain byte sequences) -- there
+ // are other cases we could optimize too:
+ startCode = 194;
+ } else {
+ startCode = utf8.byteAt(upto) & (~MASKS[utf8.numBits(upto)-1]);
+ }
+ if (doAll && utf8.byteAt(upto) != startCode) {
+ all(start, end, startCode, utf8.byteAt(upto)-1, utf8.len-upto-1);
+ }
+ State n = newUTF8State();
+ start.addTransition(new Transition(utf8.byteAt(upto), n)); // type=end
+ end(n, end, utf8, 1+upto, true);
+ }
+ }
+
+ private void all(State start, State end, int startCode, int endCode, int left) {
+ if (left == 0) {
+ start.addTransition(new Transition(startCode, endCode, end)); // type=all
+ } else {
+ State lastN = newUTF8State();
+ start.addTransition(new Transition(startCode, endCode, lastN)); // type=all
+ while (left > 1) {
+ State n = newUTF8State();
+ lastN.addTransition(new Transition(128, 191, n)); // type=all*
+ left--;
+ lastN = n;
+ }
+ lastN.addTransition(new Transition(128, 191, end)); // type = all*
+ }
+ }
+
+ private State[] utf8States;
+ private int utf8StateCount;
+
+ /** Converts an incoming utf32 automaton to an equivalent
+ * utf8 one. The incoming automaton need not be
+ * deterministic. Note that the returned automaton will
+ * not in general be deterministic, so you must
+ * determinize it if that's needed. */
+ public Automaton convert(Automaton utf32) {
+ if (utf32.isSingleton()) {
+ utf32 = utf32.cloneExpanded();
+ }
+
+ State[] map = new State[utf32.getNumberedStates().length];
+ List pending = new ArrayList();
+ State utf32State = utf32.getInitialState();
+ pending.add(utf32State);
+ Automaton utf8 = new Automaton();
+ utf8.setDeterministic(false);
+
+ State utf8State = utf8.getInitialState();
+
+ utf8States = new State[5];
+ utf8StateCount = 0;
+ utf8State.number = utf8StateCount;
+ utf8States[utf8StateCount] = utf8State;
+ utf8StateCount++;
+
+ utf8State.setAccept(utf32State.isAccept());
+
+ map[utf32State.number] = utf8State;
+
+ while(pending.size() != 0) {
+ utf32State = pending.remove(pending.size()-1);
+ utf8State = map[utf32State.number];
+ for(int i=0;i set = new TreeSet();
+ SortedSet set = new TreeSet();
for (int i = 0; i < word.length; i++)
set.add(word[i]);
- alphabet = new char[set.size()];
- Iterator iterator = set.iterator();
+ alphabet = new int[set.size()];
+ Iterator iterator = set.iterator();
for (int i = 0; i < alphabet.length; i++)
alphabet[i] = iterator.next();
- rangeLower = new char[alphabet.length + 2];
- rangeUpper = new char[alphabet.length + 2];
+ rangeLower = new int[alphabet.length + 2];
+ rangeUpper = new int[alphabet.length + 2];
// calculate the unicode range intervals that exclude the alphabet
// these are the ranges for all unicode characters not in the alphabet
int lower = 0;
for (int i = 0; i < alphabet.length; i++) {
- char higher = alphabet[i];
+ int higher = alphabet[i];
if (higher > lower) {
- rangeLower[numRanges] = (char) lower;
- rangeUpper[numRanges] = (char) (higher - 1);
+ rangeLower[numRanges] = lower;
+ rangeUpper[numRanges] = higher - 1;
numRanges++;
}
lower = higher + 1;
}
/* add the final endpoint */
- if (lower <= 0xFFFF) {
- rangeLower[numRanges] = (char) lower;
- rangeUpper[numRanges] = '\uFFFF';
+ if (lower <= Character.MAX_CODE_POINT) {
+ rangeLower[numRanges] = lower;
+ rangeUpper[numRanges] = Character.MAX_CODE_POINT;
numRanges++;
}
descriptions = new ParametricDescription[] {
null, /* for n=0, we do not need to go through the trouble */
- new Lev1ParametricDescription(input.length()),
- new Lev2ParametricDescription(input.length()),
+ new Lev1ParametricDescription(word.length),
+ new Lev2ParametricDescription(word.length),
};
}
@@ -119,6 +123,7 @@
// create all states, and mark as accept states if appropriate
for (int i = 0; i < states.length; i++) {
states[i] = new State();
+ states[i].number = i;
states[i].setAccept(description.isAccept(i));
}
// create transitions from state to state
@@ -129,7 +134,7 @@
final int end = xpos + Math.min(word.length - xpos, range);
for (int x = 0; x < alphabet.length; x++) {
- final char ch = alphabet[x];
+ final int ch = alphabet[x];
// get the characteristic vector at this position wrt ch
final int cvec = getVector(ch, xpos, end);
int dest = description.transition(k, xpos, cvec);
@@ -143,13 +148,15 @@
if (dest >= 0)
for (int r = 0; r < numRanges; r++)
states[k].addTransition(new Transition(rangeLower[r], rangeUpper[r], states[dest]));
+ // reduce the state: this doesn't appear to help anything
+ //states[k].reduce();
}
- Automaton a = new Automaton();
- a.setInitialState(states[0]);
+ Automaton a = new Automaton(states[0]);
a.setDeterministic(true);
+ a.setNumberedStates(states);
// we need not trim transitions to dead states, as they are not created.
- // a.restoreInvariant();
+ //a.restoreInvariant();
return a;
}
@@ -157,7 +164,7 @@
* Get the characteristic vector X(x, V)
* where V is substring(pos, end)
*/
- int getVector(char x, int pos, int end) {
+ int getVector(int x, int pos, int end) {
int vector = 0;
for (int i = pos; i < end; i++) {
vector <<= 1;
Index: lucene/src/java/org/apache/lucene/util/automaton/State.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/State.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/automaton/State.java (working copy)
@@ -28,13 +28,15 @@
*/
package org.apache.lucene.util.automaton;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
+import java.util.Comparator;
import java.util.Set;
+import java.util.Iterator;
/**
* Automaton state.
@@ -44,7 +46,8 @@
public class State implements Serializable, Comparable {
boolean accept;
- Set transitions;
+ public Transition[] transitionsArray;
+ public int numTransitions;
int number;
@@ -63,8 +66,26 @@
* Resets transition set.
*/
final void resetTransitions() {
- transitions = new HashSet();
+ transitionsArray = new Transition[0];
+ numTransitions = 0;
}
+
+ private class TransitionsIterable implements Iterable {
+ public Iterator iterator() {
+ return new Iterator() {
+ int upto;
+ public boolean hasNext() {
+ return upto < numTransitions;
+ }
+ public Transition next() {
+ return transitionsArray[upto++];
+ }
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ }
/**
* Returns the set of outgoing transitions. Subsequent changes are reflected
@@ -72,9 +93,18 @@
*
* @return transition set
*/
- public Set getTransitions() {
- return transitions;
+ public Iterable getTransitions() {
+ return new TransitionsIterable();
}
+
+ public int numTransitions() {
+ return numTransitions;
+ }
+
+ public void setTransitions(Transition[] transitions) {
+ this.numTransitions = transitions.length;
+ this.transitionsArray = transitions;
+ }
/**
* Adds an outgoing transition.
@@ -82,7 +112,12 @@
* @param t transition
*/
public void addTransition(Transition t) {
- transitions.add(t);
+ if (numTransitions == transitionsArray.length) {
+ final Transition[] newArray = new Transition[ArrayUtil.oversize(1+numTransitions, RamUsageEstimator.NUM_BYTES_OBJ_REF)];
+ System.arraycopy(transitionsArray, 0, newArray, 0, numTransitions);
+ transitionsArray = newArray;
+ }
+ transitionsArray[numTransitions++] = t;
}
/**
@@ -106,44 +141,88 @@
/**
* Performs lookup in transitions, assuming determinism.
*
- * @param c character to look up
+ * @param c codepoint to look up
* @return destination state, null if no matching outgoing transition
- * @see #step(char, Collection)
+ * @see #step(int, Collection)
*/
- public State step(char c) {
- for (Transition t : transitions)
+ public State step(int c) {
+ assert c >= 0;
+ for (int i=0;i dest) {
- for (Transition t : transitions)
+ public void step(int c, Collection dest) {
+ for (int i=0;i max) max = t.max;
+ } else {
+ if (p != null) {
+ transitionsArray[upto++] = new Transition(min, max, p);
+ }
+ min = t.min;
+ max = t.max;
+ }
+ } else {
+ if (p != null) {
+ transitionsArray[upto++] = new Transition(min, max, p);
+ }
+ p = t.to;
+ min = t.min;
+ max = t.max;
+ }
+ }
+
+ if (p != null) {
+ transitionsArray[upto++] = new Transition(min, max, p);
+ }
+ numTransitions = upto;
}
-
+
/**
* Returns sorted list of outgoing transitions.
*
@@ -151,11 +230,12 @@
* reverse max, to)
* @return transition list
*/
- public List getSortedTransitions(boolean to_first) {
- return Arrays.asList(getSortedTransitionArray(to_first));
+
+ /** Sorts transitions array in-place. */
+ public void sortTransitions(Comparator comparator) {
+ Arrays.sort(transitionsArray, 0, numTransitions, comparator);
}
-
/**
* Return this state's number.
*
@@ -178,7 +258,7 @@
if (accept) b.append(" [accept]");
else b.append(" [reject]");
b.append(":\n");
- for (Transition t : transitions)
+ for (Transition t : getTransitions())
b.append(" ").append(t.toString()).append("\n");
return b.toString();
}
@@ -190,20 +270,4 @@
public int compareTo(State s) {
return s.id - id;
}
-
- /**
- * See {@link java.lang.Object#equals(java.lang.Object)}.
- */
- @Override
- public boolean equals(Object obj) {
- return super.equals(obj);
- }
-
- /**
- * See {@link java.lang.Object#hashCode()}.
- */
- @Override
- public int hashCode() {
- return super.hashCode();
- }
}
Index: lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java (revision 0)
+++ lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java (revision 0)
@@ -0,0 +1,38 @@
+package org.apache.lucene.util.automaton;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class ByteRunAutomaton extends RunAutomaton {
+
+ public ByteRunAutomaton(Automaton a) {
+ super(new UTF32ToUTF8().convert(a), 256, true);
+ }
+
+ /**
+ * Returns true if the given byte array is accepted by this automaton
+ */
+ public boolean run(byte[] s, int offset, int length) {
+ int p = initial;
+ int l = offset + length;
+ for (int i = offset; i < l; i++) {
+ p = step(p, s[i] & 0xFF);
+ if (p == -1) return false;
+ }
+ return accept[p];
+ }
+}
Property changes on: lucene/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/automaton/TransitionComparator.java (working copy)
@@ -1,75 +0,0 @@
-/*
- * dk.brics.automaton
- *
- * Copyright (c) 2001-2009 Anders Moeller
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the author may not be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-package org.apache.lucene.util.automaton;
-
-import java.io.Serializable;
-import java.util.Comparator;
-
-/**
- * Comparator for state {@link Transition}s that orders unicode char range
- * transitions in lexicographic order.
- *
- * @lucene.experimental
- */
-class TransitionComparator implements Comparator, Serializable {
-
- boolean to_first;
-
- TransitionComparator(boolean to_first) {
- this.to_first = to_first;
- }
-
- /**
- * Compares by (min, reverse max, to) or (to, min, reverse max).
- */
- public int compare(Transition t1, Transition t2) {
- if (to_first) {
- if (t1.to != t2.to) {
- if (t1.to == null) return -1;
- else if (t2.to == null) return 1;
- else if (t1.to.number < t2.to.number) return -1;
- else if (t1.to.number > t2.to.number) return 1;
- }
- }
- if (t1.min < t2.min) return -1;
- if (t1.min > t2.min) return 1;
- if (t1.max > t2.max) return -1;
- if (t1.max < t2.max) return 1;
- if (!to_first) {
- if (t1.to != t2.to) {
- if (t1.to == null) return -1;
- else if (t2.to == null) return 1;
- else if (t1.to.number < t2.to.number) return -1;
- else if (t1.to.number > t2.to.number) return 1;
- }
- }
- return 0;
- }
-}
Index: lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/automaton/SpecialOperations.java (working copy)
@@ -33,6 +33,8 @@
import java.util.HashSet;
import java.util.Set;
+import org.apache.lucene.util.BytesRef;
+
/**
* Special automata operations.
*
@@ -46,7 +48,7 @@
* Finds the largest entry whose value is less than or equal to c, or 0 if
* there is no such entry.
*/
- static int findIndex(char c, char[] points) {
+ static int findIndex(int c, int[] points) {
int a = 0;
int b = points.length;
while (b - a > 1) {
@@ -70,9 +72,11 @@
* Checks whether there is a loop containing s. (This is sufficient since
* there are never transitions to dead states.)
*/
+ // TODO: not great that this is recursive... in theory a
+ // large automata could exceed java's stack
private static boolean isFinite(State s, HashSet path) {
path.add(s);
- for (Transition t : s.transitions)
+ for (Transition t : s.getTransitions())
if (path.contains(t.to) || !isFinite(t.to, path)) return false;
path.remove(s);
return true;
@@ -93,10 +97,10 @@
do {
done = true;
visited.add(s);
- if (!s.accept && s.transitions.size() == 1) {
- Transition t = s.transitions.iterator().next();
+ if (!s.accept && s.numTransitions() == 1) {
+ Transition t = s.getTransitions().iterator().next();
if (t.min == t.max && !visited.contains(t.to)) {
- b.append(t.min);
+ b.appendCodePoint(t.min);
s = t.to;
done = false;
}
@@ -105,6 +109,31 @@
return b.toString();
}
+ // TODO: this currently requites a determinized machine,
+ // but it need not -- we can speed it up by walking the
+ // NFA instead. it'd still be fail fast.
+ public static BytesRef getCommonPrefixBytesRef(Automaton a) {
+ if (a.isSingleton()) return new BytesRef(a.singleton);
+ BytesRef ref = new BytesRef(10);
+ HashSet visited = new HashSet();
+ State s = a.initial;
+ boolean done;
+ do {
+ done = true;
+ visited.add(s);
+ if (!s.accept && s.numTransitions() == 1) {
+ Transition t = s.getTransitions().iterator().next();
+ if (t.min == t.max && !visited.contains(t.to)) {
+ ref.grow(++ref.length);
+ ref.bytes[ref.length - 1] = (byte)t.min;
+ s = t.to;
+ done = false;
+ }
+ }
+ } while (!done);
+ return ref;
+ }
+
/**
* Returns the longest string that is a suffix of all accepted strings and
* visits each state at most once.
@@ -119,9 +148,32 @@
Automaton r = a.clone();
reverse(r);
r.determinize();
- return reverseUnicode3(SpecialOperations.getCommonPrefix(r));
+ return new StringBuilder(SpecialOperations.getCommonPrefix(r)).reverse().toString();
}
+ public static BytesRef getCommonSuffixBytesRef(Automaton a) {
+ if (a.isSingleton()) // if singleton, the suffix is the string itself.
+ return new BytesRef(a.singleton);
+
+ // reverse the language of the automaton, then reverse its common prefix.
+ Automaton r = a.clone();
+ reverse(r);
+ r.determinize();
+ BytesRef ref = SpecialOperations.getCommonPrefixBytesRef(r);
+ reverseBytes(ref);
+ return ref;
+ }
+
+ private static void reverseBytes(BytesRef ref) {
+ if (ref.length <= 1) return;
+ int num = ref.length >> 1;
+ for (int i = ref.offset; i < ( ref.offset + num ); i++) {
+ byte b = ref.bytes[i];
+ ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1];
+ ref.bytes[ref.offset * 2 + ref.length - i - 1] = b;
+ }
+ }
+
/**
* Reverses the language of the given (non-singleton) automaton while returning
* the set of new initial states.
@@ -130,8 +182,11 @@
a.expandSingleton();
// reverse all edges
HashMap> m = new HashMap>();
- Set states = a.getStates();
- Set accept = a.getAcceptStates();
+ State[] states = a.getNumberedStates();
+ Set accept = new HashSet();
+ for (State s : states)
+ if (s.isAccept())
+ accept.add(s);
for (State r : states) {
m.put(r, new HashSet());
r.accept = false;
@@ -139,41 +194,17 @@
for (State r : states)
for (Transition t : r.getTransitions())
m.get(t.to).add(new Transition(t.min, t.max, r));
- for (State r : states)
- r.transitions = m.get(r);
+ for (State r : states) {
+ Set tr = m.get(r);
+ r.setTransitions(tr.toArray(new Transition[tr.size()]));
+ }
// make new initial+final states
a.initial.accept = true;
a.initial = new State();
for (State r : accept)
a.initial.addEpsilon(r); // ensures that all initial states are reachable
a.deterministic = false;
+ a.clearNumberedStates();
return accept;
}
-
- /**
- * Intentionally use a unicode 3 reverse.
- * This is because we are only going to reverse it again...
- */
- private static String reverseUnicode3( final String input ){
- char[] charInput = input.toCharArray();
- reverseUnicode3(charInput, 0, charInput.length);
- return new String(charInput);
- }
-
- /**
- * Intentionally use a unicode 3 reverse.
- * This is because it is only used by getCommonSuffix(),
- * which will reverse the entire FSM using code unit reversal,
- * so we must then reverse its common prefix back using the
- * same code unit reversal.
- */
- private static void reverseUnicode3(char[] buffer, int start, int len){
- if (len <= 1) return;
- int num = len>>1;
- for (int i = start; i < ( start + num ); i++) {
- char c = buffer[i];
- buffer[i] = buffer[start * 2 + len - i - 1];
- buffer[start * 2 + len - i - 1] = c;
- }
- }
}
Index: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py (revision 0)
+++ lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py (revision 0)
@@ -0,0 +1,351 @@
+import types
+import os
+import sys
+import random
+
+MAX_UNICODE = 0x10FFFF
+
+# TODO
+# - could be more minimal
+# - eg when bracket lands on a utf8 boundary, like 3 - 2047 -- they can share the two * edges
+# - also 3 2048 or 3 65536 -- it should not have an * down the red path, but it does
+
+# MASKS[0] is bottom 1-bit
+# MASKS[1] is bottom 2-bits
+# ...
+
+utf8Ranges = [(0, 127),
+ (128, 2047),
+ (2048, 65535),
+ (65536, 1114111)]
+
+typeToColor = {'startend': 'purple',
+ 'start': 'blue',
+ 'end': 'red'}
+
+class FSA:
+
+ def __init__(self):
+ # maps fromNode -> (startUTF8, endUTF8, endNode)
+ self.states = {}
+ self.nodeUpto = 0
+
+ def run(self, bytes):
+ state = self.start
+ for b in bytes:
+ found = False
+ oldState = state
+ for label, s, e, n in self.states[state][1:]:
+ if b >= s and b <= e:
+ if found:
+ raise RuntimeError('state %s has ambiguous output for byte %s' % (oldState, b))
+ state = n
+ found = True
+ if not found:
+ return -1
+
+ return state
+
+ def addEdge(self, n1, n2, v1, v2, label):
+ """
+ Adds edge from n1-n2, utf8 byte range v1-v2.
+ """
+ assert n1 in self.states
+ assert type(v1) is types.IntType
+ assert type(v2) is types.IntType
+ self.states[n1].append((label, v1, v2, n2))
+
+ def addNode(self, label=None):
+ try:
+ self.states[self.nodeUpto] = [label]
+ return self.nodeUpto
+ finally:
+ self.nodeUpto += 1
+
+ def toDOT(self, label):
+ __l = []
+ w = __l.append
+ endNode = startNode = None
+ for id, details in self.states.items():
+ name = details[0]
+ if name == 'end':
+ endNode = id
+ elif name == 'start':
+ startNode = id
+
+ w('digraph %s {' % label)
+ w(' rankdir=LR;')
+ w(' size="8,5";')
+ w(' node [color=white label=""]; Ns;')
+
+ w(' node [color=black];')
+ w(' node [shape=doublecircle, label=""]; N%s [label="%s"];' % (endNode, endNode))
+ w(' node [shape=circle];')
+
+ w(' N%s [label="%s"];' % (startNode, startNode))
+ w(' Ns -> N%s;' % startNode)
+ for id, details in self.states.items():
+ edges = details[1:]
+ w(' N%s [label="%s"];' % (id, id))
+ for type, s, e, dest in edges:
+ c = typeToColor.get(type, 'black')
+ if type == 'all*':
+ # special case -- matches any utf8 byte at this point
+ label = '*'
+ elif s == e:
+ label = '%s' % binary(s)
+ else:
+ label = '%s-%s' % (binary(s), binary(e))
+ w(' N%s -> N%s [label="%s" color="%s"];' % (id, dest, label, c))
+ if name == 'end':
+ endNode = id
+ elif name == 'start':
+ startNode = id
+ w('}')
+ return '\n'.join(__l)
+
+ def toPNG(self, label, pngOut):
+ open('tmp.dot', 'wb').write(self.toDOT(label))
+ if os.system('dot -Tpng tmp.dot -o %s' % pngOut):
+ raise RuntimeException('dot failed')
+
+
+MASKS = []
+v = 2
+for i in range(32):
+ MASKS.append(v-1)
+ v *= 2
+
+def binary(x):
+ if x == 0:
+ return '00000000'
+
+ l = []
+ while x > 0:
+ if x & 1 == 1:
+ l.append('1')
+ else:
+ l.append('0')
+ x = x >> 1
+
+ # big endian!
+ l.reverse()
+
+ l2 = []
+ while len(l) > 0:
+ s = ''.join(l[-8:])
+ if len(s) < 8:
+ s = '0'*(8-len(s)) + s
+ l2.append(s)
+ del l[-8:]
+
+ return ' '.join(l2)
+
+def getUTF8Rest(code, numBytes):
+ l = []
+ for i in range(numBytes):
+ l.append((128 | (code & MASKS[5]), 6))
+ code = code >> 6
+ l.reverse()
+ return tuple(l)
+
+def toUTF8(code):
+ # code = Unicode code point
+ assert code >= 0
+ assert code <= MAX_UNICODE
+
+ if code < 128:
+ # 0xxxxxxx
+ bytes = ((code, 7),)
+ elif code < 2048:
+ # 110yyyxx 10xxxxxx
+ byte1 = (6 << 5) | (code >> 6)
+ bytes = ((byte1, 5),) + getUTF8Rest(code, 1)
+ elif code < 65536:
+ # 1110yyyy 10yyyyxx 10xxxxxx
+ len = 3
+ byte1 = (14 << 4) | (code >> 12)
+ bytes = ((byte1, 4),) + getUTF8Rest(code, 2)
+ else:
+ # 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
+ len = 4
+ byte1 = (30 << 3) | (code >> 18)
+ bytes = ((byte1, 3),) + getUTF8Rest(code, 3)
+
+ return bytes
+
+def all(fsa, startNode, endNode, startCode, endCode, left):
+ if len(left) == 0:
+ fsa.addEdge(startNode, endNode, startCode, endCode, 'all')
+ else:
+ lastN = fsa.addNode()
+ fsa.addEdge(startNode, lastN, startCode, endCode, 'all')
+ while len(left) > 1:
+ n = fsa.addNode()
+ fsa.addEdge(lastN, n, 128, 191, 'all*')
+ left = left[1:]
+ lastN = n
+ fsa.addEdge(lastN, endNode, 128, 191, 'all*')
+
+def start(fsa, startNode, endNode, utf8, doAll):
+ if len(utf8) == 1:
+ fsa.addEdge(startNode, endNode, utf8[0][0], utf8[0][0] | MASKS[utf8[0][1]-1], 'start')
+ else:
+ n = fsa.addNode()
+ fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'start')
+ start(fsa, n, endNode, utf8[1:], True)
+ end = utf8[0][0] | MASKS[utf8[0][1]-1]
+ if doAll and utf8[0][0] != end:
+ all(fsa, startNode, endNode, utf8[0][0]+1, end, utf8[1:])
+
+def end(fsa, startNode, endNode, utf8, doAll):
+ if len(utf8) == 1:
+ fsa.addEdge(startNode, endNode, utf8[0][0] & ~MASKS[utf8[0][1]-1], utf8[0][0], 'end')
+ else:
+ if utf8[0][1] == 5:
+ # special case -- avoid created unused edges (utf8 doesn't accept certain byte sequences):
+ start = 194
+ else:
+ start = utf8[0][0] & (~MASKS[utf8[0][1]-1])
+ if doAll and utf8[0][0] != start:
+ all(fsa, startNode, endNode, start, utf8[0][0]-1, utf8[1:])
+ n = fsa.addNode()
+ fsa.addEdge(startNode, n, utf8[0][0], utf8[0][0], 'end')
+ end(fsa, n, endNode, utf8[1:], True)
+
+def build(fsa,
+ startNode, endNode,
+ startUTF8, endUTF8):
+
+ # Break into start, middle, end:
+ if startUTF8[0][0] == endUTF8[0][0]:
+ # Degen case: lead with the same byte:
+ if len(startUTF8) == 1 and len(endUTF8) == 1:
+ fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend')
+ return
+ else:
+ assert len(startUTF8) != 1
+ assert len(endUTF8) != 1
+ n = fsa.addNode()
+ # single value edge
+ fsa.addEdge(startNode, n, startUTF8[0][0], startUTF8[0][0], 'single')
+ build(fsa, n, endNode, startUTF8[1:], endUTF8[1:])
+ elif len(startUTF8) == len(endUTF8):
+ if len(startUTF8) == 1:
+ fsa.addEdge(startNode, endNode, startUTF8[0][0], endUTF8[0][0], 'startend')
+ else:
+ start(fsa, startNode, endNode, startUTF8, False)
+ if endUTF8[0][0] - startUTF8[0][0] > 1:
+ all(fsa, startNode, endNode, startUTF8[0][0]+1, endUTF8[0][0]-1, startUTF8[1:])
+ end(fsa, startNode, endNode, endUTF8, False)
+ else:
+ # start
+ start(fsa, startNode, endNode, startUTF8, True)
+
+ # possibly middle
+ byteCount = 1+len(startUTF8)
+ while byteCount < len(endUTF8):
+ s = toUTF8(utf8Ranges[byteCount-1][0])
+ e = toUTF8(utf8Ranges[byteCount-1][1])
+ all(fsa, startNode, endNode,
+ s[0][0],
+ e[0][0],
+ s[1:])
+ byteCount += 1
+
+ # end
+ end(fsa, startNode, endNode, endUTF8, True)
+
+def main():
+
+ if len(sys.argv) not in (3, 4):
+ print
+ print 'Usage: python %s startUTF32 endUTF32 [testCode]' % sys.argv[0]
+ print
+ sys.exit(1)
+
+ utf32Start = int(sys.argv[1])
+ utf32End = int(sys.argv[2])
+
+ if utf32Start > utf32End:
+ print 'ERROR: start must be <= end'
+ sys.exit(1)
+
+ fsa = FSA()
+ fsa.start = fsa.addNode('start')
+ fsa.end = fsa.addNode('end')
+
+ print 's=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32Start)])
+ print 'e=%s' % ' '.join([binary(x[0]) for x in toUTF8(utf32End)])
+
+ if len(sys.argv) == 4:
+ print 't=%s [%s]' % \
+ (' '.join([binary(x[0]) for x in toUTF8(int(sys.argv[3]))]),
+ ' '.join(['%2x' % x[0] for x in toUTF8(int(sys.argv[3]))]))
+
+ build(fsa, fsa.start, fsa.end,
+ toUTF8(utf32Start),
+ toUTF8(utf32End))
+
+ fsa.toPNG('test', '/tmp/outpy.png')
+ print 'Saved to /tmp/outpy.png...'
+
+ test(fsa, utf32Start, utf32End, 100000);
+
+def test(fsa, utf32Start, utf32End, count):
+
+ # verify correct ints are accepted
+ for i in range(count):
+ r = random.randint(utf32Start, utf32End)
+ dest = fsa.run([tup[0] for tup in toUTF8(r)])
+ if dest != fsa.end:
+ print 'FAILED: valid %s (%s) is not accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)]))
+ return False
+
+ invalidRange = MAX_UNICODE - (utf32End - utf32Start + 1)
+ if invalidRange >= 0:
+ # verify invalid ints are not accepted
+ for i in range(count):
+ r = random.randint(0, invalidRange-1)
+ if r >= utf32Start:
+ r = utf32End + 1 + r - utf32Start
+ dest = fsa.run([tup[0] for tup in toUTF8(r)])
+ if dest != -1:
+ print 'FAILED: invalid %s (%s) is accepted' % (r, ' '.join([binary(x[0]) for x in toUTF8(r)]))
+ return False
+
+ return True
+
+def stress():
+
+ print 'Testing...'
+
+ iter = 0
+ while True:
+ if iter % 10 == 0:
+ print '%s...' % iter
+ iter += 1
+
+ v1 = random.randint(0, MAX_UNICODE)
+ v2 = random.randint(0, MAX_UNICODE)
+ if v2 < v1:
+ v1, v2 = v2, v1
+
+ utf32Start = v1
+ utf32End = v2
+
+ fsa = FSA()
+ fsa.start = fsa.addNode('start')
+ fsa.end = fsa.addNode('end')
+ build(fsa, fsa.start, fsa.end,
+ toUTF8(utf32Start),
+ toUTF8(utf32End))
+
+ if not test(fsa, utf32Start, utf32End, 10000):
+ print 'FAILED on utf32Start=%s utf32End=%s' % (utf32Start, utf32End)
+
+if __name__ == '__main__':
+ if len(sys.argv) > 1:
+ main()
+ else:
+ stress()
Property changes on: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/automaton/BasicOperations.java (working copy)
@@ -29,8 +29,12 @@
package org.apache.lucene.util.automaton;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
import java.util.ArrayList;
import java.util.BitSet;
+import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@@ -73,7 +77,8 @@
s.addEpsilon(a2.initial);
}
a1.deterministic = deterministic;
- a1.clearHashCode();
+ //a1.clearHashCode();
+ a1.clearNumberedStates();
a1.checkMinimizeAlways();
return a1;
}
@@ -125,7 +130,8 @@
ac = ns;
}
b.deterministic = false;
- b.clearHashCode();
+ //b.clearHashCode();
+ b.clearNumberedStates();
b.checkMinimizeAlways();
return b;
}
@@ -144,7 +150,8 @@
s.accept = true;
a.initial = s;
a.deterministic = false;
- a.clearHashCode();
+ //a.clearHashCode();
+ a.clearNumberedStates();
a.checkMinimizeAlways();
return a;
}
@@ -165,7 +172,8 @@
p.addEpsilon(s);
a.initial = s;
a.deterministic = false;
- a.clearHashCode();
+ //a.clearHashCode();
+ a.clearNumberedStates();
a.checkMinimizeAlways();
return a;
}
@@ -217,7 +225,8 @@
for (State p : b.getAcceptStates())
p.addEpsilon(d.initial);
b.deterministic = false;
- b.clearHashCode();
+ //b.clearHashCode();
+ b.clearNumberedStates();
b.checkMinimizeAlways();
}
return b;
@@ -233,7 +242,7 @@
a = a.cloneExpandedIfRequired();
a.determinize();
a.totalize();
- for (State p : a.getStates())
+ for (State p : a.getNumberedStates())
p.accept = !p.accept;
a.removeDeadTransitions();
return a;
@@ -274,10 +283,8 @@
else return BasicAutomata.makeEmpty();
}
if (a1 == a2) return a1.cloneIfRequired();
- Transition[][] transitions1 = Automaton
- .getSortedTransitions(a1.getStates());
- Transition[][] transitions2 = Automaton
- .getSortedTransitions(a2.getStates());
+ Transition[][] transitions1 = a1.getSortedTransitions();
+ Transition[][] transitions2 = a2.getSortedTransitions();
Automaton c = new Automaton();
LinkedList worklist = new LinkedList();
HashMap newstates = new HashMap();
@@ -302,9 +309,9 @@
newstates.put(q, q);
r = q;
}
- char min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
- char max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
- p.s.transitions.add(new Transition(min, max, r.s));
+ int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min;
+ int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max;
+ p.s.addTransition(new Transition(min, max, r.s));
}
}
}
@@ -313,6 +320,24 @@
c.checkMinimizeAlways();
return c;
}
+
+ /** Returns true if these two auotomata accept exactly the
+ * same language. This is a costly computation! Note
+ * also that a1 and a2 will be determinized as a side
+ * effect. */
+ public static boolean sameLanguage(Automaton a1, Automaton a2) {
+ if (a1 == a2) {
+ return true;
+ }
+ if (a1.isSingleton() && a2.isSingleton()) {
+ return a1.singleton.equals(a2.singleton);
+ } else if (a1.isSingleton()) {
+ // subsetOf is faster if the first automaton is a singleton
+ return subsetOf(a1, a2) && subsetOf(a2, a1);
+ } else {
+ return subsetOf(a2, a1) && subsetOf(a1, a2);
+ }
+ }
/**
* Returns true if the language of a1 is a subset of the language
@@ -328,10 +353,8 @@
return BasicOperations.run(a2, a1.singleton);
}
a2.determinize();
- Transition[][] transitions1 = Automaton
- .getSortedTransitions(a1.getStates());
- Transition[][] transitions2 = Automaton
- .getSortedTransitions(a2.getStates());
+ Transition[][] transitions1 = a1.getSortedTransitions();
+ Transition[][] transitions2 = a2.getSortedTransitions();
LinkedList worklist = new LinkedList();
HashSet visited = new HashSet();
StatePair p = new StatePair(a1.initial, a2.initial);
@@ -339,19 +362,24 @@
visited.add(p);
while (worklist.size() > 0) {
p = worklist.removeFirst();
- if (p.s1.accept && !p.s2.accept) return false;
+ if (p.s1.accept && !p.s2.accept) {
+ return false;
+ }
Transition[] t1 = transitions1[p.s1.number];
Transition[] t2 = transitions2[p.s2.number];
for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) {
while (b2 < t2.length && t2[b2].max < t1[n1].min)
b2++;
int min1 = t1[n1].min, max1 = t1[n1].max;
+
for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) {
- if (t2[n2].min > min1) return false;
- if (t2[n2].max < Character.MAX_VALUE) min1 = t2[n2].max + 1;
+ if (t2[n2].min > min1) {
+ return false;
+ }
+ if (t2[n2].max < Character.MAX_CODE_POINT) min1 = t2[n2].max + 1;
else {
- min1 = Character.MAX_VALUE;
- max1 = Character.MIN_VALUE;
+ min1 = Character.MAX_CODE_POINT;
+ max1 = Character.MIN_CODE_POINT;
}
StatePair q = new StatePair(t1[n1].to, t2[n2].to);
if (!visited.contains(q)) {
@@ -359,7 +387,9 @@
visited.add(q);
}
}
- if (min1 <= max1) return false;
+ if (min1 <= max1) {
+ return false;
+ }
}
}
return true;
@@ -387,7 +417,8 @@
s.addEpsilon(a2.initial);
a1.initial = s;
a1.deterministic = false;
- a1.clearHashCode();
+ //a1.clearHashCode();
+ a1.clearNumberedStates();
a1.checkMinimizeAlways();
return a1;
}
@@ -414,64 +445,257 @@
Automaton a = new Automaton();
a.initial = s;
a.deterministic = false;
- a.clearHashCode();
+ //a.clearHashCode();
+ a.clearNumberedStates();
a.checkMinimizeAlways();
return a;
}
-
+
+ // Simple custom ArrayList
+ private final static class TransitionList {
+ Transition[] transitions = new Transition[2];
+ int count;
+
+ public void add(Transition t) {
+ if (transitions.length == count) {
+ Transition[] newArray = new Transition[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJ_REF)];
+ System.arraycopy(transitions, 0, newArray, 0, count);
+ transitions = newArray;
+ }
+ transitions[count++] = t;
+ }
+ }
+
+ // Holds all transitions that start on this int point, or
+ // end at this point-1
+ private final static class PointTransitions implements Comparable {
+ int point;
+ final TransitionList ends = new TransitionList();
+ final TransitionList starts = new TransitionList();
+ public int compareTo(PointTransitions other) {
+ return point - other.point;
+ }
+
+ public void reset(int point) {
+ this.point = point;
+ ends.count = 0;
+ starts.count = 0;
+ }
+
+ public boolean equals(Object other) {
+ return ((PointTransitions) other).point == point;
+ }
+
+ public int hashCode() {
+ return point;
+ }
+ }
+
+ private final static class PointTransitionSet {
+ int count;
+ PointTransitions[] points = new PointTransitions[5];
+
+ private final static int HASHMAP_CUTOVER = 30;
+ private final HashMap map = new HashMap();
+ private boolean useHash = false;
+
+ private PointTransitions next(int point) {
+ // 1st time we are seeing this point
+ if (count == points.length) {
+ final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJ_REF)];
+ System.arraycopy(points, 0, newArray, 0, count);
+ points = newArray;
+ }
+ PointTransitions points0 = points[count];
+ if (points0 == null) {
+ points0 = points[count] = new PointTransitions();
+ }
+ points0.reset(point);
+ count++;
+ return points0;
+ }
+
+ private PointTransitions find(int point) {
+ if (useHash) {
+ final Integer pi = point;
+ PointTransitions p = map.get(pi);
+ if (p == null) {
+ p = next(point);
+ map.put(pi, p);
+ }
+ return p;
+ } else {
+ for(int i=0;i 1) {
+ Arrays.sort(points, 0, count);
+ }
+ }
+
+ public void add(Transition t) {
+ find(t.min).starts.add(t);
+ find(1+t.max).ends.add(t);
+ }
+
+ public String toString() {
+ StringBuilder s = new StringBuilder();
+ for(int i=0;i 0) {
+ s.append(' ');
+ }
+ s.append(points[i].point).append(':').append(points[i].starts.count).append(',').append(points[i].ends.count);
+ }
+ return s.toString();
+ }
+ }
+
/**
* Determinizes the given automaton.
*
- * Complexity: exponential in number of states.
+ * Worst case complexity: exponential in number of states.
*/
- public static void determinize(Automaton a) {
- if (a.deterministic || a.isSingleton()) return;
- Set initialset = new HashSet();
- initialset.add(a.initial);
- determinize(a, initialset);
- }
-
- /**
- * Determinizes the given automaton using the given set of initial states.
- */
- static void determinize(Automaton a, Set initialset) {
- char[] points = a.getStartPoints();
+ static void determinize(Automaton a) {
+ if (a.deterministic || a.isSingleton()) {
+ return;
+ }
+
+ final State[] allStates = a.getNumberedStates();
+
// subset construction
- Map,Set> sets = new HashMap,Set>();
- LinkedList> worklist = new LinkedList>();
- Map,State> newstate = new HashMap,State>();
- sets.put(initialset, initialset);
+ final boolean initAccept = a.initial.accept;
+ final int initNumber = a.initial.number;
+ a.initial = new State();
+ SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(initNumber, a.initial);
+
+ LinkedList worklist = new LinkedList();
+ Map newstate = new HashMap();
+
worklist.add(initialset);
- a.initial = new State();
+
+ a.initial.accept = initAccept;
newstate.put(initialset, a.initial);
+
+ int newStateUpto = 0;
+ State[] newStatesArray = new State[5];
+ newStatesArray[newStateUpto] = a.initial;
+ a.initial.number = newStateUpto;
+ newStateUpto++;
+
+ // like Set
+ final PointTransitionSet points = new PointTransitionSet();
+
+ // like SortedMap
+ final SortedIntSet statesSet = new SortedIntSet(5);
+
while (worklist.size() > 0) {
- Set s = worklist.removeFirst();
- State r = newstate.get(s);
- for (State q : s)
- if (q.accept) {
- r.accept = true;
- break;
+ SortedIntSet.FrozenIntSet s = worklist.removeFirst();
+
+ // Collate all outgoing transitions by min/1+max:
+ for(int i=0;i p = new HashSet();
- for (State q : s)
- for (Transition t : q.transitions)
- if (t.min <= points[n] && points[n] <= t.max) p.add(t.to);
- if (!sets.containsKey(p)) {
- sets.put(p, p);
- worklist.add(p);
- newstate.put(p, new State());
+ }
+
+ if (points.count == 0) {
+ // No outgoing transitions -- skip it
+ continue;
+ }
+
+ points.sort();
+
+ int lastPoint = -1;
+ int accCount = 0;
+
+ final State r = s.state;
+ for(int i=0;i 0) {
+ assert lastPoint != -1;
+
+ statesSet.computeHash();
+
+ State q = newstate.get(statesSet);
+ if (q == null) {
+ q = new State();
+ final SortedIntSet.FrozenIntSet p = statesSet.freeze(q);
+ worklist.add(p);
+ if (newStateUpto == newStatesArray.length) {
+ final State[] newArray = new State[ArrayUtil.oversize(1+newStateUpto, RamUsageEstimator.NUM_BYTES_OBJ_REF)];
+ System.arraycopy(newStatesArray, 0, newArray, 0, newStateUpto);
+ newStatesArray = newArray;
+ }
+ newStatesArray[newStateUpto] = q;
+ q.number = newStateUpto;
+ newStateUpto++;
+ q.accept = accCount > 0;
+ newstate.put(p, q);
+ } else {
+ assert (accCount > 0 ? true:false) == q.accept: "accCount=" + accCount + " vs existing accept=" + q.accept + " states=" + statesSet;
+ }
+
+ r.addTransition(new Transition(lastPoint, point-1, q));
}
- State q = newstate.get(p);
- char min = points[n];
- char max;
- if (n + 1 < points.length) max = (char) (points[n + 1] - 1);
- else max = Character.MAX_VALUE;
- r.transitions.add(new Transition(min, max, q));
+
+ // process transitions that end on this point
+ // (closes an overlapping interval)
+ Transition[] transitions = points.points[i].ends.transitions;
+ int limit = points.points[i].ends.count;
+ for(int j=0;j states = a.getStates();
- Automaton.setStateNumbers(states);
+ State[] states = a.getNumberedStates();
LinkedList pp = new LinkedList();
LinkedList pp_other = new LinkedList();
- BitSet bb = new BitSet(states.size());
- BitSet bb_other = new BitSet(states.size());
+ BitSet bb = new BitSet(states.length);
+ BitSet bb_other = new BitSet(states.length);
pp.add(a.initial);
ArrayList dest = new ArrayList();
boolean accept = a.initial.accept;
- for (int i = 0; i < s.length(); i++) {
- char c = s.charAt(i);
+ for (int i = 0, c = 0; i < s.length(); i += Character.charCount(c)) {
+ c = s.codePointAt(i);
accept = false;
pp_other.clear();
bb_other.clear();
Index: lucene/src/java/org/apache/lucene/util/automaton/SortedIntSet.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/SortedIntSet.java (revision 0)
+++ lucene/src/java/org/apache/lucene/util/automaton/SortedIntSet.java (revision 0)
@@ -0,0 +1,272 @@
+package org.apache.lucene.util.automaton;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.TreeMap;
+import java.util.Map;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+// Just holds a set of int[] states, plus a corresponding
+// int[] count per state. Used by
+// BasicOperations.determinize
+final class SortedIntSet {
+ int[] values;
+ int[] counts;
+ int upto;
+ private int hashCode;
+
+ // If we hold more than this many states, we switch from
+ // O(N^2) linear ops to O(N log(N)) TreeMap
+ private final static int TREE_MAP_CUTOVER = 30;
+
+ private final Map map = new TreeMap();
+
+ private boolean useTreeMap;
+
+ State state;
+
+ public SortedIntSet(int capacity) {
+ values = new int[capacity];
+ counts = new int[capacity];
+ }
+
+ // Adds this state to the set
+ public void incr(int num) {
+ if (useTreeMap) {
+ final Integer key = num;
+ Integer val = map.get(key);
+ if (val == null) {
+ map.put(key, 1);
+ } else {
+ map.put(key, 1+val);
+ }
+ return;
+ }
+
+ if (upto == values.length) {
+ values = ArrayUtil.grow(values, 1+upto);
+ counts = ArrayUtil.grow(counts, 1+upto);
+ }
+
+ for(int i=0;i= i) {
+ values[1+j] = values[j];
+ counts[1+j] = counts[j];
+ j--;
+ }
+ values[i] = num;
+ counts[i] = 1;
+ upto++;
+ return;
+ }
+ }
+
+ // append
+ values[upto] = num;
+ counts[upto] = 1;
+ upto++;
+
+ if (upto == TREE_MAP_CUTOVER) {
+ useTreeMap = true;
+ for(int i=0;i values.length) {
+ final int size = ArrayUtil.oversize(map.size(), RamUsageEstimator.NUM_BYTES_INT);
+ values = new int[size];
+ counts = new int[size];
+ }
+ hashCode = map.size();
+ upto = 0;
+ for(int state : map.keySet()) {
+ hashCode = 683*hashCode + state;
+ values[upto++] = state;
+ }
+ } else {
+ hashCode = upto;
+ for(int i=0;i 0) {
+ sb.append(' ');
+ }
+ sb.append(values[i]).append(':').append(counts[i]);
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ public final static class FrozenIntSet {
+ final int[] values;
+ final int hashCode;
+ final State state;
+
+ public FrozenIntSet(int[] values, int hashCode, State state) {
+ this.values = values;
+ this.hashCode = hashCode;
+ this.state = state;
+ }
+
+ public FrozenIntSet(int num, State state) {
+ this.values = new int[] {num};
+ this.state = state;
+ this.hashCode = 683+num;
+ }
+
+ public int hashCode() {
+ return hashCode;
+ }
+
+ public boolean equals(Object _other) {
+ if (_other == null) {
+ return false;
+ }
+ if (_other instanceof FrozenIntSet) {
+ FrozenIntSet other = (FrozenIntSet) _other;
+ if (hashCode != other.hashCode) {
+ return false;
+ }
+ if (other.values.length != values.length) {
+ return false;
+ }
+ for(int i=0;i 0) {
+ sb.append(' ');
+ }
+ sb.append(values[i]);
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+ }
+}
+
Property changes on: lucene/src/java/org/apache/lucene/util/automaton/SortedIntSet.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/java/org/apache/lucene/util/automaton/RegExp.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/RegExp.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/automaton/RegExp.java (working copy)
@@ -366,9 +366,9 @@
Kind kind;
RegExp exp1, exp2;
String s;
- char c;
+ int c;
int min, max, digits;
- char from, to;
+ int from, to;
String b;
int flags;
@@ -625,10 +625,10 @@
b.append(")");
break;
case REGEXP_CHAR:
- b.append("\\").append(c);
+ b.append("\\").appendCodePoint(c);
break;
case REGEXP_CHAR_RANGE:
- b.append("[\\").append(from).append("-\\").append(to).append("]");
+ b.append("[\\").appendCodePoint(from).append("-\\").appendCodePoint(to).append("]");
break;
case REGEXP_ANYCHAR:
b.append(".");
@@ -725,9 +725,9 @@
static private RegExp makeString(RegExp exp1, RegExp exp2) {
StringBuilder b = new StringBuilder();
if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
- else b.append(exp1.c);
+ else b.appendCodePoint(exp1.c);
if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
- else b.append(exp2.c);
+ else b.appendCodePoint(exp2.c);
return makeString(b.toString());
}
@@ -777,14 +777,14 @@
return r;
}
- static RegExp makeChar(char c) {
+ static RegExp makeChar(int c) {
RegExp r = new RegExp();
r.kind = Kind.REGEXP_CHAR;
r.c = c;
return r;
}
- static RegExp makeCharRange(char from, char to) {
+ static RegExp makeCharRange(int from, int to) {
RegExp r = new RegExp();
r.kind = Kind.REGEXP_CHAR_RANGE;
r.from = from;
@@ -834,13 +834,13 @@
}
private boolean peek(String s) {
- return more() && s.indexOf(b.charAt(pos)) != -1;
+ return more() && s.indexOf(b.codePointAt(pos)) != -1;
}
- private boolean match(char c) {
+ private boolean match(int c) {
if (pos >= b.length()) return false;
- if (b.charAt(pos) == c) {
- pos++;
+ if (b.codePointAt(pos) == c) {
+ pos += Character.charCount(c);
return true;
}
return false;
@@ -850,9 +850,11 @@
return pos < b.length();
}
- private char next() throws IllegalArgumentException {
+ private int next() throws IllegalArgumentException {
if (!more()) throw new IllegalArgumentException("unexpected end-of-string");
- return b.charAt(pos++);
+ int ch = b.codePointAt(pos);
+ pos += Character.charCount(ch);
+ return ch;
}
private boolean check(int flag) {
@@ -933,7 +935,7 @@
}
final RegExp parseCharClass() throws IllegalArgumentException {
- char c = parseCharExp();
+ int c = parseCharExp();
if (match('-')) return makeCharRange(c, parseCharExp());
else return makeChar(c);
}
@@ -993,7 +995,7 @@
} else return makeChar(parseCharExp());
}
- final char parseCharExp() throws IllegalArgumentException {
+ final int parseCharExp() throws IllegalArgumentException {
match('\\');
return next();
}
Index: lucene/src/java/org/apache/lucene/util/automaton/Transition.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/Transition.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/automaton/Transition.java (working copy)
@@ -30,12 +30,13 @@
package org.apache.lucene.util.automaton;
import java.io.Serializable;
+import java.util.Comparator;
/**
* Automaton transition.
*
* A transition, which belongs to a source state, consists of a Unicode
- * character interval and a destination state.
+ * codepoint interval and a destination state.
*
* @lucene.experimental
*/
@@ -45,18 +46,18 @@
* CLASS INVARIANT: min<=max
*/
- char min;
- char max;
+ final int min;
+ final int max;
+ final State to;
- State to;
-
/**
* Constructs a new singleton interval transition.
*
- * @param c transition character
+ * @param c transition codepoint
* @param to destination state
*/
- public Transition(char c, State to) {
+ public Transition(int c, State to) {
+ assert c >= 0;
min = max = c;
this.to = to;
}
@@ -68,9 +69,11 @@
* @param max transition interval maximum
* @param to destination state
*/
- public Transition(char min, char max, State to) {
+ public Transition(int min, int max, State to) {
+ assert min >= 0;
+ assert max >= 0;
if (max < min) {
- char t = max;
+ int t = max;
max = min;
min = t;
}
@@ -80,12 +83,12 @@
}
/** Returns minimum of this transition interval. */
- public char getMin() {
+ public int getMin() {
return min;
}
/** Returns maximum of this transition interval. */
- public char getMax() {
+ public int getMax() {
return max;
}
@@ -134,14 +137,18 @@
}
}
- static void appendCharString(char c, StringBuilder b) {
- if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.append(c);
+ static void appendCharString(int c, StringBuilder b) {
+ if (c >= 0x21 && c <= 0x7e && c != '\\' && c != '"') b.appendCodePoint(c);
else {
- b.append("\\u");
+ b.append("\\\\U");
String s = Integer.toHexString(c);
- if (c < 0x10) b.append("000").append(s);
- else if (c < 0x100) b.append("00").append(s);
- else if (c < 0x1000) b.append("0").append(s);
+ if (c < 0x10) b.append("0000000").append(s);
+ else if (c < 0x100) b.append("000000").append(s);
+ else if (c < 0x1000) b.append("00000").append(s);
+ else if (c < 0x10000) b.append("0000").append(s);
+ else if (c < 0x100000) b.append("000").append(s);
+ else if (c < 0x1000000) b.append("00").append(s);
+ else if (c < 0x10000000) b.append("0").append(s);
else b.append(s);
}
}
@@ -171,4 +178,96 @@
}
b.append("\"]\n");
}
+
+ private static final class CompareByDestThenMinMaxSingle implements Comparator {
+ public int compare(Transition t1, Transition t2) {
+ if (t1.to != t2.to) {
+ if (t1.to.number < t2.to.number) return -1;
+ else if (t1.to.number > t2.to.number) return 1;
+ }
+ if (t1.min < t2.min) return -1;
+ if (t1.min > t2.min) return 1;
+ if (t1.max > t2.max) return -1;
+ if (t1.max < t2.max) return 1;
+ return 0;
+ }
+ }
+
+ public static final Comparator CompareByDestThenMinMax = new CompareByDestThenMinMaxSingle();
+
+ private static final class CompareByMinMaxThenDestSingle implements Comparator {
+ public int compare(Transition t1, Transition t2) {
+ if (t1.min < t2.min) return -1;
+ if (t1.min > t2.min) return 1;
+ if (t1.max > t2.max) return -1;
+ if (t1.max < t2.max) return 1;
+ if (t1.to != t2.to) {
+ if (t1.to.number < t2.to.number) return -1;
+ if (t1.to.number > t2.to.number) return 1;
+ }
+ return 0;
+ }
+ }
+
+ public static final Comparator CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle();
+
+ private static class UTF8InUTF16Order {
+ protected int compareCodePoint(int aByte, int bByte) {
+ if (aByte != bByte) {
+ // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
+
+ // We know the terms are not equal, but, we may
+ // have to carefully fixup the bytes at the
+ // difference to match UTF16's sort order:
+ if (aByte >= 0xee && bByte >= 0xee) {
+ if ((aByte & 0xfe) == 0xee) {
+ aByte += 0x10;
+ }
+ if ((bByte&0xfe) == 0xee) {
+ bByte += 0x10;
+ }
+ }
+ return aByte - bByte;
+ }
+ return 0;
+ }
+ }
+
+ private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator {
+ public int compare(Transition t1, Transition t2) {
+ if (t1.to != t2.to) {
+ if (t1.to == null) return -1;
+ else if (t2.to == null) return 1;
+ else if (t1.to.number < t2.to.number) return -1;
+ else if (t1.to.number > t2.to.number) return 1;
+ }
+ int minComp = compareCodePoint(t1.min, t2.min);
+ if (minComp != 0) return minComp;
+ int maxComp = compareCodePoint(t1.max, t2.max);
+ if (maxComp != 0) return maxComp;
+ return 0;
+ }
+ }
+
+ public static final Comparator CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle();
+
+ private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator {
+ public int compare(Transition t1, Transition t2) {
+ int minComp = compareCodePoint(t1.min, t2.min);
+ if (minComp != 0) return minComp;
+ int maxComp = compareCodePoint(t1.max, t2.max);
+ if (maxComp != 0) return maxComp;
+ if (t1.to != t2.to) {
+ if (t1.to == null) return -1;
+ else if (t2.to == null) return 1;
+ else if (t1.to.number < t2.to.number) return -1;
+ else if (t1.to.number > t2.to.number) return 1;
+ }
+ return 0;
+ }
+ }
+
+ public static final Comparator CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle();
+
+
}
Index: lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/automaton/RunAutomaton.java (working copy)
@@ -30,22 +30,22 @@
package org.apache.lucene.util.automaton;
import java.io.Serializable;
-import java.util.Set;
/**
* Finite-state automaton with fast run operation.
*
* @lucene.experimental
*/
-public final class RunAutomaton implements Serializable {
-
+public abstract class RunAutomaton implements Serializable {
+ final int maxInterval;
final int size;
final boolean[] accept;
final int initial;
final int[] transitions; // delta(state,c) = transitions[state*points.length +
// getCharClass(c)]
- final char[] points; // char interval start points
+ final int[] points; // char interval start points
final int[] classmap; // map from char number to class class
+ final Automaton automaton;
/**
* Returns a string representation of this automaton.
@@ -61,10 +61,10 @@
for (int j = 0; j < points.length; j++) {
int k = transitions[i * points.length + j];
if (k != -1) {
- char min = points[j];
- char max;
- if (j + 1 < points.length) max = (char) (points[j + 1] - 1);
- else max = Character.MAX_VALUE;
+ int min = points[j];
+ int max;
+ if (j + 1 < points.length) max = (points[j + 1] - 1);
+ else max = maxInterval;
b.append(" ");
Transition.appendCharString(min, b);
if (min != max) {
@@ -81,52 +81,59 @@
/**
* Returns number of states in automaton.
*/
- public int getSize() {
+ public final int getSize() {
return size;
}
/**
* Returns acceptance status for given state.
*/
- public boolean isAccept(int state) {
+ public final boolean isAccept(int state) {
return accept[state];
}
/**
* Returns initial state.
*/
- public int getInitialState() {
+ public final int getInitialState() {
return initial;
}
/**
- * Returns array of character class interval start points. The array should
+ * Returns array of codepoint class interval start points. The array should
* not be modified by the caller.
*/
- public char[] getCharIntervals() {
+ public final int[] getCharIntervals() {
return points.clone();
}
/**
- * Gets character class of given char.
+ * Gets character class of given codepoint
*/
- int getCharClass(char c) {
+ final int getCharClass(int c) {
return SpecialOperations.findIndex(c, points);
}
/**
+ * @return the automaton
+ */
+ public Automaton getAutomaton() {
+ return automaton;
+ }
+
+ /**
* Constructs a new RunAutomaton from a deterministic
* Automaton.
*
* @param a an automaton
*/
- public RunAutomaton(Automaton a) {
+ public RunAutomaton(Automaton a, int maxInterval, boolean tableize) {
+ this.maxInterval = maxInterval;
a.determinize();
points = a.getStartPoints();
- Set states = a.getStates();
- Automaton.setStateNumbers(states);
initial = a.initial.number;
- size = states.size();
+ final State[] states = a.getNumberedStates();
+ size = states.length;
accept = new boolean[size];
transitions = new int[size * points.length];
for (int n = 0; n < size * points.length; n++)
@@ -142,12 +149,18 @@
/*
* Set alphabet table for optimal run performance.
*/
- classmap = new int[Character.MAX_VALUE + 1];
- int i = 0;
- for (int j = 0; j <= Character.MAX_VALUE; j++) {
- if (i + 1 < points.length && j == points[i + 1]) i++;
- classmap[j] = i;
+ if (tableize) {
+ classmap = new int[maxInterval + 1];
+ int i = 0;
+ for (int j = 0; j <= maxInterval; j++) {
+ if (i + 1 < points.length && j == points[i + 1])
+ i++;
+ classmap[j] = i;
+ }
+ } else {
+ classmap = null;
}
+ this.automaton = a;
}
/**
@@ -157,54 +170,10 @@
* if a dead state is entered in an equivalent automaton with a total
* transition function.)
*/
- public int step(int state, char c) {
- return transitions[state * points.length + classmap[c]];
+ public final int step(int state, int c) {
+ if (classmap == null)
+ return transitions[state * points.length + getCharClass(c)];
+ else
+ return transitions[state * points.length + classmap[c]];
}
-
- /**
- * Returns true if the given string is accepted by this automaton.
- */
- public boolean run(String s) {
- int p = initial;
- int l = s.length();
- for (int i = 0; i < l; i++) {
- p = step(p, s.charAt(i));
- if (p == -1) return false;
- }
- return accept[p];
- }
-
- /**
- * Returns true if the given string is accepted by this automaton
- */
- public boolean run(char[] s, int offset, int length) {
- int p = initial;
- int l = offset + length;
- for (int i = offset; i < l; i++) {
- p = step(p, s[i]);
- if (p == -1) return false;
- }
- return accept[p];
- }
-
- /**
- * Returns the length of the longest accepted run of the given string starting
- * at the given offset.
- *
- * @param s the string
- * @param offset offset into s where the run starts
- * @return length of the longest accepted run, -1 if no run is accepted
- */
- public int run(String s, int offset) {
- int p = initial;
- int l = s.length();
- int max = -1;
- for (int r = 0; offset <= l; offset++, r++) {
- if (accept[p]) max = r;
- if (offset == l) break;
- p = step(p, s.charAt(offset));
- if (p == -1) break;
- }
- return max;
- }
}
Index: lucene/src/java/org/apache/lucene/util/IntsRef.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/IntsRef.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/IntsRef.java (working copy)
@@ -30,6 +30,10 @@
public IntsRef() {
}
+ public IntsRef(int capacity) {
+ ints = new int[capacity];
+ }
+
public IntsRef(int[] ints, int offset, int length) {
this.ints = ints;
this.offset = offset;
Index: lucene/src/java/org/apache/lucene/util/ArrayUtil.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/ArrayUtil.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/ArrayUtil.java (working copy)
@@ -368,4 +368,57 @@
code = code * 31 + array[i];
return code;
}
+
+
+ // Since Arrays.equals doesn't implement offsets for equals
+ /**
+ * See if two array slices are the same.
+ *
+ * @param left The left array to compare
+ * @param offsetLeft The offset into the array. Must be positive
+ * @param right The right array to compare
+ * @param offsetRight the offset into the right array. Must be positive
+ * @param length The length of the section of the array to compare
+ * @return true if the two arrays, starting at their respective offsets, are equal
+ *
+ * @see java.util.Arrays#equals(char[], char[])
+ */
+ public static boolean equals(char[] left, int offsetLeft, char[] right, int offsetRight, int length) {
+ if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) {
+ for (int i = 0; i < length; i++) {
+ if (left[offsetLeft + i] != right[offsetRight + i]) {
+ return false;
+ }
+
+ }
+ return true;
+ }
+ return false;
+ }
+
+ // Since Arrays.equals doesn't implement offsets for equals
+ /**
+ * See if two array slices are the same.
+ *
+ * @param left The left array to compare
+ * @param offsetLeft The offset into the array. Must be positive
+ * @param right The right array to compare
+ * @param offsetRight the offset into the right array. Must be positive
+ * @param length The length of the section of the array to compare
+ * @return true if the two arrays, starting at their respective offsets, are equal
+ *
+ * @see java.util.Arrays#equals(char[], char[])
+ */
+ public static boolean equals(int[] left, int offsetLeft, int[] right, int offsetRight, int length) {
+ if ((offsetLeft + length <= left.length) && (offsetRight + length <= right.length)) {
+ for (int i = 0; i < length; i++) {
+ if (left[offsetLeft + i] != right[offsetRight + i]) {
+ return false;
+ }
+
+ }
+ return true;
+ }
+ return false;
+ }
}
Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 940218)
+++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy)
@@ -491,4 +491,92 @@
return true;
}
+
+ // Borrowed from Python's 3.1.2 sources,
+ // Objects/unicodeobject.c, and modified (see commented
+ // out section, and the -1s) to disallow the reserved for
+ // future (RFC 3629) 5/6 byte sequence characters, and
+ // invalid 0xFE and 0xFF bytes.
+
+ /* Map UTF-8 encoded prefix byte to sequence length. -1 (0xFF)
+ * means illegal prefix. see RFC 2279 for details */
+ static byte[] utf8CodeLength = new byte[] {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4 //, 5, 5, 5, 5, 6, 6, 0, 0
+ };
+
+
+ /** Returns the number of code points in this utf8
+ * sequence. Behavior is undefined if the utf8 sequence
+ * is invalid.*/
+ public static final int codePointCount(BytesRef utf8) {
+ int upto = utf8.offset;
+ final int limit = utf8.offset + utf8.length;
+ final byte[] bytes = utf8.bytes;
+ int codePointCount = 0;
+ while (upto < limit) {
+ codePointCount++;
+ upto += utf8CodeLength[bytes[upto]&0xFF];
+ }
+ return codePointCount;
+ }
+
+ public static void UTF8toUTF32(final BytesRef utf8, final IntsRef utf32) {
+ // pre-alloc for worst case
+ if (utf32.ints == null || utf32.ints.length < utf8.length) {
+ utf32.ints = new int[utf8.length];
+ }
+ int utf32Count = 0;
+ int utf8Upto = utf8.offset;
+ final int[] ints = utf32.ints;
+ final byte[] bytes = utf8.bytes;
+ final int utf8Limit = utf8.offset + utf8.length;
+ while(utf8Upto < utf8Limit) {
+ final int numBytes = utf8CodeLength[bytes[utf8Upto]&0xFF];
+ int v = 0;
+ switch(numBytes) {
+ case 1:
+ ints[utf32Count++] = bytes[utf8Upto++];
+ continue;
+ case 2:
+ // 5 useful bits
+ v = bytes[utf8Upto++] & 31;
+ break;
+ case 3:
+ // 4 useful bits
+ v = bytes[utf8Upto++] & 15;
+ break;
+ case 4:
+ // 3 useful bits
+ v = bytes[utf8Upto++] & 7;
+ break;
+ default :
+ throw new IllegalStateException("invalid utf8");
+ }
+
+ final int limit = utf8Upto + numBytes-1;
+
+ while(utf8Upto < limit) {
+ v = v << 6 | bytes[utf8Upto++]&63;
+ }
+ ints[utf32Count++] = v;
+ }
+
+ utf32.offset = 0;
+ utf32.length = utf32Count;
+ }
}
Index: lucene/LICENSE.txt
===================================================================
--- lucene/LICENSE.txt (revision 940218)
+++ lucene/LICENSE.txt (working copy)
@@ -237,6 +237,12 @@
http://www.python.org/download/releases/2.4.2/license/
+Some code in src/java/org/apache/lucene/util/UnicodeUtil.java was
+derived from Python 3.1.2 sources available at
+http://www.python.org. Full license is here:
+
+ http://www.python.org/download/releases/3.1.2/license/
+
Some code in src/java/org/apache/lucene/util/automaton was
derived from Brics automaton sources available at
www.brics.dk/automaton/. Here is the copyright from those sources: