true by default.
+ */
+ public void setPreservePositionIncrements(boolean enablePositionIncrements) {
+ this.preservePositionIncrements = enablePositionIncrements;
+ }
+
+ private static class Position implements RollingBuffer.Resettable {
+ // Any tokens that ended at our position arrive to this state:
+ State arriving;
+
+ // Any tokens that start at our position leave from this state:
+ State leaving;
+
+ @Override
+ public void reset() {
+ arriving = null;
+ leaving = null;
+ }
+ }
+
+ private static class Positions extends RollingBuffer
* NOTE: This suggester does not boost suggestions that
@@ -177,6 +178,10 @@
// to be log weights or something ...
Automaton levA = toLevenshteinAutomata(lookupAutomaton);
+ if (isUnicodeAware()) {
+ levA = new UTF32ToUTF8().convert(levA);
+ BasicOperations.determinize(levA);
+ }
/*
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
w.write(levA.toDot());
@@ -203,7 +208,7 @@
// to allow the trailing dedup bytes to be
// edited... but then 0 byte is "in general" allowed
// on input (but not in UTF8).
- LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions);
+ LevenshteinAutomata lev = new LevenshteinAutomata(ints, isUnicodeAware() ? Character.MAX_CODE_POINT : 255, transpositions);
Automaton levAutomaton = lev.toAutomaton(maxEdits);
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (date 1372194687000)
+++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (date 1372232814000)
@@ -24,7 +24,6 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
-import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -48,8 +47,6 @@
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.TermFreq;
import org.apache.lucene.search.suggest.TermFreqArrayIterator;
@@ -594,7 +591,7 @@
}
}
- private static char SEP = '\uFFFF';
+ private static char SEP = '\u001F';
public void testRandom() throws Exception {
@@ -615,6 +612,7 @@
}
boolean preserveSep = random().nextBoolean();
+ boolean unicodeAware = random().nextBoolean();
final int numStopChars = random().nextInt(10);
final boolean preserveHoles = random().nextBoolean();
@@ -641,7 +639,7 @@
if (token > 0) {
key += " ";
}
- if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != SEP) {
+ if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length()-1)) != 0x1F : analyzedKey.charAt(analyzedKey.length()-1) != SEP)) {
analyzedKey += SEP;
}
key += s;
@@ -702,8 +700,14 @@
}
Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles);
- AnalyzingSuggester suggester = new AnalyzingSuggester(a, a,
- preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1);
+ int options = 0;
+ if (preserveSep) {
+ options |= AnalyzingSuggester.PRESERVE_SEP;
+ }
+ if (unicodeAware) {
+ options |= AnalyzingSuggester.UNICODE_AWARE;
+ }
+ AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, options, 256, -1);
if (doPayloads) {
suggester.build(new TermFreqPayloadArrayIterator(payloadKeys));
} else {
@@ -838,7 +842,7 @@
int tokenStreamCounter = 0;
final TokenStream[] tokenStreams = new TokenStream[] {
new CannedBinaryTokenStream(new BinaryToken[] {
- token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})),
+ token(new BytesRef(new byte[] {0x61, (byte) 0x1F, 0x61})),
}),
new CannedTokenStream(new Token[] {
token("a",1,1),
@@ -849,7 +853,7 @@
token("a",1,1)
}),
new CannedBinaryTokenStream(new BinaryToken[] {
- token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})),
+ token(new BytesRef(new byte[] {0x61, (byte) 0x1F, 0x61})),
})
};
@@ -1192,5 +1196,19 @@
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, 1);
suggester.build(new TermFreqArrayIterator(new TermFreq[] {new TermFreq("a", 1)}));
assertEquals("[a/1]", suggester.lookup("a", false, 1).toString());
+ }
+
+ public void testIllegalLookupArgument() throws Exception {
+ Analyzer a = new MockAnalyzer(random());
+ AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.UNICODE_AWARE, 256, -1);
+ suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+ new TermFreq("а где Люси?", 7),
+ }));
+ try {
+ suggester.lookup("а\u001E", false, 3);
+ fail("should throw IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ System.out.println(e.getMessage());
+ }
}
}
Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (date 1372194687000)
+++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (date 1372232814000)
@@ -37,6 +37,7 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.analysis.TokenStreamToUnicodeAutomaton;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -48,7 +49,9 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.State;
+import org.apache.lucene.util.automaton.UTF32ToUTF8;
import org.apache.lucene.util.fst.Util;
public class FuzzySuggesterTest extends LuceneTestCase {
@@ -60,7 +63,9 @@
keys.add(new TermFreq("boo" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
}
keys.add(new TermFreq("foo bar boo far", 12));
- FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
+ MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
+ FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS,
+ 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH);
suggester.build(new TermFreqArrayIterator(keys));
int numIters = atLeast(10);
for (int i = 0; i < numIters; i++) {
@@ -72,6 +77,27 @@
}
}
+ public void testNonLatinRandomEdits() throws IOException {
+ Listtrue, maxEdits, minFuzzyLength and nonFuzzyPrefix
+ * will be measured in Unicode code points (actual letters) instead of bytes.
+ */
+ public static final String UNICODE_AWARE = "unicodeAware";
+
+ /**
* When multiple suggestions collide to the same analyzed form, this is the limit of
* how many unique surface forms we keep.
*/
@@ -91,12 +97,19 @@
? Boolean.valueOf(params.get(PRESERVE_SEP).toString())
: true;
+ boolean unicodeAware = params.get(UNICODE_AWARE) != null
+ ? Boolean.valueOf(params.get(UNICODE_AWARE).toString())
+ : false;
+
int flags = 0;
if (exactMatchFirst) {
flags |= AnalyzingSuggester.EXACT_FIRST;
}
if (preserveSep) {
flags |= AnalyzingSuggester.PRESERVE_SEP;
+ }
+ if (unicodeAware) {
+ flags |= AnalyzingSuggester.UNICODE_AWARE;
}
int maxSurfaceFormsPerAnalyzedForm = params.get(MAX_SURFACE_FORMS) != null
Index: solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
--- solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java (date 1372194687000)
+++ solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java (date 1372232814000)
@@ -80,12 +80,19 @@
? Boolean.valueOf(params.get(AnalyzingLookupFactory.PRESERVE_SEP).toString())
: true;
+ boolean unicodeAware = (params.get(AnalyzingLookupFactory.UNICODE_AWARE) != null)
+ ? Boolean.valueOf(params.get(AnalyzingLookupFactory.UNICODE_AWARE).toString())
+ : false;
+
int options = 0;
if (exactMatchFirst) {
options |= FuzzySuggester.EXACT_FIRST;
}
if (preserveSep) {
options |= FuzzySuggester.PRESERVE_SEP;
+ }
+ if (unicodeAware) {
+ options |= FuzzySuggester.UNICODE_AWARE;
}
int maxSurfaceFormsPerAnalyzedForm = (params.get(AnalyzingLookupFactory.MAX_SURFACE_FORMS) != null)