Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 1498685) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (working copy) @@ -32,7 +32,8 @@ // TODO: maybe also toFST? then we can translate atts into FST outputs/weights /** Consumes a TokenStream and creates an {@link Automaton} - * where the transition labels are UTF8 bytes from the {@link + * where the transition labels are UTF8 bytes (or Unicode + * code points if unicodeArcs is true) from the {@link * TermToBytesRefAttribute}. Between tokens we insert * POS_SEP and for holes we insert HOLE. * @@ -40,6 +41,7 @@ public class TokenStreamToAutomaton { private boolean preservePositionIncrements; + private boolean unicodeArcs; /** Sole constructor. */ public TokenStreamToAutomaton() { @@ -51,6 +53,12 @@ this.preservePositionIncrements = enablePositionIncrements; } + /** Whether to make transition labels Unicode code points instead of UTF8 bytes, + * false by default */ + public void setUnicodeArcs(boolean unicodeArcs) { + this.unicodeArcs = unicodeArcs; + } + private static class Position implements RollingBuffer.Resettable { // Any tokens that ended at our position arrive to this state: State arriving; @@ -80,15 +88,16 @@ } /** We create transition between two adjacent tokens. */ - public static final int POS_SEP = 256; + public static final int POS_SEP = 0x001f; /** We add this arc to represent a hole. */ - public static final int HOLE = 257; + public static final int HOLE = 0x001e; /** Pulls the graph (including {@link * PositionLengthAttribute}) from the provided {@link * TokenStream}, and creates the corresponding - * automaton where arcs are bytes from each term. */ + * automaton where arcs are bytes (or Unicode code points + * if unicodeArcs = true) from each term. */ public Automaton toAutomaton(TokenStream in) throws IOException { final Automaton a = new Automaton(); boolean deterministic = true; @@ -156,16 +165,34 @@ final int endPos = pos + posLengthAtt.getPositionLength(); termBytesAtt.fillBytesRef(); - final BytesRef term2 = changeToken(term); + final BytesRef termUTF8 = changeToken(term); + int[] termUnicode = null; final Position endPosData = positions.get(endPos); if (endPosData.arriving == null) { endPosData.arriving = new State(); } State state = posData.leaving; - for(int byteIDX=0;byteIDX keys = new ArrayList(); + int numTerms = atLeast(100); + for (int i = 0; i < numTerms; i++) { + keys.add(new TermFreq("буу" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); + } + keys.add(new TermFreq("фуу бар буу фар", 12)); + MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); + FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP | FuzzySuggester.FUZZY_UNICODE_AWARE, 256, -1, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, + 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH); + suggester.build(new TermFreqArrayIterator(keys)); + int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + String addRandomEdit = addRandomEdit("фуу бар буу", 0); + List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); + assertEquals(addRandomEdit, 1, results.size()); + assertEquals("фуу бар буу фар", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + } + } + /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ public void testKeyword() throws Exception { TermFreq keys[] = new TermFreq[] { @@ -580,12 +605,13 @@ TermFreq[] keys = new TermFreq[numQueries]; boolean preserveSep = random().nextBoolean(); + boolean unicodeAware = random().nextBoolean(); final int numStopChars = random().nextInt(10); final boolean preserveHoles = random().nextBoolean(); if (VERBOSE) { - System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); + System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " ; unicodeAware=" + unicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); } for (int i = 0; i < numQueries; i++) { @@ -606,7 +632,7 @@ if (token > 0) { key += " "; } - if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != ' ') { + if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length())-1) != ' ' : analyzedKey.charAt(analyzedKey.length()-1) != ' ')) { analyzedKey += " "; } key += s; @@ -658,8 +684,14 @@ } Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); - FuzzySuggester suggester = new FuzzySuggester(a, a, - preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3); + int options = 0; + if (preserveSep) { + options |= AnalyzingSuggester.PRESERVE_SEP; + } + if (unicodeAware) { + options |= AnalyzingSuggester.FUZZY_UNICODE_AWARE; + } + FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, false, 1, 3); suggester.build(new TermFreqArrayIterator(keys)); for (String prefix : allPrefixes) { @@ -729,6 +761,10 @@ // suggester.toLevA ...) ... but testRandom2() fixes // this: Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)); + if (unicodeAware) { + automaton = new UTF32ToUTF8().convert(automaton); + BasicOperations.determinize(automaton); + } assertTrue(automaton.isDeterministic()); // TODO: could be faster... but its slowCompletor for a reason BytesRef spare = new BytesRef(); @@ -878,7 +914,8 @@ // NOTE: can only use ascii here so that, in // UTF8 byte space it's still a single // insertion: - int x = random().nextInt(128); + // bytes 0x1e and 0x1f are reserved + int x = random().nextBoolean() ? random().nextInt(30) : 32 + random().nextInt(128 - 32); builder.append((char) x); for (int j = i; j < input.length; j++) { builder.append(input[j]); Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision 1498685) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (working copy) @@ -24,7 +24,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; -import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -48,8 +47,6 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.TermFreq; import org.apache.lucene.search.suggest.TermFreqArrayIterator; @@ -594,7 +591,7 @@ } } - private static char SEP = '\uFFFF'; + private static char SEP = '\u001F'; public void testRandom() throws Exception { @@ -615,12 +612,13 @@ } boolean preserveSep = random().nextBoolean(); + boolean fuzzyUnicodeAware = random().nextBoolean(); final int numStopChars = random().nextInt(10); final boolean preserveHoles = random().nextBoolean(); if (VERBOSE) { - System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); + System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + "; fuzzyUnicodeAware=" + fuzzyUnicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); } for (int i = 0; i < numQueries; i++) { @@ -702,8 +700,14 @@ } Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); - AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, - preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1); + int options = 0; + if (preserveSep) { + options |= AnalyzingSuggester.PRESERVE_SEP; + } + if (fuzzyUnicodeAware) { + options |= AnalyzingSuggester.FUZZY_UNICODE_AWARE; + } + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, options, 256, -1); if (doPayloads) { suggester.build(new TermFreqPayloadArrayIterator(payloadKeys)); } else { @@ -822,70 +826,6 @@ } } - public void testStolenBytes() throws Exception { - - // First time w/ preserveSep, second time without: - for(int i=0;i<2;i++) { - - final Analyzer analyzer = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); - - // TokenStream stream = new SynonymFilter(tokenizer, map, true); - // return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); - return new TokenStreamComponents(tokenizer) { - int tokenStreamCounter = 0; - final TokenStream[] tokenStreams = new TokenStream[] { - new CannedBinaryTokenStream(new BinaryToken[] { - token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), - }), - new CannedTokenStream(new Token[] { - token("a",1,1), - token("a",1,1) - }), - new CannedTokenStream(new Token[] { - token("a",1,1), - token("a",1,1) - }), - new CannedBinaryTokenStream(new BinaryToken[] { - token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), - }) - }; - - @Override - public TokenStream getTokenStream() { - TokenStream result = tokenStreams[tokenStreamCounter]; - tokenStreamCounter++; - return result; - } - - @Override - protected void setReader(final Reader reader) throws IOException { - } - }; - } - }; - - TermFreq keys[] = new TermFreq[] { - new TermFreq("a a", 50), - new TermFreq("a b", 50), - }; - - AnalyzingSuggester suggester = new AnalyzingSuggester(analyzer, analyzer, AnalyzingSuggester.EXACT_FIRST | (i==0 ? AnalyzingSuggester.PRESERVE_SEP : 0), 256, -1); - suggester.build(new TermFreqArrayIterator(keys)); - List results = suggester.lookup("a a", false, 5); - assertEquals(1, results.size()); - assertEquals("a b", results.get(0).key); - assertEquals(50, results.get(0).value); - - results = suggester.lookup("a a", false, 5); - assertEquals(1, results.size()); - assertEquals("a a", results.get(0).key); - assertEquals(50, results.get(0).value); - } - } - public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { Analyzer a = new MockAnalyzer(random()); AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 2, -1); @@ -1193,4 +1133,24 @@ suggester.build(new TermFreqArrayIterator(new TermFreq[] {new TermFreq("a", 1)})); assertEquals("[a/1]", suggester.lookup("a", false, 1).toString()); } + + public void testIllegalLookupArgument() throws Exception { + Analyzer a = new MockAnalyzer(random()); + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.FUZZY_UNICODE_AWARE, 256, -1); + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("а где Люси?", 7), + })); + try { + suggester.lookup("а\u001E", false, 3); + fail("should throw IllegalArgumentException"); + } catch (IllegalArgumentException e) { + System.out.println(e.getMessage()); + } + try { + suggester.lookup("а\u001F", false, 3); + fail("should throw IllegalArgumentException"); + } catch (IllegalArgumentException e) { + System.out.println(e.getMessage()); + } + } } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1498685) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) @@ -53,6 +53,7 @@ import org.apache.lucene.util.automaton.SpecialOperations; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST.BytesReader; @@ -146,6 +147,12 @@ */ private final boolean preserveSep; + /** + * True if {@link FuzzySuggester} operates Unicode code + * points instead of bytes. + */ + private final boolean fuzzyUnicodeAware; + /** Include this flag in the options parameter to {@link * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to always * return the exact match first, regardless of score. This @@ -158,9 +165,17 @@ * token separators when matching. */ public static final int PRESERVE_SEP = 2; + /** Include this flag in the options parameter to {@link + * FuzzySuggester#FuzzySuggester(org.apache.lucene.analysis.Analyzer, + * org.apache.lucene.analysis.Analyzer, int, int, int, int, boolean, int, int)} + * to measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix + * parameters in Unicode code points (actual letters) + * instead of bytes. */ + public static final int FUZZY_UNICODE_AWARE = 4; + /** Represents the separation between tokens, if * PRESERVE_SEP was specified */ - private static final int SEP_LABEL = 0xff; + private static final int SEP_LABEL = '\u001F'; /** Marks end of the analyzed input and start of dedup * byte. */ @@ -213,7 +228,8 @@ * analyzing suggestions while building the index. * @param queryAnalyzer Analyzer that will be used for * analyzing query text during lookup - * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}, + * {@link #FUZZY_UNICODE_AWARE} * @param maxSurfaceFormsPerAnalyzedForm Maximum number of * surface forms to keep for a single analyzed form. * When there are too many surface forms we discard the @@ -225,11 +241,12 @@ public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions) { this.indexAnalyzer = indexAnalyzer; this.queryAnalyzer = queryAnalyzer; - if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) { - throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options); + if ((options & ~(EXACT_FIRST | PRESERVE_SEP | FUZZY_UNICODE_AWARE)) != 0) { + throw new IllegalArgumentException("options should only contain EXACT_FIRST, PRESERVE_SEP and FUZZY_UNICODE_AWARE; got " + options); } this.exactFirst = (options & EXACT_FIRST) != 0; this.preserveSep = (options & PRESERVE_SEP) != 0; + this.fuzzyUnicodeAware = (options & FUZZY_UNICODE_AWARE) != 0; // NOTE: this is just an implementation limitation; if // somehow this is a problem we could fix it by using @@ -307,45 +324,10 @@ } } - /** Just escapes the 0xff byte (which we still for SEP). */ - private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { - - final BytesRef spare = new BytesRef(); - - @Override - protected BytesRef changeToken(BytesRef in) { - int upto = 0; - for(int i=0;i results = new ArrayList(); - List>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst); + List>> prefixPaths = FSTUtil.intersectPrefixPaths(this.fuzzyUnicodeAware ? utf8lookupAutomaton : lookupAutomaton, fst); if (exactFirst) { @@ -866,6 +861,11 @@ replaceSep(automaton); + if (fuzzyUnicodeAware) { + automaton = new UTF32ToUTF8().convert(automaton); + BasicOperations.determinize(automaton); + } + assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be @@ -899,7 +899,13 @@ return automaton; } - + /** + * Returns the fuzzyUnicodeAware option + * @return the fuzzyUnicodeAware option + */ + public boolean isFuzzyUnicodeAware() { + return fuzzyUnicodeAware; + } /** * Returns the weight associated with an input string, Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (revision 1498685) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (working copy) @@ -15,10 +15,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -import java.io.FileOutputStream; + import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; import java.util.Arrays; import java.util.List; import java.util.Set; @@ -33,6 +31,7 @@ import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs.Pair; @@ -54,6 +53,8 @@ * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be * edited. We allow up to 1 (@link * #DEFAULT_MAX_EDITS} edit. + * If {@link #FUZZY_UNICODE_AWARE} option is set to true, maxEdits, minFuzzyLength, transpositions + * and nonFuzzyPrefix are measured in Unicode code points (actual letters) instead of bytes. * *

* NOTE: This suggester does not boost suggestions that @@ -177,6 +178,10 @@ // to be log weights or something ... Automaton levA = toLevenshteinAutomata(lookupAutomaton); + if (isFuzzyUnicodeAware()) { + levA = new UTF32ToUTF8().convert(levA); + BasicOperations.determinize(levA); + } /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); w.write(levA.toDot()); @@ -203,7 +208,7 @@ // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). - LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions); + LevenshteinAutomata lev = new LevenshteinAutomata(ints, isFuzzyUnicodeAware() ? Character.MAX_CODE_POINT : 255, transpositions); Automaton levAutomaton = lev.toAutomaton(maxEdits); Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already Index: solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java (revision 1498685) +++ solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java (working copy) @@ -43,6 +43,13 @@ public static final String PRESERVE_SEP = "preserveSep"; /** + * If true, maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix in + * {@link FuzzyLookupFactory} will be measured in Unicode code points (actual letters) + * instead of bytes. + */ + public static final String FUZZY_UNICODE_AWARE = "fuzzyUnicodeAware"; + + /** * When multiple suggestions collide to the same analyzed form, this is the limit of * how many unique surface forms we keep. */ @@ -91,6 +98,10 @@ ? Boolean.valueOf(params.get(PRESERVE_SEP).toString()) : true; + boolean fuzzyUnicodeAware = params.get(FUZZY_UNICODE_AWARE) != null + ? Boolean.valueOf(params.get(FUZZY_UNICODE_AWARE).toString()) + : false; + int flags = 0; if (exactMatchFirst) { flags |= AnalyzingSuggester.EXACT_FIRST; @@ -98,6 +109,9 @@ if (preserveSep) { flags |= AnalyzingSuggester.PRESERVE_SEP; } + if (fuzzyUnicodeAware) { + flags |= AnalyzingSuggester.FUZZY_UNICODE_AWARE; + } int maxSurfaceFormsPerAnalyzedForm = params.get(MAX_SURFACE_FORMS) != null ? Integer.parseInt(params.get(MAX_SURFACE_FORMS).toString()) Index: solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java (revision 1498685) +++ solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java (working copy) @@ -34,22 +34,26 @@ /** * Maximum number of edits allowed, used by {@link LevenshteinAutomata#toAutomaton(int)} + * in bytes or Unicode code points (if unicodeAware option is set to true). */ public static final String MAX_EDITS = "maxEdits"; /** * If transpositions are allowed, Fuzzy suggestions will be computed based on a primitive * edit operation. If it is false, it will be based on the classic Levenshtein algorithm. + * Transpositions of bytes or Unicode code points (if unicodeAware option is set to true). */ public static final String TRANSPOSITIONS = "transpositions"; /** * Length of common (non-fuzzy) prefix for the suggestions + * in bytes or Unicode code points (if unicodeAware option is set to true). */ public static final String NON_FUZZY_PREFIX = "nonFuzzyPrefix"; /** * Minimum length of lookup key before any edits are allowed for the suggestions + * in bytes or Unicode code points (if unicodeAware option is set to true). */ public static final String MIN_FUZZY_LENGTH = "minFuzzyLength"; @@ -80,6 +84,10 @@ ? Boolean.valueOf(params.get(AnalyzingLookupFactory.PRESERVE_SEP).toString()) : true; + boolean fuzzyUnicodeAware = (params.get(AnalyzingLookupFactory.FUZZY_UNICODE_AWARE) != null) + ? Boolean.valueOf(params.get(AnalyzingLookupFactory.FUZZY_UNICODE_AWARE).toString()) + : false; + int options = 0; if (exactMatchFirst) { options |= FuzzySuggester.EXACT_FIRST; @@ -87,6 +95,9 @@ if (preserveSep) { options |= FuzzySuggester.PRESERVE_SEP; } + if (fuzzyUnicodeAware) { + options |= FuzzySuggester.FUZZY_UNICODE_AWARE; + } int maxSurfaceFormsPerAnalyzedForm = (params.get(AnalyzingLookupFactory.MAX_SURFACE_FORMS) != null) ? Integer.parseInt(params.get(AnalyzingLookupFactory.MAX_SURFACE_FORMS).toString())