Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (date 1372194687000) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (date 1372232814000) @@ -80,10 +80,10 @@ } /** We create transition between two adjacent tokens. */ - public static final int POS_SEP = 256; + public static final int POS_SEP = 0x001f; /** We add this arc to represent a hole. */ - public static final int HOLE = 257; + public static final int HOLE = 0x001e; /** Pulls the graph (including {@link * PositionLengthAttribute}) from the provided {@link Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java (date 1372232814000) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToUnicodeAutomaton.java (date 1372232814000) @@ -0,0 +1,247 @@ +package org.apache.lucene.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RollingBuffer; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.Transition; + +import java.io.IOException; + +// TODO: maybe also toFST? then we can translate atts into FST outputs/weights + +/** + * Consumes a TokenStream and creates an {@link org.apache.lucene.util.automaton.Automaton} + * where the transition labels are Unicode code points from the {@link + * org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute}. Between tokens we insert + * POS_SEP and for holes we insert HOLE. + * + * @lucene.experimental + */ +public class TokenStreamToUnicodeAutomaton { + + private boolean preservePositionIncrements; + + /** + * Sole constructor. + */ + public TokenStreamToUnicodeAutomaton() { + this.preservePositionIncrements = true; + } + + /** + * Whether to generate holes in the automaton for missing positions, true by default. + */ + public void setPreservePositionIncrements(boolean enablePositionIncrements) { + this.preservePositionIncrements = enablePositionIncrements; + } + + private static class Position implements RollingBuffer.Resettable { + // Any tokens that ended at our position arrive to this state: + State arriving; + + // Any tokens that start at our position leave from this state: + State leaving; + + @Override + public void reset() { + arriving = null; + leaving = null; + } + } + + private static class Positions extends RollingBuffer { + @Override + protected Position newInstance() { + return new Position(); + } + } + + /** + * Subclass & implement this if you need to change the + * token (such as escaping certain bytes) before it's + * turned into a graph. + */ + protected BytesRef changeToken(BytesRef in) { + return in; + } + + /** + * Pulls the graph (including {@link + * org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute}) from the provided {@link + * org.apache.lucene.analysis.TokenStream}, and creates the corresponding + * automaton where arcs are Unicode code points from each term. + */ + public Automaton toAutomaton(TokenStream in) throws IOException { + final Automaton a = new Automaton(); + boolean deterministic = true; + + final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); + final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); + final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); + final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class); + + final BytesRef term = termBytesAtt.getBytesRef(); + + in.reset(); + + // Only temporarily holds states ahead of our current + // position: + + final RollingBuffer positions = new Positions(); + + int pos = -1; + Position posData = null; + int maxOffset = 0; + while (in.incrementToken()) { + int posInc = posIncAtt.getPositionIncrement(); + if (!preservePositionIncrements && posInc > 1) { + posInc = 1; + } + assert pos > -1 || posInc > 0; + + if (posInc > 0) { + + // New node: + pos += posInc; + + posData = positions.get(pos); + assert posData.leaving == null; + + if (posData.arriving == null) { + // No token ever arrived to this position + if (pos == 0) { + // OK: this is the first token + posData.leaving = a.getInitialState(); + } else { + // This means there's a hole (eg, StopFilter + // does this): + posData.leaving = new State(); + addHoles(a.getInitialState(), positions, pos); + } + } else { + posData.leaving = new State(); + posData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, posData.leaving)); + if (posInc > 1) { + // A token spanned over a hole; add holes + // "under" it: + addHoles(a.getInitialState(), positions, pos); + } + } + positions.freeBefore(pos); + } else { + // note: this isn't necessarily true. its just that we aren't surely det. + // we could optimize this further (e.g. buffer and sort synonyms at a position) + // but thats probably overkill. this is cheap and dirty + deterministic = false; + } + + final int endPos = pos + posLengthAtt.getPositionLength(); + + termBytesAtt.fillBytesRef(); + final String utf16 = changeToken(term).utf8ToString(); + final int[] term2 = new int[utf16.codePointCount(0, utf16.length())]; + for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) + term2[j++] = cp = utf16.codePointAt(i); + + final Position endPosData = positions.get(endPos); + if (endPosData.arriving == null) { + endPosData.arriving = new State(); + } + + State state = posData.leaving; + for (int charIDX = 0; charIDX < term2.length; charIDX++) { + final State nextState = charIDX == term2.length - 1 ? endPosData.arriving : new State(); + state.addTransition(new Transition(term2[charIDX], nextState)); + state = nextState; + } + + maxOffset = Math.max(maxOffset, offsetAtt.endOffset()); + } + + in.end(); + State endState = null; + if (offsetAtt.endOffset() > maxOffset) { + endState = new State(); + endState.setAccept(true); + } + + pos++; + while (pos <= positions.getMaxPos()) { + posData = positions.get(pos); + if (posData.arriving != null) { + if (endState != null) { + posData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, endState)); + } else { + posData.arriving.setAccept(true); + } + } + pos++; + } + + //toDot(a); + a.setDeterministic(deterministic); + return a; + } + + // for debugging! + /* + private static void toDot(Automaton a) throws IOException { + final String s = a.toDot(); + Writer w = new OutputStreamWriter(new FileOutputStream("/tmp/out.dot")); + w.write(s); + w.close(); + System.out.println("TEST: saved to /tmp/out.dot"); + } + */ + + private static void addHoles(State startState, RollingBuffer positions, int pos) { + Position posData = positions.get(pos); + Position prevPosData = positions.get(pos - 1); + + while (posData.arriving == null || prevPosData.leaving == null) { + if (posData.arriving == null) { + posData.arriving = new State(); + posData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, posData.leaving)); + } + if (prevPosData.leaving == null) { + if (pos == 1) { + prevPosData.leaving = startState; + } else { + prevPosData.leaving = new State(); + } + if (prevPosData.arriving != null) { + prevPosData.arriving.addTransition(new Transition(TokenStreamToAutomaton.POS_SEP, prevPosData.leaving)); + } + } + prevPosData.leaving.addTransition(new Transition(TokenStreamToAutomaton.HOLE, posData.arriving)); + pos--; + if (pos <= 0) { + break; + } + posData = prevPosData; + prevPosData = positions.get(pos - 1); + } + } +} Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (date 1372194687000) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (date 1372232814000) @@ -32,6 +32,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.TokenStreamToUnicodeAutomaton; import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.spell.TermFreqPayloadIterator; import org.apache.lucene.search.suggest.Lookup; @@ -53,6 +54,7 @@ import org.apache.lucene.util.automaton.SpecialOperations; import org.apache.lucene.util.automaton.State; import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST.BytesReader; @@ -146,6 +148,11 @@ */ private final boolean preserveSep; + /** + * True if suggester operates non-ASCII letters. + */ + private final boolean unicodeAware; + /** Include this flag in the options parameter to {@link * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} to always * return the exact match first, regardless of score. This @@ -158,9 +165,14 @@ * token separators when matching. */ public static final int PRESERVE_SEP = 2; + /** Include this flag in the options parameter to {@link + * #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)} if + * you want your suggester to operate non-ASCII letters. */ + public static final int UNICODE_AWARE = 4; + /** Represents the separation between tokens, if * PRESERVE_SEP was specified */ - private static final int SEP_LABEL = 0xff; + private static final int SEP_LABEL = '\u001F'; /** Marks end of the analyzed input and start of dedup * byte. */ @@ -225,11 +237,12 @@ public AnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions) { this.indexAnalyzer = indexAnalyzer; this.queryAnalyzer = queryAnalyzer; - if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) { - throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options); + if ((options & ~(EXACT_FIRST | PRESERVE_SEP | UNICODE_AWARE)) != 0) { + throw new IllegalArgumentException("options should only contain EXACT_FIRST, PRESERVE_SEP and UNICODE_AWARE; got " + options); } this.exactFirst = (options & EXACT_FIRST) != 0; this.preserveSep = (options & PRESERVE_SEP) != 0; + this.unicodeAware = (options & UNICODE_AWARE) != 0; // NOTE: this is just an implementation limitation; if // somehow this is a problem we could fix it by using @@ -307,41 +320,53 @@ } } - /** Just escapes the 0xff byte (which we still for SEP). */ - private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { - - final BytesRef spare = new BytesRef(); - - @Override - protected BytesRef changeToken(BytesRef in) { + protected static BytesRef escapeToken(BytesRef in, BytesRef spare) { - int upto = 0; - for(int i=0;i { private final boolean hasPayloads; @@ -413,7 +451,7 @@ return 0; } - }; + } @Override public void build(TermFreqIterator iterator) throws IOException { @@ -434,7 +472,13 @@ Sort.ByteSequencesReader reader = null; BytesRef scratch = new BytesRef(); - TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); + TokenStreamToAutomaton ts2a = null; + TokenStreamToUnicodeAutomaton ts2ua = null; + if (this.unicodeAware) { + ts2ua = getTokenStreamToUnicodeAutomaton(); + } else { + ts2a = getTokenStreamToAutomaton(); + } boolean success = false; byte buffer[] = new byte[8]; @@ -443,7 +487,12 @@ BytesRef surfaceForm; while ((surfaceForm = iterator.next()) != null) { - Set paths = toFiniteStrings(surfaceForm, ts2a); + Set paths; + if (unicodeAware) { + paths = toFiniteStrings(surfaceForm, ts2ua); + } else { + paths = toFiniteStrings(surfaceForm, ts2a); + } maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size()); @@ -700,11 +749,21 @@ } //System.out.println("lookup key=" + key + " num=" + num); + for (int i = 0; i < key.length(); i++) { + if (key.charAt(i) == 0x1E) { + throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001E; this character is reserved"); + } + } final BytesRef utf8Key = new BytesRef(key); try { Automaton lookupAutomaton = toLookupAutomaton(key); + Automaton utf8lookupAutomaton = null; + if (this.unicodeAware) { + utf8lookupAutomaton = new UTF32ToUTF8().convert(lookupAutomaton); + BasicOperations.determinize(utf8lookupAutomaton); + } - + final CharsRef spare = new CharsRef(); //System.out.println(" now intersect exactFirst=" + exactFirst); @@ -721,7 +780,7 @@ final List results = new ArrayList(); - List>> prefixPaths = FSTUtil.intersectPrefixPaths(lookupAutomaton, fst); + List>> prefixPaths = FSTUtil.intersectPrefixPaths(this.unicodeAware ? utf8lookupAutomaton : lookupAutomaton, fst); if (exactFirst) { @@ -878,11 +937,43 @@ return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); } + final Set toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToUnicodeAutomaton ts2ua) throws IOException { + // Analyze surface form: + TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString())); + + // Create corresponding automaton: labels are Unicode code points + // from each analyzed token, with code point 0 used as + // separator between tokens: + Automaton unicodeAutomaton = ts2ua.toAutomaton(ts); + ts.close(); + + replaceSep(unicodeAutomaton); + + Automaton automaton = new UTF32ToUTF8().convert(unicodeAutomaton); + BasicOperations.determinize(automaton); + + assert SpecialOperations.isFinite(automaton); + + // Get all paths from the automaton (there can be + // more than one path, eg if the analyzer created a + // graph using SynFilter or WDF): + + // TODO: we could walk & add simultaneously, so we + // don't have to alloc [possibly biggish] + // intermediate HashSet in RAM: + return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + } + final Automaton toLookupAutomaton(final CharSequence key) throws IOException { // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); - Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); + Automaton automaton; + if (this.unicodeAware) { + automaton = (getTokenStreamToUnicodeAutomaton()).toAutomaton(ts); + } else { + automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); + } ts.close(); // TODO: we could use the end offset to "guess" @@ -899,7 +990,12 @@ return automaton; } - + /** + * @return the unicodeAware option + */ + public boolean isUnicodeAware() { + return unicodeAware; + } /** * Returns the weight associated with an input string, Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (date 1372194687000) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java (date 1372232814000) @@ -15,10 +15,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -import java.io.FileOutputStream; + import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; import java.util.Arrays; import java.util.List; import java.util.Set; @@ -33,6 +31,7 @@ import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.apache.lucene.util.automaton.SpecialOperations; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs.Pair; @@ -54,6 +53,8 @@ * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be * edited. We allow up to 1 (@link * #DEFAULT_MAX_EDITS} edit. + * If UNICODE_AWARE option is set to true, maxEdits, minFuzzyLength and nonFuzzyPrefix + * are measured in Unicode code points (actual letters) instead of bytes. * *

* NOTE: This suggester does not boost suggestions that @@ -177,6 +178,10 @@ // to be log weights or something ... Automaton levA = toLevenshteinAutomata(lookupAutomaton); + if (isUnicodeAware()) { + levA = new UTF32ToUTF8().convert(levA); + BasicOperations.determinize(levA); + } /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); w.write(levA.toDot()); @@ -203,7 +208,7 @@ // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). - LevenshteinAutomata lev = new LevenshteinAutomata(ints, 255, transpositions); + LevenshteinAutomata lev = new LevenshteinAutomata(ints, isUnicodeAware() ? Character.MAX_CODE_POINT : 255, transpositions); Automaton levAutomaton = lev.toAutomaton(maxEdits); Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton)); combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (date 1372194687000) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (date 1372232814000) @@ -24,7 +24,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; -import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -48,8 +47,6 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.TermFreq; import org.apache.lucene.search.suggest.TermFreqArrayIterator; @@ -594,7 +591,7 @@ } } - private static char SEP = '\uFFFF'; + private static char SEP = '\u001F'; public void testRandom() throws Exception { @@ -615,6 +612,7 @@ } boolean preserveSep = random().nextBoolean(); + boolean unicodeAware = random().nextBoolean(); final int numStopChars = random().nextInt(10); final boolean preserveHoles = random().nextBoolean(); @@ -641,7 +639,7 @@ if (token > 0) { key += " "; } - if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != SEP) { + if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length()-1)) != 0x1F : analyzedKey.charAt(analyzedKey.length()-1) != SEP)) { analyzedKey += SEP; } key += s; @@ -702,8 +700,14 @@ } Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); - AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, - preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1); + int options = 0; + if (preserveSep) { + options |= AnalyzingSuggester.PRESERVE_SEP; + } + if (unicodeAware) { + options |= AnalyzingSuggester.UNICODE_AWARE; + } + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, options, 256, -1); if (doPayloads) { suggester.build(new TermFreqPayloadArrayIterator(payloadKeys)); } else { @@ -838,7 +842,7 @@ int tokenStreamCounter = 0; final TokenStream[] tokenStreams = new TokenStream[] { new CannedBinaryTokenStream(new BinaryToken[] { - token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), + token(new BytesRef(new byte[] {0x61, (byte) 0x1F, 0x61})), }), new CannedTokenStream(new Token[] { token("a",1,1), @@ -849,7 +853,7 @@ token("a",1,1) }), new CannedBinaryTokenStream(new BinaryToken[] { - token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), + token(new BytesRef(new byte[] {0x61, (byte) 0x1F, 0x61})), }) }; @@ -1192,5 +1196,19 @@ AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, 1); suggester.build(new TermFreqArrayIterator(new TermFreq[] {new TermFreq("a", 1)})); assertEquals("[a/1]", suggester.lookup("a", false, 1).toString()); + } + + public void testIllegalLookupArgument() throws Exception { + Analyzer a = new MockAnalyzer(random()); + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.UNICODE_AWARE, 256, -1); + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("а где Люси?", 7), + })); + try { + suggester.lookup("а\u001E", false, 3); + fail("should throw IllegalArgumentException"); + } catch (IllegalArgumentException e) { + System.out.println(e.getMessage()); + } } } Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (date 1372194687000) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java (date 1372232814000) @@ -37,6 +37,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.analysis.TokenStreamToUnicodeAutomaton; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -48,7 +49,9 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.State; +import org.apache.lucene.util.automaton.UTF32ToUTF8; import org.apache.lucene.util.fst.Util; public class FuzzySuggesterTest extends LuceneTestCase { @@ -60,7 +63,9 @@ keys.add(new TermFreq("boo" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); } keys.add(new TermFreq("foo bar boo far", 12)); - FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); + MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); + FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, + 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH); suggester.build(new TermFreqArrayIterator(keys)); int numIters = atLeast(10); for (int i = 0; i < numIters; i++) { @@ -72,6 +77,27 @@ } } + public void testNonLatinRandomEdits() throws IOException { + List keys = new ArrayList(); + int numTerms = atLeast(100); + for (int i = 0; i < numTerms; i++) { + keys.add(new TermFreq("буу" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); + } + keys.add(new TermFreq("фуу бар буу \u001e фар\u001fфар", 12)); + MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); + FuzzySuggester suggester = new FuzzySuggester(analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP | FuzzySuggester.UNICODE_AWARE, 256, -1, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, + 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH); + suggester.build(new TermFreqArrayIterator(keys)); + int numIters = atLeast(10); + for (int i = 0; i < numIters; i++) { + String addRandomEdit = addRandomEdit("фуу бар буу", 0); + List results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); + assertEquals(addRandomEdit, 1, results.size()); + assertEquals("фуу бар буу \u001e фар\u001fфар", results.get(0).key.toString()); + assertEquals(12, results.get(0).value, 0.01F); + } + } + /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ public void testKeyword() throws Exception { TermFreq keys[] = new TermFreq[] { @@ -580,12 +606,13 @@ TermFreq[] keys = new TermFreq[numQueries]; boolean preserveSep = random().nextBoolean(); + boolean unicodeAware = random().nextBoolean(); final int numStopChars = random().nextInt(10); final boolean preserveHoles = random().nextBoolean(); if (VERBOSE) { - System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); + System.out.println("TEST: " + numQueries + " words; preserveSep=" + preserveSep + " ; unicodeAware=" + unicodeAware + " numStopChars=" + numStopChars + " preserveHoles=" + preserveHoles); } for (int i = 0; i < numQueries; i++) { @@ -606,7 +633,7 @@ if (token > 0) { key += " "; } - if (preserveSep && analyzedKey.length() > 0 && analyzedKey.charAt(analyzedKey.length()-1) != ' ') { + if (preserveSep && analyzedKey.length() > 0 && (unicodeAware ? analyzedKey.codePointAt(analyzedKey.codePointCount(0, analyzedKey.length())-1) != ' ' : analyzedKey.charAt(analyzedKey.length()-1) != ' ')) { analyzedKey += " "; } key += s; @@ -658,8 +685,14 @@ } Analyzer a = new MockTokenEatingAnalyzer(numStopChars, preserveHoles); - FuzzySuggester suggester = new FuzzySuggester(a, a, - preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1, 1, false, 1, 3); + int options = 0; + if (preserveSep) { + options |= AnalyzingSuggester.PRESERVE_SEP; + } + if (unicodeAware) { + options |= AnalyzingSuggester.UNICODE_AWARE; + } + FuzzySuggester suggester = new FuzzySuggester(a, a, options, 256, -1, 1, false, 1, 3); suggester.build(new TermFreqArrayIterator(keys)); for (String prefix : allPrefixes) { @@ -722,19 +755,34 @@ if (VERBOSE) { System.out.println(" analyzed: " + analyzedKey); } - TokenStreamToAutomaton tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton(); + TokenStreamToAutomaton tokenStreamToAutomaton = null; + TokenStreamToUnicodeAutomaton tokenStreamToUnicodeAutomaton = null; + if (unicodeAware) { + tokenStreamToUnicodeAutomaton = suggester.getTokenStreamToUnicodeAutomaton(); + } else { + tokenStreamToAutomaton = suggester.getTokenStreamToAutomaton(); + } // NOTE: not great that we ask the suggester to give // us the "answer key" (ie maybe we have a bug in // suggester.toLevA ...) ... but testRandom2() fixes // this: Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey)); + if (unicodeAware) { + automaton = new UTF32ToUTF8().convert(automaton); + BasicOperations.determinize(automaton); + } assertTrue(automaton.isDeterministic()); // TODO: could be faster... but its slowCompletor for a reason BytesRef spare = new BytesRef(); for (TermFreq2 e : slowCompletor) { spare.copyChars(e.analyzedForm); - Set finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton); + Set finiteStrings; + if (unicodeAware) { + finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToUnicodeAutomaton); + } else { + finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton); + } for (IntsRef intsRef : finiteStrings) { State p = automaton.getInitialState(); BytesRef ref = Util.toBytesRef(intsRef, spare); @@ -878,7 +926,8 @@ // NOTE: can only use ascii here so that, in // UTF8 byte space it's still a single // insertion: - int x = random().nextInt(128); + // byte 0x1f is reserved + int x = random().nextBoolean() ? random().nextInt(31) : 32 + random().nextInt(128 - 32); builder.append((char) x); for (int j = i; j < input.length; j++) { builder.append(input[j]); Index: solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java (date 1372194687000) +++ solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java (date 1372232814000) @@ -43,6 +43,12 @@ public static final String PRESERVE_SEP = "preserveSep"; /** + * If true, maxEdits, minFuzzyLength and nonFuzzyPrefix + * will be measured in Unicode code points (actual letters) instead of bytes. + */ + public static final String UNICODE_AWARE = "unicodeAware"; + + /** * When multiple suggestions collide to the same analyzed form, this is the limit of * how many unique surface forms we keep. */ @@ -91,12 +97,19 @@ ? Boolean.valueOf(params.get(PRESERVE_SEP).toString()) : true; + boolean unicodeAware = params.get(UNICODE_AWARE) != null + ? Boolean.valueOf(params.get(UNICODE_AWARE).toString()) + : false; + int flags = 0; if (exactMatchFirst) { flags |= AnalyzingSuggester.EXACT_FIRST; } if (preserveSep) { flags |= AnalyzingSuggester.PRESERVE_SEP; + } + if (unicodeAware) { + flags |= AnalyzingSuggester.UNICODE_AWARE; } int maxSurfaceFormsPerAnalyzedForm = params.get(MAX_SURFACE_FORMS) != null Index: solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java IDEA additional info: Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP <+>UTF-8 =================================================================== --- solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java (date 1372194687000) +++ solr/core/src/java/org/apache/solr/spelling/suggest/fst/FuzzyLookupFactory.java (date 1372232814000) @@ -80,12 +80,19 @@ ? Boolean.valueOf(params.get(AnalyzingLookupFactory.PRESERVE_SEP).toString()) : true; + boolean unicodeAware = (params.get(AnalyzingLookupFactory.UNICODE_AWARE) != null) + ? Boolean.valueOf(params.get(AnalyzingLookupFactory.UNICODE_AWARE).toString()) + : false; + int options = 0; if (exactMatchFirst) { options |= FuzzySuggester.EXACT_FIRST; } if (preserveSep) { options |= FuzzySuggester.PRESERVE_SEP; + } + if (unicodeAware) { + options |= FuzzySuggester.UNICODE_AWARE; } int maxSurfaceFormsPerAnalyzedForm = (params.get(AnalyzingLookupFactory.MAX_SURFACE_FORMS) != null)