Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java (revision 1387114) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java (working copy) @@ -167,38 +167,34 @@ // TokenStream stream = new SynonymFilter(tokenizer, map, true); // return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); return new TokenStreamComponents(tokenizer) { - int tokenStreamCounter = 0; - final TokenStream[] tokenStreams = new TokenStream[]{ new CannedTokenStream( - new Token[] { - token("ab",1,1), - token("ba",0,1), - token("xc",1,1) - }), + int tokenStreamCounter = 0; + final TokenStream[] tokenStreams = new TokenStream[] { + new CannedTokenStream(new Token[] { + token("ab",1,1), + token("ba",0,1), + token("xc",1,1) + }), + new CannedTokenStream(new Token[] { + token("ba",1,1), + token("xd",1,1) + }), + new CannedTokenStream(new Token[] { + token("ab",1,1), + token("ba",0,1), + token("x",1,1) + }) + }; - new CannedTokenStream( - new Token[] { - token("ba",1,1), - token("xd",1,1) - }), - - new CannedTokenStream( - new Token[] { - token("ab",1,1), - token("ba",0,1), - token("x",1,1) - }) - }; - - @Override - public TokenStream getTokenStream() { - TokenStream result = tokenStreams[tokenStreamCounter]; - tokenStreamCounter++; - return result; - } + @Override + public TokenStream getTokenStream() { + TokenStream result = tokenStreams[tokenStreamCounter]; + tokenStreamCounter++; + return result; + } - @Override - protected void setReader(final Reader reader) throws IOException { - } + @Override + protected void setReader(final Reader reader) throws IOException { + } }; } }; @@ -216,7 +212,6 @@ return t; } - private void printTokens(final Analyzer analyzer, String input) throws IOException { System.out.println("Tokens for " + input); TokenStream ts = analyzer.tokenStream("", new StringReader(input)); @@ -233,6 +228,111 @@ ts.close(); } + private final Analyzer getUnusualAnalyzer() { + return new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + + return new TokenStreamComponents(tokenizer) { + + int count; + + @Override + public TokenStream getTokenStream() { + // 4th time we are called, return tokens a/b, + // else just a: + if (count++ != 3) { + return new CannedTokenStream(new Token[] { + token("a", 1, 1), + }); + } else { + // After that "a b": + return new CannedTokenStream(new Token[] { + token("a", 1, 1), + token("b", 1, 1), + }); + } + } + + @Override + protected void setReader(final Reader reader) throws IOException { + } + }; + } + }; + } + + public void testExactFirst() throws Exception { + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(getUnusualAnalyzer(), AnalyzingCompletionLookup.EXACT_FIRST | AnalyzingCompletionLookup.PRESERVE_SEP); + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("x y", 1), + new TermFreq("x y z", 3), + new TermFreq("x", 2), + new TermFreq("z z z", 20), + })); + + for(int topN=1;topN<6;topN++) { + List results = suggester.lookup("p", false, topN); + + assertEquals(Math.min(topN, 4), results.size()); + + assertEquals("x y z", results.get(0).key); + assertEquals(3, results.get(0).value); + + if (topN > 1) { + assertEquals("x", results.get(1).key); + assertEquals(2, results.get(1).value); + + if (topN > 2) { + assertEquals("x y", results.get(2).key); + assertEquals(1, results.get(2).value); + + if (topN > 3) { + assertEquals("z z z", results.get(3).key); + assertEquals(20, results.get(3).value); + } + } + } + } + } + + public void testNonExactFirst() throws Exception { + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(getUnusualAnalyzer(), AnalyzingCompletionLookup.PRESERVE_SEP); + + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("x y", 1), + new TermFreq("x y z", 3), + new TermFreq("x", 2), + new TermFreq("z z z", 20), + })); + + for(int topN=1;topN<6;topN++) { + List results = suggester.lookup("p", false, topN); + + assertEquals(Math.min(topN, 4), results.size()); + + assertEquals("z z z", results.get(0).key); + assertEquals(20, results.get(0).value); + + if (topN > 1) { + assertEquals("x y z", results.get(1).key); + assertEquals(3, results.get(1).value); + + if (topN > 2) { + assertEquals("x", results.get(2).key); + assertEquals(2, results.get(2).value); + + if (topN > 3) { + assertEquals("x y", results.get(3).key); + assertEquals(1, results.get(3).value); + } + } + } + } + } public void testRandom() throws Exception { int numWords = atLeast(1000); Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java (revision 1387114) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java (working copy) @@ -29,6 +29,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -119,22 +120,28 @@ this(analyzer, EXACT_FIRST | PRESERVE_SEP); } - /** Include this flag in the options parameter {@link - * #AnalyzingCompletionLookup(Analyzer,int)} to always - * return exact matches first regardless of score. This - * has no performance impact but could result in - * low-quality suggestions. */ + /** Include this flag in the options parameter to {@link + * #AnalyzingCompletionLookup(Analyzer,int)} to always + * return exact matches first regardless of score. This + * has no performance impact but could result in + * low-quality suggestions. Note that "exact" means the + * key provided to {@link #lookup} analyzed to same + * output as the inputs provided to {@link #build}. */ public static final int EXACT_FIRST = 1; - /** Include this flag in the options parameter {@link - * #AnalyzingCompletionLookup(Analyzer,int)} to preserve - * token separators when matching. */ + /** Include this flag in the options parameter to {@link + * #AnalyzingCompletionLookup(Analyzer,int)} to preserve + * token separators when matching. */ public static final int PRESERVE_SEP = 2; /** Represents the separation between tokens, if * PRESERVE_SEP was specified */ - private static final byte SEP_BYTE = 0; + private static final int SEP_LABEL = 0xff; + /** Marks end of the analyzed input and start of dedup + * byte. */ + private static final int END_BYTE = 0x0; + /** * Creates a new suggester. * @@ -150,6 +157,50 @@ this.preserveSep = (options & PRESERVE_SEP) != 0; } + // nocommit need to get this working, tack it onto the end + // of the TokenStream: + + /** Escapes bytes 0xff, 0x0 if they appear in the token. */ + private static class EscapingTokenFilter extends TokenFilter { + + private final TermToBytesRefAttribute termBytesAtt = getAttribute(TermToBytesRefAttribute.class); + private final BytesRef spare = new BytesRef(); + + public EscapingTokenFilter(TokenStream in) { + super(in); + } + + @Override + public boolean incrementToken() throws IOException { + boolean hasToken = input.incrementToken(); + if (!hasToken) { + return false; + } + final BytesRef bytes = termBytesAtt.getBytesRef(); + termBytesAtt.fillBytesRef(); + + int upto = 0; + for(int i=0;i outputs = new PairOutputs(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()); Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); @@ -263,6 +327,7 @@ BytesRef surface = new BytesRef(); IntsRef scratchInts = new IntsRef(); ByteArrayDataInput input = new ByteArrayDataInput(); + int dedup = 0; while (reader.read(scratch)) { input.reset(scratch.bytes, scratch.offset, scratch.length); @@ -279,33 +344,35 @@ surface.bytes = scratch.bytes; surface.offset = input.getPosition(); surface.length = input.length() - input.getPosition() - 2; - + if (previous == null) { previous = new BytesRef(); + previous.copyBytes(analyzed); } else if (analyzed.equals(previous)) { - // nocommit: "extend" duplicates with useless - // increasing bytes (it wont matter) ... or we - // could use multiple outputs for a single input? - // this would be more efficient? - if (dedup < 256) { - analyzed.grow(analyzed.length+1); - analyzed.bytes[analyzed.length] = (byte) dedup; + if (dedup < 255) { dedup++; - analyzed.length++; } else { - // nocommit add param to limit "dups"??? + // nocommit add param to limit "dups"??? and + // check that it's <= 256 up front // More than 256 dups: skip the rest: continue; } } else { dedup = 0; + previous.copyBytes(analyzed); } + + analyzed.grow(analyzed.length+2); + // NOTE: must be byte 0 so we sort before whatever + // is next + analyzed.bytes[analyzed.length] = 0; + analyzed.bytes[analyzed.length+1] = (byte) dedup; + analyzed.length += 2; Util.toIntsRef(analyzed, scratchInts); - // nocommit (why?) + // System.out.println("ADD: " + analyzed); builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface))); - previous.copyBytes(analyzed); } fst = builder.finish(); @@ -349,88 +416,128 @@ assert num > 0; Arc> arc = new Arc>(); - //System.out.println("lookup"); + // System.out.println("lookup num=" + num); - // TODO: is there a Reader from a CharSequence? - // Turn tokenstream into automaton: - Automaton automaton; try { + + // TODO: is there a Reader from a CharSequence? + // Turn tokenstream into automaton: TokenStream ts = analyzer.tokenStream("", new StringReader(key.toString())); - automaton = TokenStreamToAutomaton.toAutomaton(ts); + Automaton automaton = TokenStreamToAutomaton.toAutomaton(ts); ts.end(); ts.close(); - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } - replaceSep(automaton); + replaceSep(automaton); - // TODO: we can optimize this somewhat by determinizing - // while we convert - automaton = Automaton.minimize(automaton); + // TODO: we can optimize this somewhat by determinizing + // while we convert + automaton = Automaton.minimize(automaton); - List results = new ArrayList(num); - CharsRef spare = new CharsRef(); + List results = new ArrayList(num); + CharsRef spare = new CharsRef(); - //System.out.println(" now intersect exactFirst=" + exactFirst); + //System.out.println(" now intersect exactFirst=" + exactFirst); - // Intersect automaton w/ suggest wFST and get all - // prefix starting nodes & their outputs: - final List>> prefixPaths; - try { + // Intersect automaton w/ suggest wFST and get all + // prefix starting nodes & their outputs: + final List>> prefixPaths; prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst); - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } - // nocommit maybe nuke exactFirst...? but... it's - // useful? - // nocommit: exactFirst is not well defined here ... the - // "exactness" refers to the analyzed form not the - // surface form - if (exactFirst) { + //System.out.println(" prefixPaths: " + + //prefixPaths.size()); + + // nocommit need test w/ terms that have 0/1/0xff bytes... + + BytesReader bytesReader = fst.getBytesReader(0); + + FST.Arc> scratchArc = new FST.Arc>(); + + List exactResults; + + if (exactFirst) { + exactResults = new ArrayList(); + + Util.TopNSearcher> searcher; + searcher = new Util.TopNSearcher>(fst, num, weightComparator); + + for (FSTUtil.Path> path : prefixPaths) { + if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { + // This node has END_BYTE arc leaving, meaning it's an + // "exact" match: + searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), true, path.input); + } + } + + MinResult> completions[] = searcher.search(); + + for(MinResult> completion : completions) { + spare.grow(completion.output.output2.length); + UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); + LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1)); + //System.out.println(" result=" + result); + exactResults.add(result); + } + + if (exactResults.size() == num) { + // That was quick: we found enough "exact" matches + return exactResults; + } + + //System.out.println(" after exact: " + exactResults.size()); + + } else { + exactResults = null; + } + + Util.TopNSearcher> searcher; + searcher = new Util.TopNSearcher>(fst, + exactResults == null ? num : num - exactResults.size(), + weightComparator); for (FSTUtil.Path> path : prefixPaths) { - if (path.fstNode.isFinal()) { - BytesRef prefix = BytesRef.deepCopyOf(path.output.output2); - prefix.append(path.fstNode.nextFinalOutput.output2); - spare.grow(prefix.length); - UnicodeUtil.UTF8toUTF16(prefix, spare); - results.add(new LookupResult(spare.toString(), decodeWeight(path.output.output1 + path.fstNode.nextFinalOutput.output1))); - if (--num == 0) { - // nocommit hmm should we order all "exact" - // matches by their .output1s, then return those - // top n...? - return results; // that was quick + fst.readFirstTargetArc(path.fstNode, scratchArc, bytesReader); + //System.out.println("path: " + path.input); + while(true) { + if (!exactFirst || scratchArc.label != END_BYTE) { + path.input.grow(path.input.length+1); + path.input.ints[path.input.length] = scratchArc.label; + path.input.length++; + searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), true, path.input); } + + if (scratchArc.isLast()) { + break; + } + + fst.readNextArc(scratchArc, bytesReader); } } - } - Util.TopNSearcher> searcher = new Util.TopNSearcher>(fst, num, weightComparator); - for (FSTUtil.Path> path : prefixPaths) { - try { - searcher.addStartPaths(path.fstNode, path.output, !exactFirst, path.input); - } catch (IOException bogus) { - throw new RuntimeException(bogus); + MinResult> completions[] = searcher.search(); + + List normalResults = new ArrayList(); + + for(MinResult> completion : completions) { + spare.grow(completion.output.output2.length); + UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); + LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1)); + //System.out.println(" result=" + result); + normalResults.add(result); } - } - MinResult> completions[] = null; - try { - completions = searcher.search(); + if (exactResults == null) { + assert normalResults.size() <= num; + return normalResults; + } else { + List allResults = exactResults; + allResults.addAll(normalResults); + assert allResults.size() <= num; + return allResults; + } } catch (IOException bogus) { throw new RuntimeException(bogus); } + } - for (MinResult> completion : completions) { - spare.grow(completion.output.output2.length); - UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); - results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1))); - } - - return results; - } - /** * Returns the weight associated with an input string, * or null if it does not exist. @@ -455,6 +562,6 @@ static final Comparator> weightComparator = new Comparator> () { public int compare(Pair left, Pair right) { return left.output1.compareTo(right.output1); - } + } }; } Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1387114) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -830,7 +830,8 @@ scratch.grow(input.length); for(int i=0;i= Byte.MIN_VALUE && value <= Byte.MAX_VALUE: "value " + value + " doesn't fit into byte"; + // NOTE: we allow -128 to 255 + assert value >= Byte.MIN_VALUE && value <= 255: "value " + value + " doesn't fit into byte"; scratch.bytes[i] = (byte) value; } scratch.length = input.length;