Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java (revision 1387530) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionTest.java (working copy) @@ -21,6 +21,7 @@ import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; @@ -29,6 +30,8 @@ import java.util.TreeSet; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CannedBinaryTokenStream.BinaryToken; +import org.apache.lucene.analysis.CannedBinaryTokenStream; import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenFilter; @@ -42,6 +45,7 @@ import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.TermFreq; import org.apache.lucene.search.suggest.TermFreqArrayIterator; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -129,7 +133,7 @@ int options = 0; - AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random()), options); + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random()), options, 256, -1); suggester.build(new TermFreqArrayIterator(keys)); // nocommit if i change this to "ab " ... the test fails // now but really it should pass??? problem is @@ -146,10 +150,6 @@ } public void testInputPathRequired() throws Exception { - TermFreq keys[] = new TermFreq[] { - new TermFreq("ab xc", 50), - new TermFreq("ba xd", 50), - }; // SynonymMap.Builder b = new SynonymMap.Builder(false); // b.add(new CharsRef("ab"), new CharsRef("ba"), true); @@ -167,42 +167,42 @@ // TokenStream stream = new SynonymFilter(tokenizer, map, true); // return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); return new TokenStreamComponents(tokenizer) { - int tokenStreamCounter = 0; - final TokenStream[] tokenStreams = new TokenStream[]{ new CannedTokenStream( - new Token[] { - token("ab",1,1), - token("ba",0,1), - token("xc",1,1) - }), + int tokenStreamCounter = 0; + final TokenStream[] tokenStreams = new TokenStream[] { + new CannedTokenStream(new Token[] { + token("ab",1,1), + token("ba",0,1), + token("xc",1,1) + }), + new CannedTokenStream(new Token[] { + token("ba",1,1), + token("xd",1,1) + }), + new CannedTokenStream(new Token[] { + token("ab",1,1), + token("ba",0,1), + token("x",1,1) + }) + }; - new CannedTokenStream( - new Token[] { - token("ba",1,1), - token("xd",1,1) - }), - - new CannedTokenStream( - new Token[] { - token("ab",1,1), - token("ba",0,1), - token("x",1,1) - }) - }; - - @Override - public TokenStream getTokenStream() { - TokenStream result = tokenStreams[tokenStreamCounter]; - tokenStreamCounter++; - return result; - } + @Override + public TokenStream getTokenStream() { + TokenStream result = tokenStreams[tokenStreamCounter]; + tokenStreamCounter++; + return result; + } - @Override - protected void setReader(final Reader reader) throws IOException { - } + @Override + protected void setReader(final Reader reader) throws IOException { + } }; } }; + TermFreq keys[] = new TermFreq[] { + new TermFreq("ab xc", 50), + new TermFreq("ba xd", 50), + }; AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(analyzer); suggester.build(new TermFreqArrayIterator(keys)); List results = suggester.lookup("ab x", false, 1); @@ -216,7 +216,10 @@ return t; } - + private static BinaryToken token(BytesRef term) { + return new BinaryToken(term); + } + private void printTokens(final Analyzer analyzer, String input) throws IOException { System.out.println("Tokens for " + input); TokenStream ts = analyzer.tokenStream("", new StringReader(input)); @@ -233,6 +236,114 @@ ts.close(); } + private final Analyzer getUnusualAnalyzer() { + return new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + + return new TokenStreamComponents(tokenizer) { + + int count; + + @Override + public TokenStream getTokenStream() { + // 4th time we are called, return tokens a b, + // else just a: + if (count++ != 3) { + return new CannedTokenStream(new Token[] { + token("a", 1, 1), + }); + } else { + // After that "a b": + return new CannedTokenStream(new Token[] { + token("a", 1, 1), + token("b", 1, 1), + }); + } + } + + @Override + protected void setReader(final Reader reader) throws IOException { + } + }; + } + }; + } + + public void testExactFirst() throws Exception { + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(getUnusualAnalyzer(), AnalyzingCompletionLookup.EXACT_FIRST | AnalyzingCompletionLookup.PRESERVE_SEP, 256, -1); + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("x y", 1), + new TermFreq("x y z", 3), + new TermFreq("x", 2), + new TermFreq("z z z", 20), + })); + + //System.out.println("ALL: " + suggester.lookup("x y", false, 6)); + + for(int topN=1;topN<6;topN++) { + List results = suggester.lookup("x y", false, topN); + //System.out.println("topN=" + topN + " " + results); + + assertEquals(Math.min(topN, 4), results.size()); + + assertEquals("x y", results.get(0).key); + assertEquals(1, results.get(0).value); + + if (topN > 1) { + assertEquals("z z z", results.get(1).key); + assertEquals(20, results.get(1).value); + + if (topN > 2) { + assertEquals("x y z", results.get(2).key); + assertEquals(3, results.get(2).value); + + if (topN > 3) { + assertEquals("x", results.get(3).key); + assertEquals(2, results.get(3).value); + } + } + } + } + } + + public void testNonExactFirst() throws Exception { + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(getUnusualAnalyzer(), AnalyzingCompletionLookup.PRESERVE_SEP, 256, -1); + + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("x y", 1), + new TermFreq("x y z", 3), + new TermFreq("x", 2), + new TermFreq("z z z", 20), + })); + + for(int topN=1;topN<6;topN++) { + List results = suggester.lookup("p", false, topN); + + assertEquals(Math.min(topN, 4), results.size()); + + assertEquals("z z z", results.get(0).key); + assertEquals(20, results.get(0).value); + + if (topN > 1) { + assertEquals("x y z", results.get(1).key); + assertEquals(3, results.get(1).value); + + if (topN > 2) { + assertEquals("x", results.get(2).key); + assertEquals(2, results.get(2).value); + + if (topN > 3) { + assertEquals("x y", results.get(3).key); + assertEquals(1, results.get(3).value); + } + } + } + } + } public void testRandom() throws Exception { int numWords = atLeast(1000); @@ -265,7 +376,7 @@ // nocommit also test NOT preserving seps/holes // nocommit why no failure if we DON'T preserve seps/holes...? AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false), - AnalyzingCompletionLookup.PRESERVE_SEP); + AnalyzingCompletionLookup.PRESERVE_SEP, 256, -1); suggester.build(new TermFreqArrayIterator(keys)); for (String prefix : allPrefixes) { @@ -307,4 +418,88 @@ } } } + + // nocommit need random full binary test + + public void testStolenBytes() throws Exception { + + final Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + + // TokenStream stream = new SynonymFilter(tokenizer, map, true); + // return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream)); + return new TokenStreamComponents(tokenizer) { + int tokenStreamCounter = 0; + final TokenStream[] tokenStreams = new TokenStream[] { + new CannedBinaryTokenStream(new BinaryToken[] { + token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), + }), + new CannedTokenStream(new Token[] { + token("a",1,1), + token("a",1,1) + }), + new CannedTokenStream(new Token[] { + token("a",1,1), + token("a",1,1) + }), + new CannedBinaryTokenStream(new BinaryToken[] { + token(new BytesRef(new byte[] {0x61, (byte) 0xff, 0x61})), + }) + }; + + @Override + public TokenStream getTokenStream() { + TokenStream result = tokenStreams[tokenStreamCounter]; + tokenStreamCounter++; + return result; + } + + @Override + protected void setReader(final Reader reader) throws IOException { + } + }; + } + }; + + TermFreq keys[] = new TermFreq[] { + new TermFreq("a a", 50), + new TermFreq("a b", 50), + }; + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(analyzer); + suggester.build(new TermFreqArrayIterator(keys)); + List results = suggester.lookup("a a", false, 5); + assertEquals(1, results.size()); + assertEquals("a b", results.get(0).key); + assertEquals(50, results.get(0).value); + + results = suggester.lookup("a a", false, 5); + assertEquals(1, results.size()); + assertEquals("a a", results.get(0).key); + assertEquals(50, results.get(0).value); + } + + // nocommit test that dups are preserved by weight: + public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { + + AnalyzingCompletionLookup suggester = new AnalyzingCompletionLookup(new MockAnalyzer(random()), 0, 2, -1); + + List keys = Arrays.asList(new TermFreq[] { + new TermFreq("a", 40), + new TermFreq("a ", 50), + new TermFreq(" a", 60), + }); + + Collections.shuffle(keys, random()); + suggester.build(new TermFreqArrayIterator(keys)); + + List results = suggester.lookup("a", false, 5); + assertEquals(2, results.size()); + assertEquals(" a", results.get(0).key); + assertEquals(60, results.get(0).value); + assertEquals("a ", results.get(1).key); + assertEquals(50, results.get(1).value); + } } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (revision 1387530) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java (working copy) @@ -57,7 +57,6 @@ * Input weights will be cast to a java integer, and any * negative, infinite, or NaN values will be rejected. * - * @see Util#shortestPaths(FST, FST.Arc, Object, Comparator, int) * @lucene.experimental */ public class WFSTCompletionLookup extends Lookup { Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java (revision 1387530) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingCompletionLookup.java (working copy) @@ -29,6 +29,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; @@ -113,41 +114,78 @@ private final boolean preserveSep; /** - * Calls {@link #AnalyzingCompletionLookup(Analyzer,int) AnalyzingCompletionLookup(analyzer, EXACT_FIRST | PRESERVE_SEP)} + * Calls {@link #AnalyzingCompletionLookup(Analyzer,int,int,int) + * AnalyzingCompletionLookup(analyzer, EXACT_FIRST | + * PRESERVE_SEP, 256, -1)} */ public AnalyzingCompletionLookup(Analyzer analyzer) { - this(analyzer, EXACT_FIRST | PRESERVE_SEP); + this(analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1); } - /** Include this flag in the options parameter {@link - * #AnalyzingCompletionLookup(Analyzer,int)} to always - * return exact matches first regardless of score. This - * has no performance impact but could result in - * low-quality suggestions. */ + /** Include this flag in the options parameter to {@link + * #AnalyzingCompletionLookup(Analyzer,int,int,int)} to always + * return the exact match first, regardless of score. This + * has no performance impact but could result in + * low-quality suggestions. */ public static final int EXACT_FIRST = 1; - /** Include this flag in the options parameter {@link - * #AnalyzingCompletionLookup(Analyzer,int)} to preserve - * token separators when matching. */ + /** Include this flag in the options parameter to {@link + * #AnalyzingCompletionLookup(Analyzer,int,int,int)} to preserve + * token separators when matching. */ public static final int PRESERVE_SEP = 2; /** Represents the separation between tokens, if * PRESERVE_SEP was specified */ - private static final byte SEP_BYTE = 0; + private static final int SEP_LABEL = 0xff; + /** Marks end of the analyzed input and start of dedup + * byte. */ + private static final int END_BYTE = 0x0; + + /** Maximum number of dup surface forms (different surface + * forms for the same analyzed form). */ + private final int maxSurfaceFormsPerAnalyzedForm; + + /** Maximum graph paths to index for a single analyzed + * surface form. This only matters if your analyzer + * makes lots of alternate paths (e.g. contains + * SynonymFilter). */ + private final int maxGraphExpansions; + /** * Creates a new suggester. * * @param analyzer Analyzer that will be used for analyzing suggestions. * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param maxSurfaceFormsPerAnalyzedForm Maximum number of + * surface forms to keep for a single analyzed form. + * When there are too many surface forms we discard the + * lowest weighted ones. + * @param maxGraphExpansions Maximum number of graph paths + * to expand from the analyzed form. Set this to -1 for + * no limit. */ - public AnalyzingCompletionLookup(Analyzer analyzer, int options) { + public AnalyzingCompletionLookup(Analyzer analyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions) { this.analyzer = analyzer; if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) { throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options); } this.exactFirst = (options & EXACT_FIRST) != 0; this.preserveSep = (options & PRESERVE_SEP) != 0; + + // NOTE: this is just an implementation limitation; if + // somehow this is a problem we could fix it by using + // more than one byte to disambiguate ... but 256 seems + // like it should be way more then enough. + if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) { + throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")"); + } + this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; + + if (maxGraphExpansions < 1 && maxGraphExpansions != -1) { + throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")"); + } + this.maxGraphExpansions = maxGraphExpansions; } // Replaces SEP with epsilon or remaps them if @@ -165,8 +203,8 @@ assert t.getMin() == t.getMax(); if (t.getMin() == TokenStreamToAutomaton.POS_SEP) { if (preserveSep) { - // Remap to SEP_BYTE: - t = new Transition(SEP_BYTE, t.getDest()); + // Remap to SEP_LABEL: + t = new Transition(SEP_LABEL, t.getDest()); } else { // NOTE: sort of weird because this will grow // the transition array we are iterating over, @@ -175,6 +213,22 @@ state.addEpsilon(t.getDest()); t = null; } + } else if (t.getMin() == TokenStreamToAutomaton.HOLE) { + + // Just remove the hole: there will then be two + // SEP tokens next to each other, which will only + // match another hole at search time. Note that + // it will also match an empty-string token ... if + // that's somehow a problem we can always map HOLE + // to a dedicated byte (and escape it in the + // input). + + // NOTE: sort of weird because this will grow + // the transition array we are iterating over, + // but because we are going in reverse topo sort + // it will not add any SEP/HOLE transitions: + state.addEpsilon(t.getDest()); + t = null; } if (t != null) { newTransitions.add(t); @@ -184,6 +238,34 @@ state.setTransitions(newTransitions.toArray(new Transition[newTransitions.size()])); } } + + /** Just escapes the bytes we steal (0xff, 0x0). */ + private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { + + final BytesRef spare = new BytesRef(); + + @Override + protected BytesRef changeToken(BytesRef in) { + int upto = 0; + for(int i=0;i paths = SpecialOperations.getFiniteStrings(automaton, -1); + Set paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); for (IntsRef path : paths) { Util.toBytesRef(path, scratch); @@ -242,7 +325,15 @@ output.writeBytes(scratch.bytes, scratch.offset, scratch.length); output.writeByte((byte)0); // separator: not used, just for sort order output.writeByte((byte)0); // separator: not used, just for sort order + + // NOTE: important that writeInt is big-endian, + // because this means we sort secondarily by + // cost ascending (= weight descending) so that + // when we discard too many surface forms for a + // single analyzed form we are discarding the + // least weight ones: output.writeInt(encodeWeight(iterator.weight())); + output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); output.writeShort(analyzedLength); writer.write(buffer, 0, output.getPosition()); @@ -253,7 +344,7 @@ // Sort all input/output pairs (required by FST.Builder): new Sort().sort(tempInput, tempSorted); reader = new Sort.ByteSequencesReader(tempSorted); - + PairOutputs outputs = new PairOutputs(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()); Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); @@ -263,6 +354,7 @@ BytesRef surface = new BytesRef(); IntsRef scratchInts = new IntsRef(); ByteArrayDataInput input = new ByteArrayDataInput(); + int dedup = 0; while (reader.read(scratch)) { input.reset(scratch.bytes, scratch.offset, scratch.length); @@ -279,33 +371,32 @@ surface.bytes = scratch.bytes; surface.offset = input.getPosition(); surface.length = input.length() - input.getPosition() - 2; - + if (previous == null) { previous = new BytesRef(); + previous.copyBytes(analyzed); } else if (analyzed.equals(previous)) { - // nocommit: "extend" duplicates with useless - // increasing bytes (it wont matter) ... or we - // could use multiple outputs for a single input? - // this would be more efficient? - if (dedup < 256) { - analyzed.grow(analyzed.length+1); - analyzed.bytes[analyzed.length] = (byte) dedup; - dedup++; - analyzed.length++; - } else { - // nocommit add param to limit "dups"??? - - // More than 256 dups: skip the rest: + dedup++; + if (dedup >= maxSurfaceFormsPerAnalyzedForm) { + // More than maxSurfaceFormsPerAnalyzedForm + // dups: skip the rest: continue; } } else { dedup = 0; + previous.copyBytes(analyzed); } + + analyzed.grow(analyzed.length+2); + // NOTE: must be byte 0 so we sort before whatever + // is next + analyzed.bytes[analyzed.length] = 0; + analyzed.bytes[analyzed.length+1] = (byte) dedup; + analyzed.length += 2; Util.toIntsRef(analyzed, scratchInts); - // nocommit (why?) + //System.out.println("ADD: " + analyzed); builder.add(scratchInts, outputs.newPair(cost, BytesRef.deepCopyOf(surface))); - previous.copyBytes(analyzed); } fst = builder.finish(); @@ -345,92 +436,146 @@ } @Override - public List lookup(CharSequence key, boolean onlyMorePopular, int num) { + public List lookup(final CharSequence key, boolean onlyMorePopular, int num) { assert num > 0; Arc> arc = new Arc>(); - //System.out.println("lookup"); + // System.out.println("lookup num=" + num); - // TODO: is there a Reader from a CharSequence? - // Turn tokenstream into automaton: - Automaton automaton; try { + + // TODO: is there a Reader from a CharSequence? + // Turn tokenstream into automaton: TokenStream ts = analyzer.tokenStream("", new StringReader(key.toString())); - automaton = TokenStreamToAutomaton.toAutomaton(ts); + Automaton automaton = (new EscapingTokenStreamToAutomaton()).toAutomaton(ts); ts.end(); ts.close(); - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } - replaceSep(automaton); + replaceSep(automaton); - // TODO: we can optimize this somewhat by determinizing - // while we convert - automaton = Automaton.minimize(automaton); + // TODO: we can optimize this somewhat by determinizing + // while we convert + automaton = Automaton.minimize(automaton); - List results = new ArrayList(num); - CharsRef spare = new CharsRef(); + final CharsRef spare = new CharsRef(); - //System.out.println(" now intersect exactFirst=" + exactFirst); + //System.out.println(" now intersect exactFirst=" + exactFirst); - // Intersect automaton w/ suggest wFST and get all - // prefix starting nodes & their outputs: - final List>> prefixPaths; - try { + // Intersect automaton w/ suggest wFST and get all + // prefix starting nodes & their outputs: + final List>> prefixPaths; prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst); - } catch (IOException bogus) { - throw new RuntimeException(bogus); - } - // nocommit maybe nuke exactFirst...? but... it's - // useful? - // nocommit: exactFirst is not well defined here ... the - // "exactness" refers to the analyzed form not the - // surface form - if (exactFirst) { - for (FSTUtil.Path> path : prefixPaths) { - if (path.fstNode.isFinal()) { - BytesRef prefix = BytesRef.deepCopyOf(path.output.output2); - prefix.append(path.fstNode.nextFinalOutput.output2); - spare.grow(prefix.length); - UnicodeUtil.UTF8toUTF16(prefix, spare); - results.add(new LookupResult(spare.toString(), decodeWeight(path.output.output1 + path.fstNode.nextFinalOutput.output1))); - if (--num == 0) { - // nocommit hmm should we order all "exact" - // matches by their .output1s, then return those - // top n...? - return results; // that was quick + //System.out.println(" prefixPaths: " + + //prefixPaths.size()); + + BytesReader bytesReader = fst.getBytesReader(0); + + FST.Arc> scratchArc = new FST.Arc>(); + + List results = new ArrayList(); + + if (exactFirst) { + + Util.TopNSearcher> searcher; + searcher = new Util.TopNSearcher>(fst, num, weightComparator); + + int count = 0; + for (FSTUtil.Path> path : prefixPaths) { + if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { + // This node has END_BYTE arc leaving, meaning it's an + // "exact" match: + count++; } } + + searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); + + // NOTE: we could almost get away with only usine + // the first start node. The only catch is if + // maxSurfaceFormsPerAnalyzedForm had kicked in and + // pruned our exact match from one of these nodes + // ...: + for (FSTUtil.Path> path : prefixPaths) { + if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { + // This node has END_BYTE arc leaving, meaning it's an + // "exact" match: + searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), true, path.input); + } + } + + MinResult> completions[] = searcher.search(); + + // NOTE: this is rather inefficient: we enumerate + // every matching "exactly the same analyzed form" + // path, and then do linear scan to see if one of + // these exactly matches the input. It should be + // possible (though hairy) to do something similar + // to getByOutput, since the surface form is encoded + // into the FST output, so we more efficiently hone + // in on the exact surface-form match. Still, I + // suspect very little time is spent in this linear + // seach: it's bounded by how many prefix start + // nodes we have and the + // maxSurfaceFormsPerAnalyzedForm: + for(MinResult> completion : completions) { + spare.grow(completion.output.output2.length); + UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); + if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) { + results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1))); + break; + } + } + + if (results.size() == num) { + // That was quick: + return results; + } } - } - Util.TopNSearcher> searcher = new Util.TopNSearcher>(fst, num, weightComparator); - for (FSTUtil.Path> path : prefixPaths) { - try { - searcher.addStartPaths(path.fstNode, path.output, !exactFirst, path.input); - } catch (IOException bogus) { - throw new RuntimeException(bogus); + // nocommit hmm w/ a graph ... aren't we going to + // produce dup surface form suggestions ...? do we + // need to dedup by surface form? + + Util.TopNSearcher> searcher; + searcher = new Util.TopNSearcher>(fst, + num - results.size(), + weightComparator) { + @Override + protected boolean acceptResult(IntsRef input, Pair output) { + if (!exactFirst) { + return true; + } else { + // In exactFirst mode, don't accept any paths + // matching the surface form since that will + // create duplicate results: + spare.grow(output.output2.length); + UnicodeUtil.UTF8toUTF16(output.output2, spare); + return CHARSEQUENCE_COMPARATOR.compare(spare, key) != 0; + } + } + }; + + for (FSTUtil.Path> path : prefixPaths) { + searcher.addStartPaths(path.fstNode, path.output, true, path.input); } - } - MinResult> completions[] = null; - try { - completions = searcher.search(); + MinResult> completions[] = searcher.search(); + + for(MinResult> completion : completions) { + spare.grow(completion.output.output2.length); + UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); + LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1)); + //System.out.println(" result=" + result); + results.add(result); + } + + return results; } catch (IOException bogus) { throw new RuntimeException(bogus); } + } - for (MinResult> completion : completions) { - spare.grow(completion.output.output2.length); - UnicodeUtil.UTF8toUTF16(completion.output.output2, spare); - results.add(new LookupResult(spare.toString(), decodeWeight(completion.output.output1))); - } - - return results; - } - /** * Returns the weight associated with an input string, * or null if it does not exist. @@ -455,6 +600,6 @@ static final Comparator> weightComparator = new Comparator> () { public int compare(Pair left, Pair right) { return left.output1.compareTo(right.output1); - } + } }; } Index: lucene/test-framework/src/java/org/apache/lucene/analysis/CannedBinaryTokenStream.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/analysis/CannedBinaryTokenStream.java (revision 0) +++ lucene/test-framework/src/java/org/apache/lucene/analysis/CannedBinaryTokenStream.java (working copy) @@ -0,0 +1,130 @@ +package org.apache.lucene.analysis; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.BytesRef; + +/** + * TokenStream from a canned list of binary (BytesRef-based) + * tokens. + */ +public final class CannedBinaryTokenStream extends TokenStream { + + /** Represents a binary token. */ + public final static class BinaryToken { + BytesRef term; + int posInc; + int posLen; + int startOffset; + int endOffset; + + public BinaryToken(BytesRef term) { + this.term = term; + this.posInc = 1; + this.posLen = 1; + } + + public BinaryToken(BytesRef term, int posInc, int posLen) { + this.term = term; + this.posInc = posInc; + this.posLen = posLen; + } + } + + private final BinaryToken[] tokens; + private int upto = 0; + private final BinaryTermAttribute termAtt = addAttribute(BinaryTermAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + public interface BinaryTermAttribute extends TermToBytesRefAttribute { + public void setBytesRef(BytesRef bytes); + } + + public final static class BinaryTermAttributeImpl extends AttributeImpl implements BinaryTermAttribute, TermToBytesRefAttribute { + private final BytesRef bytes = new BytesRef(); + + @Override + public int fillBytesRef() { + return bytes.hashCode(); + } + + @Override + public BytesRef getBytesRef() { + return bytes; + } + + public void setBytesRef(BytesRef bytes) { + this.bytes.copyBytes(bytes); + } + + @Override + public void clear() { + } + + @Override + public boolean equals(Object other) { + return other == this; + } + + @Override + public int hashCode() { + return System.identityHashCode(this); + } + + @Override + public void copyTo(AttributeImpl target) { + BinaryTermAttributeImpl other = (BinaryTermAttributeImpl) target; + other.bytes.copyBytes(bytes); + } + + @Override + public BinaryTermAttributeImpl clone() { + throw new UnsupportedOperationException(); + } + } + + public CannedBinaryTokenStream(BinaryToken... tokens) { + super(); + this.tokens = tokens; + } + + @Override + public boolean incrementToken() { + if (upto < tokens.length) { + final BinaryToken token = tokens[upto++]; + // TODO: can we just capture/restoreState so + // we get all attrs...? + clearAttributes(); + termAtt.setBytesRef(token.term); + posIncrAtt.setPositionIncrement(token.posInc); + posLengthAtt.setPositionLength(token.posLen); + offsetAtt.setOffset(token.startOffset, token.endOffset); + return true; + } else { + return false; + } + } +} Property changes on: lucene/test-framework/src/java/org/apache/lucene/analysis/CannedBinaryTokenStream.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/test/org/apache/lucene/index/BinaryTokenStream.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/BinaryTokenStream.java (revision 1387530) +++ lucene/core/src/test/org/apache/lucene/index/BinaryTokenStream.java (working copy) @@ -25,6 +25,7 @@ /** * a binary tokenstream that lets you index a BytesRef */ +// nocommit absorb into CannedBinaryTS? public final class BinaryTokenStream extends TokenStream { private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class); private boolean available = true; @@ -63,6 +64,8 @@ } public void setBytesRef(BytesRef bytes) { + // nocommit isn't this dangerous? the getBytesRef is + // pre-shared? so you can't set this per token? this.bytes = bytes; } Index: lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java =================================================================== --- lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (revision 1387530) +++ lucene/core/src/test/org/apache/lucene/analysis/TestGraphTokenizers.java (working copy) @@ -416,7 +416,7 @@ new Token[] { token("abc", 1, 1), }); - final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); + final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicAutomata.makeString("abc"); assertTrue(BasicOperations.sameLanguage(expected, actual)); } @@ -459,7 +459,7 @@ token("abc", 1, 1), token("def", 1, 1), }); - final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); + final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join("abc", "def"); //toDot(actual); @@ -473,7 +473,7 @@ token("abc", 1, 1), token("def", 2, 1), }); - final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); + final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")); @@ -489,7 +489,7 @@ token("abc", 1, 1), token("xyz", 0, 1) }); - final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); + final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("abc"); final Automaton a2 = BasicAutomata.makeString("xyz"); final Automaton expected = BasicOperations.union(a1, a2); @@ -504,7 +504,7 @@ token("xyz", 0, 2), token("def", 1, 1), }); - final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); + final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("xyz"); final Automaton a2 = join("abc", "def"); @@ -521,7 +521,7 @@ token("X", 0, 2), token("b", 2, 1), }); - final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); + final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicOperations.union( join(s2a("a"), SEP_A, HOLE_A), BasicAutomata.makeString("X")); @@ -539,7 +539,7 @@ token("abc", 0, 3), token("def", 2, 1), }); - final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); + final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicOperations.union( join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), BasicAutomata.makeString("abc")); @@ -555,7 +555,7 @@ token("def", 1, 1), token("ghi", 1, 1), }); - final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); + final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("xyz"); final Automaton a2 = join("abc", "def", "ghi"); final Automaton expected = BasicOperations.union(a1, a2); @@ -575,7 +575,7 @@ new Token[] { token("abc", 2, 1), }); - final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); + final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join(HOLE_A, SEP_A, s2a("abc")); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); @@ -589,7 +589,7 @@ token("a", 1, 1), token("X", 0, 10), }); - final Automaton actual = TokenStreamToAutomaton.toAutomaton(ts); + final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicOperations.union(BasicAutomata.makeString("a"), BasicAutomata.makeString("X")); assertTrue(BasicOperations.sameLanguage(expected, actual)); Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1387530) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -472,7 +472,10 @@ if (path.arc.label == FST.END_LABEL) { // Add final output: //System.out.println(" done!: " + path); - results.add(new MinResult(path.input, fst.outputs.add(path.cost, path.arc.output), comparator)); + T finalOutput = fst.outputs.add(path.cost, path.arc.output); + if (acceptResult(path.input, finalOutput)) { + results.add(new MinResult(path.input, finalOutput, comparator)); + } break; } else { path.input.grow(1+path.input.length); @@ -487,6 +490,10 @@ (MinResult[]) new MinResult[results.size()]; return results.toArray(arr); } + + protected boolean acceptResult(IntsRef input, T output) { + return true; + } } /** Holds a single input (IntsRef) + output, returned by @@ -830,7 +837,8 @@ scratch.grow(input.length); for(int i=0;i= Byte.MIN_VALUE && value <= Byte.MAX_VALUE: "value " + value + " doesn't fit into byte"; + // NOTE: we allow -128 to 255 + assert value >= Byte.MIN_VALUE && value <= 255: "value " + value + " doesn't fit into byte"; scratch.bytes[i] = (byte) value; } scratch.length = input.length; Index: lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java =================================================================== --- lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (revision 1387530) +++ lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java (working copy) @@ -62,6 +62,13 @@ } } + /** Subclass & implement this if you need to change the + * token (such as escaping certain bytes) before it's + * turned into a graph. */ + protected BytesRef changeToken(BytesRef in) { + return in; + } + /** We create transition between two adjacent tokens. */ public static final int POS_SEP = 256; @@ -72,7 +79,7 @@ * PositionLengthAttribute}) from the provided {@link * TokenStream}, and creates the corresponding * automaton where arcs are bytes from each term. */ - public static Automaton toAutomaton(TokenStream in) throws IOException { + public Automaton toAutomaton(TokenStream in) throws IOException { final Automaton a = new Automaton(); final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); @@ -131,15 +138,16 @@ final int endPos = pos + posLengthAtt.getPositionLength(); termBytesAtt.fillBytesRef(); + final BytesRef term2 = changeToken(term); final Position endPosData = positions.get(endPos); if (endPosData.arriving == null) { endPosData.arriving = new State(); } State state = posData.leaving; - for(int byteIDX=0;byteIDX