Index: lucene/common-build.xml =================================================================== --- lucene/common-build.xml (revision 1400186) +++ lucene/common-build.xml (working copy) @@ -841,10 +841,12 @@ + Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1400186) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) @@ -161,6 +161,12 @@ * SynonymFilter). */ private final int maxGraphExpansions; + /** Highest number of analyzed paths we saw for any single + * input surface form. For analyzers that never create + * graphs this will always be 1. */ + // nocommit: not saved/loaded ... + private int maxAnalyzedPathsForOneInput; + /** * Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int) * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST | @@ -354,6 +360,8 @@ // don't have to alloc [possibly biggish] // intermediate HashSet in RAM: Set paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); + maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size()); + for (IntsRef path : paths) { Util.toBytesRef(path, scratch); @@ -545,7 +553,7 @@ // Searcher just to find the single exact only // match, if present: Util.TopNSearcher> searcher; - searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); + searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); // NOTE: we could almost get away with only using // the first start node. The only catch is if @@ -591,16 +599,19 @@ Util.TopNSearcher> searcher; searcher = new Util.TopNSearcher>(fst, - num - results.size(), + num, + num * maxAnalyzedPathsForOneInput, weightComparator) { private final Set seen = new HashSet(); @Override protected boolean acceptResult(IntsRef input, Pair output) { - + + //System.out.println("ACCEPT? path=" + input); // Dedup: when the input analyzes to a graph we // can get duplicate surface forms: if (seen.contains(output.output2)) { + //System.out.println("SKIP: dup"); return false; } seen.add(output.output2); @@ -630,6 +641,12 @@ LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1)); //System.out.println(" result=" + result); results.add(result); + + if (results.size() == num) { + // In the exactFirst=true case the search may + // produce one extra path + break; + } } return results; Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (revision 1400186) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (working copy) @@ -47,7 +47,8 @@ /** * Benchmarks tests for implementations of {@link Lookup} interface. */ -@Ignore("COMMENT ME TO RUN BENCHMARKS!") +// nocommit +//@Ignore("COMMENT ME TO RUN BENCHMARKS!") public class LookupBenchmarkTest extends LuceneTestCase { @SuppressWarnings("unchecked") private final List> benchmarkClasses = Arrays.asList( Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision 1400186) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (working copy) @@ -803,4 +803,116 @@ List results = suggester.lookup("a", false, 4); } + + public void testExactFirstMissingResult() throws Exception { + + Analyzer a = new MockAnalyzer(random()); + + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.EXACT_FIRST, 256, -1); + + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("a", 5), + new TermFreq("a b", 3), + new TermFreq("a c", 4), + })); + + // nocommit make this false and confirm test still + // failed before: + List results = suggester.lookup("a", true, 3); + assertEquals(3, results.size()); + assertEquals("a", results.get(0).key); + assertEquals(5, results.get(0).value); + assertEquals("a c", results.get(1).key); + assertEquals(4, results.get(1).value); + assertEquals("a b", results.get(2).key); + assertEquals(3, results.get(2).value); + } + + public void testDupSurfaceFormsMissingResults() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + + return new TokenStreamComponents(tokenizer) { + + @Override + public TokenStream getTokenStream() { + return new CannedTokenStream(new Token[] { + token("hairy", 1, 1), + token("smelly", 0, 1), + token("dog", 1, 1), + }); + } + + @Override + protected void setReader(final Reader reader) throws IOException { + } + }; + } + }; + + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1); + + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("hambone", 6), + new TermFreq("nellie", 5), + })); + + List results = suggester.lookup("nellie", false, 2); + assertEquals(2, results.size()); + assertEquals("hambone", results.get(0).key); + assertEquals(6, results.get(0).value); + assertEquals("nellie", results.get(1).key); + assertEquals(5, results.get(1).value); + } + + public void testDupSurfaceFormsMissingResults2() throws Exception { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true); + + return new TokenStreamComponents(tokenizer) { + + int count; + + @Override + public TokenStream getTokenStream() { + if (count == 0) { + count++; + return new CannedTokenStream(new Token[] { + token("p", 1, 1), + token("q", 1, 1), + token("r", 0, 1), + token("s", 0, 1), + }); + } else { + return new CannedTokenStream(new Token[] { + token("p", 1, 1), + }); + } + } + + @Override + protected void setReader(final Reader reader) throws IOException { + } + }; + } + }; + + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1); + + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("a", 6), + new TermFreq("b", 5), + })); + + List results = suggester.lookup("a", false, 2); + assertEquals(2, results.size()); + assertEquals("a", results.get(0).key); + assertEquals(6, results.get(0).value); + assertEquals("b", results.get(1).key); + assertEquals(5, results.get(1).value); + } } Index: lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java (revision 1400186) +++ lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java (working copy) @@ -30,6 +30,8 @@ return new Statement() { @Override public void evaluate() throws Throwable { + // nocommit + /* try { assert false; String msg = "Test class requires enabled assertions, enable globally (-ea)" + @@ -39,6 +41,7 @@ } catch (AssertionError e) { // Ok, enabled. } + */ base.evaluate(); } Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1400186) +++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -266,6 +266,7 @@ private final FST fst; private final FST.BytesReader bytesReader; private final int topN; + private final int maxQueueDepth; private final FST.Arc scratchArc = new FST.Arc(); @@ -273,10 +274,11 @@ TreeSet> queue = null; - public TopNSearcher(FST fst, int topN, Comparator comparator) { + public TopNSearcher(FST fst, int topN, int maxQueueDepth, Comparator comparator) { this.fst = fst; this.bytesReader = fst.getBytesReader(0); this.topN = topN; + this.maxQueueDepth = maxQueueDepth; this.comparator = comparator; queue = new TreeSet>(); @@ -290,7 +292,7 @@ T cost = fst.outputs.add(path.cost, path.arc.output); //System.out.println(" addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label); - if (queue.size() == topN) { + if (queue.size() == maxQueueDepth) { FSTPath bottom = queue.last(); int comp = comparator.compare(cost, bottom.cost); if (comp > 0) { @@ -323,9 +325,9 @@ queue.add(newPath); - if (queue.size() == topN+1) { + if (queue.size() == maxQueueDepth+1) { queue.pollLast(); - } + } } /** Adds all leaving arcs, including 'finished' arc, if @@ -390,8 +392,6 @@ break; } - //System.out.println(" remove init path=" + path); - if (path.arc.label == FST.END_LABEL) { //System.out.println(" empty string! cost=" + path.cost); // Empty string! @@ -400,10 +400,13 @@ continue; } + // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible: + /* if (results.size() == topN-1) { // Last path -- don't bother w/ queue anymore: queue = null; } + */ //System.out.println(" path: " + path); @@ -512,8 +515,11 @@ * PositiveIntOutputs#getSingleton}). */ public static MinResult[] shortestPaths(FST fst, FST.Arc fromNode, T startOutput, Comparator comparator, int topN, boolean allowEmptyString) throws IOException { - TopNSearcher searcher = new TopNSearcher(fst, topN, comparator); + // All paths are kept, so we can pass topN for + // maxQueueDepth and the pruning is admissible: + TopNSearcher searcher = new TopNSearcher(fst, topN, topN, comparator); + // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.addStartPaths(fromNode, startOutput, allowEmptyString, new IntsRef());