Index: lucene/common-build.xml
===================================================================
--- lucene/common-build.xml (revision 1400186)
+++ lucene/common-build.xml (working copy)
@@ -841,10 +841,12 @@
+
Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
===================================================================
--- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1400186)
+++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy)
@@ -161,6 +161,12 @@
* SynonymFilter). */
private final int maxGraphExpansions;
+ /** Highest number of analyzed paths we saw for any single
+ * input surface form. For analyzers that never create
+ * graphs this will always be 1. */
+ // nocommit: not saved/loaded ...
+ private int maxAnalyzedPathsForOneInput;
+
/**
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
@@ -354,6 +360,8 @@
// don't have to alloc [possibly biggish]
// intermediate HashSet in RAM:
Set paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
+ maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
+
for (IntsRef path : paths) {
Util.toBytesRef(path, scratch);
@@ -545,7 +553,7 @@
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher> searcher;
- searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
+ searcher = new Util.TopNSearcher>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// NOTE: we could almost get away with only using
// the first start node. The only catch is if
@@ -591,16 +599,19 @@
Util.TopNSearcher> searcher;
searcher = new Util.TopNSearcher>(fst,
- num - results.size(),
+ num,
+ num * maxAnalyzedPathsForOneInput,
weightComparator) {
private final Set seen = new HashSet();
@Override
protected boolean acceptResult(IntsRef input, Pair output) {
-
+
+ //System.out.println("ACCEPT? path=" + input);
// Dedup: when the input analyzes to a graph we
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
+ //System.out.println("SKIP: dup");
return false;
}
seen.add(output.output2);
@@ -630,6 +641,12 @@
LookupResult result = new LookupResult(spare.toString(), decodeWeight(completion.output.output1));
//System.out.println(" result=" + result);
results.add(result);
+
+ if (results.size() == num) {
+ // In the exactFirst=true case the search may
+ // produce one extra path
+ break;
+ }
}
return results;
Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java
===================================================================
--- lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (revision 1400186)
+++ lucene/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java (working copy)
@@ -47,7 +47,8 @@
/**
* Benchmarks tests for implementations of {@link Lookup} interface.
*/
-@Ignore("COMMENT ME TO RUN BENCHMARKS!")
+// nocommit
+//@Ignore("COMMENT ME TO RUN BENCHMARKS!")
public class LookupBenchmarkTest extends LuceneTestCase {
@SuppressWarnings("unchecked")
private final List> benchmarkClasses = Arrays.asList(
Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
===================================================================
--- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision 1400186)
+++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (working copy)
@@ -803,4 +803,116 @@
List results = suggester.lookup("a", false, 4);
}
+
+ public void testExactFirstMissingResult() throws Exception {
+
+ Analyzer a = new MockAnalyzer(random());
+
+ AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.EXACT_FIRST, 256, -1);
+
+ suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+ new TermFreq("a", 5),
+ new TermFreq("a b", 3),
+ new TermFreq("a c", 4),
+ }));
+
+ // nocommit make this false and confirm test still
+ // failed before:
+ List results = suggester.lookup("a", true, 3);
+ assertEquals(3, results.size());
+ assertEquals("a", results.get(0).key);
+ assertEquals(5, results.get(0).value);
+ assertEquals("a c", results.get(1).key);
+ assertEquals(4, results.get(1).value);
+ assertEquals("a b", results.get(2).key);
+ assertEquals(3, results.get(2).value);
+ }
+
+ public void testDupSurfaceFormsMissingResults() throws Exception {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+
+ return new TokenStreamComponents(tokenizer) {
+
+ @Override
+ public TokenStream getTokenStream() {
+ return new CannedTokenStream(new Token[] {
+ token("hairy", 1, 1),
+ token("smelly", 0, 1),
+ token("dog", 1, 1),
+ });
+ }
+
+ @Override
+ protected void setReader(final Reader reader) throws IOException {
+ }
+ };
+ }
+ };
+
+ AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1);
+
+ suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+ new TermFreq("hambone", 6),
+ new TermFreq("nellie", 5),
+ }));
+
+ List results = suggester.lookup("nellie", false, 2);
+ assertEquals(2, results.size());
+ assertEquals("hambone", results.get(0).key);
+ assertEquals(6, results.get(0).value);
+ assertEquals("nellie", results.get(1).key);
+ assertEquals(5, results.get(1).value);
+ }
+
+ public void testDupSurfaceFormsMissingResults2() throws Exception {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.SIMPLE, true);
+
+ return new TokenStreamComponents(tokenizer) {
+
+ int count;
+
+ @Override
+ public TokenStream getTokenStream() {
+ if (count == 0) {
+ count++;
+ return new CannedTokenStream(new Token[] {
+ token("p", 1, 1),
+ token("q", 1, 1),
+ token("r", 0, 1),
+ token("s", 0, 1),
+ });
+ } else {
+ return new CannedTokenStream(new Token[] {
+ token("p", 1, 1),
+ });
+ }
+ }
+
+ @Override
+ protected void setReader(final Reader reader) throws IOException {
+ }
+ };
+ }
+ };
+
+ AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1);
+
+ suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+ new TermFreq("a", 6),
+ new TermFreq("b", 5),
+ }));
+
+ List results = suggester.lookup("a", false, 2);
+ assertEquals(2, results.size());
+ assertEquals("a", results.get(0).key);
+ assertEquals(6, results.get(0).value);
+ assertEquals("b", results.get(1).key);
+ assertEquals(5, results.get(1).value);
+ }
}
Index: lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java (revision 1400186)
+++ lucene/test-framework/src/java/org/apache/lucene/util/TestRuleAssertionsRequired.java (working copy)
@@ -30,6 +30,8 @@
return new Statement() {
@Override
public void evaluate() throws Throwable {
+ // nocommit
+ /*
try {
assert false;
String msg = "Test class requires enabled assertions, enable globally (-ea)" +
@@ -39,6 +41,7 @@
} catch (AssertionError e) {
// Ok, enabled.
}
+ */
base.evaluate();
}
Index: lucene/core/src/java/org/apache/lucene/util/fst/Util.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/fst/Util.java (revision 1400186)
+++ lucene/core/src/java/org/apache/lucene/util/fst/Util.java (working copy)
@@ -266,6 +266,7 @@
private final FST fst;
private final FST.BytesReader bytesReader;
private final int topN;
+ private final int maxQueueDepth;
private final FST.Arc scratchArc = new FST.Arc();
@@ -273,10 +274,11 @@
TreeSet> queue = null;
- public TopNSearcher(FST fst, int topN, Comparator comparator) {
+ public TopNSearcher(FST fst, int topN, int maxQueueDepth, Comparator comparator) {
this.fst = fst;
this.bytesReader = fst.getBytesReader(0);
this.topN = topN;
+ this.maxQueueDepth = maxQueueDepth;
this.comparator = comparator;
queue = new TreeSet>();
@@ -290,7 +292,7 @@
T cost = fst.outputs.add(path.cost, path.arc.output);
//System.out.println(" addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label);
- if (queue.size() == topN) {
+ if (queue.size() == maxQueueDepth) {
FSTPath bottom = queue.last();
int comp = comparator.compare(cost, bottom.cost);
if (comp > 0) {
@@ -323,9 +325,9 @@
queue.add(newPath);
- if (queue.size() == topN+1) {
+ if (queue.size() == maxQueueDepth+1) {
queue.pollLast();
- }
+ }
}
/** Adds all leaving arcs, including 'finished' arc, if
@@ -390,8 +392,6 @@
break;
}
- //System.out.println(" remove init path=" + path);
-
if (path.arc.label == FST.END_LABEL) {
//System.out.println(" empty string! cost=" + path.cost);
// Empty string!
@@ -400,10 +400,13 @@
continue;
}
+ // LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
+ /*
if (results.size() == topN-1) {
// Last path -- don't bother w/ queue anymore:
queue = null;
}
+ */
//System.out.println(" path: " + path);
@@ -512,8 +515,11 @@
* PositiveIntOutputs#getSingleton}). */
public static MinResult[] shortestPaths(FST fst, FST.Arc fromNode, T startOutput, Comparator comparator, int topN,
boolean allowEmptyString) throws IOException {
- TopNSearcher searcher = new TopNSearcher(fst, topN, comparator);
+ // All paths are kept, so we can pass topN for
+ // maxQueueDepth and the pruning is admissible:
+ TopNSearcher searcher = new TopNSearcher(fst, topN, topN, comparator);
+
// since this search is initialized with a single start node
// it is okay to start with an empty input path here
searcher.addStartPaths(fromNode, startOutput, allowEmptyString, new IntsRef());