Index: lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java =================================================================== --- lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (revision 1405894) +++ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java (working copy) @@ -983,4 +983,17 @@ assertEquals("b", results.get(1).key); assertEquals(5, results.get(1).value); } + + public void testDupSurfaceFormsMissingResults3() throws Exception { + Analyzer a = new MockAnalyzer(random()); + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1); + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("a a", 7), + new TermFreq("a a", 7), + new TermFreq("a c", 6), + new TermFreq("a c", 3), + new TermFreq("a b", 5), + })); + assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString()); + } } Index: lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java =================================================================== --- lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (revision 1405894) +++ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (working copy) @@ -340,6 +340,7 @@ try { ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; + while ((surfaceForm = iterator.next()) != null) { Set paths = toFiniteStrings(surfaceForm, ts2a); @@ -379,6 +380,10 @@ // Sort all input/output pairs (required by FST.Builder): new Sort().sort(tempInput, tempSorted); + + // Free disk space: + tempInput.delete(); + reader = new Sort.ByteSequencesReader(tempSorted); PairOutputs outputs = new PairOutputs(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()); @@ -391,6 +396,12 @@ IntsRef scratchInts = new IntsRef(); ByteArrayDataInput input = new ByteArrayDataInput(); + // Used to remove duplicate surface forms (but we + // still index the hightest-weight one). We clear + // this when we see a new analyzed form, so it cannot + // grow unbounded (at most 256 entries): + Set seenSurfaceForms = new HashSet(); + int dedup = 0; while (reader.read(scratch)) { input.reset(scratch.bytes, scratch.offset, scratch.length); @@ -411,6 +422,7 @@ if (previous == null) { previous = new BytesRef(); previous.copyBytes(analyzed); + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); } else if (analyzed.equals(previous)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { @@ -418,9 +430,15 @@ // dups: skip the rest: continue; } + if (seenSurfaceForms.contains(surface)) { + continue; + } + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); } else { dedup = 0; previous.copyBytes(analyzed); + seenSurfaceForms.clear(); + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); } analyzed.grow(analyzed.length+2);