Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1398854) +++ lucene/CHANGES.txt (working copy) @@ -57,6 +57,9 @@ * LUCENE-4468: Fix rareish integer overflows in Block and Lucene40 postings formats (Robert Muir) +* LUCENE-4485: When CheckIndex terms, terms/docs pairs and tokens, + these counts now all exclude deleted documents. (Mike McCandless) + Optimizations * LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets Index: lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java (revision 1398854) +++ lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java (working copy) @@ -75,8 +75,8 @@ assertNotNull(seg.termIndexStatus); assertNull(seg.termIndexStatus.error); - assertEquals(19, seg.termIndexStatus.termCount); - assertEquals(19, seg.termIndexStatus.totFreq); + assertEquals(18, seg.termIndexStatus.termCount); + assertEquals(18, seg.termIndexStatus.totFreq); assertEquals(18, seg.termIndexStatus.totPos); assertNotNull(seg.storedFieldStatus); Index: lucene/core/src/java/org/apache/lucene/index/CheckIndex.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/CheckIndex.java (revision 1398854) +++ lucene/core/src/java/org/apache/lucene/index/CheckIndex.java (working copy) @@ -233,9 +233,12 @@ TermIndexStatus() { } - /** Total term count */ + /** Number of terms with at least one live doc. */ public long termCount = 0L; + /** Number of terms with zero live docs docs. */ + public long delTermCount = 0L; + /** Total frequency across all terms. */ public long totFreq = 0L; @@ -750,7 +753,7 @@ final TermsEnum termsEnum = terms.iterator(null); boolean hasOrd = true; - final long termCountStart = status.termCount; + final long termCountStart = status.delTermCount + status.termCount; BytesRef lastTerm = null; @@ -781,7 +784,6 @@ if (docFreq <= 0) { throw new RuntimeException("docfreq: " + docFreq + " is out of bounds"); } - status.totFreq += docFreq; sumDocFreq += docFreq; docs = termsEnum.docs(liveDocs, docs); @@ -796,15 +798,13 @@ } if (hasOrd) { - final long ordExpected = status.termCount - termCountStart; + final long ordExpected = status.delTermCount + status.termCount - termCountStart; if (ord != ordExpected) { throw new RuntimeException("ord mismatch: TermsEnum has ord=" + ord + " vs actual=" + ordExpected); } } } - status.termCount++; - final DocsEnum docs2; if (postings != null) { docs2 = postings; @@ -820,6 +820,7 @@ if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } + status.totFreq++; visitedDocs.set(doc); int freq = -1; if (hasFreqs) { @@ -883,6 +884,12 @@ } } + if (docCount != 0) { + status.termCount++; + } else { + status.delTermCount++; + } + final long totalTermFreq2 = termsEnum.totalTermFreq(); final boolean hasTotalTermFreq = hasFreqs && totalTermFreq2 != -1; @@ -1063,11 +1070,11 @@ // check unique term count long termCount = -1; - if (status.termCount-termCountStart > 0) { + if ((status.delTermCount+status.termCount)-termCountStart > 0) { termCount = fields.terms(field).size(); - if (termCount != -1 && termCount != status.termCount - termCountStart) { - throw new RuntimeException("termCount mismatch " + termCount + " vs " + (status.termCount - termCountStart)); + if (termCount != -1 && termCount != status.delTermCount + status.termCount - termCountStart) { + throw new RuntimeException("termCount mismatch " + (status.delTermCount + termCount) + " vs " + (status.termCount - termCountStart)); } }