Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 651026) +++ CHANGES.txt (working copy) @@ -94,6 +94,9 @@ hitting an exception in readInternal, the buffer is incorrectly filled with stale bytes such that subsequent calls to readByte() return incorrect results. (Trejkaz via Mike McCandless) + + 8. LUCENE-1267: Added numDocs() and maxDoc() to IndexWriter; + deprecated docCount(). (Mike McCandless) New features Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 651026) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -104,8 +104,11 @@ // optimize the index and check that the new doc count is correct writer = new IndexWriter(dir, true, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + assertEquals(100, writer.maxDoc()); + assertEquals(60, writer.numDocs()); writer.optimize(); - assertEquals(60, writer.docCount()); + assertEquals(60, writer.maxDoc()); + assertEquals(60, writer.numDocs()); writer.close(); // check that the index reader gives the same numbers. @@ -117,7 +120,8 @@ // make sure opening a new index for create over // this existing one works correctly: writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); - assertEquals(0, writer.docCount()); + assertEquals(0, writer.maxDoc()); + assertEquals(0, writer.numDocs()); writer.close(); } @@ -3030,7 +3034,10 @@ writer = new IndexWriter(dir, false, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); + assertEquals(8, writer.numDocs()); + assertEquals(10, writer.maxDoc()); writer.expungeDeletes(); + assertEquals(8, writer.numDocs()); writer.close(); ir = IndexReader.open(dir); assertEquals(8, ir.maxDoc()); @@ -3075,6 +3082,7 @@ false, new StandardAnalyzer(), IndexWriter.MaxFieldLength.LIMITED); writer.setMergeFactor(3); + assertEquals(49, writer.numDocs()); writer.expungeDeletes(); writer.close(); ir = IndexReader.open(dir); Index: src/test/org/apache/lucene/index/TestCheckIndex.java =================================================================== --- src/test/org/apache/lucene/index/TestCheckIndex.java (revision 651026) +++ src/test/org/apache/lucene/index/TestCheckIndex.java (working copy) @@ -49,7 +49,11 @@ ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); CheckIndex.out = new PrintStream(bos); - assertTrue(CheckIndex.check(dir, false, null)); + if (!CheckIndex.check(dir, false, null)) { + System.out.println("CheckIndex failed"); + System.out.println(bos.toString()); + fail(); + } final List onlySegments = new ArrayList(); onlySegments.add("_0"); assertTrue(CheckIndex.check(dir, false, onlySegments)); Index: src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfo.java (revision 651026) +++ src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BitVector; import java.io.IOException; import java.util.List; import java.util.ArrayList; @@ -73,6 +74,9 @@ // other segments private boolean docStoreIsCompoundFile; // whether doc store files are stored in compound file (*.cfx) + private int delCount; // How many deleted docs in this segment, or -1 if not yet known + // (if it's an older index) + public SegmentInfo(String name, int docCount, Directory dir) { this.name = name; this.docCount = docCount; @@ -84,6 +88,7 @@ docStoreOffset = -1; docStoreSegment = name; docStoreIsCompoundFile = false; + delCount = 0; } public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) { @@ -99,6 +104,7 @@ this.docStoreOffset = docStoreOffset; this.docStoreSegment = docStoreSegment; this.docStoreIsCompoundFile = docStoreIsCompoundFile; + delCount = 0; assert docStoreOffset == -1 || docStoreSegment != null; } @@ -122,6 +128,7 @@ } isCompoundFile = src.isCompoundFile; hasSingleNormFile = src.hasSingleNormFile; + delCount = src.delCount; } /** @@ -168,6 +175,11 @@ } isCompoundFile = input.readByte(); preLockless = (isCompoundFile == CHECK_DIR); + if (format <= SegmentInfos.FORMAT_DEL_COUNT) { + delCount = input.readInt(); + assert delCount <= docCount; + } else + delCount = -1; } else { delGen = CHECK_DIR; normGen = null; @@ -177,6 +189,7 @@ docStoreOffset = -1; docStoreIsCompoundFile = false; docStoreSegment = null; + delCount = -1; } } @@ -263,6 +276,7 @@ SegmentInfo si = new SegmentInfo(name, docCount, dir); si.isCompoundFile = isCompoundFile; si.delGen = delGen; + si.delCount = delCount; si.preLockless = preLockless; si.hasSingleNormFile = hasSingleNormFile; if (normGen != null) { @@ -429,6 +443,23 @@ } } + int getDelCount() throws IOException { + if (delCount == -1) { + if (hasDeletions()) { + final String delFileName = getDelFileName(); + delCount = new BitVector(dir, delFileName).count(); + } else + delCount = 0; + } + assert delCount <= docCount; + return delCount; + } + + void setDelCount(int delCount) { + this.delCount = delCount; + assert delCount <= docCount; + } + int getDocStoreOffset() { return docStoreOffset; } @@ -475,6 +506,7 @@ } } output.writeByte(isCompoundFile); + output.writeInt(delCount); } private void addIfExists(List files, String fileName) throws IOException { Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/SegmentReader.java (revision 651026) +++ src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -55,10 +55,12 @@ private boolean deletedDocsDirty = false; private boolean normsDirty = false; private boolean undeleteAll = false; + private int pendingDeleteCount; private boolean rollbackDeletedDocsDirty = false; private boolean rollbackNormsDirty = false; private boolean rollbackUndeleteAll = false; + private int rollbackPendingDeleteCount; IndexInput freqStream; IndexInput proxStream; @@ -351,11 +353,16 @@ if (hasDeletions(si)) { deletedDocs = new BitVector(directory(), si.getDelFileName()); - // Verify # deletes does not exceed maxDoc for this segment: - if (deletedDocs.count() > maxDoc()) { - throw new CorruptIndexException("number of deletes (" + deletedDocs.count() + ") exceeds max doc (" + maxDoc() + ") for segment " + si.name); - } - } + assert si.getDelCount() == deletedDocs.count() : + "delete count mismatch: info=" + si.getDelCount() + " vs BitVector=" + deletedDocs.count(); + + // Verify # deletes does not exceed maxDoc for this + // segment: + assert si.getDelCount() <= maxDoc() : + "delete count mismatch: " + deletedDocs.count() + ") exceeds max doc (" + maxDoc() + ") for segment " + si.name; + + } else + assert si.getDelCount() == 0; } protected synchronized DirectoryIndexReader doReopen(SegmentInfos infos) throws CorruptIndexException, IOException { @@ -525,9 +532,12 @@ // .tmp & renaming it) because the file is not live // until segments file is written: deletedDocs.write(directory(), si.getDelFileName()); + + si.setDelCount(si.getDelCount()+pendingDeleteCount); } if (undeleteAll && si.hasDeletions()) { si.clearDelGen(); + si.setDelCount(0); } if (normsDirty) { // re-write norms si.setNumFields(fieldInfos.size()); @@ -620,7 +630,8 @@ deletedDocs = new BitVector(maxDoc()); deletedDocsDirty = true; undeleteAll = false; - deletedDocs.set(docNum); + if (!deletedDocs.getAndSet(docNum)) + pendingDeleteCount++; } protected void doUndeleteAll() { @@ -1009,6 +1020,7 @@ rollbackDeletedDocsDirty = deletedDocsDirty; rollbackNormsDirty = normsDirty; rollbackUndeleteAll = undeleteAll; + rollbackPendingDeleteCount = pendingDeleteCount; Iterator it = norms.values().iterator(); while (it.hasNext()) { Norm norm = (Norm) it.next(); @@ -1021,6 +1033,7 @@ deletedDocsDirty = rollbackDeletedDocsDirty; normsDirty = rollbackNormsDirty; undeleteAll = rollbackUndeleteAll; + pendingDeleteCount = rollbackPendingDeleteCount; Iterator it = norms.values().iterator(); while (it.hasNext()) { Norm norm = (Norm) it.next(); Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 651026) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -1187,7 +1187,8 @@ int docStart = 0; boolean any = false; for (int i = 0; i < infosEnd; i++) { - IndexReader reader = SegmentReader.get(infos.info(i), false); + final SegmentInfo info = infos.info(i); + SegmentReader reader = SegmentReader.get(info, false); boolean success = false; try { any |= applyDeletes(reader, docStart); @@ -1212,7 +1213,7 @@ // Apply buffered delete terms, queries and docIDs to the // provided reader - private final synchronized boolean applyDeletes(IndexReader reader, int docIDStart) + private final synchronized boolean applyDeletes(SegmentReader reader, int docIDStart) throws CorruptIndexException, IOException { final int docEnd = docIDStart + reader.maxDoc(); Index: src/java/org/apache/lucene/index/SegmentInfos.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfos.java (revision 651026) +++ src/java/org/apache/lucene/index/SegmentInfos.java (working copy) @@ -61,8 +61,12 @@ * ensure all bytes were successfully written. */ public static final int FORMAT_CHECKSUM = -5; + /** This format adds the deletion count for each segment. + * This way IndexWriter can efficiently report numDocs(). */ + public static final int FORMAT_DEL_COUNT = -6; + /* This must always point to the most recent file format. */ - private static final int CURRENT_FORMAT = FORMAT_CHECKSUM; + static final int CURRENT_FORMAT = FORMAT_DEL_COUNT; public int counter = 0; // used to name new segments /** Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 651026) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -1798,17 +1798,48 @@ return analyzer; } - /** Returns the number of documents currently in this index. */ + /** Returns the number of documents currently in this + * index, not counting deletions. + * @deprecated Please use {@link #maxDoc()} (same as this + * method) or {@link #numDocs()} (also takes deletions + * into account), instead. */ public synchronized int docCount() { ensureOpen(); + return maxDoc(); + } + + /** Returns total number of docs in this index, including + * docs not yet flushed (still in the RAM buffer), + * not counting deletions. + * @see #numDocs */ + public synchronized int maxDoc() { int count; if (docWriter != null) count = docWriter.getNumDocsInRAM(); else count = 0; + + for (int i = 0; i < segmentInfos.size(); i++) + count += segmentInfos.info(i).docCount; + return count; + } + + /** Returns total number of docs in this index, including + * docs not yet flushed (still in the RAM buffer), and + * including deletions. NOTE: buffered deletions + * are not counted. If you really need these to be + * counted you should call {@link #commit()} first. + * @see #numDocs */ + public synchronized int numDocs() throws IOException { + int count; + if (docWriter != null) + count = docWriter.getNumDocsInRAM(); + else + count = 0; + for (int i = 0; i < segmentInfos.size(); i++) { - SegmentInfo si = segmentInfos.info(i); - count += si.docCount; + final SegmentInfo info = segmentInfos.info(i); + count += info.docCount - info.getDelCount(); } return count; } @@ -3354,6 +3385,7 @@ BitVector deletes = null; int docUpto = 0; + int delCount = 0; final int numSegmentsToMerge = sourceSegments.size(); for(int i=0;ibit to true, and + * returns true if bit was already set */ + public final boolean getAndSet(int bit) { + if (bit >= size) { + throw new ArrayIndexOutOfBoundsException(bit); + } + final int pos = bit >> 3; + final int v = bits[pos]; + final int flag = 1 << (bit & 7); + if ((flag & v) != 0) + return true; + else { + bits[pos] = (byte) ((v | flag)&0xff); + if (count != -1) + count++; + return false; + } + } + /** Sets the value of bit to zero. */ public final void clear(int bit) { if (bit >= size) {