Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 620053) +++ CHANGES.txt (working copy) @@ -36,6 +36,11 @@ the Lucene code base will need to be adapted. See also the javadocs of the Filter class. (Paul Elschot, Michael Busch) + 4. LUCENE-325: Added IndexWriter.expungeDeletes methods, which + consults the MergePolicy to find merges necessary to merge away + all deletes from the index. This should be a somewhat lower cost + operation than optimize. (John Wang via Mike McCandless) + Bug fixes 1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimze a single Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 620053) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -2845,4 +2845,140 @@ reader.close(); dir.close(); } + + // LUCENE-325: test expungeDeletes, when 2 singular merges + // are required + public void testExpungeDeletes() throws IOException { + Directory dir = new MockRAMDirectory(); + IndexWriter writer = new IndexWriter(dir, + false, new StandardAnalyzer(), + IndexWriter.MaxFieldLength.LIMITED); + writer.setMaxBufferedDocs(2); + writer.setRAMBufferSizeMB(IndexWriter.DISABLE_AUTO_FLUSH); + + Document document = new Document(); + + document = new Document(); + Field storedField = new Field("stored", "stored", Field.Store.YES, + Field.Index.NO); + document.add(storedField); + Field termVectorField = new Field("termVector", "termVector", + Field.Store.NO, Field.Index.UN_TOKENIZED, + Field.TermVector.WITH_POSITIONS_OFFSETS); + document.add(termVectorField); + for(int i=0;i<10;i++) + writer.addDocument(document); + writer.close(); + + IndexReader ir = IndexReader.open(dir); + assertEquals(10, ir.maxDoc()); + assertEquals(10, ir.numDocs()); + ir.deleteDocument(0); + ir.deleteDocument(7); + assertEquals(8, ir.numDocs()); + ir.close(); + + writer = new IndexWriter(dir, + false, new StandardAnalyzer(), + IndexWriter.MaxFieldLength.LIMITED); + writer.expungeDeletes(); + writer.close(); + ir = IndexReader.open(dir); + assertEquals(8, ir.maxDoc()); + assertEquals(8, ir.numDocs()); + ir.close(); + dir.close(); + } + + // LUCENE-325: test expungeDeletes, when many adjacent merges are required + public void testExpungeDeletes2() throws IOException { + Directory dir = new MockRAMDirectory(); + IndexWriter writer = new IndexWriter(dir, + false, new StandardAnalyzer(), + IndexWriter.MaxFieldLength.LIMITED); + writer.setMaxBufferedDocs(2); + writer.setMergeFactor(50); + writer.setRAMBufferSizeMB(IndexWriter.DISABLE_AUTO_FLUSH); + + Document document = new Document(); + + document = new Document(); + Field storedField = new Field("stored", "stored", Field.Store.YES, + Field.Index.NO); + document.add(storedField); + Field termVectorField = new Field("termVector", "termVector", + Field.Store.NO, Field.Index.UN_TOKENIZED, + Field.TermVector.WITH_POSITIONS_OFFSETS); + document.add(termVectorField); + for(int i=0;i<98;i++) + writer.addDocument(document); + writer.close(); + + IndexReader ir = IndexReader.open(dir); + assertEquals(98, ir.maxDoc()); + assertEquals(98, ir.numDocs()); + for(int i=0;i<98;i+=2) + ir.deleteDocument(i); + assertEquals(49, ir.numDocs()); + ir.close(); + + writer = new IndexWriter(dir, + false, new StandardAnalyzer(), + IndexWriter.MaxFieldLength.LIMITED); + writer.setMergeFactor(3); + writer.expungeDeletes(); + writer.close(); + ir = IndexReader.open(dir); + assertEquals(49, ir.maxDoc()); + assertEquals(49, ir.numDocs()); + ir.close(); + dir.close(); + } + + // LUCENE-325: test expungeDeletes without waiting, when + // many adjacent merges are required + public void testExpungeDeletes3() throws IOException { + Directory dir = new MockRAMDirectory(); + IndexWriter writer = new IndexWriter(dir, + false, new StandardAnalyzer(), + IndexWriter.MaxFieldLength.LIMITED); + writer.setMaxBufferedDocs(2); + writer.setMergeFactor(50); + writer.setRAMBufferSizeMB(IndexWriter.DISABLE_AUTO_FLUSH); + + Document document = new Document(); + + document = new Document(); + Field storedField = new Field("stored", "stored", Field.Store.YES, + Field.Index.NO); + document.add(storedField); + Field termVectorField = new Field("termVector", "termVector", + Field.Store.NO, Field.Index.UN_TOKENIZED, + Field.TermVector.WITH_POSITIONS_OFFSETS); + document.add(termVectorField); + for(int i=0;i<98;i++) + writer.addDocument(document); + writer.close(); + + IndexReader ir = IndexReader.open(dir); + assertEquals(98, ir.maxDoc()); + assertEquals(98, ir.numDocs()); + for(int i=0;i<98;i+=2) + ir.deleteDocument(i); + assertEquals(49, ir.numDocs()); + ir.close(); + + writer = new IndexWriter(dir, + false, new StandardAnalyzer(), + IndexWriter.MaxFieldLength.LIMITED); + // Force many merges to happen + writer.setMergeFactor(3); + writer.expungeDeletes(false); + writer.close(); + ir = IndexReader.open(dir); + assertEquals(49, ir.maxDoc()); + assertEquals(49, ir.numDocs()); + ir.close(); + dir.close(); + } } Index: src/java/org/apache/lucene/index/MergePolicy.java =================================================================== --- src/java/org/apache/lucene/index/MergePolicy.java (revision 620053) +++ src/java/org/apache/lucene/index/MergePolicy.java (working copy) @@ -209,7 +209,7 @@ throws CorruptIndexException, IOException; /** - * Determine what set of merge operations are necessary in + * Determine what set of merge operations is necessary in * order to optimize the index. The IndexWriter calls * this when its optimize() method is called. This call * is always synchronized on the IndexWriter instance so @@ -230,6 +230,19 @@ throws CorruptIndexException, IOException; /** + * Determine what set of merge operations is necessary in + * order to expunge all deletes from the index. + * @param segmentInfos the total set of segments in the index + * @param writer IndexWriter instance + */ + MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos, + IndexWriter writer) + throws CorruptIndexException, IOException + { + throw new RuntimeException("not implemented"); + } + + /** * Release all resources for the policy. */ abstract void close(); Index: src/java/org/apache/lucene/index/LogMergePolicy.java =================================================================== --- src/java/org/apache/lucene/index/LogMergePolicy.java (revision 620053) +++ src/java/org/apache/lucene/index/LogMergePolicy.java (working copy) @@ -245,6 +245,54 @@ return spec; } + /** + * Finds merges necessary to expunge all deletes from the + * index. We simply merge adjacent segments that have + * deletes, up to mergeFactor at a time. + */ + public MergeSpecification findMergesToExpungeDeletes(SegmentInfos segmentInfos, + IndexWriter writer) + throws CorruptIndexException, IOException + { + this.writer = writer; + + final int numSegments = segmentInfos.size(); + + message("findMergesToExpungeDeletes: " + numSegments + " segments"); + + MergeSpecification spec = new MergeSpecification(); + int firstSegmentWithDeletions = -1; + for(int i=0;i 0) { @@ -2059,6 +2059,87 @@ return false; } + /** Just like {@link #expungeDeletes()}, except you can + * specify whether the call should block until the + * operation completes. This is only meaningful with a + * {@link MergeScheduler} that is able to run merges in + * background threads. */ + public void expungeDeletes(boolean doWait) + throws CorruptIndexException, IOException { + ensureOpen(); + + if (infoStream != null) + message("expungeDeletes: index now " + segString()); + + MergePolicy.MergeSpecification spec; + + synchronized(this) { + spec = mergePolicy.findMergesToExpungeDeletes(segmentInfos, this); + if (spec != null) { + final int numMerges = spec.merges.size(); + for(int i=0;i