Index: test/org/apache/lucene/index/TestIndexWriterMergePolicy.java =================================================================== --- test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (revision 467489) +++ test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (working copy) @@ -181,14 +181,14 @@ int ramSegmentCount = writer.getRAMSegmentCount(); assertTrue(ramSegmentCount < maxBufferedDocs); - int lowerBound = 0; + int lowerBound = -1; int upperBound = maxBufferedDocs; int numSegments = 0; int segmentCount = writer.getSegmentCount(); for (int i = segmentCount - 1; i >= 0; i--) { int docCount = writer.getDocCount(i); - assertTrue(docCount > lowerBound || docCount == 0); + assertTrue(docCount > lowerBound); if (docCount <= upperBound) { numSegments++; @@ -197,8 +197,10 @@ assertTrue(numSegments < mergeFactor); } - lowerBound = upperBound; - upperBound *= mergeFactor; + do { + lowerBound = upperBound; + upperBound *= mergeFactor; + } while (docCount > upperBound); numSegments = 1; } } Index: test/org/apache/lucene/index/TestAddIndexesNoOptimize.java =================================================================== --- test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (revision 0) +++ test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (revision 0) @@ -0,0 +1,381 @@ +package org.apache.lucene.index; + +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +public class TestAddIndexesNoOptimize extends TestCase { + public void testSimpleCase() throws IOException { + // main directory + Directory dir = new RAMDirectory(); + // two auxiliary directories + Directory aux = new RAMDirectory(); + Directory aux2 = new RAMDirectory(); + + IndexWriter writer = null; + + writer = newWriter(dir, true); + // add 100 documents + addDocs(writer, 100); + assertEquals(100, writer.docCount()); + writer.close(); + + writer = newWriter(aux, true); + writer.setUseCompoundFile(false); // use one without a compound file + // add 40 documents in separate files + addDocs(writer, 40); + assertEquals(40, writer.docCount()); + writer.close(); + + writer = newWriter(aux2, true); + // add 40 documents in compound files + addDocs2(writer, 50); + assertEquals(50, writer.docCount()); + writer.close(); + + // test doc count before segments are merged + writer = newWriter(dir, false); + assertEquals(100, writer.docCount()); + writer.addIndexesNoOptimize(new Directory[] { aux, aux2 }); + assertEquals(190, writer.docCount()); + writer.close(); + + // make sure the old index is correct + verifyNumDocs(aux, 40); + + // make sure the new index is correct + verifyNumDocs(dir, 190); + + // now add another set in. + Directory aux3 = new RAMDirectory(); + writer = newWriter(aux3, true); + // add 40 documents + addDocs(writer, 40); + assertEquals(40, writer.docCount()); + writer.close(); + + // test doc count before segments are merged/index is optimized + writer = newWriter(dir, false); + assertEquals(190, writer.docCount()); + writer.addIndexesNoOptimize(new Directory[] { aux3 }); + assertEquals(230, writer.docCount()); + writer.close(); + + // make sure the new index is correct + verifyNumDocs(dir, 230); + + verifyTermDocs(dir, new Term("content", "aaa"), 180); + + verifyTermDocs(dir, new Term("content", "bbb"), 50); + + // now optimize it. + writer = newWriter(dir, false); + writer.optimize(); + writer.close(); + + // make sure the new index is correct + verifyNumDocs(dir, 230); + + verifyTermDocs(dir, new Term("content", "aaa"), 180); + + verifyTermDocs(dir, new Term("content", "bbb"), 50); + + // now add a single document + Directory aux4 = new RAMDirectory(); + writer = newWriter(aux4, true); + addDocs2(writer, 1); + writer.close(); + + writer = newWriter(dir, false); + assertEquals(230, writer.docCount()); + writer.addIndexesNoOptimize(new Directory[] { aux4 }); + assertEquals(231, writer.docCount()); + writer.close(); + + verifyNumDocs(dir, 231); + + verifyTermDocs(dir, new Term("content", "bbb"), 51); + } + + // case 0: add self or exceed maxMergeDocs, expect exception + public void testAddSelf() throws IOException { + // main directory + Directory dir = new RAMDirectory(); + // auxiliary directory + Directory aux = new RAMDirectory(); + + IndexWriter writer = null; + + writer = newWriter(dir, true); + // add 100 documents + addDocs(writer, 100); + assertEquals(100, writer.docCount()); + writer.close(); + + writer = newWriter(aux, true); + writer.setUseCompoundFile(false); // use one without a compound file + writer.setMaxBufferedDocs(1000); + // add 140 documents in separate files + addDocs(writer, 40); + writer.close(); + writer = newWriter(aux, true); + writer.setUseCompoundFile(false); // use one without a compound file + writer.setMaxBufferedDocs(1000); + addDocs(writer, 100); + writer.close(); + + writer = newWriter(dir, false); + int maxMergeDocs = writer.getMaxMergeDocs(); + writer.setMaxMergeDocs(99); + + try { + // upper bound cannot exceed maxMergeDocs + writer.addIndexesNoOptimize(new Directory[] { aux }); + assertTrue(false); + } + catch (IllegalArgumentException e) { + assertEquals(100, writer.docCount()); + } + + writer.setMaxMergeDocs(maxMergeDocs); + try { + // cannot add self + writer.addIndexesNoOptimize(new Directory[] { aux, dir }); + assertTrue(false); + } + catch (IllegalArgumentException e) { + assertEquals(100, writer.docCount()); + } + writer.close(); + + // make sure the index is correct + verifyNumDocs(dir, 100); + } + + // in all the remaining tests, make the doc count of the oldest segment + // in dir large so that it is never merged in addIndexesNoOptimize() + // case 1: no tail segments + public void testNoTailSegments() throws IOException { + // main directory + Directory dir = new RAMDirectory(); + // auxiliary directory + Directory aux = new RAMDirectory(); + + setUpDirs(dir, aux); + + IndexWriter writer = newWriter(dir, false); + writer.setMaxBufferedDocs(10); + writer.setMergeFactor(4); + addDocs(writer, 10); + + writer.addIndexesNoOptimize(new Directory[] { aux }); + assertEquals(1040, writer.docCount()); + assertEquals(2, writer.getSegmentCount()); + assertEquals(1000, writer.getDocCount(0)); + writer.close(); + + // make sure the index is correct + verifyNumDocs(dir, 1040); + } + + // case 2: tail segments, invariants hold, no copy + public void testNoCopySegments() throws IOException { + // main directory + Directory dir = new RAMDirectory(); + // auxiliary directory + Directory aux = new RAMDirectory(); + + setUpDirs(dir, aux); + + IndexWriter writer = newWriter(dir, false); + writer.setMaxBufferedDocs(9); + writer.setMergeFactor(4); + addDocs(writer, 2); + + writer.addIndexesNoOptimize(new Directory[] { aux }); + assertEquals(1032, writer.docCount()); + assertEquals(2, writer.getSegmentCount()); + assertEquals(1000, writer.getDocCount(0)); + writer.close(); + + // make sure the index is correct + verifyNumDocs(dir, 1032); + } + + // case 3: tail segments, invariants hold, copy, invariants hold + public void testNoMergeAfterCopy() throws IOException { + // main directory + Directory dir = new RAMDirectory(); + // auxiliary directory + Directory aux = new RAMDirectory(); + + setUpDirs(dir, aux); + + IndexWriter writer = newWriter(dir, false); + writer.setMaxBufferedDocs(10); + writer.setMergeFactor(4); + + writer.addIndexesNoOptimize(new Directory[] { aux, aux }); + assertEquals(1060, writer.docCount()); + assertEquals(1000, writer.getDocCount(0)); + writer.close(); + + // make sure the index is correct + verifyNumDocs(dir, 1060); + } + + // case 4: tail segments, invariants hold, copy, invariants not hold + public void testMergeAfterCopy() throws IOException { + // main directory + Directory dir = new RAMDirectory(); + // auxiliary directory + Directory aux = new RAMDirectory(); + + setUpDirs(dir, aux); + + IndexReader reader = IndexReader.open(aux); + for (int i = 0; i < 20; i++) { + reader.deleteDocument(i); + } + assertEquals(10, reader.numDocs()); + reader.close(); + + IndexWriter writer = newWriter(dir, false); + writer.setMaxBufferedDocs(4); + writer.setMergeFactor(4); + + writer.addIndexesNoOptimize(new Directory[] { aux, aux }); + assertEquals(1020, writer.docCount()); + assertEquals(2, writer.getSegmentCount()); + assertEquals(1000, writer.getDocCount(0)); + writer.close(); + + // make sure the index is correct + verifyNumDocs(dir, 1020); + } + + // case 5: tail segments, invariants not hold + public void testMoreMerges() throws IOException { + // main directory + Directory dir = new RAMDirectory(); + // auxiliary directory + Directory aux = new RAMDirectory(); + Directory aux2 = new RAMDirectory(); + + setUpDirs(dir, aux); + + IndexWriter writer = newWriter(aux2, true); + writer.setMaxBufferedDocs(100); + writer.setMergeFactor(10); + writer.addIndexesNoOptimize(new Directory[] { aux }); + assertEquals(30, writer.docCount()); + assertEquals(3, writer.getSegmentCount()); + writer.close(); + + IndexReader reader = IndexReader.open(aux); + for (int i = 0; i < 27; i++) { + reader.deleteDocument(i); + } + assertEquals(3, reader.numDocs()); + reader.close(); + + reader = IndexReader.open(aux2); + for (int i = 0; i < 8; i++) { + reader.deleteDocument(i); + } + assertEquals(22, reader.numDocs()); + reader.close(); + + writer = newWriter(dir, false); + writer.setMaxBufferedDocs(6); + writer.setMergeFactor(4); + + writer.addIndexesNoOptimize(new Directory[] { aux, aux2 }); + assertEquals(1025, writer.docCount()); + assertEquals(1000, writer.getDocCount(0)); + writer.close(); + + // make sure the index is correct + verifyNumDocs(dir, 1025); + } + + private IndexWriter newWriter(Directory dir, boolean create) + throws IOException { + return new IndexWriter(dir, new WhitespaceAnalyzer(), create); + } + + private void addDocs(IndexWriter writer, int numDocs) throws IOException { + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + doc + .add(new Field("content", "aaa", Field.Store.NO, + Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + } + + private void addDocs2(IndexWriter writer, int numDocs) throws IOException { + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + doc + .add(new Field("content", "bbb", Field.Store.NO, + Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + } + + private void verifyNumDocs(Directory dir, int numDocs) throws IOException { + IndexReader reader = IndexReader.open(dir); + assertEquals(numDocs, reader.maxDoc()); + assertEquals(numDocs, reader.numDocs()); + reader.close(); + } + + private void verifyTermDocs(Directory dir, Term term, int numDocs) + throws IOException { + IndexReader reader = IndexReader.open(dir); + TermDocs termDocs = reader.termDocs(term); + int count = 0; + while (termDocs.next()) + count++; + assertEquals(numDocs, count); + reader.close(); + } + + private void setUpDirs(Directory dir, Directory aux) throws IOException { + IndexWriter writer = null; + + writer = newWriter(dir, true); + writer.setMaxBufferedDocs(1000); + // add 1000 documents + addDocs(writer, 1000); + assertEquals(1000, writer.docCount()); + assertEquals(1, writer.getSegmentCount()); + writer.close(); + + writer = newWriter(aux, true); + writer.setUseCompoundFile(false); // use one without a compound file + writer.setMaxBufferedDocs(100); + writer.setMergeFactor(10); + // add 30 documents in 3 segments + for (int i = 0; i < 3; i++) { + addDocs(writer, 10); + writer.close(); + writer = newWriter(aux, false); + writer.setUseCompoundFile(false); // use one without a compound file + writer.setMaxBufferedDocs(100); + writer.setMergeFactor(10); + } + assertEquals(30, writer.docCount()); + assertEquals(3, writer.getSegmentCount()); + writer.close(); + } +} Index: java/org/apache/lucene/index/IndexWriter.java =================================================================== --- java/org/apache/lucene/index/IndexWriter.java (revision 467489) +++ java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -632,6 +632,130 @@ optimize(); // final cleanup } + /** + * Merges all segments from an array of indexes into this index. + *

+ * This is similar to addIndexes(Directory[]). However, no optimize() + * is called either at the beginning or at the end. Instead, merges + * are carried out as necessary. + *

+ * This requires this index not be among those to be added, and the + * upper bound* of those segment doc counts not exceed maxMergeDocs. + */ + public synchronized void addIndexesNoOptimize(Directory[] dirs) + throws IOException { + // Adding indexes can be viewed as adding a sequence of segments S to + // a sequence of segments T. Segments in T follow the invariants but + // segments in S may not since they could come from multiple indexes. + // Here is the merge algorithm for addIndexesNoOptimize(): + // + // 1 Flush ram segments. + // 2 Consider a combined sequence with segments from T followed + // by segments from S (same as current addIndexes(Directory[])). + // 3 Assume the highest level for segments in S is h. Call + // maybeMergeSegments(), but instead of starting w/ lowerBound = -1 + // and upperBound = maxBufferedDocs, start w/ lowerBound = -1 and + // upperBound = upperBound of level h. After this, the invariants + // are guaranteed except for the last < M segments whose levels <= h. + // 4 If the invariants hold for the last < M segments whose levels <= h, + // if some of those < M segments are from S (not merged in step 3), + // properly copy them over*, otherwise done. + // Otherwise, simply merge those segments. If the merge results in + // a segment of level <= h, done. Otherwise, it's of level h+1 and call + // maybeMergeSegments() starting w/ upperBound = upperBound of level h+1. + // + // * Ideally, we want to simply copy a segment. However, directory does + // not support copy yet. In addition, source may use compound file or not + // and target may use compound file or not. So we use mergeSegments() to + // copy a segment, which may cause doc count to change because deleted + // docs are garbage collected. + // + // In current addIndexes(Directory[]), segment infos in S are added to + // T's "segmentInfos" upfront. Then segments in S are merged to T several + // at a time. Every merge is committed with T's "segmentInfos". So if + // a reader is opened on T while addIndexes() is going on, it could see + // an inconsistent index. AddIndexesNoOptimize() has a similar behaviour. + + // 1 flush ram segments + flushRamSegments(); + + // 2 copy segment infos and find the highest level from dirs + int start = segmentInfos.size(); + int startUpperBound = minMergeDocs; + + try { + for (int i = 0; i < dirs.length; i++) { + if (directory == dirs[i]) { + // cannot add this index: segments may be deleted in merge before added + throw new IllegalArgumentException("Cannot add this index to itself"); + } + + SegmentInfos sis = new SegmentInfos(); // read infos from dir + sis.read(dirs[i]); + for (int j = 0; j < sis.size(); j++) { + SegmentInfo info = sis.info(j); + segmentInfos.addElement(info); // add each info + + while (startUpperBound < info.docCount) { + startUpperBound *= mergeFactor; // find the highest level from dirs + if (startUpperBound > maxMergeDocs) { + // upper bound cannot exceed maxMergeDocs + throw new IllegalArgumentException("Upper bound cannot exceed maxMergeDocs"); + } + } + } + } + } catch (IllegalArgumentException e) { + for (int i = segmentInfos.size() - 1; i >= start; i--) { + segmentInfos.remove(i); + } + throw e; + } + + // 3 maybe merge segments starting from the highest level from dirs + maybeMergeSegments(startUpperBound); + + // get the tail segments whose levels <= h + int segmentCount = segmentInfos.size(); + int numTailSegments = 0; + while (numTailSegments < segmentCount + && startUpperBound >= segmentInfos.info(segmentCount - 1 - numTailSegments).docCount) { + numTailSegments++; + } + if (numTailSegments == 0) { + return; + } + + // 4 make sure invariants hold for the tail segments whose levels <= h + if (checkNonDecreasingLevels(segmentCount - numTailSegments)) { + // identify the segments from S to be copied (not merged in 3) + int numSegmentsToCopy = 0; + while (numSegmentsToCopy < segmentCount + && directory != segmentInfos.info(segmentCount - 1 - numSegmentsToCopy).dir) { + numSegmentsToCopy++; + } + if (numSegmentsToCopy == 0) { + return; + } + + // copy those segments from S + for (int i = segmentCount - numSegmentsToCopy; i < segmentCount; i++) { + mergeSegments(segmentInfos, i, i + 1); + } + if (checkNonDecreasingLevels(segmentCount - numSegmentsToCopy)) { + return; + } + } + + // invariants do not hold, simply merge those segments + mergeSegments(segmentInfos, segmentCount - numTailSegments, segmentCount); + + // maybe merge segments again if necessary + if (segmentInfos.info(segmentInfos.size() - 1).docCount > startUpperBound) { + maybeMergeSegments(startUpperBound * mergeFactor); + } + } + /** Merges the provided indexes into this index. *

After this completes, the index is optimized.

*

The provided IndexReaders are not closed.

@@ -735,16 +859,16 @@ private final void flushRamSegments() throws IOException { if (ramSegmentInfos.size() > 0) { mergeSegments(ramSegmentInfos, 0, ramSegmentInfos.size()); - maybeMergeSegments(); + maybeMergeSegments(minMergeDocs); } } /** Incremental segment merger. */ - private final void maybeMergeSegments() throws IOException { + private final void maybeMergeSegments(int startUpperBound) throws IOException { long lowerBound = -1; - long upperBound = minMergeDocs; + long upperBound = startUpperBound; - while (upperBound * mergeFactor <= maxMergeDocs) { + while (upperBound < maxMergeDocs) { int minSegment = segmentInfos.size(); int maxSegment = -1; @@ -949,4 +1073,22 @@ } directory.renameFile("deleteable.new", IndexFileNames.DELETABLE); } + + private final boolean checkNonDecreasingLevels(int start) { + int lowerBound = -1; + int upperBound = minMergeDocs; + + for (int i = segmentInfos.size() - 1; i >= start; i--) { + int docCount = segmentInfos.info(i).docCount; + if (docCount <= lowerBound) { + return false; + } + + while (docCount > upperBound) { + lowerBound = upperBound; + upperBound *= mergeFactor; + } + } + return true; + } }