Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 1021827) +++ CHANGES.txt (working copy) @@ -100,6 +100,10 @@ test lock just before the real lock is acquired. (Surinder Pal Singh Bindra via Mike McCandless) +* LUCENE-2701: maxMergeMB constraint set on LogMergePolicy now affects + optimize() as well (as opposed to only regular merges). This means that you + can run optimize() and too large segments won't be merged. (Shai Erera) + API Changes * LUCENE-2076: Rename FSDirectory.getFile -> getDirectory. (George Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 1021827) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -5243,4 +5243,41 @@ _TestUtil.checkIndex(dir); dir.close(); } + + public void testSizeBoundedOptimize() throws Exception { + // tests that the max merge size constraint is applied during optimize. + Directory dir = new RAMDirectory(); + + // Prepare an index w/ several small segments and a large one. + IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + // prevent any merges from happening. + conf.setMergePolicy(NoMergePolicy.COMPOUND_FILES); + IndexWriter writer = new IndexWriter(dir, conf); + final int numSegments = 15; + for (int i = 0; i < numSegments; i++) { + int numDocs = i == 7 ? 10 : 1; + for (int j = 0; j < numDocs; j++) { + Document doc = new Document(); + writer.addDocument(doc); + } + writer.commit(); + } + + writer.close(); + + conf = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + LogByteSizeMergePolicy lmp = new LogByteSizeMergePolicy(); + lmp.setMaxMergeMB(200.0 / (1 << 20)); // ~100 bytes tops + conf.setMergePolicy(lmp); + + writer = new IndexWriter(dir, conf); + writer.optimize(); + writer.close(); + + // Should only be 3 segments in the index, because one of them exceeds the size limit + SegmentInfos sis = new SegmentInfos(); + sis.read(dir); + assertEquals(3, sis.size()); + } + } Index: src/java/org/apache/lucene/index/MergePolicy.java =================================================================== --- src/java/org/apache/lucene/index/MergePolicy.java (revision 1021827) +++ src/java/org/apache/lucene/index/MergePolicy.java (working copy) @@ -77,8 +77,8 @@ SegmentReader[] readers; // used by IndexWriter SegmentReader[] readersClone; // used by IndexWriter List mergeFiles; // used by IndexWriter - final SegmentInfos segments; - final boolean useCompoundFile; + public final SegmentInfos segments; + public final boolean useCompoundFile; boolean aborted; Throwable error; boolean paused; @@ -146,7 +146,7 @@ return paused; } - String segString(Directory dir) { + public String segString(Directory dir) { StringBuilder b = new StringBuilder(); final int numSegments = segments.size(); for(int i=0;i merges = new ArrayList(); + public final List merges = new ArrayList(); public void add(OneMerge merge) { merges.add(merge); Index: src/java/org/apache/lucene/index/LogMergePolicy.java =================================================================== --- src/java/org/apache/lucene/index/LogMergePolicy.java (revision 1021827) +++ src/java/org/apache/lucene/index/LogMergePolicy.java (working copy) @@ -54,16 +54,16 @@ * or larger will never be merged. @see setMaxMergeDocs */ public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE; - private int mergeFactor = DEFAULT_MERGE_FACTOR; + protected int mergeFactor = DEFAULT_MERGE_FACTOR; - long minMergeSize; - long maxMergeSize; - int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; + protected long minMergeSize; + protected long maxMergeSize; + protected int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; protected boolean calibrateSizeByDeletes = true; - private boolean useCompoundFile = true; - private boolean useCompoundDocStore = true; + protected boolean useCompoundFile = true; + protected boolean useCompoundDocStore = true; public LogMergePolicy() { super(); @@ -74,7 +74,7 @@ return w != null && w.verbose(); } - private void message(String message) { + protected void message(String message) { if (verbose()) writer.get().message("LMP: " + message); } @@ -180,7 +180,7 @@ } } - private boolean isOptimized(SegmentInfos infos, int maxNumSegments, Set segmentsToOptimize) throws IOException { + protected boolean isOptimized(SegmentInfos infos, int maxNumSegments, Set segmentsToOptimize) throws IOException { final int numSegments = infos.size(); int numToOptimize = 0; SegmentInfo optimizeInfo = null; @@ -199,7 +199,7 @@ /** Returns true if this single info is optimized (has no * pending norms or deletes, is in the same dir as the * writer, and matches the current compound file setting */ - private boolean isOptimized(SegmentInfo info) + protected boolean isOptimized(SegmentInfo info) throws IOException { IndexWriter w = writer.get(); assert w != null; @@ -210,6 +210,105 @@ info.getUseCompoundFile() == useCompoundFile; } + /** + * Returns the merges necessary to optimize the index, taking the max merge + * size into consideration. This method attempts to respect the {@code + * maxNumSegments} parameter, however it might be, due to size constraints, + * that more than that number of segments will remain in the index. Also, this + * method does not guarantee that exactly {@code maxNumSegments} will remain, + * but <= that number. + */ + private MergeSpecification findMergesForOptimizeMaxMergeSize( + SegmentInfos infos, int maxNumSegments, int last) throws IOException { + MergeSpecification spec = new MergeSpecification(); + + int start = last - 1; + while (start >= 0) { + SegmentInfo info = infos.info(start); + if (info.sizeInBytes() > maxMergeSize) { + // need to skip that segment + add a merge for the 'right' segments, + // unless there is only 1 which is optimized. + if (last - start - 1 > 1) { + // there is more than 1 segment to the right of this one. + spec.add(new OneMerge(infos.range(start + 1, last), useCompoundFile)); + } else if (start != last - 1 && !isOptimized(infos.info(start + 1))) { + spec.add(new OneMerge(infos.range(start + 1, last), useCompoundFile)); + } + last = start; + } else if (last - start == mergeFactor) { + // mergeFactor eligible segments were found, add them as a merge. + spec.add(new OneMerge(infos.range(start, last), useCompoundFile)); + last = start; + } + --start; + } + + // Add any left-over segments, unless there is just 1 already optimized. + if (last > 0 && (++start + 1 < last || !isOptimized(infos.info(start)))) { + spec.add(new OneMerge(infos.range(start, last), useCompoundFile)); + } + + return spec.merges.size() == 0 ? null : spec; + } + + /** + * Returns the merges necessary to optimize the index. This method constraints + * the returned merges only by the {@code maxNumSegments} parameter, and + * guaranteed that exactly that number of segments will remain in the index. + */ + private MergeSpecification findMergesForOptimizeMaxNumSegments(SegmentInfos infos, int maxNumSegments, int last) throws IOException { + MergeSpecification spec = new MergeSpecification(); + + // First, enroll all "full" merges (size + // mergeFactor) to potentially be run concurrently: + while (last - maxNumSegments + 1 >= mergeFactor) { + spec.add(new OneMerge(infos.range(last-mergeFactor, last), useCompoundFile)); + last -= mergeFactor; + } + + // Only if there are no full merges pending do we + // add a final partial (< mergeFactor segments) merge: + if (0 == spec.merges.size()) { + if (maxNumSegments == 1) { + + // Since we must optimize down to 1 segment, the + // choice is simple: + if (last > 1 || !isOptimized(infos.info(0))) { + spec.add(new OneMerge(infos.range(0, last), useCompoundFile)); + } + } else if (last > maxNumSegments) { + + // Take care to pick a partial merge that is + // least cost, but does not make the index too + // lopsided. If we always just picked the + // partial tail then we could produce a highly + // lopsided index over time: + + // We must merge this many segments to leave + // maxNumSegments in the index (from when + // optimize was first kicked off): + final int finalMergeSize = last - maxNumSegments + 1; + + // Consider all possible starting points: + long bestSize = 0; + int bestStart = 0; + + for(int i=0;i segmentsToOptimize) throws IOException { - MergeSpecification spec; assert maxNumSegments > 0; - if (!isOptimized(infos, maxNumSegments, segmentsToOptimize)) { - - // Find the newest (rightmost) segment that needs to - // be optimized (other segments may have been flushed - // since optimize started): - int last = infos.size(); - while(last > 0) { - final SegmentInfo info = infos.info(--last); - if (segmentsToOptimize.contains(info)) { - last++; - break; - } + // If the segments are already optimized (e.g. there's only 1 segment), or + // there are 0) { + final SegmentInfo info = infos.info(--last); + if (segmentsToOptimize.contains(info)) { + last++; + break; } + } - if (last > 0) { + if (last == 0) return null; + + // There is only one segment already, and it is optimized + if (maxNumSegments == 1 && last == 1 && isOptimized(infos.info(0))) return null; - spec = new MergeSpecification(); - - // First, enroll all "full" merges (size - // mergeFactor) to potentially be run concurrently: - while (last - maxNumSegments + 1 >= mergeFactor) { - spec.add(new OneMerge(infos.range(last-mergeFactor, last), useCompoundFile)); - last -= mergeFactor; - } - - // Only if there are no full merges pending do we - // add a final partial (< mergeFactor segments) merge: - if (0 == spec.merges.size()) { - if (maxNumSegments == 1) { - - // Since we must optimize down to 1 segment, the - // choice is simple: - if (last > 1 || !isOptimized(infos.info(0))) - spec.add(new OneMerge(infos.range(0, last), useCompoundFile)); - } else if (last > maxNumSegments) { - - // Take care to pick a partial merge that is - // least cost, but does not make the index too - // lopsided. If we always just picked the - // partial tail then we could produce a highly - // lopsided index over time: - - // We must merge this many segments to leave - // maxNumSegments in the index (from when - // optimize was first kicked off): - final int finalMergeSize = last - maxNumSegments + 1; - - // Consider all possible starting points: - long bestSize = 0; - int bestStart = 0; - - for(int i=0;i maxMergeSize || sizeDocs(info) > maxMergeDocs) { + anyTooLarge = true; + break; + } + } + + if (anyTooLarge) { + return findMergesForOptimizeMaxMergeSize(infos, maxNumSegments, last); + } else { + return findMergesForOptimizeMaxNumSegments(infos, maxNumSegments, last); + } } /** Index: contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java =================================================================== --- contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java (revision 1021827) +++ contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java (working copy) @@ -103,31 +103,6 @@ } } - private boolean isOptimized(SegmentInfos infos, IndexWriter writer, int maxNumSegments, Set segmentsToOptimize) throws IOException { - final int numSegments = infos.size(); - int numToOptimize = 0; - SegmentInfo optimizeInfo = null; - for(int i=0;i segmentsToOptimize) throws IOException { @@ -135,7 +110,7 @@ MergeSpecification spec = null; - if (!isOptimized(infos, writer.get(), maxNumSegments, segmentsToOptimize)) { + if (!isOptimized(infos, maxNumSegments, segmentsToOptimize)) { // Find the newest (rightmost) segment that needs to // be optimized (other segments may have been flushed @@ -158,7 +133,7 @@ // Since we must optimize down to 1 segment, the // choice is simple: boolean useCompoundFile = getUseCompoundFile(); - if (last > 1 || !isOptimized(writer.get(), infos.info(0))) { + if (last > 1 || !isOptimized(infos.info(0))) { spec = new MergeSpecification(); spec.add(new OneMerge(infos.range(0, last), useCompoundFile));