Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1376521) +++ lucene/CHANGES.txt (working copy) @@ -20,6 +20,10 @@ search performance. This was from Han Jiang's 2012 Google Summer of Code project (Han Jiang, Adrien Grand, Robert Muir, Mike McCandless) +* LUCENE-4323: Added support for an absolute maximum CFS segment size + (in MiB) to LogMergePolicy and TieredMergePolicy. + (Alexey Lef via Uwe Schindler) + API Changes * LUCENE-4299: Added Terms.hasPositions() and Terms.hasOffsets(). Index: lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java (revision 1376521) +++ lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java (working copy) @@ -64,6 +64,13 @@ * @see #setNoCFSRatio */ public static final double DEFAULT_NO_CFS_RATIO = 0.1; + /** Default maxCFSSegmentSize value allows compound file + * for a segment of any size. The actual file format is + * still subject to noCFSRatio. + * @see #setMaxCFSSegmentSizeMB(double) + */ + public static final long DEFAULT_MAX_CFS_SEGMENT_SIZE = Long.MAX_VALUE; + protected int mergeFactor = DEFAULT_MERGE_FACTOR; protected long minMergeSize; @@ -74,6 +81,7 @@ protected int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; protected double noCFSRatio = DEFAULT_NO_CFS_RATIO; + protected long maxCFSSegmentSize = DEFAULT_MAX_CFS_SEGMENT_SIZE; protected boolean calibrateSizeByDeletes = true; @@ -136,21 +144,21 @@ // Javadoc inherited @Override public boolean useCompoundFile(SegmentInfos infos, SegmentInfoPerCommit mergedInfo) throws IOException { - final boolean doCFS; - - if (!useCompoundFile) { - doCFS = false; - } else if (noCFSRatio == 1.0) { - doCFS = true; - } else { - long totalSize = 0; - for (SegmentInfoPerCommit info : infos) { - totalSize += size(info); - } - - doCFS = size(mergedInfo) <= noCFSRatio * totalSize; + if (!getUseCompoundFile()) { + return false; } - return doCFS; + long mergedInfoSize = size(mergedInfo); + if (mergedInfoSize > maxCFSSegmentSize) { + return false; + } + if (getNoCFSRatio() >= 1.0) { + return true; + } + long totalSize = 0; + for (SegmentInfoPerCommit info : infos) { + totalSize += size(info); + } + return mergedInfoSize <= getNoCFSRatio() * totalSize; } /** Sets whether compound file format should be used for @@ -674,9 +682,28 @@ sb.append("calibrateSizeByDeletes=").append(calibrateSizeByDeletes).append(", "); sb.append("maxMergeDocs=").append(maxMergeDocs).append(", "); sb.append("useCompoundFile=").append(useCompoundFile).append(", "); + sb.append("maxCFSSegmentSizeMB=").append(getMaxCFSSegmentSizeMB()).append(", "); sb.append("noCFSRatio=").append(noCFSRatio); sb.append("]"); return sb.toString(); } - + + /** Returns the largest size allowed for a compound file segment */ + public final double getMaxCFSSegmentSizeMB() { + return maxCFSSegmentSize/1024/1024.; + } + + /** If a merged segment will be more than this value, + * leave the segment as + * non-compound file even if compound file is enabled. + * Set this to Double.POSITIVE_INFINITY (default) and noCFSRatio to 1.0 + * to always use CFS regardless of merge size. */ + public final void setMaxCFSSegmentSizeMB(double v) { + if (v < 0.0) { + throw new IllegalArgumentException("maxCFSSegmentSizeMB must be >=0 (got " + v + ")"); + } + v *= 1024 * 1024; + this.maxCFSSegmentSize = (v > (double) Long.MAX_VALUE) ? Long.MAX_VALUE : (long) v; + } + } Index: lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java (revision 1376521) +++ lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java (working copy) @@ -84,6 +84,7 @@ private double forceMergeDeletesPctAllowed = 10.0; private boolean useCompoundFile = true; private double noCFSRatio = 0.1; + private long maxCFSSegmentSize = Long.MAX_VALUE; private double reclaimDeletesWeight = 2.0; /** Maximum number of segments to be merged at a time @@ -127,7 +128,11 @@ * sizes of to-be-merged segments (compensating for * percent deleted docs). Default is 5 GB. */ public TieredMergePolicy setMaxMergedSegmentMB(double v) { - maxMergedSegmentBytes = (long) (v*1024*1024); + if (v < 0.0) { + throw new IllegalArgumentException("maxMergedSegmentMB must be >=0 (got " + v + ")"); + } + v *= 1024 * 1024; + maxMergedSegmentBytes = (v > (double) Long.MAX_VALUE) ? Long.MAX_VALUE : (long) v; return this; } @@ -162,7 +167,8 @@ if (v <= 0.0) { throw new IllegalArgumentException("floorSegmentMB must be >= 0.0 (got " + v + ")"); } - floorSegmentBytes = (long) (v*1024*1024); + v *= 1024 * 1024; + floorSegmentBytes = (v > (double) Long.MAX_VALUE) ? Long.MAX_VALUE : (long) v; return this; } @@ -602,21 +608,21 @@ @Override public boolean useCompoundFile(SegmentInfos infos, SegmentInfoPerCommit mergedInfo) throws IOException { - final boolean doCFS; - - if (!useCompoundFile) { - doCFS = false; - } else if (noCFSRatio == 1.0) { - doCFS = true; - } else { - long totalSize = 0; - for (SegmentInfoPerCommit info : infos) { + if (!getUseCompoundFile()) { + return false; + } + long mergedInfoSize = size(mergedInfo); + if (mergedInfoSize > maxCFSSegmentSize) { + return false; + } + if (getNoCFSRatio() >= 1.0) { + return true; + } + long totalSize = 0; + for (SegmentInfoPerCommit info : infos) { totalSize += size(info); - } - - doCFS = size(mergedInfo) <= noCFSRatio * totalSize; } - return doCFS; + return mergedInfoSize <= getNoCFSRatio() * totalSize; } @Override @@ -629,7 +635,7 @@ boolean hasDeletions = w.numDeletedDocs(info) > 0; return !hasDeletions && info.info.dir == w.getDirectory() && - (info.info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0); + (info.info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0 || maxCFSSegmentSize < Long.MAX_VALUE); } // Segment size in bytes, pro-rated by % deleted @@ -664,7 +670,27 @@ sb.append("forceMergeDeletesPctAllowed=").append(forceMergeDeletesPctAllowed).append(", "); sb.append("segmentsPerTier=").append(segsPerTier).append(", "); sb.append("useCompoundFile=").append(useCompoundFile).append(", "); + sb.append("maxCFSSegmentSizeMB=").append(getMaxCFSSegmentSizeMB()).append(", "); sb.append("noCFSRatio=").append(noCFSRatio); return sb.toString(); } + + /** Returns the largest size allowed for a compound file segment */ + public final double getMaxCFSSegmentSizeMB() { + return maxCFSSegmentSize/1024/1024.; + } + + /** If a merged segment will be more than this value, + * leave the segment as + * non-compound file even if compound file is enabled. + * Set this to Double.POSITIVE_INFINITY (default) and noCFSRatio to 1.0 + * to always use CFS regardless of merge size. */ + public final TieredMergePolicy setMaxCFSSegmentSizeMB(double v) { + if (v < 0.0) { + throw new IllegalArgumentException("maxCFSSegmentSizeMB must be >=0 (got " + v + ")"); + } + v *= 1024 * 1024; + this.maxCFSSegmentSize = (v > (double) Long.MAX_VALUE) ? Long.MAX_VALUE : (long) v; + return this; + } } Index: lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (revision 1376521) +++ lucene/core/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (working copy) @@ -263,4 +263,31 @@ assertTrue(numSegments < mergeFactor); } } + + private static final double EPSILON = 1E-14; + + public void testSetters() { + assertSetters(new LogByteSizeMergePolicy()); + assertSetters(new LogDocMergePolicy()); + } + + private void assertSetters(LogMergePolicy lmp) { + lmp.setMaxCFSSegmentSizeMB(2.0); + assertEquals(2.0, lmp.getMaxCFSSegmentSizeMB(), EPSILON); + + lmp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY); + assertEquals(Long.MAX_VALUE/1024/1024., lmp.getMaxCFSSegmentSizeMB(), EPSILON*Long.MAX_VALUE); + + lmp.setMaxCFSSegmentSizeMB(Long.MAX_VALUE/1024/1024.); + assertEquals(Long.MAX_VALUE/1024/1024., lmp.getMaxCFSSegmentSizeMB(), EPSILON*Long.MAX_VALUE); + + try { + lmp.setMaxCFSSegmentSizeMB(-2.0); + fail("Didn't throw IllegalArgumentException"); + } catch (IllegalArgumentException iae) { + // pass + } + + // TODO: Add more checks for other non-double setters! + } } Index: lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java (revision 1376521) +++ lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java (working copy) @@ -153,4 +153,60 @@ dir.close(); } + + private static final double EPSILON = 1E-14; + + public void testSetters() { + final TieredMergePolicy tmp = new TieredMergePolicy(); + + tmp.setMaxMergedSegmentMB(0.5); + assertEquals(0.5, tmp.getMaxMergedSegmentMB(), EPSILON); + + tmp.setMaxMergedSegmentMB(Double.POSITIVE_INFINITY); + assertEquals(Long.MAX_VALUE/1024/1024., tmp.getMaxMergedSegmentMB(), EPSILON*Long.MAX_VALUE); + + tmp.setMaxMergedSegmentMB(Long.MAX_VALUE/1024/1024.); + assertEquals(Long.MAX_VALUE/1024/1024., tmp.getMaxMergedSegmentMB(), EPSILON*Long.MAX_VALUE); + + try { + tmp.setMaxMergedSegmentMB(-2.0); + fail("Didn't throw IllegalArgumentException"); + } catch (IllegalArgumentException iae) { + // pass + } + + tmp.setFloorSegmentMB(2.0); + assertEquals(2.0, tmp.getFloorSegmentMB(), EPSILON); + + tmp.setFloorSegmentMB(Double.POSITIVE_INFINITY); + assertEquals(Long.MAX_VALUE/1024/1024., tmp.getFloorSegmentMB(), EPSILON*Long.MAX_VALUE); + + tmp.setFloorSegmentMB(Long.MAX_VALUE/1024/1024.); + assertEquals(Long.MAX_VALUE/1024/1024., tmp.getFloorSegmentMB(), EPSILON*Long.MAX_VALUE); + + try { + tmp.setFloorSegmentMB(-2.0); + fail("Didn't throw IllegalArgumentException"); + } catch (IllegalArgumentException iae) { + // pass + } + + tmp.setMaxCFSSegmentSizeMB(2.0); + assertEquals(2.0, tmp.getMaxCFSSegmentSizeMB(), EPSILON); + + tmp.setMaxCFSSegmentSizeMB(Double.POSITIVE_INFINITY); + assertEquals(Long.MAX_VALUE/1024/1024., tmp.getMaxCFSSegmentSizeMB(), EPSILON*Long.MAX_VALUE); + + tmp.setMaxCFSSegmentSizeMB(Long.MAX_VALUE/1024/1024.); + assertEquals(Long.MAX_VALUE/1024/1024., tmp.getMaxCFSSegmentSizeMB(), EPSILON*Long.MAX_VALUE); + + try { + tmp.setMaxCFSSegmentSizeMB(-2.0); + fail("Didn't throw IllegalArgumentException"); + } catch (IllegalArgumentException iae) { + // pass + } + + // TODO: Add more checks for other non-double setters! + } } Index: lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java (revision 1376521) +++ lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java (working copy) @@ -765,6 +765,11 @@ } else { logmp.setMergeFactor(_TestUtil.nextInt(r, 10, 50)); } + logmp.setUseCompoundFile(r.nextBoolean()); + logmp.setNoCFSRatio(0.1 + r.nextDouble()*0.8); + if (rarely()) { + logmp.setMaxCFSSegmentSizeMB(0.2 + r.nextDouble() * 2.0); + } return logmp; } @@ -791,6 +796,9 @@ } tmp.setUseCompoundFile(r.nextBoolean()); tmp.setNoCFSRatio(0.1 + r.nextDouble()*0.8); + if (rarely()) { + tmp.setMaxCFSSegmentSizeMB(0.2 + r.nextDouble() * 2.0); + } tmp.setReclaimDeletesWeight(r.nextDouble()*4); return tmp; }