Index: contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java =================================================================== --- contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java (revision 0) +++ contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java (revision 0) @@ -0,0 +1,443 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; +import java.util.Set; + +/** + * Merge policy that tries to balance not doing large + * segment merges with not accumulating too many segments in + * the index, to provide for better performance in near + * real-time setting. + * + *

This is based on code from zoie, described in more detail + * at http://code.google.com/p/zoie/wiki/ZoieMergePolicy.

+ */ +public class BalancedSegmentMergePolicy extends LogByteSizeMergePolicy +{ + public static final int DEFAULT_NUM_LARGE_SEGMENTS = 10; + + private boolean _partialExpunge = false; + private int _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS; + private int _maxSmallSegments = 2 * LogMergePolicy.DEFAULT_MERGE_FACTOR; + private int _maxSegments = _numLargeSegments + _maxSmallSegments; + + public BalancedSegmentMergePolicy(IndexWriter writer) { + super(writer); + } + + public void setMergePolicyParams(MergePolicyParams params) { + if (params!=null) { + setPartialExpunge(params._doPartialExpunge); + setNumLargeSegments(params._numLargeSegments); + setMaxSmallSegments(params._maxSmallSegments); + setPartialExpunge(params._doPartialExpunge); + setMergeFactor(params._mergeFactor); + setUseCompoundFile(params._useCompoundFile); + setMaxMergeDocs(params._maxMergeDocs); + } + } + + protected long size(SegmentInfo info) throws IOException { + long byteSize = info.sizeInBytes(); + float delRatio = (info.docCount <= 0 ? 0.0f : ((float)info.getDelCount() / (float)info.docCount)); + return (info.docCount <= 0 ? byteSize : (long)((float)byteSize * (1.0f - delRatio))); + } + + public void setPartialExpunge(boolean doPartialExpunge) { + _partialExpunge = doPartialExpunge; + } + + public boolean getPartialExpunge() { + return _partialExpunge; + } + + public void setNumLargeSegments(int numLargeSegments) { + if (numLargeSegments < 2) { + throw new IllegalArgumentException("numLargeSegments cannot be less than 2"); + } + + _numLargeSegments = numLargeSegments; + _maxSegments = _numLargeSegments + 2 * getMergeFactor(); + } + + public int getNumLargeSegments() { + return _numLargeSegments; + } + + public void setMaxSmallSegments(int maxSmallSegments) { + if (maxSmallSegments < getMergeFactor()) { + throw new IllegalArgumentException("maxSmallSegments cannot be less than mergeFactor"); + } + _maxSmallSegments = maxSmallSegments; + _maxSegments = _numLargeSegments + _maxSmallSegments; + } + + public int getMaxSmallSegments() { + return _maxSmallSegments; + } + + @Override + public void setMergeFactor(int mergeFactor) { + super.setMergeFactor(mergeFactor); + if (_maxSmallSegments < getMergeFactor()) { + _maxSmallSegments = getMergeFactor(); + _maxSegments = _numLargeSegments + _maxSmallSegments; + } + } + + private boolean isOptimized(SegmentInfos infos, IndexWriter writer, int maxNumSegments, Set segmentsToOptimize) throws IOException { + final int numSegments = infos.size(); + int numToOptimize = 0; + SegmentInfo optimizeInfo = null; + for(int i=0;i 0; + + MergeSpecification spec = null; + + if (!isOptimized(infos, writer, maxNumSegments, segmentsToOptimize)) { + + // Find the newest (rightmost) segment that needs to + // be optimized (other segments may have been flushed + // since optimize started): + int last = infos.size(); + while(last > 0) { + + final SegmentInfo info = infos.info(--last); + if (segmentsToOptimize.contains(info)) { + + last++; + break; + } + } + + if (last > 0) { + + if (maxNumSegments == 1) { + + // Since we must optimize down to 1 segment, the + // choice is simple: + boolean useCompoundFile = getUseCompoundFile(); + if (last > 1 || !isOptimized(writer, infos.info(0))) { + + spec = new MergeSpecification(); + spec.add(new OneMerge(infos.range(0, last), useCompoundFile)); + } + } else if (last > maxNumSegments) { + + // find most balanced merges + spec = findBalancedMerges(infos, last, maxNumSegments, _partialExpunge); + } + } + } + return spec; + } + + private MergeSpecification findBalancedMerges(SegmentInfos infos, int infoLen, int maxNumSegments, boolean partialExpunge) + throws IOException { + if (infoLen <= maxNumSegments) return null; + + MergeSpecification spec = new MergeSpecification(); + boolean useCompoundFile = getUseCompoundFile(); + + // use Viterbi algorithm to find the best segmentation. + // we will try to minimize the size variance of resulting segments. + + double[][] variance = createVarianceTable(infos, infoLen, maxNumSegments); + + final int maxMergeSegments = infoLen - maxNumSegments + 1; + double[] sumVariance = new double[maxMergeSegments]; + int[][] backLink = new int[maxNumSegments][maxMergeSegments]; + + for(int i = (maxMergeSegments - 1); i >= 0; i--) { + sumVariance[i] = variance[0][i]; + backLink[0][i] = 0; + } + for(int i = 1; i < maxNumSegments; i++) { + for(int j = (maxMergeSegments - 1); j >= 0; j--) { + double minV = Double.MAX_VALUE; + int minK = 0; + for(int k = j; k >= 0; k--) { + double v = sumVariance[k] + variance[i + k][j - k]; + if(v < minV) { + minV = v; + minK = k; + } + } + sumVariance[j] = minV; + backLink[i][j] = minK; + } + } + + // now, trace back the back links to find all merges, + // also find a candidate for partial expunge if requested + int mergeEnd = infoLen; + int prev = maxMergeSegments - 1; + int expungeCandidate = -1; + int maxDelCount = 0; + for(int i = maxNumSegments - 1; i >= 0; i--) { + prev = backLink[i][prev]; + int mergeStart = i + prev; + if((mergeEnd - mergeStart) > 1) { + spec.add(new OneMerge(infos.range(mergeStart, mergeEnd), useCompoundFile)); + } else { + if(partialExpunge) { + SegmentInfo info = infos.info(mergeStart); + int delCount = info.getDelCount(); + if(delCount > maxDelCount) { + expungeCandidate = mergeStart; + maxDelCount = delCount; + } + } + } + mergeEnd = mergeStart; + } + + if(partialExpunge && maxDelCount > 0) { + // expunge deletes + spec.add(new OneMerge(infos.range(expungeCandidate, expungeCandidate + 1), useCompoundFile)); + } + + return spec; + } + + private double[][] createVarianceTable(SegmentInfos infos, int last, int maxNumSegments) throws IOException { + int maxMergeSegments = last - maxNumSegments + 1; + double[][] variance = new double[last][maxMergeSegments]; + + // compute the optimal segment size + long optSize = 0; + long[] sizeArr = new long[last]; + for(int i = 0; i < sizeArr.length; i++) { + sizeArr[i] = size(infos.info(i)); + optSize += sizeArr[i]; + } + optSize = (optSize / maxNumSegments); + + for(int i = 0; i < last; i++) { + long size = 0; + for(int j = 0; j < maxMergeSegments; j++) { + if((i + j) < last) { + size += sizeArr[i + j]; + double residual = ((double)size/(double)optSize) - 1.0d; + variance[i][j] = residual * residual; + } else { + variance[i][j] = Double.NaN; + } + } + } + return variance; + } + + @Override + public MergeSpecification findMergesToExpungeDeletes(SegmentInfos infos) + throws CorruptIndexException, IOException { + final int numSegs = infos.size(); + final int numLargeSegs = (numSegs < _numLargeSegments ? numSegs : _numLargeSegments); + MergeSpecification spec = null; + + if(numLargeSegs < numSegs) { + SegmentInfos smallSegments = infos.range(numLargeSegs, numSegs); + spec = super.findMergesToExpungeDeletes(smallSegments); + } + + if(spec == null) spec = new MergeSpecification(); + for(int i = 0; i < numLargeSegs; i++) { + SegmentInfo info = infos.info(i); + if(info.hasDeletions()) { + spec.add(new OneMerge(infos.range(i, i + 1), getUseCompoundFile())); + } + } + return spec; + } + + @Override + public MergeSpecification findMerges(SegmentInfos infos) throws IOException { + final int numSegs = infos.size(); + final int numLargeSegs = _numLargeSegments; + + if (numSegs <= numLargeSegs) { + return null; + } + + long totalLargeSegSize = 0; + long totalSmallSegSize = 0; + SegmentInfo info; + + // compute the total size of large segments + for(int i = 0; i < numLargeSegs; i++) { + info = infos.info(i); + totalLargeSegSize += size(info); + } + // compute the total size of small segments + for(int i = numLargeSegs; i < numSegs; i++) { + info = infos.info(i); + totalSmallSegSize += size(info); + } + + long targetSegSize = (totalLargeSegSize / (numLargeSegs - 1)); + if(targetSegSize <= totalSmallSegSize) { + // the total size of small segments is big enough, + // promote the small segments to a large segment and do balanced merge, + + if(totalSmallSegSize < targetSegSize * 2) { + MergeSpecification spec = findBalancedMerges(infos, numLargeSegs, (numLargeSegs - 1), _partialExpunge); + if(spec == null) spec = new MergeSpecification(); // should not happen + spec.add(new OneMerge(infos.range(numLargeSegs, numSegs), getUseCompoundFile())); + return spec; + } else { + return findBalancedMerges(infos, numSegs, numLargeSegs, _partialExpunge); + } + } else if (_maxSegments < numSegs) { + // we have more than _maxSegments, merge small segments smaller than targetSegSize/4 + MergeSpecification spec = new MergeSpecification(); + int startSeg = numLargeSegs; + long sizeThreshold = (targetSegSize / 4); + while(startSeg < numSegs) { + info = infos.info(startSeg); + if(size(info) < sizeThreshold) break; + startSeg++; + } + spec.add(new OneMerge(infos.range(startSeg, numSegs), getUseCompoundFile())); + return spec; + } else { + // apply the log merge policy to small segments. + SegmentInfos smallSegments = infos.range(numLargeSegs, numSegs); + MergeSpecification spec = super.findMerges(smallSegments); + + if(_partialExpunge) { + OneMerge expunge = findOneSegmentToExpunge(infos, numLargeSegs); + if(expunge != null) { + if(spec == null) spec = new MergeSpecification(); + spec.add(expunge); + } + } + return spec; + } + } + + private OneMerge findOneSegmentToExpunge(SegmentInfos infos, int maxNumSegments) throws IOException { + int expungeCandidate = -1; + int maxDelCount = 0; + + for(int i = maxNumSegments - 1; i >= 0; i--) { + SegmentInfo info = infos.info(i); + int delCount = info.getDelCount(); + if (delCount > maxDelCount) { + expungeCandidate = i; + maxDelCount = delCount; + } + } + if (maxDelCount > 0) { + return new OneMerge(infos.range(expungeCandidate, expungeCandidate + 1), getUseCompoundFile()); + } + return null; + } + + + public static class MergePolicyParams { + private int _numLargeSegments; + private int _maxSmallSegments; + private boolean _doPartialExpunge; + private int _mergeFactor; + private boolean _useCompoundFile; + private int _maxMergeDocs; + + public MergePolicyParams() { + _useCompoundFile = true; + _doPartialExpunge = false; + _numLargeSegments = DEFAULT_NUM_LARGE_SEGMENTS; + _maxSmallSegments = 2 * LogMergePolicy.DEFAULT_MERGE_FACTOR; + _maxSmallSegments = _numLargeSegments + _maxSmallSegments; + _mergeFactor = LogMergePolicy.DEFAULT_MERGE_FACTOR; + _maxMergeDocs = LogMergePolicy.DEFAULT_MAX_MERGE_DOCS; + } + + public void setNumLargeSegments(int numLargeSegments) { + _numLargeSegments = numLargeSegments; + } + + public int getNumLargeSegments() { + return _numLargeSegments; + } + + public void setMaxSmallSegments(int maxSmallSegments) { + _maxSmallSegments = maxSmallSegments; + } + + public int getMaxSmallSegments() { + return _maxSmallSegments; + } + + public void setPartialExpunge(boolean doPartialExpunge) { + _doPartialExpunge = doPartialExpunge; + } + + public boolean getPartialExpunge() { + return _doPartialExpunge; + } + + public void setMergeFactor(int mergeFactor) { + _mergeFactor = mergeFactor; + } + + public int getMergeFactor() { + return _mergeFactor; + } + + public void setMaxMergeDocs(int maxMergeDocs) { + _maxMergeDocs = maxMergeDocs; + } + + public int getMaxMergeDocs() { + return _maxMergeDocs; + } + + public void setUseCompoundFile(boolean useCompoundFile) { + _useCompoundFile = useCompoundFile; + } + + public boolean isUseCompoundFile() { + return _useCompoundFile; + } + } +} Property changes on: contrib/misc/src/java/org/apache/lucene/index/BalancedSegmentMergePolicy.java ___________________________________________________________________ Added: svn:eol-style + native