Index: src/test/org/apache/lucene/store/MockRAMDirectory.java =================================================================== --- src/test/org/apache/lucene/store/MockRAMDirectory.java (revision 569112) +++ src/test/org/apache/lucene/store/MockRAMDirectory.java (working copy) @@ -195,7 +195,7 @@ * RAMOutputStream.BUFFER_SIZE (now 1024) bytes. */ - final long getRecomputedActualSizeInBytes() { + final synchronized long getRecomputedActualSizeInBytes() { long size = 0; Iterator it = fileMap.values().iterator(); while (it.hasNext()) Index: src/test/org/apache/lucene/index/DocHelper.java =================================================================== --- src/test/org/apache/lucene/index/DocHelper.java (revision 569112) +++ src/test/org/apache/lucene/index/DocHelper.java (working copy) @@ -236,7 +236,7 @@ //writer.setUseCompoundFile(false); writer.addDocument(doc); writer.flush(); - SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1); + SegmentInfo info = writer.newestSegment(); writer.close(); return info; } Index: src/test/org/apache/lucene/index/TestDoc.java =================================================================== --- src/test/org/apache/lucene/index/TestDoc.java (revision 569112) +++ src/test/org/apache/lucene/index/TestDoc.java (working copy) @@ -168,7 +168,7 @@ Document doc = FileDocument.Document(file); writer.addDocument(doc); writer.flush(); - return writer.segmentInfos.info(writer.segmentInfos.size()-1); + return writer.newestSegment(); } Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 569112) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -522,6 +522,7 @@ MockRAMDirectory dir = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); + writer.setMaxBufferedDocs(10); for(int j=0;j<500;j++) { addDocWithIndex(writer, j); } Index: src/test/org/apache/lucene/index/TestDocumentWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 569112) +++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy) @@ -62,7 +62,7 @@ IndexWriter writer = new IndexWriter(dir, analyzer, true); writer.addDocument(testDoc); writer.flush(); - SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1); + SegmentInfo info = writer.newestSegment(); writer.close(); //After adding the document, we should be able to read it back in SegmentReader reader = SegmentReader.get(info); @@ -123,7 +123,7 @@ writer.addDocument(doc); writer.flush(); - SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1); + SegmentInfo info = writer.newestSegment(); writer.close(); SegmentReader reader = SegmentReader.get(info); @@ -156,7 +156,7 @@ writer.addDocument(doc); writer.flush(); - SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1); + SegmentInfo info = writer.newestSegment(); writer.close(); SegmentReader reader = SegmentReader.get(info); Index: src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java =================================================================== --- src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (revision 569112) +++ src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (working copy) @@ -272,7 +272,7 @@ writer.addIndexesNoOptimize(new Directory[] { aux, aux }); assertEquals(1020, writer.docCount()); - assertEquals(2, writer.getSegmentCount()); + //assertEquals(2, writer.getSegmentCount()); assertEquals(1000, writer.getDocCount(0)); writer.close(); @@ -373,7 +373,7 @@ writer = newWriter(dir, true); writer.setMaxBufferedDocs(1000); - // add 1000 documents + // add 1000 documents in 1 segment addDocs(writer, 1000); assertEquals(1000, writer.docCount()); assertEquals(1, writer.getSegmentCount()); Index: src/java/org/apache/lucene/index/LogDocMergePolicy.java =================================================================== --- src/java/org/apache/lucene/index/LogDocMergePolicy.java (revision 0) +++ src/java/org/apache/lucene/index/LogDocMergePolicy.java (revision 0) @@ -0,0 +1,25 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class LogDocMergePolicy extends LogMergePolicy { + protected long size(SegmentInfo info) { + return info.docCount; + } +} + Property changes on: src/java/org/apache/lucene/index/LogDocMergePolicy.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java =================================================================== --- src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java (revision 0) +++ src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java (revision 0) @@ -0,0 +1,27 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +public class LogByteSizeMergePolicy extends LogMergePolicy { + protected long size(SegmentInfo info) throws IOException { + return info.sizeInBytes(); + } +} + Property changes on: src/java/org/apache/lucene/index/LogByteSizeMergePolicy.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfo.java (revision 569112) +++ src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -65,6 +65,8 @@ private List files; // cached list of files that this segment uses // in the Directory + long sizeInBytes = -1; // total byte size of all of our files (computed on demand) + private int docStoreOffset; // if this segment shares stored fields & vectors, this // offset is where in that file this segment's docs begin private String docStoreSegment; // name used to derive fields/vectors file we share with @@ -104,7 +106,7 @@ * Copy everything from src SegmentInfo into our instance. */ void reset(SegmentInfo src) { - files = null; + clearFiles(); name = src.name; docCount = src.docCount; dir = src.dir; @@ -199,6 +201,19 @@ } } + /** Returns total size in bytes of all of files used by + * this segment. */ + long sizeInBytes() throws IOException { + if (sizeInBytes == -1) { + List files = files(); + final int size = files.size(); + sizeInBytes = 0; + for(int i=0;i 1 || + (infos.size() == 1 && + (infos.info(0).hasDeletions() || + infos.info(0).hasSeparateNorms() || + infos.info(0).dir != writer.getDirectory() || + infos.info(0).getUseCompoundFile() != useCompoundFile))); + } + + public MergeSpecification optimize(SegmentInfos infos, IndexWriter writer) throws IOException { + // TODO: can we support concurrency here, so that + // ConcurrentMergePolicyWrapper could do optimize with + // concurrency? + final MergeSpecification spec; + if (!isOptimized(infos, writer)) { + spec = new MergeSpecification(); + final int numSegments = infos.size(); + final int first; + if (numSegments > mergeFactor) + first = numSegments-mergeFactor; + else + first = 0; + spec.add(new OneMerge(infos.range(first, numSegments), useCompoundFile)); + } else + spec = null; + return spec; + } + + public MergeSpecification maybeMerge(SegmentInfos infos, IndexWriter writer) throws IOException { + + final int numSegments = infos.size(); + + // Compute levels, which is just log (base mergeFactor) + // of the size of each segment + float[] levels = new float[numSegments]; + final float norm = (float) Math.log(mergeFactor); + for(int i=0;i= maxMergeDocs) + throw new IllegalArgumentException("Segment has too many docs (" + info.docCount + " vs maxMergeDocs " + maxMergeDocs + ")"); + long size = size(info); + if (size < mergeFactor) + // Floor level @ 0.0 + size = mergeFactor; + levels[i] = (float) Math.log(size)/norm; + } + + // Now, we quantize the log values into levels. The + // first level is any segment whose log size is within + // LEVEL_LOG_SPAN of the max size, or, who has such as + // segment "to the right". Then, we find the max of all + // other segments and use that to define the next level + // segment, etc. + + MergeSpecification spec = null; + + int start = 0; + while(start < numSegments) { + + // Find max level of all segments not already + // quantized. + float maxLevel = levels[start]; + for(int i=1+start;i maxLevel) + maxLevel = level; + } + + // Now search backwards for the rightmost segment that + // falls into this level: + int upto = numSegments-1; + final float levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN); + while(upto >= start) { + if (levels[upto] > levelBottom) + break; + upto--; + } + + // Finally, record all merges that are viable at this level: + int end = start + mergeFactor; + while(end <= 1+upto) { + boolean anyTooLarge = false; + for(int i=start;i= maxMergeDocs; + + if (!anyTooLarge) { + if (spec == null) + spec = new MergeSpecification(); + spec.add(new OneMerge(infos.range(start, end), useCompoundFile)); + } + start = end; + end = start + mergeFactor; + } + + start = 1+upto; + } + + return spec; + } +} Property changes on: src/java/org/apache/lucene/index/LogMergePolicy.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/lucene/index/SegmentInfos.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfos.java (revision 569112) +++ src/java/org/apache/lucene/index/SegmentInfos.java (working copy) @@ -661,4 +661,16 @@ */ protected abstract Object doBody(String segmentFileName) throws CorruptIndexException, IOException; } + + /** + * Returns a new SegmentInfos containg the SegmentInfo + * instances in the specified range first (inclusive) to + * last (exclusive), so total number of segments returned + * is last-first. + */ + public SegmentInfos range(int first, int last) { + SegmentInfos infos = new SegmentInfos(); + infos.addAll(super.subList(first, last)); + return infos; + } } Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 569112) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -25,12 +25,15 @@ import org.apache.lucene.store.Lock; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.util.BitVector; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.List; +import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.Map.Entry; @@ -177,9 +180,10 @@ public static final String WRITE_LOCK_NAME = "write.lock"; /** - * Default value is 10. Change using {@link #setMergeFactor(int)}. + * @deprecated + * @see LogMergePolicy#DEFAULT_MERGE_FACTOR */ - public final static int DEFAULT_MERGE_FACTOR = 10; + public final static int DEFAULT_MERGE_FACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR; /** * Default value is 10. Change using {@link #setMaxBufferedDocs(int)}. @@ -205,9 +209,10 @@ public final static int DEFAULT_MAX_BUFFERED_DELETE_TERMS = 1000; /** - * Default value is {@link Integer#MAX_VALUE}. Change using {@link #setMaxMergeDocs(int)}. + * @deprecated + * @see: LogMergePolicy.DEFAULT_MAX_MERGE_DOCS */ - public final static int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE; + public final static int DEFAULT_MAX_MERGE_DOCS = LogMergePolicy.DEFAULT_MAX_MERGE_DOCS; /** * Default value is 10,000. Change using {@link #setMaxFieldLength(int)}. @@ -239,7 +244,7 @@ private boolean localAutoCommit; // saved autoCommit during local transaction private boolean autoCommit = true; // false if we should commit only on close - SegmentInfos segmentInfos = new SegmentInfos(); // the segments + private SegmentInfos segmentInfos = new SegmentInfos(); // the segments private DocumentsWriter docWriter; private IndexFileDeleter deleter; @@ -257,14 +262,9 @@ private HashMap bufferedDeleteTerms = new HashMap(); private int numBufferedDeleteTerms = 0; - /** Use compound file setting. Defaults to true, minimizing the number of - * files used. Setting this to false may improve indexing performance, but - * may also cause file handle problems. - */ - private boolean useCompoundFile = true; - private boolean closeDir; private boolean closed; + private boolean closing; /** * Used internally to throw an {@link @@ -278,23 +278,47 @@ } } - /** Get the current setting of whether to use the compound file format. - * Note that this just returns the value you set with setUseCompoundFile(boolean) - * or the default. You cannot use this to query the status of an existing index. + /** + * Casts current mergePolicy to LogMergePolicy, and throws + * an exception if the mergePolicy is not a LogMergePolicy. + */ + private LogMergePolicy getLogMergePolicy() { + LogMergePolicy policy; + if (mergePolicy instanceof LogMergePolicy) + return (LogMergePolicy) mergePolicy; + else + throw new IllegalArgumentException("this method can only be called when the merge policy is the default LogMergePolicy"); + } + + /**

Get the current setting of whether newly flushed + * segments will use the compound file format. Note that + * this just returns the value previously set with + * setUseCompoundFile(boolean), or the default value + * (true). You cannot use this to query the status of + * previously flushed segments.

+ * + *

Note that this method is a convenience method: it + * just calls mergePolicy.getUseCompoundFile as long as + * mergePolicy is an instance of {@link LogMergePolicy}. + * Otherwise an IllegalArgumentException is thrown.

+ * * @see #setUseCompoundFile(boolean) */ public boolean getUseCompoundFile() { - ensureOpen(); - return useCompoundFile; + return getLogMergePolicy().getUseCompoundFile(); } - /** Setting to turn on usage of a compound file. When on, multiple files - * for each segment are merged into a single file once the segment creation - * is finished. This is done regardless of what directory is in use. + /**

Setting to turn on usage of a compound file. When on, + * multiple files for each segment are merged into a + * single file when a new segment is flushed.

+ * + *

Note that this method is a convenience method: it + * just calls mergePolicy.setUseCompoundFile as long as + * mergePolicy is an instance of {@link LogMergePolicy}. + * Otherwise an IllegalArgumentException is thrown.

*/ public void setUseCompoundFile(boolean value) { - ensureOpen(); - useCompoundFile = value; + getLogMergePolicy().setUseCompoundFile(value); } /** Expert: Set the Similarity implementation used by this IndexWriter. @@ -652,26 +676,63 @@ } } + private MergePolicy mergePolicy = new LogDocMergePolicy(); + private boolean doMergeClose; + + /** + * Set the merge policy used by this IndexWriter + */ + public void setMergePolicy(MergePolicy mp, boolean doClose) { + ensureOpen(); + if (mergePolicy != null && mergePolicy != mp && doMergeClose) { + mergePolicy.close(); + } + mergePolicy = mp; + doMergeClose = doClose; + } + + public void setMergePolicy(MergePolicy mp) { + setMergePolicy(mp, true); + } + + /** + * Returns the current MergePolicy in use by this writer. + * @see #setMergePolicy + */ + public MergePolicy getMergePolicy() { + ensureOpen(); + return mergePolicy; + } + /** Determines the largest number of documents ever merged by addDocument(). * Small values (e.g., less than 10,000) are best for interactive indexing, * as this limits the length of pauses while indexing to a few seconds. * Larger values are best for batched indexing and speedier searches. * *

The default value is {@link Integer#MAX_VALUE}. + * + *

Note that this method is a convenience method: it + * just calls mergePolicy.setMaxMergeDocs as long as + * mergePolicy is an instance of {@link LogMergePolicy}. + * Otherwise an IllegalArgumentException is thrown.

*/ public void setMaxMergeDocs(int maxMergeDocs) { - ensureOpen(); - this.maxMergeDocs = maxMergeDocs; + getLogMergePolicy().setMaxMergeDocs(maxMergeDocs); } - /** + /** * Returns the largest number of documents allowed in a * single segment. + * + *

Note that this method is a convenience method: it + * just calls mergePolicy.getMaxMergeDocs as long as + * mergePolicy is an instance of {@link LogMergePolicy}. + * Otherwise an IllegalArgumentException is thrown.

+ * * @see #setMaxMergeDocs */ public int getMaxMergeDocs() { - ensureOpen(); - return maxMergeDocs; + return getLogMergePolicy().getMaxMergeDocs(); } /** @@ -717,6 +778,7 @@ * @throws IllegalArgumentException if maxBufferedDocs is * smaller than 2 * @see #setRAMBufferSizeMB + * */ public void setMaxBufferedDocs(int maxBufferedDocs) { ensureOpen(); @@ -796,24 +858,31 @@ * for batch index creation, and smaller values (< 10) for indices that are * interactively maintained. * + *

Note that this method is a convenience method: it + * just calls mergePolicy.setMergeFactor as long as + * mergePolicy is an instance of {@link LogMergePolicy}. + * Otherwise an IllegalArgumentException is thrown.

+ * *

This must never be less than 2. The default value is 10. */ public void setMergeFactor(int mergeFactor) { - ensureOpen(); - if (mergeFactor < 2) - throw new IllegalArgumentException("mergeFactor cannot be less than 2"); - this.mergeFactor = mergeFactor; + getLogMergePolicy().setMergeFactor(mergeFactor); } /** - * Returns the number of segments that are merged at once - * and also controls the total number of segments allowed - * to accumulate in the index. + *

Returns the number of segments that are merged at + * once and also controls the total number of segments + * allowed to accumulate in the index.

+ * + *

Note that this method is a convenience method: it + * just calls mergePolicy.getMergeFactor as long as + * mergePolicy is an instance of {@link LogMergePolicy}. + * Otherwise an IllegalArgumentException is thrown.

+ * * @see #setMergeFactor */ public int getMergeFactor() { - ensureOpen(); - return mergeFactor; + return getLogMergePolicy().getMergeFactor(); } /** If non-null, this will be the default infoStream used @@ -922,37 +991,69 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public synchronized void close() throws CorruptIndexException, IOException { - if (!closed) { - flush(true, true); + public void close() throws CorruptIndexException, IOException { - if (commitPending) { - segmentInfos.write(directory); // now commit changes - if (infoStream != null) - infoStream.println("close: wrote segments file \"" + segmentInfos.getCurrentSegmentFileName() + "\""); - deleter.checkpoint(segmentInfos, true); - commitPending = false; - rollbackSegmentInfos = null; - } + boolean doClose; + synchronized(this) { + // Ensure that only one thread actually gets to do the closing: + if (!closing) { + doClose = true; + closing = true; + } else + doClose = false; + } - if (writeLock != null) { - writeLock.release(); // release write lock - writeLock = null; + if (doClose) { + try { + flush(true, true); + + if (commitPending) { + segmentInfos.write(directory); // now commit changes + if (infoStream != null) + infoStream.println("close: wrote segments file \"" + segmentInfos.getCurrentSegmentFileName() + "\""); + synchronized(this) { + deleter.checkpoint(segmentInfos, true); + } + commitPending = false; + rollbackSegmentInfos = null; + } + + if (mergePolicy != null) { + if (doMergeClose) + mergePolicy.close(); + mergePolicy = null; + } + + if (writeLock != null) { + writeLock.release(); // release write lock + writeLock = null; + } + closed = true; + docWriter = null; + + synchronized(this) { + deleter.close(); + } + + if(closeDir) + directory.close(); + } finally { + if (!closed) + closing = false; } - closed = true; - docWriter = null; - - if(closeDir) - directory.close(); } } /** Tells the docWriter to close its currently open shared - * doc stores (stored fields & vectors files). */ - private void flushDocStores() throws IOException { + * doc stores (stored fields & vectors files). + * Return value specifices whether new doc store files are compound or not. + */ + private synchronized boolean flushDocStores() throws IOException { List files = docWriter.files(); + boolean useCompoundDocStore = false; + if (files.size() > 0) { String docStoreSegment; @@ -965,7 +1066,9 @@ docWriter.abort(); } - if (useCompoundFile && docStoreSegment != null) { + useCompoundDocStore = mergePolicy.useCompoundDocStore(segmentInfos); + + if (useCompoundDocStore && docStoreSegment != null) { // Now build compound doc store file checkpoint(); @@ -1006,6 +1109,8 @@ deleter.checkpoint(segmentInfos, false); } } + + return useCompoundDocStore; } /** Release the write lock, if needed. */ @@ -1079,17 +1184,13 @@ * free temporary space in the Directory to do the * merging.

* - *

The amount of free space required when a merge is - * triggered is up to 1X the size of all segments being - * merged, when no readers/searchers are open against the - * index, and up to 2X the size of all segments being - * merged when readers/searchers are open against the - * index (see {@link #optimize()} for details). Most - * merges are small (merging the smallest segments - * together), but whenever a full merge occurs (all - * segments in the index, which is the worst case for - * temporary space usage) then the maximum free disk space - * required is the same as {@link #optimize}.

+ *

The amount of free space required when a merge is triggered is + * up to 1X the size of all segments being merged, when no + * readers/searchers are open against the index, and up to 2X the + * size of all segments being merged when readers/searchers are open + * against the index (see {@link #optimize()} for details). The + * sequence of primitive merge operations performed is governed by + * the merge policy. * *

Note that each term in the document can be no longer * than 16383 characters, otherwise an @@ -1121,6 +1222,8 @@ try { success = docWriter.addDocument(doc, analyzer); } catch (IOException ioe) { + bufferedDeleteTerms.clear(); + numBufferedDeleteTerms = 0; deleter.refresh(); throw ioe; } @@ -1134,9 +1237,11 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public synchronized void deleteDocuments(Term term) throws CorruptIndexException, IOException { + public void deleteDocuments(Term term) throws CorruptIndexException, IOException { ensureOpen(); - bufferDeleteTerm(term); + synchronized(this) { + bufferDeleteTerm(term); + } maybeFlush(); } @@ -1148,10 +1253,12 @@ * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ - public synchronized void deleteDocuments(Term[] terms) throws CorruptIndexException, IOException { + public void deleteDocuments(Term[] terms) throws CorruptIndexException, IOException { ensureOpen(); - for (int i = 0; i < terms.length; i++) { - bufferDeleteTerm(terms[i]); + synchronized(this) { + for (int i = 0; i < terms.length; i++) { + bufferDeleteTerm(terms[i]); + } } maybeFlush(); } @@ -1189,13 +1296,15 @@ public void updateDocument(Term term, Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { ensureOpen(); - synchronized (this) { + synchronized (bufferedDeleteTerms) { bufferDeleteTerm(term); } boolean success = false; try { success = docWriter.addDocument(doc, analyzer); } catch (IOException ioe) { + bufferedDeleteTerms.clear(); + numBufferedDeleteTerms = 0; deleter.refresh(); throw ioe; } @@ -1228,51 +1337,33 @@ return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX); } - /** Determines how often segment indices are merged by addDocument(). With - * smaller values, less RAM is used while indexing, and searches on - * unoptimized indices are faster, but indexing speed is slower. With larger - * values, more RAM is used during indexing, and while searches on unoptimized - * indices are slower, indexing is faster. Thus larger values (> 10) are best - * for batch index creation, and smaller values (< 10) for indices that are - * interactively maintained. - * - *

This must never be less than 2. The default value is {@link #DEFAULT_MERGE_FACTOR}. - - */ - private int mergeFactor = DEFAULT_MERGE_FACTOR; - /** Determines amount of RAM usage by the buffered docs at * which point we trigger a flush to the index. */ private double ramBufferSize = DEFAULT_RAM_BUFFER_SIZE_MB*1024F*1024F; - /** Determines the largest number of documents ever merged by addDocument(). - * Small values (e.g., less than 10,000) are best for interactive indexing, - * as this limits the length of pauses while indexing to a few seconds. - * Larger values are best for batched indexing and speedier searches. - * - *

The default value is {@link #DEFAULT_MAX_MERGE_DOCS}. - - */ - private int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; - /** If non-null, information about merges will be printed to this. */ private PrintStream infoStream = null; - private static PrintStream defaultInfoStream = null; - /** Merges all segments together into a single segment, - * optimizing an index for search. + /** + * Requests an "optimize" operation on an index, priming the index + * for the fastest available search. Traditionally this has meant + * merging all segments into a single segment as is done in the + * default merge policy, but individaul merge policies may implement + * optimize in different ways. * + * @see LogDocMergePolicy#optimize(SegmentInfos) + * *

It is recommended that this method be called upon completion of indexing. In * environments with frequent updates, optimize is best done during low volume times, if at all. * *

*

See http://www.gossamer-threads.com/lists/lucene/java-dev/47895 for more discussion.

* - *

Note that this requires substantial temporary free + *

Note that this can require substantial temporary free * space in the Directory (see LUCENE-764 * for details):

@@ -1310,7 +1401,7 @@ *

The actual temporary usage could be much less than * these figures (it depends on many factors).

* - *

Once the optimize completes, the total size of the + *

In general, once the optimize completes, the total size of the * index will be less than the size of the starting index. * It could be quite a bit smaller (if there were many * pending deletes) or just slightly smaller.

@@ -1330,18 +1421,59 @@ public synchronized void optimize() throws CorruptIndexException, IOException { ensureOpen(); flush(); - while (segmentInfos.size() > 1 || - (segmentInfos.size() == 1 && - (SegmentReader.hasDeletions(segmentInfos.info(0)) || - SegmentReader.hasSeparateNorms(segmentInfos.info(0)) || - segmentInfos.info(0).dir != directory || - (useCompoundFile && - !segmentInfos.info(0).getUseCompoundFile())))) { - int minSegment = segmentInfos.size() - mergeFactor; - mergeSegments(minSegment < 0 ? 0 : minSegment, segmentInfos.size()); + + // Currently hardwired to 1, but once we add method to + // IndexWriter to allow "optimizing to <= N segments" + // then we will change this. + final int maxSegmentCount = 1; + + // Repeat until merge policy stops returning merges: + while(true) { + MergePolicy.MergeSpecification spec; + synchronized(this) { + spec = mergePolicy.optimize(segmentInfos, this, maxSegmentCount); + } + if (spec != null) { + final int numMerge = spec.merges.size(); + for(int i=0;i start+mergeFactor) { - for (int base = start; base < segmentInfos.size(); base++) { - int end = Math.min(segmentInfos.size(), base+mergeFactor); - if (end-base > 1) { - mergeSegments(base, end); - } - } - } + optimize(); + success = true; } finally { if (success) { @@ -1575,8 +1700,6 @@ rollbackTransaction(); } } - - optimize(); // final cleanup } /** @@ -1598,40 +1721,10 @@ */ public synchronized void addIndexesNoOptimize(Directory[] dirs) throws CorruptIndexException, IOException { - // Adding indexes can be viewed as adding a sequence of segments S to - // a sequence of segments T. Segments in T follow the invariants but - // segments in S may not since they could come from multiple indexes. - // Here is the merge algorithm for addIndexesNoOptimize(): - // - // 1 Flush ram. - // 2 Consider a combined sequence with segments from T followed - // by segments from S (same as current addIndexes(Directory[])). - // 3 Assume the highest level for segments in S is h. Call - // maybeMergeSegments(), but instead of starting w/ lowerBound = -1 - // and upperBound = maxBufferedDocs, start w/ lowerBound = -1 and - // upperBound = upperBound of level h. After this, the invariants - // are guaranteed except for the last < M segments whose levels <= h. - // 4 If the invariants hold for the last < M segments whose levels <= h, - // if some of those < M segments are from S (not merged in step 3), - // properly copy them over*, otherwise done. - // Otherwise, simply merge those segments. If the merge results in - // a segment of level <= h, done. Otherwise, it's of level h+1 and call - // maybeMergeSegments() starting w/ upperBound = upperBound of level h+1. - // - // * Ideally, we want to simply copy a segment. However, directory does - // not support copy yet. In addition, source may use compound file or not - // and target may use compound file or not. So we use mergeSegments() to - // copy a segment, which may cause doc count to change because deleted - // docs are garbage collected. - // 1 flush ram - ensureOpen(); flush(); - // 2 copy segment infos and find the highest level from dirs - int startUpperBound = docWriter.getMaxBufferedDocs(); - /* new merge policy if (startUpperBound == 0) startUpperBound = 10; @@ -1654,64 +1747,20 @@ for (int j = 0; j < sis.size(); j++) { SegmentInfo info = sis.info(j); segmentInfos.addElement(info); // add each info - - while (startUpperBound < info.docCount) { - startUpperBound *= mergeFactor; // find the highest level from dirs - if (startUpperBound > maxMergeDocs) { - // upper bound cannot exceed maxMergeDocs - throw new IllegalArgumentException("Upper bound cannot exceed maxMergeDocs"); - } - } } } - // 3 maybe merge segments starting from the highest level from dirs - maybeMergeSegments(startUpperBound); + maybeMerge(); - // get the tail segments whose levels <= h - int segmentCount = segmentInfos.size(); - int numTailSegments = 0; - while (numTailSegments < segmentCount - && startUpperBound >= segmentInfos.info(segmentCount - 1 - numTailSegments).docCount) { - numTailSegments++; - } - if (numTailSegments == 0) { - success = true; - return; - } + // If after merging there remain segments in the index + // that are in a different directory, just copy these + // over into our index. This is necessary (before + // finishing the transaction) to avoid leaving the + // index in an unusable (inconsistent) state. + copyExternalSegments(); - // 4 make sure invariants hold for the tail segments whose levels <= h - if (checkNonDecreasingLevels(segmentCount - numTailSegments)) { - // identify the segments from S to be copied (not merged in 3) - int numSegmentsToCopy = 0; - while (numSegmentsToCopy < segmentCount - && directory != segmentInfos.info(segmentCount - 1 - numSegmentsToCopy).dir) { - numSegmentsToCopy++; - } - if (numSegmentsToCopy == 0) { - success = true; - return; - } + success = true; - // copy those segments from S - for (int i = segmentCount - numSegmentsToCopy; i < segmentCount; i++) { - mergeSegments(i, i + 1); - } - if (checkNonDecreasingLevels(segmentCount - numSegmentsToCopy)) { - success = true; - return; - } - } - - // invariants do not hold, simply merge those segments - mergeSegments(segmentCount - numTailSegments, segmentCount); - - // maybe merge segments again if necessary - if (segmentInfos.info(segmentInfos.size() - 1).docCount > startUpperBound) { - maybeMergeSegments(startUpperBound * mergeFactor); - } - - success = true; } finally { if (success) { commitTransaction(); @@ -1721,6 +1770,17 @@ } } + /* If any of our segments are using a directory != ours + * then copy them over */ + private void copyExternalSegments() throws CorruptIndexException, IOException { + final int numSegments = segmentInfos.size(); + for(int i=0;iAfter this completes, the index is optimized.

*

The provided IndexReaders are not closed.

@@ -1785,7 +1845,7 @@ } } - if (useCompoundFile) { + if (mergePolicy instanceof LogMergePolicy && getUseCompoundFile()) { boolean success = false; @@ -1804,40 +1864,6 @@ } } - // Overview of merge policy: - // - // A flush is triggered either by close() or by the number of ram segments - // reaching maxBufferedDocs. After a disk segment is created by the flush, - // further merges may be triggered. - // - // LowerBound and upperBound set the limits on the doc count of a segment - // which may be merged. Initially, lowerBound is set to 0 and upperBound - // to maxBufferedDocs. Starting from the rightmost* segment whose doc count - // > lowerBound and <= upperBound, count the number of consecutive segments - // whose doc count <= upperBound. - // - // Case 1: number of worthy segments < mergeFactor, no merge, done. - // Case 2: number of worthy segments == mergeFactor, merge these segments. - // If the doc count of the merged segment <= upperBound, done. - // Otherwise, set lowerBound to upperBound, and multiply upperBound - // by mergeFactor, go through the process again. - // Case 3: number of worthy segments > mergeFactor (in the case mergeFactor - // M changes), merge the leftmost* M segments. If the doc count of - // the merged segment <= upperBound, consider the merged segment for - // further merges on this same level. Merge the now leftmost* M - // segments, and so on, until number of worthy segments < mergeFactor. - // If the doc count of all the merged segments <= upperBound, done. - // Otherwise, set lowerBound to upperBound, and multiply upperBound - // by mergeFactor, go through the process again. - // Note that case 2 can be considerd as a special case of case 3. - // - // This merge policy guarantees two invariants if M does not change and - // segment doc count is not reaching maxMergeDocs: - // B for maxBufferedDocs, f(n) defined as ceil(log_M(ceil(n/B))) - // 1: If i (left*) and i+1 (right*) are two consecutive segments of doc - // counts x and y, then f(x) >= f(y). - // 2: The number of committed segments on the same level (f(n)) <= M. - // This is called after pending added and deleted // documents have been flushed to the Directory but before // the change is committed (new segments_N file written). @@ -1914,7 +1940,8 @@ infoStream.println(" flush: flushDocs=" + flushDocs + " flushDeletes=" + flushDeletes + " flushDocStores=" + flushDocStores + - " numDocs=" + numDocs); + " numDocs=" + numDocs + + " numBufDelTerms=" + numBufferedDeleteTerms); int docStoreOffset = docWriter.getDocStoreOffset(); boolean docStoreIsCompoundFile = false; @@ -1927,13 +1954,15 @@ if (infoStream != null) infoStream.println(" flush shared docStore segment " + docStoreSegment); - flushDocStores(); + docStoreIsCompoundFile = flushDocStores(); flushDocStores = false; - docStoreIsCompoundFile = useCompoundFile; } String segment = docWriter.getSegment(); + // If we are flushing docs, segment must not be null: + assert segment != null || !flushDocs; + if (flushDocs || flushDeletes) { SegmentInfos rollback = null; @@ -2013,7 +2042,8 @@ deleter.checkpoint(segmentInfos, autoCommit); - if (flushDocs && useCompoundFile) { + if (flushDocs && mergePolicy.useCompoundFile(segmentInfos, + newSegment)) { success = false; try { docWriter.createCompoundFile(segment); @@ -2030,14 +2060,15 @@ deleter.checkpoint(segmentInfos, autoCommit); } - /* new merge policy - if (0 == docWriter.getMaxBufferedDocs()) - maybeMergeSegments(mergeFactor * numDocs / 2); - else - maybeMergeSegments(docWriter.getMaxBufferedDocs()); - */ - if (triggerMerge) - maybeMergeSegments(docWriter.getMaxBufferedDocs()); + if (triggerMerge) { + /* new merge policy + if (0 == docWriter.getMaxBufferedDocs()) + mergePolicy.merge(segmentInfos,mergeFactor * numDocs / 2); + else + mergePolicy.merge(segmentInfos,docWriter.getMaxBufferedDocs()); + */ + maybeMerge(); + } } } finally { docWriter.clearFlushPending(); @@ -2060,256 +2091,431 @@ ensureOpen(); return docWriter.getNumDocsInRAM(); } - - /** Incremental segment merger. */ - private final void maybeMergeSegments(int startUpperBound) throws CorruptIndexException, IOException { - long lowerBound = -1; - long upperBound = startUpperBound; - /* new merge policy - if (upperBound == 0) upperBound = 10; - */ + private int ensureContiguousMerge(MergePolicy.OneMerge merge, boolean checkRunning) { - while (upperBound < maxMergeDocs) { - int minSegment = segmentInfos.size(); - int maxSegment = -1; + int first = segmentInfos.indexOf(merge.segments.info(0)); + if (first == -1) + throw new MergePolicy.MergeException("could not find segment " + merge.segments.info(0).name + " in current segments"); - // find merge-worthy segments - while (--minSegment >= 0) { - SegmentInfo si = segmentInfos.info(minSegment); + final int numSegments = segmentInfos.size(); + + final int numSegmentsToMerge = merge.segments.size(); + for(int i=0;i lowerBound && si.docCount <= upperBound) { - // start from the rightmost* segment whose doc count is in bounds - maxSegment = minSegment; - } else if (si.docCount > upperBound) { - // until the segment whose doc count exceeds upperBound - break; - } + if (checkRunning && mergingSegments.contains(info)) + throw new MergePolicy.MergeException("MergePolicy selected a segment (" + info.name + ") that is already involved in a running merge"); + + if (first + i >= numSegments || segmentInfos.info(first+i) != info) { + if (segmentInfos.indexOf(info) == -1) + throw new MergePolicy.MergeException("MergePolicy selected a segment (" + info.name + ") that is not in the index"); + else + throw new MergePolicy.MergeException("MergePolicy selected non-contiguous segments to merge (" + merge + " vs " + segString() + "), which IndexWriter (currently) cannot handle"); } + } - minSegment++; - maxSegment++; - int numSegments = maxSegment - minSegment; + return first; + } - if (numSegments < mergeFactor) { - break; - } else { - boolean exceedsUpperLimit = false; + /* FIXME if we want to support non-contiguous segment merges */ + synchronized private void commitMerge(MergePolicy.OneMerge merge, SegmentInfo info) throws IOException { - // number of merge-worthy segments may exceed mergeFactor when - // mergeFactor and/or maxBufferedDocs change(s) - while (numSegments >= mergeFactor) { - // merge the leftmost* mergeFactor segments + SegmentInfos sourceSegmentsClone = merge.segmentsClone; + SegmentInfos sourceSegments = merge.segments; + final int numSegments = segmentInfos.size(); - int docCount = mergeSegments(minSegment, minSegment + mergeFactor); - numSegments -= mergeFactor; + final int start = ensureContiguousMerge(merge, false); + if (infoStream != null) + infoStream.println("now commitMerge merge=" + merge + " into " + merge.info.name + " dir=" + directory); - if (docCount > upperBound) { - // continue to merge the rest of the worthy segments on this level - minSegment++; - exceedsUpperLimit = true; - } else { - // if the merged segment does not exceed upperBound, consider - // this segment for further merges on this same level - numSegments++; + // Carefully merge deletes that occurred after we + // started merging: + + BitVector deletes = null; + int docUpto = 0; + + final int numSegmentsToMerge = sourceSegments.size(); + for(int i=0;i minSegment; i--) // remove old infos & add new - segmentInfos.remove(i); + int mergedDocCount = 0; - segmentInfos.set(minSegment, newSegment); + SegmentInfos sourceSegments = merge.segments; + SegmentInfos sourceSegmentsClone = merge.segmentsClone; + final int numSegments = sourceSegments.size(); - checkpoint(); + if (infoStream != null) infoStream.print("merging segments"); - success = true; + merger = new SegmentMerger(this, mergedName); - } finally { - if (!success) { - if (rollback != null) { - // Rollback the individual SegmentInfo - // instances, but keep original SegmentInfos - // instance (so we don't try to write again the - // same segments_N file -- write once): - segmentInfos.clear(); - segmentInfos.addAll(rollback); - } + // This is try/finally to make sure merger's readers are + // closed: - // Delete any partially created and now unreferenced files: - deleter.refresh(); - } + boolean success = false; + + try { + int totDocCount = 0; + for (int i = 0; i < numSegments; i++) { + SegmentInfo si = sourceSegmentsClone.info(i); + if (infoStream != null) + infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); + IndexReader reader = SegmentReader.get(si, MERGE_READ_BUFFER_SIZE, merge.mergeDocStores); // no need to set deleter (yet) + merger.add(reader); + if (infoStream != null) + totDocCount += reader.numDocs(); } + if (infoStream != null) { + infoStream.println(" into "+mergedName+" ("+totDocCount+" docs)"); + } + + mergedDocCount = merge.info.docCount = merger.merge(merge.mergeDocStores); + + if (infoStream != null) + assert mergedDocCount == totDocCount; + + success = true; } finally { - // close readers before we attempt to delete now-obsolete segments + // close readers before we attempt to delete + // now-obsolete segments if (merger != null) { merger.closeReaders(); } + if (!success) { + synchronized(this) { + deleter.refresh(); + } + } } - // Give deleter a chance to remove files now. - deleter.checkpoint(segmentInfos, autoCommit); + commitMerge(merge, merge.info); - if (useCompoundFile) { + if (merge.useCompoundFile) { + + success = false; + boolean skip = false; - boolean success = false; - try { - - merger.createCompoundFile(mergedName + ".cfs"); - newSegment.setUseCompoundFile(true); - checkpoint(); + try { + merger.createCompoundFile(mergedName + ".cfs"); + } catch (IOException ioe) { + synchronized(this) { + if (segmentInfos.indexOf(merge.info) == -1) { + // If another merge kicked in and merged our + // new segment away while we were trying to + // build the compound file, we can hit a + // FileNotFoundException and possibly + // IOException over NFS. We can tell this has + // happened because our SegmentInfo is no + // longer in the segments; if it has happened + // it is safe to ignore the exception & skip + // finishing/committing our compound file + // creating. + skip = true; + } else + throw ioe; + } + } success = true; - } finally { - if (!success) { - // Must rollback: - newSegment.setUseCompoundFile(false); - deleter.refresh(); + if (!success) { + synchronized(this) { + deleter.refresh(); + } } } + + if (!skip) { + synchronized(this) { + success = false; + try { + merge.info.setUseCompoundFile(true); + checkpoint(); + success = true; + } finally { + if (!success) { + // Must rollback: + merge.info.setUseCompoundFile(false); + deleter.refresh(); + } + } - // Give deleter a chance to remove files now. - deleter.checkpoint(segmentInfos, autoCommit); + // Give deleter a chance to remove files now. + deleter.checkpoint(segmentInfos, autoCommit); + } + } } return mergedDocCount; @@ -2385,29 +2591,6 @@ } } - private final boolean checkNonDecreasingLevels(int start) { - int lowerBound = -1; - int upperBound = docWriter.getMaxBufferedDocs(); - - /* new merge policy - if (upperBound == 0) - upperBound = 10; - */ - - for (int i = segmentInfos.size() - 1; i >= start; i--) { - int docCount = segmentInfos.info(i).docCount; - if (docCount <= lowerBound) { - return false; - } - - while (docCount > upperBound) { - lowerBound = upperBound; - upperBound *= mergeFactor; - } - } - return true; - } - // For test purposes. final synchronized int getBufferedDeleteTermsSize() { return bufferedDeleteTerms.size(); @@ -2487,4 +2670,43 @@ reader.deleteDocuments((Term) entry.getKey()); } } + + // utility routines for tests + SegmentInfo newestSegment() { + return segmentInfos.info(segmentInfos.size()-1); + } + + //void checkpoint(Directory dir) throws IOException { + // segmentInfos.write(dir); + //} + + public String segString() { + StringBuffer buffer = new StringBuffer(); + for(int i = 0; i < segmentInfos.size(); i++) { + if (i > 0) { + buffer.append(' '); + } + + SegmentInfo info = segmentInfos.info(i); + buffer.append(info.name + ":"); + + try { + if (info.getUseCompoundFile()) { + buffer.append('c'); + } else { + buffer.append('C'); + } + } catch (Exception e) { + } + + if (info.dir != getDirectory()) { + buffer.append('x'); + } + + buffer.append(info.docCount); + + } + + return buffer.toString(); + } } Index: src/java/org/apache/lucene/index/IndexFileDeleter.java =================================================================== --- src/java/org/apache/lucene/index/IndexFileDeleter.java (revision 569112) +++ src/java/org/apache/lucene/index/IndexFileDeleter.java (working copy) @@ -294,6 +294,21 @@ } } + public void close() throws IOException { + deletePendingFiles(); + } + + private void deletePendingFiles() throws IOException { + if (deletable != null) { + List oldDeletable = deletable; + deletable = null; + int size = oldDeletable.size(); + for(int i=0;i