Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 576821) +++ CHANGES.txt (working copy) @@ -6,6 +6,25 @@ Changes in runtime behavior + 1. LUCENE-994: Defaults for IndexWriter have been changed to maximize + out-of-the-box indexing speed. First, IndexWriter now flushes by + RAM usage (16 MB by default) instead of a fixed doc count (call + IndexWriter.setMaxBufferedDocs to get backwards compatible + behavior). Second, ConcurrentMergeScheduler is used to run merges + using background threads (call IndexWriter.setMergeScheduler(new + SerialMergeScheduler()) to get backwards compatible behavior). + Third, merges are chosen based on size in bytes of each segment + rather than document count of each segment (call + IndexWriter.setMergePolicy(new LogDocMergePolicy()) to get + backwards compatible behavior). + + NOTE: users of ParallelReader must change back all of these + defaults in order to ensure the docIDs "align" across all parallel + indices. + + (Mike McCandless) + + API Changes 1. LUCENE-843: Added IndexWriter.setRAMBufferSizeMB(...) to have Index: src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java =================================================================== --- src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java (revision 576821) +++ src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java (working copy) @@ -113,10 +113,13 @@ ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler(); writer.setMergeScheduler(cms); + LogDocMergePolicy mp = new LogDocMergePolicy(); + writer.setMergePolicy(mp); + // Force degenerate merging so we can get a mix of // merging of segments with and without deletes at the // start: - ((LogDocMergePolicy) writer.getMergePolicy()).setMinMergeDocs(1000); + mp.setMinMergeDocs(1000); Document doc = new Document(); Field idField = new Field("id", "", Field.Store.YES, Field.Index.UN_TOKENIZED); Index: src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (revision 576821) +++ src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (working copy) @@ -37,6 +37,7 @@ IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); writer.setMaxBufferedDocs(10); writer.setMergeFactor(10); + writer.setMergePolicy(new LogDocMergePolicy()); for (int i = 0; i < 100; i++) { addDoc(writer); @@ -53,6 +54,7 @@ IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); writer.setMaxBufferedDocs(10); writer.setMergeFactor(10); + writer.setMergePolicy(new LogDocMergePolicy()); boolean noOverMerge = false; for (int i = 0; i < 100; i++) { @@ -74,19 +76,18 @@ IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); writer.setMaxBufferedDocs(10); writer.setMergeFactor(10); - MergePolicy mp = writer.getMergePolicy(); - if (mp instanceof LogDocMergePolicy) - ((LogDocMergePolicy) mp).setMinMergeDocs(100); + LogDocMergePolicy mp = new LogDocMergePolicy(); + mp.setMinMergeDocs(100); + writer.setMergePolicy(mp); for (int i = 0; i < 100; i++) { addDoc(writer); writer.close(); writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false); - mp = writer.getMergePolicy(); writer.setMaxBufferedDocs(10); - if (mp instanceof LogDocMergePolicy) - ((LogDocMergePolicy) mp).setMinMergeDocs(100); + writer.setMergePolicy(mp); + mp.setMinMergeDocs(100); writer.setMergeFactor(10); checkInvariants(writer); } @@ -101,6 +102,7 @@ IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); writer.setMaxBufferedDocs(10); writer.setMergeFactor(100); + writer.setMergePolicy(new LogDocMergePolicy()); for (int i = 0; i < 250; i++) { addDoc(writer); @@ -126,6 +128,7 @@ IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); writer.setMaxBufferedDocs(101); writer.setMergeFactor(101); + writer.setMergePolicy(new LogDocMergePolicy()); // leftmost* segment has 1 doc // rightmost* segment has 100 docs @@ -139,6 +142,7 @@ writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false); writer.setMaxBufferedDocs(101); writer.setMergeFactor(101); + writer.setMergePolicy(new LogDocMergePolicy()); } writer.setMaxBufferedDocs(10); @@ -164,6 +168,7 @@ Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); + writer.setMergePolicy(new LogDocMergePolicy()); writer.setMaxBufferedDocs(10); writer.setMergeFactor(100); @@ -178,6 +183,7 @@ reader.close(); writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false); + writer.setMergePolicy(new LogDocMergePolicy()); writer.setMaxBufferedDocs(10); writer.setMergeFactor(5); Index: src/test/org/apache/lucene/index/TestAtomicUpdate.java =================================================================== --- src/test/org/apache/lucene/index/TestAtomicUpdate.java (revision 576821) +++ src/test/org/apache/lucene/index/TestAtomicUpdate.java (working copy) @@ -127,6 +127,7 @@ d.add(new Field("contents", English.intToEnglish(i), Field.Store.NO, Field.Index.TOKENIZED)); writer.addDocument(d); } + writer.flush(); IndexerThread indexerThread = new IndexerThread(writer, threads); threads[0] = indexerThread; Index: src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java =================================================================== --- src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (revision 576821) +++ src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (working copy) @@ -326,7 +326,9 @@ private IndexWriter newWriter(Directory dir, boolean create) throws IOException { - return new IndexWriter(dir, new WhitespaceAnalyzer(), create); + final IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), create); + writer.setMergePolicy(new LogDocMergePolicy()); + return writer; } private void addDocs(IndexWriter writer, int numDocs) throws IOException { Index: src/test/org/apache/lucene/index/TestIndexModifier.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexModifier.java (revision 576821) +++ src/test/org/apache/lucene/index/TestIndexModifier.java (working copy) @@ -75,10 +75,7 @@ // Lucene defaults: assertNull(i.getInfoStream()); assertTrue(i.getUseCompoundFile()); - /* new merge policy assertEquals(0, i.getMaxBufferedDocs()); - */ - assertEquals(10, i.getMaxBufferedDocs()); assertEquals(10000, i.getMaxFieldLength()); assertEquals(10, i.getMergeFactor()); // test setting properties: Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 576821) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -71,16 +71,16 @@ or enough added documents since the last flush, whichever is sooner. For the added documents, flushing is triggered either by RAM usage of the documents (see {@link - #setRAMBufferSizeMB}) or the number of added documents - (this is the default; see {@link #setMaxBufferedDocs}). - For best indexing speed you should flush by RAM usage with - a large RAM buffer. You can also force a flush by calling + #setRAMBufferSizeMB}) or the number of added documents. + The default is to flush when RAM usage hits 16 MB. For + best indexing speed you should flush by RAM usage with a + large RAM buffer. You can also force a flush by calling {@link #flush}. When a flush occurs, both pending deletes and added documents are flushed to the index. A flush may also trigger one or more segment merges which by default - run (blocking) with the current thread (see below for changing the {@link - MergeScheduler}).

+ run with a background thread so as not to block the + addDocument calls (see below + for changing the {@link MergeScheduler}).

The optional autoCommit argument to the @@ -153,10 +153,10 @@ select which merges to do, if any, and return a {@link MergePolicy.MergeSpecification} describing the merges. It also selects merges to do for optimize(). (The default is - {@link LogDocMergePolicy}. Then, the {@link + {@link LogByteMergePolicy}. Then, the {@link MergeScheduler} is invoked with the requested merges and it decides when and how to run the merges. The default is - {@link SerialMergeScheduler}.

+ {@link ConcurrentMergeScheduler}.

*/ /* @@ -205,22 +205,16 @@ public final static int DEFAULT_MERGE_FACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR; /** - * Default value is 10. Change using {@link #setMaxBufferedDocs(int)}. + * Default value is 0 (because IndexWriter flushes by RAM + * usage by default). Change using {@link #setMaxBufferedDocs(int)}. */ - - public final static int DEFAULT_MAX_BUFFERED_DOCS = 10; - /* new merge policy public final static int DEFAULT_MAX_BUFFERED_DOCS = 0; - */ /** - * Default value is 0 MB (which means flush only by doc - * count). Change using {@link #setRAMBufferSizeMB}. + * Default value is 16 MB (which means flush when buffered + * docs consume 16 MB RAM). Change using {@link #setRAMBufferSizeMB}. */ - public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 0.0; - /* new merge policy public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0; - */ /** * Default value is 1000. Change using {@link #setMaxBufferedDeleteTerms(int)}. @@ -281,8 +275,8 @@ // merges private HashSet mergingSegments = new HashSet(); - private MergePolicy mergePolicy = new LogDocMergePolicy(); - private MergeScheduler mergeScheduler = new SerialMergeScheduler(); + private MergePolicy mergePolicy = new LogByteSizeMergePolicy(); + private MergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); private LinkedList pendingMerges = new LinkedList(); private Set runningMerges = new HashSet(); private List mergeExceptions = new ArrayList(); @@ -1136,6 +1130,9 @@ rollbackSegmentInfos = null; } + if (infoStream != null) + message("at close: " + segString()); + if (writeLock != null) { writeLock.release(); // release write lock writeLock = null; @@ -2245,7 +2242,7 @@ // apply to more than just the last flushed segment boolean flushDeletes = docWriter.hasDeletes(); - if (infoStream != null) + if (infoStream != null) { message(" flush: segment=" + docWriter.getSegment() + " docStoreSegment=" + docWriter.getDocStoreSegment() + " docStoreOffset=" + docWriter.getDocStoreOffset() + @@ -2254,6 +2251,8 @@ " flushDocStores=" + flushDocStores + " numDocs=" + numDocs + " numBufDelTerms=" + docWriter.getNumBufferedDeleteTerms()); + message(" index before flush " + segString()); + } int docStoreOffset = docWriter.getDocStoreOffset(); boolean docStoreIsCompoundFile = false; Index: src/java/org/apache/lucene/index/MergeScheduler.java =================================================================== --- src/java/org/apache/lucene/index/MergeScheduler.java (revision 576821) +++ src/java/org/apache/lucene/index/MergeScheduler.java (working copy) @@ -22,7 +22,7 @@ /** Expert: {@link IndexWriter} uses an instance * implementing this interface to execute the merges * selected by a {@link MergePolicy}. The default - * MergeScheduler is {@link SerialMergeScheduler}. */ + * MergeScheduler is {@link ConcurrentMergeScheduler}. */ public interface MergeScheduler { Index: contrib/gdata-server/src/core/src/test/org/apache/lucene/gdata/search/index/TestGdataIndexWriter.java =================================================================== --- contrib/gdata-server/src/core/src/test/org/apache/lucene/gdata/search/index/TestGdataIndexWriter.java (revision 576821) +++ contrib/gdata-server/src/core/src/test/org/apache/lucene/gdata/search/index/TestGdataIndexWriter.java (working copy) @@ -26,6 +26,7 @@ import org.apache.lucene.gdata.search.config.IndexSchema; import org.apache.lucene.gdata.search.config.IndexSchemaField; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.index.LogDocMergePolicy; /** * @@ -72,7 +73,8 @@ // assertEquals(VALUE_GT_DEFAULT_LONG,writer.getCommitLockTimeout()); assertEquals(VALUE_GT_DEFAULT_LONG,writer.getWriteLockTimeout()); assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxBufferedDocs()); - assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxMergeDocs()); + if (writer.getMergePolicy() instanceof LogDocMergePolicy) + assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxMergeDocs()); assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxFieldLength()); assertEquals(VALUE_GT_DEFAULT_INT,writer.getMergeFactor()); assertTrue(writer.getUseCompoundFile()); Index: contrib/gdata-server/src/core/src/java/org/apache/lucene/gdata/search/index/GDataIndexWriter.java =================================================================== --- contrib/gdata-server/src/core/src/java/org/apache/lucene/gdata/search/index/GDataIndexWriter.java (revision 576821) +++ contrib/gdata-server/src/core/src/java/org/apache/lucene/gdata/search/index/GDataIndexWriter.java (working copy) @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.gdata.search.config.IndexSchema; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.store.Directory; /** @@ -42,7 +43,7 @@ setUseCompoundFile(config.isUseCompoundFile()); if (config.getMaxBufferedDocs() != IndexSchema.NOT_SET_VALUE) setMaxBufferedDocs(config.getMaxBufferedDocs()); - if (config.getMaxMergeDocs() != IndexSchema.NOT_SET_VALUE) + if (config.getMaxMergeDocs() != IndexSchema.NOT_SET_VALUE && getMergePolicy() instanceof LogDocMergePolicy) setMaxMergeDocs(config.getMaxMergeDocs()); if (config.getMergeFactor() != IndexSchema.NOT_SET_VALUE) setMergeFactor(config.getMergeFactor()); Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java (revision 576821) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java (working copy) @@ -176,6 +176,36 @@ } /** + * Return a double property. + * If the property contain ":", e.g. "10:100:5", it is interpreted + * as array of doubles. It is extracted once, on first call + * to get() it, and a by-round-value is returned. + * @param name name of property + * @param dflt default value + * @return a double property. + */ + public double get (String name, double dflt) { + // use value by round if already parsed + double vals[] = (double[]) valByRound.get(name); + if (vals != null) { + return vals[roundNumber % vals.length]; + } + // done if not by round + String sval = props.getProperty(name,""+dflt); + if (sval.indexOf(":")<0) { + return Double.parseDouble(sval); + } + // first time this prop is extracted by round + int k = sval.indexOf(":"); + String colName = sval.substring(0,k); + sval = sval.substring(k+1); + colForValByRound.put(name,colName); + vals = propToDoubleArray(sval); + valByRound.put(name,vals); + return vals[roundNumber % vals.length]; + } + + /** * Return a boolean property. * If the property contain ":", e.g. "true.true.false", it is interpreted * as array of boleans. It is extracted once, on first call @@ -241,7 +271,7 @@ return roundNumber; } - // extract properties to array, e.g. for "10.100.5" return int[]{10,100,5}. + // extract properties to array, e.g. for "10:100:5" return int[]{10,100,5}. private int[] propToIntArray (String s) { if (s.indexOf(":")<0) { return new int [] { Integer.parseInt(s) }; @@ -260,7 +290,26 @@ return res; } - // extract properties to array, e.g. for "true.true.false" return booleab[]{true,false,false}. + // extract properties to array, e.g. for "10.7:100.4:-2.3" return int[]{10.7,100.4,-2.3}. + private double[] propToDoubleArray (String s) { + if (s.indexOf(":")<0) { + return new double [] { Double.parseDouble(s) }; + } + + ArrayList a = new ArrayList(); + StringTokenizer st = new StringTokenizer(s,":"); + while (st.hasMoreTokens()) { + String t = st.nextToken(); + a.add(new Double(t)); + } + double res[] = new double[a.size()]; + for (int i=0; i 0) + writer.setRAMBufferSizeMB(flushAtRAMUsage); + else if (mxbf != 0) + writer.setMaxBufferedDocs(mxbf); + else + throw new RuntimeException("either max.buffered or ram.flush.mb must be non-zero"); writer.setMaxFieldLength(mxfl); writer.setMergeFactor(mrgf); writer.setUseCompoundFile(cmpnd); // this one redundant? Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java =================================================================== --- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (revision 576821) +++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (working copy) @@ -49,18 +49,20 @@ int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR); int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED); int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH); - double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB); + double flushAtRAMUsage = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB); boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT); IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true); iw.setUseCompoundFile(cmpnd); iw.setMergeFactor(mrgf); - iw.setMaxBufferedDocs(mxbf); iw.setMaxFieldLength(mxfl); if (flushAtRAMUsage > 0) iw.setRAMBufferSizeMB(flushAtRAMUsage); - + else if (mxbf != 0) + iw.setMaxBufferedDocs(mxbf); + else + throw new RuntimeException("either max.buffered or ram.flush.mb must be non-zero"); getRunData().setIndexWriter(iw); return 1; }