Index: CHANGES.txt
===================================================================
--- CHANGES.txt (revision 576821)
+++ CHANGES.txt (working copy)
@@ -6,6 +6,25 @@
Changes in runtime behavior
+ 1. LUCENE-994: Defaults for IndexWriter have been changed to maximize
+ out-of-the-box indexing speed. First, IndexWriter now flushes by
+ RAM usage (16 MB by default) instead of a fixed doc count (call
+ IndexWriter.setMaxBufferedDocs to get backwards compatible
+ behavior). Second, ConcurrentMergeScheduler is used to run merges
+ using background threads (call IndexWriter.setMergeScheduler(new
+ SerialMergeScheduler()) to get backwards compatible behavior).
+ Third, merges are chosen based on size in bytes of each segment
+ rather than document count of each segment (call
+ IndexWriter.setMergePolicy(new LogDocMergePolicy()) to get
+ backwards compatible behavior).
+
+ NOTE: users of ParallelReader must change back all of these
+ defaults in order to ensure the docIDs "align" across all parallel
+ indices.
+
+ (Mike McCandless)
+
+
API Changes
1. LUCENE-843: Added IndexWriter.setRAMBufferSizeMB(...) to have
Index: src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java
===================================================================
--- src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java (revision 576821)
+++ src/test/org/apache/lucene/index/TestConcurrentMergeScheduler.java (working copy)
@@ -113,10 +113,13 @@
ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler();
writer.setMergeScheduler(cms);
+ LogDocMergePolicy mp = new LogDocMergePolicy();
+ writer.setMergePolicy(mp);
+
// Force degenerate merging so we can get a mix of
// merging of segments with and without deletes at the
// start:
- ((LogDocMergePolicy) writer.getMergePolicy()).setMinMergeDocs(1000);
+ mp.setMinMergeDocs(1000);
Document doc = new Document();
Field idField = new Field("id", "", Field.Store.YES, Field.Index.UN_TOKENIZED);
Index: src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
===================================================================
--- src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (revision 576821)
+++ src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (working copy)
@@ -37,6 +37,7 @@
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(10);
+ writer.setMergePolicy(new LogDocMergePolicy());
for (int i = 0; i < 100; i++) {
addDoc(writer);
@@ -53,6 +54,7 @@
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(10);
+ writer.setMergePolicy(new LogDocMergePolicy());
boolean noOverMerge = false;
for (int i = 0; i < 100; i++) {
@@ -74,19 +76,18 @@
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(10);
- MergePolicy mp = writer.getMergePolicy();
- if (mp instanceof LogDocMergePolicy)
- ((LogDocMergePolicy) mp).setMinMergeDocs(100);
+ LogDocMergePolicy mp = new LogDocMergePolicy();
+ mp.setMinMergeDocs(100);
+ writer.setMergePolicy(mp);
for (int i = 0; i < 100; i++) {
addDoc(writer);
writer.close();
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false);
- mp = writer.getMergePolicy();
writer.setMaxBufferedDocs(10);
- if (mp instanceof LogDocMergePolicy)
- ((LogDocMergePolicy) mp).setMinMergeDocs(100);
+ writer.setMergePolicy(mp);
+ mp.setMinMergeDocs(100);
writer.setMergeFactor(10);
checkInvariants(writer);
}
@@ -101,6 +102,7 @@
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(100);
+ writer.setMergePolicy(new LogDocMergePolicy());
for (int i = 0; i < 250; i++) {
addDoc(writer);
@@ -126,6 +128,7 @@
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setMaxBufferedDocs(101);
writer.setMergeFactor(101);
+ writer.setMergePolicy(new LogDocMergePolicy());
// leftmost* segment has 1 doc
// rightmost* segment has 100 docs
@@ -139,6 +142,7 @@
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false);
writer.setMaxBufferedDocs(101);
writer.setMergeFactor(101);
+ writer.setMergePolicy(new LogDocMergePolicy());
}
writer.setMaxBufferedDocs(10);
@@ -164,6 +168,7 @@
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
+ writer.setMergePolicy(new LogDocMergePolicy());
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(100);
@@ -178,6 +183,7 @@
reader.close();
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false);
+ writer.setMergePolicy(new LogDocMergePolicy());
writer.setMaxBufferedDocs(10);
writer.setMergeFactor(5);
Index: src/test/org/apache/lucene/index/TestAtomicUpdate.java
===================================================================
--- src/test/org/apache/lucene/index/TestAtomicUpdate.java (revision 576821)
+++ src/test/org/apache/lucene/index/TestAtomicUpdate.java (working copy)
@@ -127,6 +127,7 @@
d.add(new Field("contents", English.intToEnglish(i), Field.Store.NO, Field.Index.TOKENIZED));
writer.addDocument(d);
}
+ writer.flush();
IndexerThread indexerThread = new IndexerThread(writer, threads);
threads[0] = indexerThread;
Index: src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java
===================================================================
--- src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (revision 576821)
+++ src/test/org/apache/lucene/index/TestAddIndexesNoOptimize.java (working copy)
@@ -326,7 +326,9 @@
private IndexWriter newWriter(Directory dir, boolean create)
throws IOException {
- return new IndexWriter(dir, new WhitespaceAnalyzer(), create);
+ final IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), create);
+ writer.setMergePolicy(new LogDocMergePolicy());
+ return writer;
}
private void addDocs(IndexWriter writer, int numDocs) throws IOException {
Index: src/test/org/apache/lucene/index/TestIndexModifier.java
===================================================================
--- src/test/org/apache/lucene/index/TestIndexModifier.java (revision 576821)
+++ src/test/org/apache/lucene/index/TestIndexModifier.java (working copy)
@@ -75,10 +75,7 @@
// Lucene defaults:
assertNull(i.getInfoStream());
assertTrue(i.getUseCompoundFile());
- /* new merge policy
assertEquals(0, i.getMaxBufferedDocs());
- */
- assertEquals(10, i.getMaxBufferedDocs());
assertEquals(10000, i.getMaxFieldLength());
assertEquals(10, i.getMergeFactor());
// test setting properties:
Index: src/java/org/apache/lucene/index/IndexWriter.java
===================================================================
--- src/java/org/apache/lucene/index/IndexWriter.java (revision 576821)
+++ src/java/org/apache/lucene/index/IndexWriter.java (working copy)
@@ -71,16 +71,16 @@
or enough added documents since the last flush, whichever
is sooner. For the added documents, flushing is triggered
either by RAM usage of the documents (see {@link
- #setRAMBufferSizeMB}) or the number of added documents
- (this is the default; see {@link #setMaxBufferedDocs}).
- For best indexing speed you should flush by RAM usage with
- a large RAM buffer. You can also force a flush by calling
+ #setRAMBufferSizeMB}) or the number of added documents.
+ The default is to flush when RAM usage hits 16 MB. For
+ best indexing speed you should flush by RAM usage with a
+ large RAM buffer. You can also force a flush by calling
{@link #flush}. When a flush occurs, both pending deletes
and added documents are flushed to the index. A flush may
also trigger one or more segment merges which by default
- run (blocking) with the current thread (see below for changing the {@link
- MergeScheduler}).
+ run with a background thread so as not to block the
+ addDocument calls (see below
+ for changing the {@link MergeScheduler}).
The optional autoCommit argument to the
@@ -153,10 +153,10 @@
select which merges to do, if any, and return a {@link
MergePolicy.MergeSpecification} describing the merges. It
also selects merges to do for optimize(). (The default is
- {@link LogDocMergePolicy}. Then, the {@link
+ {@link LogByteMergePolicy}. Then, the {@link
MergeScheduler} is invoked with the requested merges and
it decides when and how to run the merges. The default is
- {@link SerialMergeScheduler}.
+ {@link ConcurrentMergeScheduler}.
*/
/*
@@ -205,22 +205,16 @@
public final static int DEFAULT_MERGE_FACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR;
/**
- * Default value is 10. Change using {@link #setMaxBufferedDocs(int)}.
+ * Default value is 0 (because IndexWriter flushes by RAM
+ * usage by default). Change using {@link #setMaxBufferedDocs(int)}.
*/
-
- public final static int DEFAULT_MAX_BUFFERED_DOCS = 10;
- /* new merge policy
public final static int DEFAULT_MAX_BUFFERED_DOCS = 0;
- */
/**
- * Default value is 0 MB (which means flush only by doc
- * count). Change using {@link #setRAMBufferSizeMB}.
+ * Default value is 16 MB (which means flush when buffered
+ * docs consume 16 MB RAM). Change using {@link #setRAMBufferSizeMB}.
*/
- public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 0.0;
- /* new merge policy
public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0;
- */
/**
* Default value is 1000. Change using {@link #setMaxBufferedDeleteTerms(int)}.
@@ -281,8 +275,8 @@
// merges
private HashSet mergingSegments = new HashSet();
- private MergePolicy mergePolicy = new LogDocMergePolicy();
- private MergeScheduler mergeScheduler = new SerialMergeScheduler();
+ private MergePolicy mergePolicy = new LogByteSizeMergePolicy();
+ private MergeScheduler mergeScheduler = new ConcurrentMergeScheduler();
private LinkedList pendingMerges = new LinkedList();
private Set runningMerges = new HashSet();
private List mergeExceptions = new ArrayList();
@@ -1136,6 +1130,9 @@
rollbackSegmentInfos = null;
}
+ if (infoStream != null)
+ message("at close: " + segString());
+
if (writeLock != null) {
writeLock.release(); // release write lock
writeLock = null;
@@ -2245,7 +2242,7 @@
// apply to more than just the last flushed segment
boolean flushDeletes = docWriter.hasDeletes();
- if (infoStream != null)
+ if (infoStream != null) {
message(" flush: segment=" + docWriter.getSegment() +
" docStoreSegment=" + docWriter.getDocStoreSegment() +
" docStoreOffset=" + docWriter.getDocStoreOffset() +
@@ -2254,6 +2251,8 @@
" flushDocStores=" + flushDocStores +
" numDocs=" + numDocs +
" numBufDelTerms=" + docWriter.getNumBufferedDeleteTerms());
+ message(" index before flush " + segString());
+ }
int docStoreOffset = docWriter.getDocStoreOffset();
boolean docStoreIsCompoundFile = false;
Index: src/java/org/apache/lucene/index/MergeScheduler.java
===================================================================
--- src/java/org/apache/lucene/index/MergeScheduler.java (revision 576821)
+++ src/java/org/apache/lucene/index/MergeScheduler.java (working copy)
@@ -22,7 +22,7 @@
/** Expert: {@link IndexWriter} uses an instance
* implementing this interface to execute the merges
* selected by a {@link MergePolicy}. The default
- * MergeScheduler is {@link SerialMergeScheduler}. */
+ * MergeScheduler is {@link ConcurrentMergeScheduler}. */
public interface MergeScheduler {
Index: contrib/gdata-server/src/core/src/test/org/apache/lucene/gdata/search/index/TestGdataIndexWriter.java
===================================================================
--- contrib/gdata-server/src/core/src/test/org/apache/lucene/gdata/search/index/TestGdataIndexWriter.java (revision 576821)
+++ contrib/gdata-server/src/core/src/test/org/apache/lucene/gdata/search/index/TestGdataIndexWriter.java (working copy)
@@ -26,6 +26,7 @@
import org.apache.lucene.gdata.search.config.IndexSchema;
import org.apache.lucene.gdata.search.config.IndexSchemaField;
import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.index.LogDocMergePolicy;
/**
*
@@ -72,7 +73,8 @@
// assertEquals(VALUE_GT_DEFAULT_LONG,writer.getCommitLockTimeout());
assertEquals(VALUE_GT_DEFAULT_LONG,writer.getWriteLockTimeout());
assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxBufferedDocs());
- assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxMergeDocs());
+ if (writer.getMergePolicy() instanceof LogDocMergePolicy)
+ assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxMergeDocs());
assertEquals(VALUE_GT_DEFAULT_INT,writer.getMaxFieldLength());
assertEquals(VALUE_GT_DEFAULT_INT,writer.getMergeFactor());
assertTrue(writer.getUseCompoundFile());
Index: contrib/gdata-server/src/core/src/java/org/apache/lucene/gdata/search/index/GDataIndexWriter.java
===================================================================
--- contrib/gdata-server/src/core/src/java/org/apache/lucene/gdata/search/index/GDataIndexWriter.java (revision 576821)
+++ contrib/gdata-server/src/core/src/java/org/apache/lucene/gdata/search/index/GDataIndexWriter.java (working copy)
@@ -24,6 +24,7 @@
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.gdata.search.config.IndexSchema;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.store.Directory;
/**
@@ -42,7 +43,7 @@
setUseCompoundFile(config.isUseCompoundFile());
if (config.getMaxBufferedDocs() != IndexSchema.NOT_SET_VALUE)
setMaxBufferedDocs(config.getMaxBufferedDocs());
- if (config.getMaxMergeDocs() != IndexSchema.NOT_SET_VALUE)
+ if (config.getMaxMergeDocs() != IndexSchema.NOT_SET_VALUE && getMergePolicy() instanceof LogDocMergePolicy)
setMaxMergeDocs(config.getMaxMergeDocs());
if (config.getMergeFactor() != IndexSchema.NOT_SET_VALUE)
setMergeFactor(config.getMergeFactor());
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java (revision 576821)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java (working copy)
@@ -176,6 +176,36 @@
}
/**
+ * Return a double property.
+ * If the property contain ":", e.g. "10:100:5", it is interpreted
+ * as array of doubles. It is extracted once, on first call
+ * to get() it, and a by-round-value is returned.
+ * @param name name of property
+ * @param dflt default value
+ * @return a double property.
+ */
+ public double get (String name, double dflt) {
+ // use value by round if already parsed
+ double vals[] = (double[]) valByRound.get(name);
+ if (vals != null) {
+ return vals[roundNumber % vals.length];
+ }
+ // done if not by round
+ String sval = props.getProperty(name,""+dflt);
+ if (sval.indexOf(":")<0) {
+ return Double.parseDouble(sval);
+ }
+ // first time this prop is extracted by round
+ int k = sval.indexOf(":");
+ String colName = sval.substring(0,k);
+ sval = sval.substring(k+1);
+ colForValByRound.put(name,colName);
+ vals = propToDoubleArray(sval);
+ valByRound.put(name,vals);
+ return vals[roundNumber % vals.length];
+ }
+
+ /**
* Return a boolean property.
* If the property contain ":", e.g. "true.true.false", it is interpreted
* as array of boleans. It is extracted once, on first call
@@ -241,7 +271,7 @@
return roundNumber;
}
- // extract properties to array, e.g. for "10.100.5" return int[]{10,100,5}.
+ // extract properties to array, e.g. for "10:100:5" return int[]{10,100,5}.
private int[] propToIntArray (String s) {
if (s.indexOf(":")<0) {
return new int [] { Integer.parseInt(s) };
@@ -260,7 +290,26 @@
return res;
}
- // extract properties to array, e.g. for "true.true.false" return booleab[]{true,false,false}.
+ // extract properties to array, e.g. for "10.7:100.4:-2.3" return int[]{10.7,100.4,-2.3}.
+ private double[] propToDoubleArray (String s) {
+ if (s.indexOf(":")<0) {
+ return new double [] { Double.parseDouble(s) };
+ }
+
+ ArrayList a = new ArrayList();
+ StringTokenizer st = new StringTokenizer(s,":");
+ while (st.hasMoreTokens()) {
+ String t = st.nextToken();
+ a.add(new Double(t));
+ }
+ double res[] = new double[a.size()];
+ for (int i=0; i 0)
+ writer.setRAMBufferSizeMB(flushAtRAMUsage);
+ else if (mxbf != 0)
+ writer.setMaxBufferedDocs(mxbf);
+ else
+ throw new RuntimeException("either max.buffered or ram.flush.mb must be non-zero");
writer.setMaxFieldLength(mxfl);
writer.setMergeFactor(mrgf);
writer.setUseCompoundFile(cmpnd); // this one redundant?
Index: contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
===================================================================
--- contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (revision 576821)
+++ contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (working copy)
@@ -49,18 +49,20 @@
int mrgf = config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR);
int mxbf = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED);
int mxfl = config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH);
- double flushAtRAMUsage = config.get("ram.flush.mb", OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
+ double flushAtRAMUsage = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB);
boolean autoCommit = config.get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT);
IndexWriter iw = new IndexWriter(dir, autoCommit, analyzer, true);
iw.setUseCompoundFile(cmpnd);
iw.setMergeFactor(mrgf);
- iw.setMaxBufferedDocs(mxbf);
iw.setMaxFieldLength(mxfl);
if (flushAtRAMUsage > 0)
iw.setRAMBufferSizeMB(flushAtRAMUsage);
-
+ else if (mxbf != 0)
+ iw.setMaxBufferedDocs(mxbf);
+ else
+ throw new RuntimeException("either max.buffered or ram.flush.mb must be non-zero");
getRunData().setIndexWriter(iw);
return 1;
}