Index: CHANGES.txt
===================================================================
--- CHANGES.txt (revision 791657)
+++ CHANGES.txt (working copy)
@@ -127,6 +127,9 @@
is failing to close reader/writers. (Brian Groose via Mike
McCandless)
+ 9. LUCENE-1717: Fixed IndexWriter to account for RAM usage of
+ buffered deletions. (Mike McCandless)
+
API Changes
1. LUCENE-1419: Add expert API to set custom indexing chain. This API is
Index: src/java/org/apache/lucene/index/BufferedDeletes.java
===================================================================
--- src/java/org/apache/lucene/index/BufferedDeletes.java (revision 791657)
+++ src/java/org/apache/lucene/index/BufferedDeletes.java (working copy)
@@ -35,6 +35,7 @@
HashMap terms = new HashMap();
HashMap queries = new HashMap();
List docIDs = new ArrayList();
+ long bytesUsed;
// Number of documents a delete term applies to.
final static class Num {
@@ -60,17 +61,21 @@
}
}
+ int size() {
+ // We use numTerms not terms.size() intentionally, so
+ // that deletes by the same term multiple times "count",
+ // ie if you ask to flush every 1000 deletes then even
+ // dup'd terms are counted towards that 1000
+ return numTerms + queries.size() + docIDs.size();
+ }
-
void update(BufferedDeletes in) {
numTerms += in.numTerms;
+ bytesUsed += in.bytesUsed;
terms.putAll(in.terms);
queries.putAll(in.queries);
docIDs.addAll(in.docIDs);
- in.terms.clear();
- in.numTerms = 0;
- in.queries.clear();
- in.docIDs.clear();
+ in.clear();
}
void clear() {
@@ -78,8 +83,13 @@
queries.clear();
docIDs.clear();
numTerms = 0;
+ bytesUsed = 0;
}
+ void addBytesUsed(long b) {
+ bytesUsed += b;
+ }
+
boolean any() {
return terms.size() > 0 || docIDs.size() > 0 || queries.size() > 0;
}
Index: src/java/org/apache/lucene/index/DocumentsWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 791657)
+++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy)
@@ -38,6 +38,7 @@
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Constants;
/**
* This class accepts multiple added documents and directly
@@ -887,10 +888,27 @@
}
synchronized boolean deletesFull() {
- return maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH
- && ((deletesInRAM.numTerms + deletesInRAM.queries.size() + deletesInRAM.docIDs.size()) >= maxBufferedDeleteTerms);
+ return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH &&
+ (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + numBytesUsed) >= ramBufferSize) ||
+ (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH &&
+ ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
}
+ synchronized boolean doApplyDeletes() {
+ // Very similar to deletesFull(), except we don't count
+ // numBytesAlloc, because we are checking whether
+ // deletes (alone) are consuming too many resources now
+ // and thus should be applied. We apply deletes if RAM
+ // usage is > 1/2 of our allowed RAM buffer, to prevent
+ // too-frequent flushing of a long tail of tiny segments
+ // when merges (which always apply deletes) are
+ // infrequent.
+ return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH &&
+ (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed) >= ramBufferSize/2) ||
+ (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH &&
+ ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
+ }
+
synchronized private boolean timeToFlushDeletes() {
return (bufferIsFull || deletesFull()) && setFlushPending();
}
@@ -1015,20 +1033,24 @@
else
num.setNum(docIDUpto);
deletesInRAM.numTerms++;
+
+ deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.text.length()*CHAR_NUM_BYTE);
}
// Buffer a specific docID for deletion. Currently only
// used when we hit a exception when adding a document
synchronized private void addDeleteDocID(int docID) {
deletesInRAM.docIDs.add(new Integer(flushedDocCount+docID));
+ deletesInRAM.addBytesUsed(BYTES_PER_DEL_DOCID);
}
synchronized private void addDeleteQuery(Query query, int docID) {
deletesInRAM.queries.put(query, new Integer(flushedDocCount + docID));
+ deletesInRAM.addBytesUsed(BYTES_PER_DEL_QUERY);
}
synchronized boolean doBalanceRAM() {
- return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger);
+ return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger);
}
/** Does the synchronized work to finish/flush the
@@ -1044,7 +1066,6 @@
assert docWriter == null || docWriter.docID == perThread.docState.docID;
-
if (aborting) {
// We are currently aborting, and another thread is
@@ -1109,7 +1130,7 @@
final SkipDocWriter skipDocWriter = new SkipDocWriter();
long getRAMUsed() {
- return numBytesUsed;
+ return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed;
}
long numBytesAlloc;
@@ -1137,10 +1158,34 @@
// Coarse estimates used to measure RAM usage of buffered deletes
final static int OBJECT_HEADER_BYTES = 8;
- final static int POINTER_NUM_BYTE = 4;
+ final static int POINTER_NUM_BYTE = Constants.JRE_IS_64BIT ? 8 : 4;
final static int INT_NUM_BYTE = 4;
final static int CHAR_NUM_BYTE = 2;
+ /* Rough logic: HashMap has an array[Entry] w/ varying
+ load factor (say 2 * POINTER). Entry is object w/ Term
+ key, BufferedDeletes.Num val, int hash, Entry next
+ (OBJ_HEADER + 3*POINTER + INT). Term is object w/
+ String field and String text (OBJ_HEADER + 2*POINTER).
+ We don't count Term's field since it's interned.
+ Term's text is String (OBJ_HEADER + 4*INT + POINTER +
+ OBJ_HEADER + string.length*CHAR). BufferedDeletes.num is
+ OBJ_HEADER + INT. */
+
+ final static int BYTES_PER_DEL_TERM = 8*POINTER_NUM_BYTE + 5*OBJECT_HEADER_BYTES + 6*INT_NUM_BYTE;
+
+ /* Rough logic: del docIDs are List When this is set, the writer will flush whenever
- * buffered documents use this much RAM. Pass in {@link
- * #DISABLE_AUTO_FLUSH} to prevent triggering a flush due
- * to RAM usage. Note that if flushing by document count
- * is also enabled, then the flush will be triggered by
- * whichever comes first.
NOTE: the account of RAM usage for pending + * deletions is only approximate. Specifically, if you + * delete by Query, Lucene currently has no way to measure + * the RAM usage if individual Queries so the accounting + * will under-estimate and you should compensate by either + * calling commit() periodically yourself, or by using + * {@link #setMaxBufferedDeleteTerms} to flush by count + * instead of RAM usage (each buffered delete Query counts + * as one). + * *
The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.
* * @throws IllegalArgumentException if ramBufferSize is @@ -4089,7 +4100,10 @@ flushCount++; - flushDeletes |= docWriter.deletesFull(); + // If we are flushing because too many deletes + // accumulated, then we should apply the deletes to free + // RAM: + flushDeletes |= docWriter.doApplyDeletes(); // When autoCommit=true we must always flush deletes // when flushing a segment; otherwise deletes may become