Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 791657) +++ CHANGES.txt (working copy) @@ -127,6 +127,9 @@ is failing to close reader/writers. (Brian Groose via Mike McCandless) + 9. LUCENE-1717: Fixed IndexWriter to account for RAM usage of + buffered deletions. (Mike McCandless) + API Changes 1. LUCENE-1419: Add expert API to set custom indexing chain. This API is Index: src/java/org/apache/lucene/index/BufferedDeletes.java =================================================================== --- src/java/org/apache/lucene/index/BufferedDeletes.java (revision 791657) +++ src/java/org/apache/lucene/index/BufferedDeletes.java (working copy) @@ -35,6 +35,7 @@ HashMap terms = new HashMap(); HashMap queries = new HashMap(); List docIDs = new ArrayList(); + long bytesUsed; // Number of documents a delete term applies to. final static class Num { @@ -60,17 +61,21 @@ } } + int size() { + // We use numTerms not terms.size() intentionally, so + // that deletes by the same term multiple times "count", + // ie if you ask to flush every 1000 deletes then even + // dup'd terms are counted towards that 1000 + return numTerms + queries.size() + docIDs.size(); + } - void update(BufferedDeletes in) { numTerms += in.numTerms; + bytesUsed += in.bytesUsed; terms.putAll(in.terms); queries.putAll(in.queries); docIDs.addAll(in.docIDs); - in.terms.clear(); - in.numTerms = 0; - in.queries.clear(); - in.docIDs.clear(); + in.clear(); } void clear() { @@ -78,8 +83,13 @@ queries.clear(); docIDs.clear(); numTerms = 0; + bytesUsed = 0; } + void addBytesUsed(long b) { + bytesUsed += b; + } + boolean any() { return terms.size() > 0 || docIDs.size() > 0 || queries.size() > 0; } Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 791657) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -38,6 +38,7 @@ import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.Constants; /** * This class accepts multiple added documents and directly @@ -887,10 +888,27 @@ } synchronized boolean deletesFull() { - return maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH - && ((deletesInRAM.numTerms + deletesInRAM.queries.size() + deletesInRAM.docIDs.size()) >= maxBufferedDeleteTerms); + return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && + (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + numBytesUsed) >= ramBufferSize) || + (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH && + ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms)); } + synchronized boolean doApplyDeletes() { + // Very similar to deletesFull(), except we don't count + // numBytesAlloc, because we are checking whether + // deletes (alone) are consuming too many resources now + // and thus should be applied. We apply deletes if RAM + // usage is > 1/2 of our allowed RAM buffer, to prevent + // too-frequent flushing of a long tail of tiny segments + // when merges (which always apply deletes) are + // infrequent. + return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && + (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed) >= ramBufferSize/2) || + (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH && + ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms)); + } + synchronized private boolean timeToFlushDeletes() { return (bufferIsFull || deletesFull()) && setFlushPending(); } @@ -1015,20 +1033,24 @@ else num.setNum(docIDUpto); deletesInRAM.numTerms++; + + deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.text.length()*CHAR_NUM_BYTE); } // Buffer a specific docID for deletion. Currently only // used when we hit a exception when adding a document synchronized private void addDeleteDocID(int docID) { deletesInRAM.docIDs.add(new Integer(flushedDocCount+docID)); + deletesInRAM.addBytesUsed(BYTES_PER_DEL_DOCID); } synchronized private void addDeleteQuery(Query query, int docID) { deletesInRAM.queries.put(query, new Integer(flushedDocCount + docID)); + deletesInRAM.addBytesUsed(BYTES_PER_DEL_QUERY); } synchronized boolean doBalanceRAM() { - return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger); + return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger); } /** Does the synchronized work to finish/flush the @@ -1044,7 +1066,6 @@ assert docWriter == null || docWriter.docID == perThread.docState.docID; - if (aborting) { // We are currently aborting, and another thread is @@ -1109,7 +1130,7 @@ final SkipDocWriter skipDocWriter = new SkipDocWriter(); long getRAMUsed() { - return numBytesUsed; + return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed; } long numBytesAlloc; @@ -1137,10 +1158,34 @@ // Coarse estimates used to measure RAM usage of buffered deletes final static int OBJECT_HEADER_BYTES = 8; - final static int POINTER_NUM_BYTE = 4; + final static int POINTER_NUM_BYTE = Constants.JRE_IS_64BIT ? 8 : 4; final static int INT_NUM_BYTE = 4; final static int CHAR_NUM_BYTE = 2; + /* Rough logic: HashMap has an array[Entry] w/ varying + load factor (say 2 * POINTER). Entry is object w/ Term + key, BufferedDeletes.Num val, int hash, Entry next + (OBJ_HEADER + 3*POINTER + INT). Term is object w/ + String field and String text (OBJ_HEADER + 2*POINTER). + We don't count Term's field since it's interned. + Term's text is String (OBJ_HEADER + 4*INT + POINTER + + OBJ_HEADER + string.length*CHAR). BufferedDeletes.num is + OBJ_HEADER + INT. */ + + final static int BYTES_PER_DEL_TERM = 8*POINTER_NUM_BYTE + 5*OBJECT_HEADER_BYTES + 6*INT_NUM_BYTE; + + /* Rough logic: del docIDs are List. Say list + allocates ~2X size (2*POINTER). Integer is OBJ_HEADER + + int */ + final static int BYTES_PER_DEL_DOCID = 2*POINTER_NUM_BYTE + OBJECT_HEADER_BYTES + INT_NUM_BYTE; + + /* Rough logic: HashMap has an array[Entry] w/ varying + load factor (say 2 * POINTER). Entry is object w/ + Query key, Integer val, int hash, Entry next + (OBJ_HEADER + 3*POINTER + INT). Query we often + undercount (say 24 bytes). Integer is OBJ_HEADER + INT. */ + final static int BYTES_PER_DEL_QUERY = 5*POINTER_NUM_BYTE + 2*OBJECT_HEADER_BYTES + 2*INT_NUM_BYTE + 24; + /* Initial chunks size of the shared byte[] blocks used to store postings data */ final static int BYTE_BLOCK_SHIFT = 15; @@ -1285,17 +1330,20 @@ // We flush when we've used our target usage final long flushTrigger = ramBufferSize; - if (numBytesAlloc > freeTrigger) { + final long deletesRAMUsed = deletesInRAM.bytesUsed+deletesFlushed.bytesUsed; + if (numBytesAlloc+deletesRAMUsed > freeTrigger) { + if (infoStream != null) message(" RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) + " vs trigger=" + toMB(flushTrigger) + " allocMB=" + toMB(numBytesAlloc) + + " deletesMB=" + toMB(deletesRAMUsed) + " vs trigger=" + toMB(freeTrigger) + " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) + " charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*CHAR_NUM_BYTE)); - final long startBytesAlloc = numBytesAlloc; + final long startBytesAlloc = numBytesAlloc + deletesRAMUsed; int iter = 0; @@ -1305,12 +1353,12 @@ boolean any = true; - while(numBytesAlloc > freeLevel) { + while(numBytesAlloc+deletesRAMUsed > freeLevel) { synchronized(this) { if (0 == byteBlockAllocator.freeByteBlocks.size() && 0 == freeCharBlocks.size() && 0 == freeIntBlocks.size() && !any) { // Nothing else to free -- must flush now. - bufferIsFull = numBytesUsed > flushTrigger; + bufferIsFull = numBytesUsed+deletesRAMUsed > flushTrigger; if (infoStream != null) { if (numBytesUsed > flushTrigger) message(" nothing to free; now set bufferIsFull"); @@ -1345,7 +1393,7 @@ } if (infoStream != null) - message(" after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc)/1024./1024.) + " usedMB=" + nf.format(numBytesUsed/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.)); + message(" after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.)); } else { // If we have not crossed the 100% mark, but have @@ -1355,10 +1403,11 @@ // flush. synchronized(this) { - if (numBytesUsed > flushTrigger) { + if (numBytesUsed+deletesRAMUsed > flushTrigger) { if (infoStream != null) message(" RAM: now flush @ usedMB=" + nf.format(numBytesUsed/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.) + + " deletesMB=" + nf.format(deletesRAMUsed/1024./1024.) + " triggerMB=" + nf.format(flushTrigger/1024./1024.)); bufferIsFull = true; Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 791657) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -1729,18 +1729,29 @@ } /** Determines the amount of RAM that may be used for - * buffering added documents before they are flushed as a - * new Segment. Generally for faster indexing performance - * it's best to flush by RAM usage instead of document - * count and use as large a RAM buffer as you can. + * buffering added documents and deletions before they are + * flushed to the Directory. Generally for faster + * indexing performance it's best to flush by RAM usage + * instead of document count and use as large a RAM buffer + * as you can. * *

When this is set, the writer will flush whenever - * buffered documents use this much RAM. Pass in {@link - * #DISABLE_AUTO_FLUSH} to prevent triggering a flush due - * to RAM usage. Note that if flushing by document count - * is also enabled, then the flush will be triggered by - * whichever comes first.

+ * buffered documents and deletions use this much RAM. + * Pass in {@link #DISABLE_AUTO_FLUSH} to prevent + * triggering a flush due to RAM usage. Note that if + * flushing by document count is also enabled, then the + * flush will be triggered by whichever comes first.

* + *

NOTE: the account of RAM usage for pending + * deletions is only approximate. Specifically, if you + * delete by Query, Lucene currently has no way to measure + * the RAM usage if individual Queries so the accounting + * will under-estimate and you should compensate by either + * calling commit() periodically yourself, or by using + * {@link #setMaxBufferedDeleteTerms} to flush by count + * instead of RAM usage (each buffered delete Query counts + * as one). + * *

The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.

* * @throws IllegalArgumentException if ramBufferSize is @@ -4089,7 +4100,10 @@ flushCount++; - flushDeletes |= docWriter.deletesFull(); + // If we are flushing because too many deletes + // accumulated, then we should apply the deletes to free + // RAM: + flushDeletes |= docWriter.doApplyDeletes(); // When autoCommit=true we must always flush deletes // when flushing a segment; otherwise deletes may become