Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 591357) +++ CHANGES.txt (working copy) @@ -205,6 +205,10 @@ first), by adding symbolic constant DISABLE_AUTO_FLUSH to disable one of the flush triggers. (Ning Li via Mike McCandless) +12. LUCENE-1043: Speed up merging of stored fields by bulk-copying the + raw bytes for each non-deleted document. (Robert Engels via Mike + McCandless) + Documentation Build Index: src/java/org/apache/lucene/index/FieldsReader.java =================================================================== --- src/java/org/apache/lucene/index/FieldsReader.java (revision 591357) +++ src/java/org/apache/lucene/index/FieldsReader.java (working copy) @@ -48,6 +48,7 @@ private final IndexInput fieldsStream; private final IndexInput indexStream; + private int numTotalDocs; private int size; private boolean closed; @@ -88,6 +89,7 @@ this.size = (int) (indexStream.length() >> 3); } + numTotalDocs = (int) (indexStream.length() >> 3); success = true; } finally { // With lock-less commits, it's entirely possible (and @@ -186,6 +188,32 @@ return doc; } + /** Returns the length in bytes of each raw document in a + * contiguous range of length numDocs starting with + * startDocID. Returns the IndexInput (the fieldStream), + * already seeked to the starting point for startDocID.*/ + final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException { + indexStream.seek(startDocID * 8L); + long startOffset = indexStream.readLong(); + long lastOffset = startOffset; + int count = 0; + while (count < numDocs) { + final long offset; + final int docID = startDocID + count + 1; + assert docID <= numTotalDocs; + if (docID < numTotalDocs) + offset = indexStream.readLong(); + else + offset = fieldsStream.length(); + lengths[count++] = (int) (offset-lastOffset); + lastOffset = offset; + } + + fieldsStream.seek(startOffset); + + return fieldsStream; + } + /** * Skip the field. We still have to read some of the information about the field, but can skip past the actual content. * This will have the most payoff on large fields. Index: src/java/org/apache/lucene/index/FieldsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FieldsWriter.java (revision 591357) +++ src/java/org/apache/lucene/index/FieldsWriter.java (working copy) @@ -26,6 +26,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.IndexInput; final class FieldsWriter { @@ -127,6 +128,21 @@ } } + /** Bulk write a contiguous series of documents. The + * lengths array is the length (in bytes) of each raw + * document. The stream IndexInput is the + * fieldsStream from which we should bulk-copy all + * bytes. */ + final void addRawDocuments(IndexInput stream, int[] lengths, int numDocs) throws IOException { + long position = fieldsStream.getFilePointer(); + long start = position; + for(int i=0;i number mapping, then this + // array will be non-null at position i: + SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.size()]; + + for (int i = 0; i < readers.size(); i++) { + IndexReader reader = (IndexReader) readers.elementAt(i); + boolean same = reader.getFieldNames(IndexReader.FieldOption.ALL).size() == fieldInfos.size() && reader instanceof SegmentReader; + if (same) { + SegmentReader segmentReader = (SegmentReader) reader; + for (int j = 0; same && j < fieldInfos.size(); j++) + same = fieldInfos.fieldName(j).equals(segmentReader.getFieldInfos().fieldName(j)); + if (same) + matchingSegmentReaders[i] = segmentReader; + } + } + + // Used for bulk-reading raw bytes for stored fields + final int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; + + // merge field values + final FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); + // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're // in merge mode, we use this FieldSelector FieldSelector fieldSelectorMerge = new FieldSelector() { @@ -238,13 +272,38 @@ try { for (int i = 0; i < readers.size(); i++) { - IndexReader reader = (IndexReader) readers.elementAt(i); - int maxDoc = reader.maxDoc(); - for (int j = 0; j < maxDoc; j++) - if (!reader.isDeleted(j)) { // skip deleted docs - fieldsWriter.addDocument(reader.document(j, fieldSelectorMerge)); - docCount++; - } + final IndexReader reader = (IndexReader) readers.elementAt(i); + final SegmentReader matchingSegmentReader = matchingSegmentReaders[i]; + final FieldsReader matchingFieldsReader; + if (matchingSegmentReader != null) + matchingFieldsReader = matchingSegmentReader.getFieldsReader(); + else + matchingFieldsReader = null; + final int maxDoc = reader.maxDoc(); + for (int j = 0; j < maxDoc;) { + if (!reader.isDeleted(j)) { // skip deleted docs + if (matchingSegmentReader != null) { + // We can optimize this case (doing a bulk + // byte copy) since the field numbers are + // identical + int start = j; + int numDocs = 0; + do { + j++; + numDocs++; + } while(j < maxDoc && !matchingSegmentReader.isDeleted(j) && numDocs < MAX_RAW_MERGE_DOCS); + + IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, start, numDocs); + fieldsWriter.addRawDocuments(stream, rawDocLengths, numDocs); + docCount += numDocs; + } else { + fieldsWriter.addDocument(reader.document(j, fieldSelectorMerge)); + j++; + docCount++; + } + } else + j++; + } } } finally { fieldsWriter.close(); Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/SegmentReader.java (revision 591357) +++ src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -314,6 +314,10 @@ undeleteAll = false; } + FieldsReader getFieldsReader() { + return fieldsReader; + } + protected void doClose() throws IOException { if (fieldsReader != null) { fieldsReader.close(); @@ -388,6 +392,10 @@ return tis.terms(t); } + FieldInfos getFieldInfos() { + return fieldInfos; + } + /** * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 591357) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -2135,11 +2135,6 @@ message("flush at addIndexesNoOptimize"); flush(); - /* new merge policy - if (startUpperBound == 0) - startUpperBound = 10; - */ - boolean success = false; startTransaction(); Index: src/java/org/apache/lucene/store/IndexOutput.java =================================================================== --- src/java/org/apache/lucene/store/IndexOutput.java (revision 591357) +++ src/java/org/apache/lucene/store/IndexOutput.java (working copy) @@ -149,7 +149,26 @@ } } + private static int COPY_BUFFER_SIZE = 16384; + private byte[] copyBuffer; + /** Copy numBytes bytes from input to ourself. */ + public void copyBytes(IndexInput input, long numBytes) throws IOException { + long left = numBytes; + if (copyBuffer == null) + copyBuffer = new byte[COPY_BUFFER_SIZE]; + while(left > 0) { + final int toCopy; + if (left > COPY_BUFFER_SIZE) + toCopy = COPY_BUFFER_SIZE; + else + toCopy = (int) left; + input.readBytes(copyBuffer, 0, toCopy); + writeBytes(copyBuffer, 0, toCopy); + left -= toCopy; + } + } + /** Forces any buffered output to be written. */ public abstract void flush() throws IOException;