Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 541583) +++ CHANGES.txt (working copy) @@ -185,6 +185,14 @@ Together with LUCENE-888 this will allow to adjust the buffer size dynamically. (Paul Elschot, Michael Busch) + 6. LUCENE-888: Increase buffer sizes inside CompoundFileWriter and + BufferedIndexOutput. Also increase buffer size in + BufferedIndexInput, but only when used during merging. Together, + these increases yield 10-18% overall performance gain vs the + previous 1K defaults. Added optional readBufferSize to various + methods, and added BufferedIndexInput.setBufferSize to change the + buffer size. (Mike McCandless) + Documentation 1. LUCENE 791 && INFRA-1173: Infrastructure moved the Wiki to Index: src/test/org/apache/lucene/store/TestBufferedIndexInput.java =================================================================== --- src/test/org/apache/lucene/store/TestBufferedIndexInput.java (revision 541583) +++ src/test/org/apache/lucene/store/TestBufferedIndexInput.java (working copy) @@ -1,6 +1,21 @@ package org.apache.lucene.store; import java.io.IOException; +import java.io.File; +import java.util.List; +import java.util.Random; +import java.util.ArrayList; +import java.util.Iterator; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.util._TestUtil; import junit.framework.TestCase; @@ -122,4 +137,133 @@ return len; } } + + public void testSetBufferSize() throws IOException { + File indexDir = new File(System.getProperty("tempDir"), "testSetBufferSize"); + MockFSDirectory dir = new MockFSDirectory(indexDir); + try { + IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); + writer.setUseCompoundFile(false); + for(int i=0;i<37;i++) { + Document doc = new Document(); + doc.add(new Field("content", "aaa bbb ccc ddd" + i, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("id", "" + i, Field.Store.YES, Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + writer.close(); + + dir.allIndexInputs.clear(); + + IndexReader reader = IndexReader.open(dir); + Term aaa = new Term("content", "aaa"); + Term bbb = new Term("content", "bbb"); + Term ccc = new Term("content", "ccc"); + assertEquals(reader.docFreq(ccc), 37); + reader.deleteDocument(0); + assertEquals(reader.docFreq(aaa), 37); + dir.tweakBufferSizes(); + reader.deleteDocument(4); + assertEquals(reader.docFreq(bbb), 37); + dir.tweakBufferSizes(); + + IndexSearcher searcher = new IndexSearcher(reader); + Hits hits = searcher.search(new TermQuery(bbb)); + dir.tweakBufferSizes(); + assertEquals(35, hits.length()); + dir.tweakBufferSizes(); + hits = searcher.search(new TermQuery(new Term("id", "33"))); + dir.tweakBufferSizes(); + assertEquals(1, hits.length()); + hits = searcher.search(new TermQuery(aaa)); + dir.tweakBufferSizes(); + assertEquals(35, hits.length()); + searcher.close(); + reader.close(); + } finally { + _TestUtil.rmDir(indexDir); + } + } + + private static class MockFSDirectory extends Directory { + + List allIndexInputs = new ArrayList(); + + Random rand = new Random(); + + private Directory dir; + + public MockFSDirectory(File path) throws IOException { + lockFactory = new NoLockFactory(); + dir = FSDirectory.getDirectory(path); + } + + public IndexInput openInput(String name) throws IOException { + return openInput(name, BufferedIndexInput.BUFFER_SIZE); + } + + public void tweakBufferSizes() { + Iterator it = allIndexInputs.iterator(); + int count = 0; + while(it.hasNext()) { + BufferedIndexInput bii = (BufferedIndexInput) it.next(); + int bufferSize = 1024+(int) Math.abs(rand.nextInt() % 32768); + bii.setBufferSize(bufferSize); + count++; + } + //System.out.println("tweak'd " + count + " buffer sizes"); + } + + public IndexInput openInput(String name, int bufferSize) throws IOException { + // Make random changes to buffer size + bufferSize = 1+(int) Math.abs(rand.nextInt() % 10); + IndexInput f = dir.openInput(name, bufferSize); + allIndexInputs.add(f); + return f; + } + + public IndexOutput createOutput(String name) throws IOException { + return dir.createOutput(name); + } + + public void close() throws IOException { + dir.close(); + } + + public void deleteFile(String name) + throws IOException + { + dir.deleteFile(name); + } + public void touchFile(String name) + throws IOException + { + dir.touchFile(name); + } + public long fileModified(String name) + throws IOException + { + return dir.fileModified(name); + } + public boolean fileExists(String name) + throws IOException + { + return dir.fileExists(name); + } + public String[] list() + throws IOException + { + return dir.list(); + } + + public long fileLength(String name) throws IOException { + return dir.fileLength(name); + } + public void renameFile(String from, String to) + throws IOException + { + dir.renameFile(from, to); + } + + + } } Index: src/java/org/apache/lucene/index/CompoundFileReader.java =================================================================== --- src/java/org/apache/lucene/index/CompoundFileReader.java (revision 541583) +++ src/java/org/apache/lucene/index/CompoundFileReader.java (working copy) @@ -37,6 +37,8 @@ */ class CompoundFileReader extends Directory { + private int readBufferSize; + private static final class FileEntry { long offset; long length; @@ -51,16 +53,21 @@ private HashMap entries = new HashMap(); - public CompoundFileReader(Directory dir, String name) + public CompoundFileReader(Directory dir, String name) throws IOException { + this(dir, name, BufferedIndexInput.BUFFER_SIZE); + } + + public CompoundFileReader(Directory dir, String name, int readBufferSize) throws IOException { directory = dir; fileName = name; + this.readBufferSize = readBufferSize; boolean success = false; try { - stream = dir.openInput(name); + stream = dir.openInput(name, readBufferSize); // read the directory and init files int count = stream.readVInt(); @@ -115,6 +122,13 @@ public synchronized IndexInput openInput(String id) throws IOException { + // Default to readBufferSize passed in when we were opened + return openInput(id, readBufferSize); + } + + public synchronized IndexInput openInput(String id, int readBufferSize) + throws IOException + { if (stream == null) throw new IOException("Stream closed"); @@ -122,7 +136,7 @@ if (entry == null) throw new IOException("No sub-file with id " + id + " found"); - return new CSIndexInput(stream, entry.offset, entry.length); + return new CSIndexInput(stream, entry.offset, entry.length, readBufferSize); } /** Returns an array of strings, one for each file in the directory. */ @@ -198,6 +212,12 @@ CSIndexInput(final IndexInput base, final long fileOffset, final long length) { + this(base, fileOffset, length, BufferedIndexInput.BUFFER_SIZE); + } + + CSIndexInput(final IndexInput base, final long fileOffset, final long length, int readBufferSize) + { + super(readBufferSize); this.base = base; this.fileOffset = fileOffset; this.length = length; Index: src/java/org/apache/lucene/index/CompoundFileWriter.java =================================================================== --- src/java/org/apache/lucene/index/CompoundFileWriter.java (revision 541583) +++ src/java/org/apache/lucene/index/CompoundFileWriter.java (working copy) @@ -161,7 +161,7 @@ // Open the files and copy their data into the stream. // Remember the locations of each file's data section. - byte buffer[] = new byte[1024]; + byte buffer[] = new byte[16384]; it = entries.iterator(); while(it.hasNext()) { FileEntry fe = (FileEntry) it.next(); Index: src/java/org/apache/lucene/index/FieldsReader.java =================================================================== --- src/java/org/apache/lucene/index/FieldsReader.java (revision 541583) +++ src/java/org/apache/lucene/index/FieldsReader.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.BufferedIndexInput; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -53,11 +54,15 @@ private ThreadLocal fieldsStreamTL = new ThreadLocal(); FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException { + this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE); + } + + FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException { fieldInfos = fn; - cloneableFieldsStream = d.openInput(segment + ".fdt"); + cloneableFieldsStream = d.openInput(segment + ".fdt", readBufferSize); fieldsStream = (IndexInput)cloneableFieldsStream.clone(); - indexStream = d.openInput(segment + ".fdx"); + indexStream = d.openInput(segment + ".fdx", readBufferSize); size = (int) (indexStream.length() / 8); } Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 541583) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.BufferedIndexInput; /** This stores a monotonically increasing set of pairs in a * Directory. Pairs are accessed either by Term or by ordinal position the @@ -42,16 +43,21 @@ TermInfosReader(Directory dir, String seg, FieldInfos fis) throws CorruptIndexException, IOException { + this(dir, seg, fis, BufferedIndexInput.BUFFER_SIZE); + } + + TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize) + throws CorruptIndexException, IOException { directory = dir; segment = seg; fieldInfos = fis; - origEnum = new SegmentTermEnum(directory.openInput(segment + ".tis"), + origEnum = new SegmentTermEnum(directory.openInput(segment + ".tis", readBufferSize), fieldInfos, false); size = origEnum.size; indexEnum = - new SegmentTermEnum(directory.openInput(segment + ".tii"), + new SegmentTermEnum(directory.openInput(segment + ".tii", readBufferSize), fieldInfos, true); } Index: src/java/org/apache/lucene/index/TermVectorsReader.java =================================================================== --- src/java/org/apache/lucene/index/TermVectorsReader.java (revision 541583) +++ src/java/org/apache/lucene/index/TermVectorsReader.java (working copy) @@ -19,6 +19,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.BufferedIndexInput; import java.io.IOException; @@ -38,12 +39,17 @@ TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos) throws CorruptIndexException, IOException { + this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE); + } + + TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize) + throws CorruptIndexException, IOException { if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) { - tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION); + tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION, readBufferSize); checkValidFormat(tvx); - tvd = d.openInput(segment + TermVectorsWriter.TVD_EXTENSION); + tvd = d.openInput(segment + TermVectorsWriter.TVD_EXTENSION, readBufferSize); tvdFormat = checkValidFormat(tvd); - tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION); + tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION, readBufferSize); tvfFormat = checkValidFormat(tvf); size = (int) tvx.length() / 8; } Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 541583) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -1785,6 +1785,14 @@ } } + // The normal read buffer size defaults to 1024, but + // increasing this during merging seems to yield + // performance gains. However we don't want to increase + // it too much because there are quite a few + // BufferedIndexInputs created during merging. See + // LUCENE-888 for details. + private final static int MERGE_READ_BUFFER_SIZE = 4096; + /** * Merges the named range of segments, replacing them in the stack with a * single segment. @@ -1816,7 +1824,7 @@ SegmentInfo si = sourceSegments.info(i); if (infoStream != null) infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); - IndexReader reader = SegmentReader.get(si); // no need to set deleter (yet) + IndexReader reader = SegmentReader.get(si, MERGE_READ_BUFFER_SIZE); // no need to set deleter (yet) merger.add(reader); if (reader.directory() == this.ramDirectory) { ramSegmentsToDelete.add(si); Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/SegmentReader.java (revision 541583) +++ src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -23,6 +23,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.util.BitVector; import java.io.IOException; @@ -127,16 +128,24 @@ * @throws IOException if there is a low-level IO error */ public static SegmentReader get(SegmentInfo si) throws CorruptIndexException, IOException { - return get(si.dir, si, null, false, false); + return get(si.dir, si, null, false, false, BufferedIndexInput.BUFFER_SIZE); } /** * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ + public static SegmentReader get(SegmentInfo si, int readBufferSize) throws CorruptIndexException, IOException { + return get(si.dir, si, null, false, false, readBufferSize); + } + + /** + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ public static SegmentReader get(SegmentInfos sis, SegmentInfo si, boolean closeDir) throws CorruptIndexException, IOException { - return get(si.dir, si, sis, closeDir, true); + return get(si.dir, si, sis, closeDir, true, BufferedIndexInput.BUFFER_SIZE); } /** @@ -145,7 +154,8 @@ */ public static SegmentReader get(Directory dir, SegmentInfo si, SegmentInfos sis, - boolean closeDir, boolean ownDir) + boolean closeDir, boolean ownDir, + int readBufferSize) throws CorruptIndexException, IOException { SegmentReader instance; try { @@ -154,11 +164,11 @@ throw new RuntimeException("cannot load SegmentReader class: " + e, e); } instance.init(dir, sis, closeDir, ownDir); - instance.initialize(si); + instance.initialize(si, readBufferSize); return instance; } - private void initialize(SegmentInfo si) throws CorruptIndexException, IOException { + private void initialize(SegmentInfo si, int readBufferSize) throws CorruptIndexException, IOException { segment = si.name; this.si = si; @@ -168,20 +178,20 @@ // Use compound file directory for some files, if it exists Directory cfsDir = directory(); if (si.getUseCompoundFile()) { - cfsReader = new CompoundFileReader(directory(), segment + ".cfs"); + cfsReader = new CompoundFileReader(directory(), segment + ".cfs", readBufferSize); cfsDir = cfsReader; } // No compound file exists - use the multi-file format fieldInfos = new FieldInfos(cfsDir, segment + ".fnm"); - fieldsReader = new FieldsReader(cfsDir, segment, fieldInfos); + fieldsReader = new FieldsReader(cfsDir, segment, fieldInfos, readBufferSize); // Verify two sources of "maxDoc" agree: if (fieldsReader.size() != si.docCount) { throw new CorruptIndexException("doc counts differ for segment " + si.name + ": fieldsReader shows " + fieldsReader.size() + " but segmentInfo shows " + si.docCount); } - tis = new TermInfosReader(cfsDir, segment, fieldInfos); + tis = new TermInfosReader(cfsDir, segment, fieldInfos, readBufferSize); // NOTE: the bitvector is stored using the regular directory, not cfs if (hasDeletions(si)) { @@ -195,12 +205,12 @@ // make sure that all index files have been read or are kept open // so that if an index update removes them we'll still have them - freqStream = cfsDir.openInput(segment + ".frq"); - proxStream = cfsDir.openInput(segment + ".prx"); - openNorms(cfsDir); + freqStream = cfsDir.openInput(segment + ".frq", readBufferSize); + proxStream = cfsDir.openInput(segment + ".prx", readBufferSize); + openNorms(cfsDir, readBufferSize); if (fieldInfos.hasVectors()) { // open term vector files only as needed - termVectorsReaderOrig = new TermVectorsReader(cfsDir, segment, fieldInfos); + termVectorsReaderOrig = new TermVectorsReader(cfsDir, segment, fieldInfos, readBufferSize); } success = true; } finally { @@ -482,7 +492,7 @@ } - private void openNorms(Directory cfsDir) throws IOException { + private void openNorms(Directory cfsDir, int readBufferSize) throws IOException { long nextNormSeek = SegmentMerger.NORMS_HEADER.length; //skip header (header unused for now) int maxDoc = maxDoc(); for (int i = 0; i < fieldInfos.size(); i++) { @@ -502,7 +512,7 @@ if (singleNormFile) { normSeek = nextNormSeek; if (singleNormStream==null) { - singleNormStream = d.openInput(fileName); + singleNormStream = d.openInput(fileName, readBufferSize); } // All norms in the .nrm file can share a single IndexInput since // they are only used in a synchronized context. Index: src/java/org/apache/lucene/store/Directory.java =================================================================== --- src/java/org/apache/lucene/store/Directory.java (revision 541583) +++ src/java/org/apache/lucene/store/Directory.java (working copy) @@ -88,6 +88,17 @@ public abstract IndexInput openInput(String name) throws IOException; + /** Returns a stream reading an existing file, with the + * specified read buffer size. The particular Directory + * implementation may ignore the buffer size. Currently + * the only Directory implementations that respect this + * parameter are {@link FSDirectory} and {@link + * org.apache.lucene.index.CompoundFileReader}. + */ + public IndexInput openInput(String name, int bufferSize) throws IOException { + return openInput(name); + } + /** Construct a {@link Lock}. * @param name the name of the lock file */ Index: src/java/org/apache/lucene/store/RAMInputStream.java =================================================================== --- src/java/org/apache/lucene/store/RAMInputStream.java (revision 541583) +++ src/java/org/apache/lucene/store/RAMInputStream.java (working copy) @@ -26,7 +26,7 @@ */ class RAMInputStream extends IndexInput implements Cloneable { - static final int BUFFER_SIZE = BufferedIndexOutput.BUFFER_SIZE; + static final int BUFFER_SIZE = RAMOutputStream.BUFFER_SIZE; private RAMFile file; private long length; Index: src/java/org/apache/lucene/store/RAMOutputStream.java =================================================================== --- src/java/org/apache/lucene/store/RAMOutputStream.java (revision 541583) +++ src/java/org/apache/lucene/store/RAMOutputStream.java (working copy) @@ -26,7 +26,7 @@ */ public class RAMOutputStream extends IndexOutput { - static final int BUFFER_SIZE = BufferedIndexOutput.BUFFER_SIZE; + static final int BUFFER_SIZE = 1024; private RAMFile file; Index: src/java/org/apache/lucene/store/BufferedIndexInput.java =================================================================== --- src/java/org/apache/lucene/store/BufferedIndexInput.java (revision 541583) +++ src/java/org/apache/lucene/store/BufferedIndexInput.java (working copy) @@ -21,8 +21,12 @@ /** Base implementation class for buffered {@link IndexInput}. */ public abstract class BufferedIndexInput extends IndexInput { - static final int BUFFER_SIZE = BufferedIndexOutput.BUFFER_SIZE; + /** Default buffer size */ + public static final int BUFFER_SIZE = 1024; + + private int bufferSize = BUFFER_SIZE; + private byte[] buffer; private long bufferStart = 0; // position in file of buffer @@ -35,6 +39,50 @@ return buffer[bufferPosition++]; } + public BufferedIndexInput() {} + + /** Inits BufferedIndexInput with a specific bufferSize */ + public BufferedIndexInput(int bufferSize) { + checkBufferSize(bufferSize); + this.bufferSize = bufferSize; + } + + /** Change the buffer size used by this IndexInput */ + public void setBufferSize(int newSize) { + assert bufferSize == buffer.length; + if (newSize != bufferSize) { + checkBufferSize(newSize); + bufferSize = newSize; + if (buffer != null) { + // Resize the existing buffer and carefully save as + // many bytes as possible starting from the current + // bufferPosition + byte[] newBuffer = new byte[newSize]; + final int leftInBuffer = bufferLength-bufferPosition; + final int numToCopy; + if (leftInBuffer > newSize) + numToCopy = newSize; + else + numToCopy = leftInBuffer; + System.arraycopy(buffer, bufferPosition, newBuffer, 0, numToCopy); + bufferStart += bufferPosition; + bufferPosition = 0; + bufferLength = numToCopy; + buffer = newBuffer; + } + } + } + + /** Returns buffer size. @see #setBufferSize */ + public int getBufferSize() { + return bufferSize; + } + + private void checkBufferSize(int bufferSize) { + if (bufferSize <= 0) + throw new IllegalArgumentException("bufferSize must be greater than 0 (got " + bufferSize + ")"); + } + public void readBytes(byte[] b, int offset, int len) throws IOException { if(len <= (bufferLength-bufferPosition)){ // the buffer contains enough data to satistfy this request @@ -51,7 +99,7 @@ bufferPosition += available; } // and now, read the remaining 'len' bytes: - if(len length()) // don't read past EOF end = length(); bufferLength = (int)(end - start); @@ -89,7 +137,7 @@ throw new IOException("read past EOF"); if (buffer == null) { - buffer = new byte[BUFFER_SIZE]; // allocate buffer lazily + buffer = new byte[bufferSize]; // allocate buffer lazily seekInternal(bufferStart); } readInternal(buffer, 0, bufferLength); Index: src/java/org/apache/lucene/store/BufferedIndexOutput.java =================================================================== --- src/java/org/apache/lucene/store/BufferedIndexOutput.java (revision 541583) +++ src/java/org/apache/lucene/store/BufferedIndexOutput.java (working copy) @@ -21,7 +21,7 @@ /** Base implementation class for buffered {@link IndexOutput}. */ public abstract class BufferedIndexOutput extends IndexOutput { - static final int BUFFER_SIZE = 1024; + static final int BUFFER_SIZE = 16384; private final byte[] buffer = new byte[BUFFER_SIZE]; private long bufferStart = 0; // position in file of buffer Index: src/java/org/apache/lucene/store/FSDirectory.java =================================================================== --- src/java/org/apache/lucene/store/FSDirectory.java (revision 541583) +++ src/java/org/apache/lucene/store/FSDirectory.java (working copy) @@ -435,11 +435,16 @@ return new FSIndexOutput(file); } - /** Returns a stream reading an existing file. */ + // Inherit javadoc public IndexInput openInput(String name) throws IOException { return new FSIndexInput(new File(directory, name)); } + // Inherit javadoc + public IndexInput openInput(String name, int bufferSize) throws IOException { + return new FSIndexInput(new File(directory, name), bufferSize); + } + /** * So we can do some byte-to-hexchar conversion below */ @@ -523,6 +528,11 @@ boolean isClone; public FSIndexInput(File path) throws IOException { + this(path, BufferedIndexInput.BUFFER_SIZE); + } + + public FSIndexInput(File path, int bufferSize) throws IOException { + super(bufferSize); file = new Descriptor(path, "r"); }