Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 966616) +++ lucene/CHANGES.txt (working copy) @@ -241,6 +241,9 @@ explicitly enable the old behavior with setAutoGeneratePhraseQueries(true) (Robert Muir) +* LUCENE-2537: FSDirectory.copy() implementation was unsafe and could result in + OOM if a large file was copied. (Shai Erera) + New features * LUCENE-2128: Parallelized fetching document frequencies during weight Index: lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java (revision 966616) +++ lucene/src/java/org/apache/lucene/index/CompoundFileWriter.java (working copy) @@ -20,12 +20,13 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IOUtils; + import java.util.LinkedList; import java.util.HashSet; import java.io.IOException; - /** * Combines multiple files into a single compound file. * The file format:
@@ -142,20 +143,17 @@ */ public void close() throws IOException { if (merged) - throw new IllegalStateException( - "Merge already performed"); + throw new IllegalStateException("Merge already performed"); if (entries.isEmpty()) - throw new IllegalStateException( - "No entries to merge have been defined"); + throw new IllegalStateException("No entries to merge have been defined"); merged = true; // open the compound stream - IndexOutput os = null; + IndexOutput os = directory.createOutput(fileName); + IOException priorException = null; try { - os = directory.createOutput(fileName); - // Write the Version info - must be a VInt because CFR reads a VInt // in older versions! os.writeVInt(FORMAT_CURRENT); @@ -185,10 +183,9 @@ // Open the files and copy their data into the stream. // Remember the locations of each file's data section. - byte buffer[] = new byte[16384]; for (FileEntry fe : entries) { fe.dataOffset = os.getFilePointer(); - copyFile(fe, os, buffer); + copyFile(fe, os); } // Write the data offsets into the directory of the compound stream @@ -206,56 +203,37 @@ IndexOutput tmp = os; os = null; tmp.close(); - + } catch (IOException e) { + priorException = e; } finally { - if (os != null) try { os.close(); } catch (IOException e) { } + IOUtils.closeSafely(priorException, os); } } - /** Copy the contents of the file with specified extension into the - * provided output stream. Use the provided buffer for moving data - * to reduce memory allocation. - */ - private void copyFile(FileEntry source, IndexOutput os, byte buffer[]) - throws IOException - { - IndexInput is = null; - try { - long startPtr = os.getFilePointer(); + /** + * Copy the contents of the file with specified extension into the provided + * output stream. + */ + private void copyFile(FileEntry source, IndexOutput os) throws IOException { + IndexInput is = directory.openInput(source.file); + try { + long startPtr = os.getFilePointer(); + long length = is.length(); + os.copyBytes(is, length); - is = directory.openInput(source.file); - long length = is.length(); - long remainder = length; - int chunk = buffer.length; + if (checkAbort != null) { + checkAbort.work(length); + } - while(remainder > 0) { - int len = (int) Math.min(chunk, remainder); - is.readBytes(buffer, 0, len, false); - os.writeBytes(buffer, len); - remainder -= len; - if (checkAbort != null) - // Roughly every 2 MB we will check if - // it's time to abort - checkAbort.work(80); - } + // Verify that the output length diff is equal to original file + long endPtr = os.getFilePointer(); + long diff = endPtr - startPtr; + if (diff != length) + throw new IOException("Difference in the output file offsets " + diff + + " does not match the original file length " + length); - // Verify that remainder is 0 - if (remainder != 0) - throw new IOException( - "Non-zero remainder length after copying: " + remainder - + " (id: " + source.file + ", length: " + length - + ", buffer size: " + chunk + ")"); - - // Verify that the output length diff is equal to original file - long endPtr = os.getFilePointer(); - long diff = endPtr - startPtr; - if (diff != length) - throw new IOException( - "Difference in the output file offsets " + diff - + " does not match the original file length " + length); - - } finally { - if (is != null) is.close(); - } + } finally { + is.close(); } + } } Index: lucene/src/java/org/apache/lucene/store/FSDirectory.java =================================================================== --- lucene/src/java/org/apache/lucene/store/FSDirectory.java (revision 966616) +++ lucene/src/java/org/apache/lucene/store/FSDirectory.java (working copy) @@ -33,6 +33,8 @@ import static java.util.Collections.synchronizedSet; import java.util.HashSet; import java.util.Set; + +import org.apache.lucene.store.SimpleFSDirectory.SimpleFSIndexInput; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util.Constants; @@ -126,6 +128,12 @@ protected final Set staleFiles = synchronizedSet(new HashSet()); // Files written, but not yet sync'ed private int chunkSize = DEFAULT_READ_CHUNK_SIZE; // LUCENE-1566 + /** + * Chunk size used to read when using FileChannel API. If an attempt to read a + * large file is made without limiting the chunk size, an OOM may occur. + */ + private static final long CHANNEL_CHUNK_SIZE = 1 << 21; // Use 2MB chunk size - LUCENE-2537 + // returns the canonical version of the directory, creating it if it doesn't exist. private static File getCanonicalPath(File file) throws IOException { return new File(file.getCanonicalPath()); @@ -441,7 +449,7 @@ try { input = new FileInputStream(new File(directory, src)).getChannel(); output = new FileOutputStream(new File(target.directory, dest)).getChannel(); - output.transferFrom(input, 0, input.size()); + copy(input, output, input.size()); } catch (IOException ioe) { priorException = ioe; } finally { @@ -451,6 +459,25 @@ super.copy(to, src, dest); } } + + /** + * Copies the content of a given {@link FileChannel} to a destination one. The + * copy is done in chunks of 2MB because if transferFrom is used without a + * limit when copying a very large file, then an OOM may be thrown (depends on + * the state of the RAM in the machine, as well as the OS used). Performance + * measurements showed that chunk sizes larger than 2MB do not result in much + * faster file copy, therefore we limit the size to be safe with different + * file sizes and systems. + */ + static void copy(FileChannel input, FileChannel output, long numBytes) throws IOException { + long pos = output.position(); + long writeTo = numBytes + pos; + while (pos < writeTo) { + pos += output.transferFrom(input, pos, Math.min(CHANNEL_CHUNK_SIZE, writeTo - pos)); + } + // transferFrom does not change the position of the channel. Need to change it manually + output.position(pos); + } protected static class FSIndexOutput extends BufferedIndexOutput { private final FSDirectory parent; @@ -472,6 +499,28 @@ } @Override + public void copyBytes(IndexInput input, long numBytes) throws IOException { + // Optimized copy only if the number of bytes to copy is larger than the + // buffer size, and the given IndexInput supports FileChannel copying .. + // NOTE: the below check relies on NIOIndexInput extending Simple. If that + // changes in the future, we should change the check as well. + if (numBytes > BUFFER_SIZE && input instanceof SimpleFSIndexInput) { + // flush any bytes in the buffer + flush(); + // do the optimized copy + FileChannel in = ((SimpleFSIndexInput) input).file.getChannel(); + FileChannel out = file.getChannel(); + copy(in, out, numBytes); + // corrects the position in super (BufferedIndexOutput), so that calls + // to getFilePointer will return the correct pointer. + // Perhaps a specific method is better? + super.seek(out.position()); + } else { + super.copyBytes(input, numBytes); + } + } + + @Override public void close() throws IOException { // only close the file if it has not been closed yet if (isOpen) { Index: lucene/src/java/org/apache/lucene/store/SimpleFSDirectory.java =================================================================== --- lucene/src/java/org/apache/lucene/store/SimpleFSDirectory.java (revision 966616) +++ lucene/src/java/org/apache/lucene/store/SimpleFSDirectory.java (working copy) @@ -125,7 +125,7 @@ final OutOfMemoryError outOfMemoryError = new OutOfMemoryError( "OutOfMemoryError likely caused by the Sun VM Bug described in " + "https://issues.apache.org/jira/browse/LUCENE-1566; try calling FSDirectory.setReadChunkSize " - + "with a a value smaller than the current chunks size (" + chunkSize + ")"); + + "with a value smaller than the current chunks size (" + chunkSize + ")"); outOfMemoryError.initCause(e); throw outOfMemoryError; }