Index: lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java (revision 1482310) +++ lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java (working copy) @@ -36,15 +36,17 @@ import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.MathUtil; import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FST.INPUT_TYPE; import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.FST.INPUT_TYPE; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.packed.BlockPackedWriter; import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; +import org.apache.lucene.util.packed.PackedInts.FormatAndBits; import org.apache.lucene.util.packed.PackedInts; -import org.apache.lucene.util.packed.PackedInts.FormatAndBits; +import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE; + /** * Writer for {@link Lucene42DocValuesFormat} */ @@ -207,6 +209,9 @@ } } + // nocommit should we fix this to allow bigger byte[]? or + // make diskDV the default dv impl? + @Override public void addBinaryField(FieldInfo field, final Iterable values) throws IOException { // write the byte[] data @@ -216,6 +221,9 @@ int maxLength = Integer.MIN_VALUE; final long startFP = data.getFilePointer(); for(BytesRef v : values) { + if (v.length > (BYTE_BLOCK_SIZE - 2)) { + throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + (BYTE_BLOCK_SIZE - 2)); + } minLength = Math.min(minLength, v.length); maxLength = Math.max(maxLength, v.length); data.writeBytes(v.bytes, v.offset, v.length); Index: lucene/core/src/java/org/apache/lucene/index/FieldInfo.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/FieldInfo.java (revision 1482310) +++ lucene/core/src/java/org/apache/lucene/index/FieldInfo.java (working copy) @@ -92,21 +92,22 @@ */ NUMERIC, /** - * A per-document byte[]. + * A per-document byte[]. Values may be larger than + * 32766 bytes, but different codecs may enforce their own limits. */ BINARY, /** * A pre-sorted byte[]. Fields with this type only store distinct byte values * and store an additional offset pointer per document to dereference the shared * byte[]. The stored byte[] is presorted and allows access via document id, - * ordinal and by-value. + * ordinal and by-value. Values must be <= 32766 bytes. */ SORTED, /** * A pre-sorted Set<byte[]>. Fields with this type only store distinct byte values * and store additional offset pointers per document to dereference the shared * byte[]s. The stored byte[] is presorted and allows access via document id, - * ordinal and by-value. + * ordinal and by-value. Values must be <= 32766 bytes. */ SORTED_SET }; Index: lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java =================================================================== --- lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java (revision 1482310) +++ lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java (working copy) @@ -22,28 +22,42 @@ import java.util.NoSuchElementException; import org.apache.lucene.codecs.DocValuesConsumer; -import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; -import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.packed.AppendingLongBuffer; +import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.packed.MonotonicAppendingLongBuffer; -import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE; - - /** Buffers up pending byte[] per doc, then flushes when * segment flushes. */ class BinaryDocValuesWriter extends DocValuesWriter { - private final ByteBlockPool pool; - private final AppendingLongBuffer lengths; + /** Maximum length for a binary field; we set this to "a + * bit" below Integer.MAX_VALUE because the exact max + * allowed byte[] is JVM dependent, so we want to avoid + * a case where a large value worked in one JVM but + * failed later at search time with a different JVM. */ + private static final int MAX_LENGTH = Integer.MAX_VALUE-256; + + // 32 KB block sizes for PagedBytes storage: + private final static int BLOCK_BITS = 15; + + private final PagedBytes bytes; + private final DataOutput bytesOut; + + private final Counter iwBytesUsed; + private final MonotonicAppendingLongBuffer lengths; private final FieldInfo fieldInfo; - private int addedValues = 0; + private int addedValues; + private long bytesUsed; public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { this.fieldInfo = fieldInfo; - this.pool = new ByteBlockPool(new DirectTrackingAllocator(iwBytesUsed)); - this.lengths = new AppendingLongBuffer(); + this.bytes = new PagedBytes(BLOCK_BITS); + this.bytesOut = bytes.getDataOutput(); + this.lengths = new MonotonicAppendingLongBuffer(); + this.iwBytesUsed = iwBytesUsed; } public void addValue(int docID, BytesRef value) { @@ -53,10 +67,10 @@ if (value == null) { throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\": null value not allowed"); } - if (value.length > (BYTE_BLOCK_SIZE - 2)) { - throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" is too large, must be <= " + (BYTE_BLOCK_SIZE - 2)); + if (value.length > MAX_LENGTH) { + throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" is too large, must be <= " + MAX_LENGTH); } - + // Fill in any holes: while(addedValues < docID) { addedValues++; @@ -64,9 +78,21 @@ } addedValues++; lengths.add(value.length); - pool.append(value); + try { + bytesOut.writeBytes(value.bytes, value.offset, value.length); + } catch (IOException ioe) { + // Should never happen! + throw new RuntimeException(ioe); + } + updateBytesUsed(); } + private void updateBytesUsed() { + final long newBytesUsed = lengths.ramBytesUsed() + bytes.ramBytesUsed(); + iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); + bytesUsed = newBytesUsed; + } + @Override public void finish(int maxDoc) { } @@ -74,6 +100,7 @@ @Override public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException { final int maxDoc = state.segmentInfo.getDocCount(); + bytes.freeze(false); dvConsumer.addBinaryField(fieldInfo, new Iterable() { @Override @@ -90,11 +117,11 @@ // iterates over the values we have in ram private class BytesIterator implements Iterator { final BytesRef value = new BytesRef(); - final AppendingLongBuffer.Iterator lengthsIterator = lengths.iterator(); + final MonotonicAppendingLongBuffer.Iterator lengthsIterator = lengths.iterator(); + final DataInput bytesIterator = bytes.getDataInput(); final int size = (int) lengths.size(); final int maxDoc; int upto; - long byteOffset; BytesIterator(int maxDoc) { this.maxDoc = maxDoc; @@ -114,8 +141,12 @@ int length = (int) lengthsIterator.next(); value.grow(length); value.length = length; - pool.readBytes(byteOffset, value.bytes, value.offset, value.length); - byteOffset += length; + try { + bytesIterator.readBytes(value.bytes, value.offset, value.length); + } catch (IOException ioe) { + // Should never happen! + throw new RuntimeException(ioe); + } } else { // This is to handle last N documents not having // this DV field in the end of the segment: Index: lucene/core/src/java/org/apache/lucene/util/PagedBytes.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/PagedBytes.java (revision 1482310) +++ lucene/core/src/java/org/apache/lucene/util/PagedBytes.java (working copy) @@ -21,6 +21,8 @@ import java.util.ArrayList; import java.util.List; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexInput; /** Represents a logical byte[] as a series of pages. You @@ -34,6 +36,7 @@ // other "shift/mask big arrays". there are too many of these classes! public final class PagedBytes { private final List blocks = new ArrayList(); + // nocommit these are unused? private final List blockEnd = new ArrayList(); private final int blockSize; private final int blockBits; @@ -42,6 +45,7 @@ private boolean frozen; private int upto; private byte[] currentBlock; + private final long bytesUsedPerBlock; private static final byte[] EMPTY_BYTES = new byte[0]; @@ -132,6 +136,7 @@ this.blockBits = blockBits; blockMask = blockSize-1; upto = blockSize; + bytesUsedPerBlock = blockSize + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_OBJECT_REF; } /** Read this many bytes from in */ @@ -216,6 +221,11 @@ } } + /** Return approx RAM usage in bytes. */ + public long ramBytesUsed() { + return (blocks.size() + (currentBlock != null ? 1 : 0)) * bytesUsedPerBlock; + } + /** Copy bytes in, writing the length as a 1 or 2 byte * vInt prefix. */ // TODO: this really needs to be refactored into fieldcacheimpl! @@ -249,4 +259,148 @@ return pointer; } + + public final class PagedBytesDataInput extends DataInput { + private int currentBlockIndex; + private int currentBlockUpto; + private byte[] currentBlock; + + PagedBytesDataInput() { + currentBlock = blocks.get(0); + } + + @Override + public PagedBytesDataInput clone() { + PagedBytesDataInput clone = getDataInput(); + clone.setPosition(getPosition()); + return clone; + } + + /** Returns the current byte position. */ + public long getPosition() { + return (long) currentBlockIndex * blockSize + currentBlockUpto; + } + + /** Seek to a position previously obtained from + * {@link #getPosition}. */ + public void setPosition(long pos) { + currentBlockIndex = (int) (pos >> blockBits); + currentBlock = blocks.get(currentBlockIndex); + currentBlockUpto = (int) (pos & blockMask); + } + + @Override + public byte readByte() { + if (currentBlockUpto == blockSize) { + nextBlock(); + } + return currentBlock[currentBlockUpto++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + assert b.length >= offset + len; + final int offsetEnd = offset + len; + while (true) { + final int blockLeft = blockSize - currentBlockUpto; + final int left = offsetEnd - offset; + if (blockLeft < left) { + System.arraycopy(currentBlock, currentBlockUpto, + b, offset, + blockLeft); + nextBlock(); + offset += blockLeft; + } else { + // Last block + System.arraycopy(currentBlock, currentBlockUpto, + b, offset, + left); + currentBlockUpto += left; + break; + } + } + } + + private void nextBlock() { + currentBlockIndex++; + currentBlockUpto = 0; + currentBlock = blocks.get(currentBlockIndex); + } + } + + public final class PagedBytesDataOutput extends DataOutput { + @Override + public void writeByte(byte b) { + if (upto == blockSize) { + if (currentBlock != null) { + blocks.add(currentBlock); + blockEnd.add(upto); + } + currentBlock = new byte[blockSize]; + upto = 0; + } + currentBlock[upto++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + assert b.length >= offset + length; + if (length == 0) { + return; + } + + if (upto == blockSize) { + if (currentBlock != null) { + blocks.add(currentBlock); + blockEnd.add(upto); + } + currentBlock = new byte[blockSize]; + upto = 0; + } + + final int offsetEnd = offset + length; + while(true) { + final int left = offsetEnd - offset; + final int blockLeft = blockSize - upto; + if (blockLeft < left) { + System.arraycopy(b, offset, currentBlock, upto, blockLeft); + blocks.add(currentBlock); + blockEnd.add(blockSize); + currentBlock = new byte[blockSize]; + upto = 0; + offset += blockLeft; + } else { + // Last block + System.arraycopy(b, offset, currentBlock, upto, left); + upto += left; + break; + } + } + } + + /** Return the current byte position. */ + public long getPosition() { + return getPointer(); + } + } + + /** Returns a DataInput to read values from this + * PagedBytes instance. */ + public PagedBytesDataInput getDataInput() { + if (!frozen) { + throw new IllegalStateException("must call freeze() before getDataInput"); + } + return new PagedBytesDataInput(); + } + + /** Returns a DataOutput that you may use to write into + * this PagedBytes instance. If you do this, you should + * not call the other writing methods (eg, copy); + * results are undefined. */ + public PagedBytesDataOutput getDataOutput() { + if (frozen) { + throw new IllegalStateException("cannot get DataOutput after freeze()"); + } + return new PagedBytesDataOutput(); + } } Index: lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java (revision 1482310) +++ lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java (working copy) @@ -18,6 +18,8 @@ */ import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.atomic.AtomicBoolean; @@ -29,12 +31,14 @@ import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.search.FieldCache; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; /** * @@ -326,29 +330,96 @@ iwriter.close(); directory.close(); } + + // nocommit also add explicit test for Facet42DVConsumer (it's + // not in the random LTC rotation) public void testTooLargeBytes() throws IOException { Analyzer analyzer = new MockAnalyzer(random()); + Directory d = newDirectory(); + boolean doFixed = random().nextBoolean(); + int numDocs; + int fixedLength = 0; + if (doFixed) { + // Sometimes make all values fixed length since some + // codecs have different code paths for this: + numDocs = _TestUtil.nextInt(random(), 10, 20); + fixedLength = _TestUtil.nextInt(random(), 65537, 256*1024); + } else { + numDocs = _TestUtil.nextInt(random(), 100, 200); + } + IndexWriter w = new IndexWriter(d, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + List docBytes = new ArrayList(); + long totalBytes = 0; + for(int docID=0;docID 64KB in size to ensure more than 2 pages in + // PagedBytes would be needed: + int numBytes; + if (doFixed) { + numBytes = fixedLength; + } else if (docID == 0 || random().nextInt(5) == 3) { + numBytes = _TestUtil.nextInt(random(), 65537, 3*1024*1024); + } else { + numBytes = _TestUtil.nextInt(random(), 1, 1024*1024); + } + totalBytes += numBytes; + if (totalBytes > 5 * 1024*1024) { + break; + } + byte[] bytes = new byte[numBytes]; + random().nextBytes(bytes); + docBytes.add(bytes); + Document doc = new Document(); + BytesRef b = new BytesRef(bytes); + b.length = bytes.length; + doc.add(new BinaryDocValuesField("field", b)); + doc.add(new StringField("id", ""+docID, Field.Store.YES)); + try { + w.addDocument(doc); + } catch (IllegalArgumentException iae) { + if (iae.getMessage().indexOf("is too large") == -1) { + throw iae; + } else { + // OK: some codecs can't handle binary DV > 32K + w.rollback(); + d.close(); + return; + } + } + } + + DirectoryReader r; try { - iwriter.addDocument(doc); - fail("did not get expected exception"); - } catch (IllegalArgumentException expected) { - // expected + r = w.getReader(); + } catch (IllegalArgumentException iae) { + if (iae.getMessage().indexOf("is too large") == -1) { + throw iae; + } else { + // OK: some codecs can't handle binary DV > 32K + w.rollback(); + d.close(); + return; + } } - iwriter.close(); + w.close(); - directory.close(); + AtomicReader ar = SlowCompositeReaderWrapper.wrap(r); + + BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field"); + for(int docID=0;docID (BYTE_BLOCK_SIZE - 2)) { + throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + (BYTE_BLOCK_SIZE - 2)); + } minLength = Math.min(minLength, b.length); maxLength = Math.max(maxLength, b.length); if (uniqueValues != null) {