Index: java/org/apache/lucene/document/AbstractField.java =================================================================== --- java/org/apache/lucene/document/AbstractField.java (revision 683964) +++ java/org/apache/lucene/document/AbstractField.java (working copy) @@ -37,10 +37,12 @@ protected float boost = 1.0f; // the one and only data object for all different kind of field values protected Object fieldsData = null; + //length/offset for all primitive types + protected int binaryLength; + protected int binaryOffset; protected AbstractField() { - } protected AbstractField(String name, Field.Store store, Field.Index index, Field.TermVector termVector) { @@ -199,8 +201,44 @@ } /** True iff the value of the filed is stored as binary */ - public final boolean isBinary() { return isBinary; } + public final boolean isBinary() { + return isBinary; + } + + /** + * Return the raw byte[] for the binary field. Note that + * you must also call {@link #getBinaryLength} and {@link + * #getBinaryOffset} to know which range of bytes in this + * returned array belong to the field. + * @return reference to the Field value as byte[]. + */ + public byte[] getBinaryValue() { + return isBinary ? (byte[]) fieldsData : null; + } + + public Fieldable getBinaryField(byte[] scratch){ + return isBinary ? this : null; + } + + /** + * Returns length of byte[] segment that is used as value, if Field is not binary + * returned value is undefined + * @return length of byte[] segment that represents this Field value + */ + public int getBinaryLength() { + return binaryLength; + } + + /** + * Returns offset into byte[] segment that is used as value, if Field is not binary + * returned value is undefined + * @return index of the first character in byte[] segment that represents this Field value + */ + public int getBinaryOffset() { + return binaryOffset; + } + /** True if norms are omitted for this indexed field */ public boolean getOmitNorms() { return omitNorms; } Index: java/org/apache/lucene/document/Document.java =================================================================== --- java/org/apache/lucene/document/Document.java (revision 683964) +++ java/org/apache/lucene/document/Document.java (working copy) @@ -314,6 +314,28 @@ return null; } + /** + * Returns Fieldable for the first (or only) field that has the name + * specified as the method parameter. This method will return null + * if no binary fields with the specified name are available. + * There may be non-binary fields with the same name. + * byte[] scratch may be null or shorter than needed, in that case new byte[] will be allocated. + * If this scratch buffer is big enough, you should find exactly this object in returned Fieldable. + * + * @param name the name of the field. + * @param byte[] buffer where the content will be stored, may be null + * @return a Fieldable containing the binary field value or null + */ + public final Fieldable getStoredBinaryField(String name, byte[] scratch) { + for (int i=0; i < fields.size(); i++) { + Fieldable field = (Fieldable)fields.get(i); + if (field.name().equals(name) && (field.isBinary())){ + return field.getBinaryField(scratch); + } + } + return null; + } + /** Prints the fields of a document for human consumption. */ public final String toString() { StringBuffer buffer = new StringBuffer(); Index: java/org/apache/lucene/document/Field.java =================================================================== --- java/org/apache/lucene/document/Field.java (revision 683964) +++ java/org/apache/lucene/document/Field.java (working copy) @@ -137,22 +137,39 @@ /** The value of the field as a String, or null. If null, the Reader value, * binary value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; } /** The value of the field as a Reader, or null. If null, the String value, * binary value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; } /** The value of the field in Binary, or null. If null, the Reader value, * String value, or TokenStream value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ - public byte[] binaryValue() { return isBinary ? (byte[])fieldsData : null; } + * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. + * @deprecated This method must allocate a new byte[] if + * the {@link AbstractField#getBinaryOffset()} is non-zero + * or {@link AbstractField#getBinaryLength()} is not the + * full length of the byte[]. Please use {@link + * AbstractField#getBinaryValue()} instead, which simply + * returns the byte[]. + */ + public byte[] binaryValue() { + if (!isBinary) + return null; + final byte[] data = (byte[]) fieldsData; + if (binaryOffset == 0 && data.length == binaryLength) + return data; //Optimization + + final byte[] ret = new byte[binaryLength]; + System.arraycopy(data, binaryOffset, ret, 0, binaryLength); + return ret; + } /** The value of the field as a TokesStream, or null. If null, the Reader value, * String value, or binary value is used. Exactly one of stringValue(), - * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */ public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; } @@ -182,9 +199,19 @@ /** Expert: change the value of this field. See setValue(String). */ public void setValue(byte[] value) { fieldsData = value; + binaryLength = value.length; + binaryOffset = 0; } /** Expert: change the value of this field. See setValue(String). */ + public void setValue(byte[] value, int offset, int length) { + fieldsData = value; + binaryLength = length; + binaryOffset = offset; + } + + + /** Expert: change the value of this field. See setValue(String). */ public void setValue(TokenStream value) { fieldsData = value; } @@ -378,34 +405,49 @@ * @throws IllegalArgumentException if store is Store.NO */ public Field(String name, byte[] value, Store store) { + this(name, value, 0, value.length, store); + } + + /** + * Create a stored field with binary value. Optionally the value may be compressed. + * + * @param name The name of the field + * @param value The binary value + * @param offset Starting offset in value where this Field's bytes are + * @param length Number of bytes to use for this Field, starting at offset + * @param store How value should be stored (compressed or not) + * @throws IllegalArgumentException if store is Store.NO + */ + public Field(String name, byte[] value, int offset, int length, Store store) { + if (name == null) throw new IllegalArgumentException("name cannot be null"); if (value == null) throw new IllegalArgumentException("value cannot be null"); this.name = name.intern(); - this.fieldsData = value; + fieldsData = value; - if (store == Store.YES){ - this.isStored = true; - this.isCompressed = false; + if (store == Store.YES) { + isStored = true; + isCompressed = false; } else if (store == Store.COMPRESS) { - this.isStored = true; - this.isCompressed = true; + isStored = true; + isCompressed = true; } else if (store == Store.NO) throw new IllegalArgumentException("binary values can't be unstored"); else throw new IllegalArgumentException("unknown store parameter " + store); - this.isIndexed = false; - this.isTokenized = false; + isIndexed = false; + isTokenized = false; - this.isBinary = true; + isBinary = true; + binaryLength = length; + binaryOffset = offset; setStoreTermVector(TermVector.NO); } - - } Index: java/org/apache/lucene/document/Fieldable.java =================================================================== --- java/org/apache/lucene/document/Fieldable.java (revision 683964) +++ java/org/apache/lucene/document/Fieldable.java (working copy) @@ -156,4 +156,29 @@ * @return true if this field can be loaded lazily */ boolean isLazy(); + + /** + * Returns offset into byte[] segment that is used as value, if Field is not binary + * returned value is undefined + * @return index of the first character in byte[] segment that represents this Field value + */ + abstract int getBinaryOffset(); + + /** + * Returns length of byte[] segment that is used as value, if Field is not binary + * returned value is undefined + * @return length of byte[] segment that represents this Field value + */ + abstract int getBinaryLength(); + + /** + * Return the raw byte[] for the binary field. Note that + * you must also call {@link #getBinaryLength} and {@link + * #getBinaryOffset} to know which range of bytes in this + * returned array belong to the field. + * @return reference to the Field value as byte[]. + */ + abstract byte[] getBinaryValue(); + + abstract Fieldable getBinaryField(byte[] scratch); } Index: java/org/apache/lucene/index/FieldsReader.java =================================================================== --- java/org/apache/lucene/index/FieldsReader.java (revision 683964) +++ java/org/apache/lucene/index/FieldsReader.java (working copy) @@ -451,27 +451,28 @@ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public byte[] binaryValue() { ensureOpen(); - if (isBinary) { - if (fieldsData == null) { - final byte[] b = new byte[toRead]; - IndexInput localFieldsStream = getFieldStream(); - //Throw this IO Exception since IndexReader.document does so anyway, so probably not that big of a change for people - //since they are already handling this exception when getting the document - try { - localFieldsStream.seek(pointer); - localFieldsStream.readBytes(b, 0, b.length); - if (isCompressed == true) { - fieldsData = uncompress(b); - } else { - fieldsData = b; - } - } catch (IOException e) { - throw new FieldReaderException(e); + if (isBinary && fieldsData == null) { + final byte[] b = new byte[toRead]; + IndexInput localFieldsStream = getFieldStream(); + //Throw this IO Exception since IndexReader.document does so anyway, so probably not that big of a change for people + //since they are already handling this exception when getting the document + try { + localFieldsStream.seek(pointer); + localFieldsStream.readBytes(b, 0, b.length); + if (isCompressed == true) { + fieldsData = uncompress(b); + } else { + fieldsData = b; } + } catch (IOException e) { + throw new FieldReaderException(e); } - return (byte[]) fieldsData; - } else - return null; + isBinary = true; + binaryOffset = 0; + binaryLength = toRead; + } + + return isBinary ? (byte[]) fieldsData : null; } /** The value of the field as a Reader, or null. If null, the String value, @@ -545,6 +546,39 @@ ensureOpen(); this.toRead = toRead; } + + public Fieldable getBinaryField(byte[] scratch) { + ensureOpen(); + + if (!isBinary) return null; + if (fieldsData != null) return this; + + final byte[] b; + + //reallocate scratch if null or too small + if(scratch == null || scratch.length < toRead){ + b = new byte[toRead]; + } else b = scratch; + + IndexInput localFieldsStream = getFieldStream(); + //Throw this IO Exception since IndexReader.document does so anyway, so probably not that big of a change for people + //since they are already handling this exception when getting the document + try { + localFieldsStream.seek(pointer); + localFieldsStream.readBytes(b, 0, toRead); + if (isCompressed == true) { + fieldsData = uncompress(b);//TODO: this could be optimized as well (reuse existing buffer) + } else { + fieldsData = b; + } + } catch (IOException e) { + throw new FieldReaderException(e); + } + isBinary = true; + binaryOffset = 0; + binaryLength = toRead; + return this; + } } private final byte[] uncompress(final byte[] input) Index: java/org/apache/lucene/index/FieldsWriter.java =================================================================== --- java/org/apache/lucene/index/FieldsWriter.java (revision 683964) +++ java/org/apache/lucene/index/FieldsWriter.java (working copy) @@ -21,6 +21,7 @@ import java.util.Iterator; import java.util.zip.Deflater; +import org.apache.lucene.document.AbstractField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.store.Directory; @@ -105,7 +106,7 @@ doClose = true; } - FieldsWriter(IndexOutput fdx, IndexOutput fdt, FieldInfos fn) throws IOException { + FieldsWriter(IndexOutput fdx, IndexOutput fdt, FieldInfos fn) { fieldInfos = fn; fieldsStream = fdt; indexStream = fdx; @@ -190,32 +191,43 @@ if (field.isCompressed()) { // compression is enabled for the current field - byte[] data = null; - + final byte[] data; + final int len; + final int offset; if (disableCompression) { // optimized case for merging, the data // is already compressed - data = field.binaryValue(); + final AbstractField f = (AbstractField) field;// FieldsReader.FieldForMerge is AbstracField + data = f.getBinaryValue(); + len = f.getBinaryLength(); + offset = f.getBinaryOffset(); } else { // check if it is a binary field if (field.isBinary()) { - data = compress(field.binaryValue()); + data = compress(field.getBinaryValue(), field.getBinaryOffset(), field.getBinaryLength()); + } else { + byte x[] = field.stringValue().getBytes("UTF-8"); + data = compress(x, 0, x.length); } - else { - data = compress(field.stringValue().getBytes("UTF-8")); - } + len = data.length; + offset = 0; } - final int len = data.length; + fieldsStream.writeVInt(len); - fieldsStream.writeBytes(data, len); + fieldsStream.writeBytes(data, offset, len); } else { // compression is disabled for the current field if (field.isBinary()) { - byte[] data = field.binaryValue(); - final int len = data.length; + final byte[] data; + final int len; + final int offset; + data = field.getBinaryValue(); + len = field.getBinaryLength(); + offset = field.getBinaryOffset(); + fieldsStream.writeVInt(len); - fieldsStream.writeBytes(data, len); + fieldsStream.writeBytes(data, offset, len); } else { fieldsStream.writeString(field.stringValue()); @@ -259,19 +271,23 @@ } } - private final byte[] compress (byte[] input) { + private final byte[] compress (byte[] input, int offset, int length) { + // Create the compressor with highest level of compression + Deflater compressor = new Deflater(); + compressor.setLevel(Deflater.BEST_COMPRESSION); + // Give the compressor the data to compress + compressor.setInput(input, offset, length); + compressor.finish(); + /* * Create an expandable byte array to hold the compressed data. * You cannot use an array that's the same size as the orginal because * there is no guarantee that the compressed data will be smaller than * the uncompressed data. */ - ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length); + ByteArrayOutputStream bos = new ByteArrayOutputStream(length); - // Create the compressor with highest level of compression - Deflater compressor = new Deflater(); - try { compressor.setLevel(Deflater.BEST_COMPRESSION); Index: test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- test/org/apache/lucene/index/TestIndexWriter.java (revision 683964) +++ test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -3765,4 +3765,36 @@ w.doFail = false; w.rollback(); } + + + // LUCENE-1219 + public void testBinaryFieldOffsetLength() throws IOException { + MockRAMDirectory dir = new MockRAMDirectory(); + IndexWriter w = new IndexWriter(dir, false, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); + byte[] b = new byte[50]; + for(int i=0;i<50;i++) + b[i] = (byte) (i+77); + + Document doc = new Document(); + Field f = new Field("binary", b, 10, 17, Field.Store.YES); + byte[] bx = f.getBinaryValue(); + assertTrue(bx != null); + assertEquals(50, bx.length); + assertEquals(10, f.getBinaryOffset()); + assertEquals(17, f.getBinaryLength()); + doc.add(f); + w.addDocument(doc); + w.close(); + + IndexReader ir = IndexReader.open(dir); + doc = ir.document(0); + f = doc.getField("binary"); + b = f.getBinaryValue(); + assertTrue(b != null); + assertEquals(17, b.length, 17); + assertEquals(87, b[0]); + ir.close(); + dir.close(); + } + }