Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1099371) +++ lucene/CHANGES.txt (working copy) @@ -43,6 +43,14 @@ PhraseQuery as term with lower doc freq will also have less positions. (Uwe Schindler, Robert Muir, Otis Gospodnetic) +* LUCENE-3065: When a NumericField is retrieved from a Document loaded + from IndexReader (or IndexSearcher), it will now come back as + NumericField not as a Field with a string-ified version of the + numeric value you had indexed. Note that this only applies for + newly-indexed Documents; older indices will still return Field + with the string-ified numeric value (Uwe Schindler, Ryan McKinley, + Mike McCandless) + Test Cases * LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to Index: lucene/src/java/org/apache/lucene/index/FieldsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldsReader.java (revision 1099371) +++ lucene/src/java/org/apache/lucene/index/FieldsReader.java (working copy) @@ -17,6 +17,10 @@ * limitations under the License. */ +import java.io.IOException; +import java.io.Reader; +import java.util.zip.DataFormatException; + import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.AbstractField; import org.apache.lucene.document.CompressionTools; @@ -25,16 +29,14 @@ import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.AlreadyClosedException; -import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.util.CloseableThreadLocal; +import org.apache.lucene.util.NumericUtils; -import java.io.IOException; -import java.io.Reader; -import java.util.zip.DataFormatException; - /** * Class responsible for access to stored document fields. *
@@ -240,43 +242,44 @@ Document doc = new Document(); int numFields = fieldsStream.readVInt(); + out: for (int i = 0; i < numFields; i++) { int fieldNumber = fieldsStream.readVInt(); FieldInfo fi = fieldInfos.fieldInfo(fieldNumber); FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name); - byte bits = fieldsStream.readByte(); - assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY; + int bits = fieldsStream.readByte() & 0xFF; + assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_COMPRESSED | FieldsWriter.FIELD_IS_TOKENIZED | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true) : "compressed fields are only allowed in indexes of version <= 2.9"; boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; - //TODO: Find an alternative approach here if this list continues to grow beyond the - //list of 5 or 6 currently here. See Lucene 762 for discussion - if (acceptField.equals(FieldSelectorResult.LOAD)) { - addField(doc, fi, binary, compressed, tokenize); + final int numeric = bits & FieldsWriter.FIELD_IS_NUMERIC_MASK; + + switch (acceptField) { + case LOAD: + addField(doc, fi, binary, compressed, tokenize, numeric); + break; + case LOAD_AND_BREAK: + addField(doc, fi, binary, compressed, tokenize, numeric); + break out;//Get out of this loop + case LAZY_LOAD: + addFieldLazy(doc, fi, binary, compressed, tokenize, true, numeric); + break; + case LATENT: + addFieldLazy(doc, fi, binary, compressed, tokenize, false, numeric); + break; + case SIZE: + skipFieldBytes(binary, compressed, addFieldSize(doc, fi, binary, compressed, numeric)); + break; + case SIZE_AND_BREAK: + addFieldSize(doc, fi, binary, compressed, numeric); + break out;//Get out of this loop + default: + skipField(binary, compressed, numeric); } - else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){ - addField(doc, fi, binary, compressed, tokenize); - break;//Get out of this loop - } - else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) { - addFieldLazy(doc, fi, binary, compressed, tokenize, true); - } else if (acceptField.equals(FieldSelectorResult.LATENT)) { - addFieldLazy(doc, fi, binary, compressed, tokenize, false); - } - else if (acceptField.equals(FieldSelectorResult.SIZE)){ - skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed)); - } - else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){ - addFieldSize(doc, fi, binary, compressed); - break; - } - else { - skipField(binary, compressed); - } } return doc; @@ -312,41 +315,85 @@ * Skip the field. We still have to read some of the information about the field, but can skip past the actual content. * This will have the most payoff on large fields. */ - private void skipField(boolean binary, boolean compressed) throws IOException { - skipField(binary, compressed, fieldsStream.readVInt()); + private void skipField(boolean binary, boolean compressed, int numeric) throws IOException { + final int numBytes; + switch(numeric) { + case 0: + numBytes = fieldsStream.readVInt(); + break; + case FieldsWriter.FIELD_IS_NUMERIC_INT: + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + numBytes = 4; + break; + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + numBytes = 8; + break; + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + + skipFieldBytes(binary, compressed, numBytes); } - private void skipField(boolean binary, boolean compressed, int toRead) throws IOException { - if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { - fieldsStream.seek(fieldsStream.getFilePointer() + toRead); - } else { - // We need to skip chars. This will slow us down, but still better - fieldsStream.skipChars(toRead); - } + private void skipFieldBytes(boolean binary, boolean compressed, int toRead) throws IOException { + if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { + fieldsStream.seek(fieldsStream.getFilePointer() + toRead); + } else { + // We need to skip chars. This will slow us down, but still better + fieldsStream.skipChars(toRead); + } } - private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, boolean cacheResult) throws IOException { + private NumericField loadNumericField(FieldInfo fi, int numeric) throws IOException { + assert numeric != 0; + switch(numeric) { + case FieldsWriter.FIELD_IS_NUMERIC_INT: { + final byte[] b = new byte[4]; + fieldsStream.readBytes(b, 0, b.length); + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setIntValue(NumericUtils.bytesToInt(b)); + } + case FieldsWriter.FIELD_IS_NUMERIC_LONG: { + final byte[] b = new byte[8]; + fieldsStream.readBytes(b, 0, b.length); + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setLongValue(NumericUtils.bytesToLong(b)); + } + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: { + final byte[] b = new byte[4]; + fieldsStream.readBytes(b, 0, b.length); + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setFloatValue(NumericUtils.bytesToFloat(b)); + } + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: { + final byte[] b = new byte[8]; + fieldsStream.readBytes(b, 0, b.length); + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setDoubleValue(NumericUtils.bytesToDouble(b)); + } + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + } + + private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, boolean cacheResult, int numeric) throws IOException { + final AbstractField f; if (binary) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); - //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES)); - doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed, cacheResult)); + f = new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed, cacheResult); //Need to move the pointer ahead by toRead positions fieldsStream.seek(pointer + toRead); + } else if (numeric != 0) { + f = loadNumericField(fi, numeric); } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); - AbstractField f; if (compressed) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); f = new LazyField(fi.name, store, toRead, pointer, binary, compressed, cacheResult); //skip over the part that we aren't loading fieldsStream.seek(pointer + toRead); - f.setOmitNorms(fi.omitNorms); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); } else { int length = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); @@ -357,16 +404,16 @@ fieldsStream.skipChars(length); } f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, compressed, cacheResult); - f.setOmitNorms(fi.omitNorms); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - } - - doc.add(f); + } } - + + f.setOmitNorms(fi.omitNorms); + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + doc.add(f); } - private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws CorruptIndexException, IOException { + private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, int numeric) throws CorruptIndexException, IOException { + final AbstractField f; //we have a binary stored field, and it may be compressed if (binary) { @@ -374,19 +421,18 @@ final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); if (compressed) { - doc.add(new Field(fi.name, uncompress(b))); + f = new Field(fi.name, uncompress(b)); } else { - doc.add(new Field(fi.name, b)); + f = new Field(fi.name, b); } + } else if (numeric != 0) { + f = loadNumericField(fi, numeric); } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); - - AbstractField f; if (compressed) { int toRead = fieldsStream.readVInt(); - final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); f = new Field(fi.name, // field name @@ -395,8 +441,6 @@ store, index, termVector); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - f.setOmitNorms(fi.omitNorms); } else { f = new Field(fi.name, // name false, @@ -404,19 +448,35 @@ store, index, termVector); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - f.setOmitNorms(fi.omitNorms); } - - doc.add(f); } + + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + f.setOmitNorms(fi.omitNorms); + doc.add(f); } // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) // Read just the size -- caller must skip the field content to continue reading fields // Return the size in bytes or chars, depending on field type - private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed) throws IOException { - int size = fieldsStream.readVInt(), bytesize = binary || compressed ? size : 2*size; + private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed, int numeric) throws IOException { + final int bytesize, size; + switch(numeric) { + case 0: + size = fieldsStream.readVInt(); + bytesize = (binary || compressed) ? size : 2*size; + break; + case FieldsWriter.FIELD_IS_NUMERIC_INT: + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + size = bytesize = 4; + break; + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + size = bytesize = 8; + break; + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } byte[] sizebytes = new byte[4]; sizebytes[0] = (byte) (bytesize>>>24); sizebytes[1] = (byte) (bytesize>>>16); @@ -427,7 +487,7 @@ } /** - * A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is + * A Lazy implementation of Fieldable that defers loading of fields until asked for, instead of when the Document is * loaded. */ private class LazyField extends AbstractField implements Fieldable { @@ -519,13 +579,13 @@ } catch (IOException e) { throw new FieldReaderException(e); } - if (cacheResult == true){ - fieldsData = value; - } - return value; + if (cacheResult == true){ + fieldsData = value; + } + return value; } else{ - return (String) fieldsData; - } + return (String) fieldsData; + } } } @@ -574,25 +634,24 @@ if (isCompressed == true) { value = uncompress(b); } else { - value = b; - } + value = b; + } } catch (IOException e) { throw new FieldReaderException(e); } binaryOffset = 0; binaryLength = toRead; - if (cacheResult == true){ - fieldsData = value; - } - return value; + if (cacheResult == true){ + fieldsData = value; + } + return value; } else{ - return (byte[]) fieldsData; - } - - - } else - return null; + return (byte[]) fieldsData; + } + } else { + return null; + } } } Index: lucene/src/java/org/apache/lucene/index/FieldsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldsWriter.java (revision 1099371) +++ lucene/src/java/org/apache/lucene/index/FieldsWriter.java (working copy) @@ -21,20 +21,34 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.NumericField; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.NumericUtils; final class FieldsWriter { - static final byte FIELD_IS_TOKENIZED = 0x1; - static final byte FIELD_IS_BINARY = 0x2; - + static final int FIELD_IS_TOKENIZED = 1 << 0; + static final int FIELD_IS_BINARY = 1 << 1; + /** @deprecated Kept for backwards-compatibility with <3.0 indexes; will be removed in 4.0 */ @Deprecated - static final byte FIELD_IS_COMPRESSED = 0x4; + static final int FIELD_IS_COMPRESSED = 1 << 2; + private static final int _NUMERIC_BIT_SHIFT = 3; + static final int FIELD_IS_NUMERIC_MASK = 0x07 << _NUMERIC_BIT_SHIFT; + + static final int FIELD_IS_NUMERIC_INT = 1 << _NUMERIC_BIT_SHIFT; + static final int FIELD_IS_NUMERIC_LONG = 2 << _NUMERIC_BIT_SHIFT; + static final int FIELD_IS_NUMERIC_FLOAT = 3 << _NUMERIC_BIT_SHIFT; + static final int FIELD_IS_NUMERIC_DOUBLE = 4 << _NUMERIC_BIT_SHIFT; + // currently unused: static final int FIELD_IS_NUMERIC_SHORT = 5 << _NUMERIC_BIT_SHIFT; + // currently unused: static final int FIELD_IS_NUMERIC_BYTE = 6 << _NUMERIC_BIT_SHIFT; + + // the next possible bits are: 1 << 6; 1 << 7 + // Original format static final int FORMAT = 0; @@ -44,10 +58,13 @@ // Lucene 3.0: Removal of compressed fields static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2; + // Lucene 3.2: NumericFields are stored in binary format + static final int FORMAT_LUCENE_3_2_NUMERIC_FIELDS = 3; + // NOTE: if you introduce a new format, make it 1 higher // than the current one, and always change this if you // switch to a new format! - static final int FORMAT_CURRENT = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; + static final int FORMAT_CURRENT = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; private FieldInfos fieldInfos; @@ -134,14 +151,36 @@ final void writeField(FieldInfo fi, Fieldable field) throws IOException { fieldsStream.writeVInt(fi.number); - byte bits = 0; + int bits = 0; if (field.isTokenized()) - bits |= FieldsWriter.FIELD_IS_TOKENIZED; + bits |= FIELD_IS_TOKENIZED; if (field.isBinary()) - bits |= FieldsWriter.FIELD_IS_BINARY; + bits |= FIELD_IS_BINARY; - fieldsStream.writeByte(bits); + final byte[] numBytes; + if (field instanceof NumericField) { + final Number n = ((NumericField) field).getNumericValue(); + if (n instanceof Integer) { + bits |= FIELD_IS_NUMERIC_INT; + numBytes = NumericUtils.intToBytes(((Integer) n).intValue()); + } else if (n instanceof Long) { + bits |= FIELD_IS_NUMERIC_LONG; + numBytes = NumericUtils.longToBytes(((Long) n).longValue()); + } else if (n instanceof Float) { + bits |= FIELD_IS_NUMERIC_FLOAT; + numBytes = NumericUtils.floatToBytes(((Float) n).floatValue()); + } else { + assert n instanceof Double; + bits |= FIELD_IS_NUMERIC_DOUBLE; + numBytes = NumericUtils.doubleToBytes(((Double) n).doubleValue()); + } + } else { + numBytes = null; + } + + fieldsStream.writeByte((byte) bits); + if (field.isBinary()) { final byte[] data; final int len; @@ -152,8 +191,9 @@ fieldsStream.writeVInt(len); fieldsStream.writeBytes(data, offset, len); - } - else { + } else if (numBytes != null) { + fieldsStream.writeBytes(numBytes, 0, numBytes.length); + } else { fieldsStream.writeString(field.stringValue()); } } Index: lucene/src/java/org/apache/lucene/util/NumericUtils.java =================================================================== --- lucene/src/java/org/apache/lucene/util/NumericUtils.java (revision 1099371) +++ lucene/src/java/org/apache/lucene/util/NumericUtils.java (working copy) @@ -465,5 +465,55 @@ } } - + + /** Below methods used for encoding NumericField into + * index's stored fields: */ + + public static byte[] intToBytes(int val) { + byte[] arr = new byte[4]; + arr[0] = (byte)(val>>>24); + arr[1] = (byte)(val>>>16); + arr[2] = (byte)(val>>>8); + arr[3] = (byte)(val); + return arr; + } + + public static int bytesToInt(byte[] arr) { + return (arr[0]<<24) | ((arr[1]&0xff)<<16) | ((arr[2]&0xff)<<8) | (arr[3]&0xff); + } + + public static byte[] longToBytes(long val) { + byte[] arr = new byte[8]; + arr[0] = (byte)(val>>>56); + arr[1] = (byte)(val>>>48); + arr[2] = (byte)(val>>>40); + arr[3] = (byte)(val>>>32); + arr[4] = (byte)(val>>>24); + arr[5] = (byte)(val>>>16); + arr[6] = (byte)(val>>>8); + arr[7] = (byte)(val); + return arr; + } + + public static long bytesToLong(byte[] arr) { + int high = (arr[0]<<24) | ((arr[1]&0xff)<<16) | ((arr[2]&0xff)<<8) | (arr[3]&0xff); + int low = (arr[4]<<24) | ((arr[5]&0xff)<<16) | ((arr[6]&0xff)<<8) | (arr[7]&0xff); + return (((long)high)<<32) | (low&0x0ffffffffL); + } + + public static byte[] floatToBytes(float val) { + return intToBytes(Float.floatToRawIntBits(val)); + } + + public static float bytesToFloat(byte[] arr) { + return Float.intBitsToFloat(bytesToInt(arr)); + } + + public static byte[] doubleToBytes(double val) { + return longToBytes(Double.doubleToRawLongBits(val)); + } + + public static double bytesToDouble(byte[] arr) { + return Double.longBitsToDouble(bytesToLong(arr)); + } } Index: lucene/src/test/org/apache/lucene/document/TestDocument.java =================================================================== --- lucene/src/test/org/apache/lucene/document/TestDocument.java (revision 1099371) +++ lucene/src/test/org/apache/lucene/document/TestDocument.java (working copy) @@ -3,13 +3,14 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -279,4 +280,60 @@ // expected } } + + public void testNumericField() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random, dir); + final int numDocs = _TestUtil.nextInt(random, 500, 1000) * RANDOM_MULTIPLIER; + final Number[] answers = new Number[numDocs]; + for(int id=0;id