Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1099371) +++ lucene/CHANGES.txt (working copy) @@ -43,6 +43,14 @@ PhraseQuery as term with lower doc freq will also have less positions. (Uwe Schindler, Robert Muir, Otis Gospodnetic) +* LUCENE-3065: When a NumericField is retrieved from a Document loaded + from IndexReader (or IndexSearcher), it will now come back as + NumericField not as a Field with a string-ified version of the + numeric value you had indexed. Note that this only applies for + newly-indexed Documents; older indices will still return Field + with the string-ified numeric value (Uwe Schindler, Ryan McKinley, + Mike McCandless) + Test Cases * LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to Index: lucene/src/java/org/apache/lucene/index/FieldsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldsReader.java (revision 1099371) +++ lucene/src/java/org/apache/lucene/index/FieldsReader.java (working copy) @@ -17,6 +17,10 @@ * limitations under the License. */ +import java.io.IOException; +import java.io.Reader; +import java.util.zip.DataFormatException; + import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.AbstractField; import org.apache.lucene.document.CompressionTools; @@ -25,16 +29,14 @@ import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.AlreadyClosedException; -import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.util.CloseableThreadLocal; +import org.apache.lucene.util.NumericUtils; -import java.io.IOException; -import java.io.Reader; -import java.util.zip.DataFormatException; - /** * Class responsible for access to stored document fields. *
@@ -246,37 +248,34 @@ FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name); byte bits = fieldsStream.readByte(); - assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY; + assert bits <= FieldsWriter.FIELD_IS_NUMERIC_MASK + FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY: "bits=" + Integer.toHexString(bits); boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true) : "compressed fields are only allowed in indexes of version <= 2.9"; boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; + final byte numeric = (byte) (bits & FieldsWriter.FIELD_IS_NUMERIC_MASK); + //TODO: Find an alternative approach here if this list continues to grow beyond the //list of 5 or 6 currently here. See Lucene 762 for discussion if (acceptField.equals(FieldSelectorResult.LOAD)) { - addField(doc, fi, binary, compressed, tokenize); - } - else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){ - addField(doc, fi, binary, compressed, tokenize); + addField(doc, fi, binary, compressed, tokenize, numeric); + } else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){ + addField(doc, fi, binary, compressed, tokenize, numeric); break;//Get out of this loop - } - else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) { - addFieldLazy(doc, fi, binary, compressed, tokenize, true); + } else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) { + addFieldLazy(doc, fi, binary, compressed, tokenize, true, numeric); } else if (acceptField.equals(FieldSelectorResult.LATENT)) { - addFieldLazy(doc, fi, binary, compressed, tokenize, false); - } - else if (acceptField.equals(FieldSelectorResult.SIZE)){ - skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed)); - } - else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){ - addFieldSize(doc, fi, binary, compressed); + addFieldLazy(doc, fi, binary, compressed, tokenize, false, numeric); + } else if (acceptField.equals(FieldSelectorResult.SIZE)){ + skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed, numeric)); + } else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){ + addFieldSize(doc, fi, binary, compressed, numeric); break; + } else { + skipField(binary, compressed, numeric); } - else { - skipField(binary, compressed); - } } return doc; @@ -312,20 +311,37 @@ * Skip the field. We still have to read some of the information about the field, but can skip past the actual content. * This will have the most payoff on large fields. */ - private void skipField(boolean binary, boolean compressed) throws IOException { - skipField(binary, compressed, fieldsStream.readVInt()); + private void skipField(boolean binary, boolean compressed, byte numeric) throws IOException { + final int numBytes; + switch(numeric) { + case 0: + numBytes = fieldsStream.readVInt(); + break; + case FieldsWriter.FIELD_IS_NUMERIC_INT: + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + numBytes = 4; + break; + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + numBytes = 8; + break; + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + + skipField(binary, compressed, numBytes); } private void skipField(boolean binary, boolean compressed, int toRead) throws IOException { - if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { - fieldsStream.seek(fieldsStream.getFilePointer() + toRead); - } else { - // We need to skip chars. This will slow us down, but still better - fieldsStream.skipChars(toRead); - } + if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { + fieldsStream.seek(fieldsStream.getFilePointer() + toRead); + } else { + // We need to skip chars. This will slow us down, but still better + fieldsStream.skipChars(toRead); + } } - private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, boolean cacheResult) throws IOException { + private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, boolean cacheResult, byte numeric) throws IOException { if (binary) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); @@ -333,6 +349,8 @@ doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed, cacheResult)); //Need to move the pointer ahead by toRead positions fieldsStream.seek(pointer + toRead); + } else if (numeric != 0) { + doc.add(loadNumericField(fi.name, numeric)); } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); @@ -366,8 +384,36 @@ } - private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws CorruptIndexException, IOException { + private NumericField loadNumericField(String fieldName, byte numeric) throws IOException { + assert numeric != 0; + switch(numeric) { + case FieldsWriter.FIELD_IS_NUMERIC_INT: { + final byte[] b = new byte[4]; + fieldsStream.readBytes(b, 0, b.length); + return new NumericField(fieldName).setIntValue(NumericUtils.bytesToInt(b)); + } + case FieldsWriter.FIELD_IS_NUMERIC_LONG: { + final byte[] b = new byte[8]; + fieldsStream.readBytes(b, 0, b.length); + return new NumericField(fieldName).setLongValue(NumericUtils.bytesToLong(b)); + } + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: { + final byte[] b = new byte[4]; + fieldsStream.readBytes(b, 0, b.length); + return new NumericField(fieldName).setFloatValue(NumericUtils.bytesToFloat(b)); + } + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: { + final byte[] b = new byte[8]; + fieldsStream.readBytes(b, 0, b.length); + return new NumericField(fieldName).setDoubleValue(NumericUtils.bytesToDouble(b)); + } + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + } + private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, byte numeric) throws CorruptIndexException, IOException { + //we have a binary stored field, and it may be compressed if (binary) { int toRead = fieldsStream.readVInt(); @@ -378,6 +424,8 @@ } else { doc.add(new Field(fi.name, b)); } + } else if (numeric != 0) { + doc.add(loadNumericField(fi.name, numeric)); } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); @@ -415,8 +463,24 @@ // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) // Read just the size -- caller must skip the field content to continue reading fields // Return the size in bytes or chars, depending on field type - private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed) throws IOException { - int size = fieldsStream.readVInt(), bytesize = binary || compressed ? size : 2*size; + private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed, byte numeric) throws IOException { + final int bytesize, size; + switch(numeric) { + case 0: + size = fieldsStream.readVInt(); + bytesize = (binary || compressed) ? size : 2*size; + break; + case FieldsWriter.FIELD_IS_NUMERIC_INT: + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + size = bytesize = 4; + break; + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + size = bytesize = 8; + break; + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } byte[] sizebytes = new byte[4]; sizebytes[0] = (byte) (bytesize>>>24); sizebytes[1] = (byte) (bytesize>>>16); @@ -427,7 +491,7 @@ } /** - * A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is + * A Lazy implementation of Fieldable that defers loading of fields until asked for, instead of when the Document is * loaded. */ private class LazyField extends AbstractField implements Fieldable { @@ -519,13 +583,13 @@ } catch (IOException e) { throw new FieldReaderException(e); } - if (cacheResult == true){ - fieldsData = value; - } - return value; + if (cacheResult == true){ + fieldsData = value; + } + return value; } else{ - return (String) fieldsData; - } + return (String) fieldsData; + } } } @@ -574,25 +638,24 @@ if (isCompressed == true) { value = uncompress(b); } else { - value = b; - } + value = b; + } } catch (IOException e) { throw new FieldReaderException(e); } binaryOffset = 0; binaryLength = toRead; - if (cacheResult == true){ - fieldsData = value; - } - return value; + if (cacheResult == true){ + fieldsData = value; + } + return value; } else{ - return (byte[]) fieldsData; - } - - - } else - return null; + return (byte[]) fieldsData; + } + } else { + return null; + } } } Index: lucene/src/java/org/apache/lucene/index/FieldsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldsWriter.java (revision 1099371) +++ lucene/src/java/org/apache/lucene/index/FieldsWriter.java (working copy) @@ -21,20 +21,33 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.NumericField; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.NumericUtils; final class FieldsWriter { static final byte FIELD_IS_TOKENIZED = 0x1; static final byte FIELD_IS_BINARY = 0x2; - + static final byte FIELD_IS_NUMERIC = 0x8; + /** @deprecated Kept for backwards-compatibility with <3.0 indexes; will be removed in 4.0 */ @Deprecated static final byte FIELD_IS_COMPRESSED = 0x4; + private static final int _NUMERIC_BIT_SHIFT = 3; + static final byte FIELD_IS_NUMERIC_MASK = (byte) (0x07 << _NUMERIC_BIT_SHIFT); + + static final byte FIELD_IS_NUMERIC_INT = (byte) (1 << _NUMERIC_BIT_SHIFT); + static final byte FIELD_IS_NUMERIC_LONG = (byte) (2 << _NUMERIC_BIT_SHIFT); + static final byte FIELD_IS_NUMERIC_FLOAT = (byte) (3 << _NUMERIC_BIT_SHIFT); + static final byte FIELD_IS_NUMERIC_DOUBLE = (byte) (4 << _NUMERIC_BIT_SHIFT); + // currently unused: static final byte FIELD_IS_NUMERIC_SHORT = (byte) (5 << _NUMERIC_BIT_SHIFT); + // currently unused: static final byte FIELD_IS_NUMERIC_BYTE = (byte) (6 << _NUMERIC_BIT_SHIFT); + // Original format static final int FORMAT = 0; @@ -44,10 +57,13 @@ // Lucene 3.0: Removal of compressed fields static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2; + // Lucene 3.2: NumericFields are stored in binary format + static final int FORMAT_LUCENE_3_2_NUMERIC_FIELDS = 3; + // NOTE: if you introduce a new format, make it 1 higher // than the current one, and always change this if you // switch to a new format! - static final int FORMAT_CURRENT = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; + static final int FORMAT_CURRENT = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; private FieldInfos fieldInfos; @@ -136,10 +152,32 @@ fieldsStream.writeVInt(fi.number); byte bits = 0; if (field.isTokenized()) - bits |= FieldsWriter.FIELD_IS_TOKENIZED; + bits |= FIELD_IS_TOKENIZED; if (field.isBinary()) - bits |= FieldsWriter.FIELD_IS_BINARY; + bits |= FIELD_IS_BINARY; + final byte[] numBytes; + + if (field instanceof NumericField) { + final Number n = ((NumericField) field).getNumericValue(); + if (n instanceof Integer) { + bits |= FIELD_IS_NUMERIC_INT; + numBytes = NumericUtils.intToBytes(((Integer) n).intValue()); + } else if (n instanceof Long) { + bits |= FIELD_IS_NUMERIC_LONG; + numBytes = NumericUtils.longToBytes(((Long) n).longValue()); + } else if (n instanceof Float) { + bits |= FIELD_IS_NUMERIC_FLOAT; + numBytes = NumericUtils.floatToBytes(((Float) n).floatValue()); + } else { + assert n instanceof Double; + bits |= FIELD_IS_NUMERIC_DOUBLE; + numBytes = NumericUtils.doubleToBytes(((Double) n).doubleValue()); + } + } else { + numBytes = null; + } + fieldsStream.writeByte(bits); if (field.isBinary()) { @@ -152,8 +190,9 @@ fieldsStream.writeVInt(len); fieldsStream.writeBytes(data, offset, len); - } - else { + } else if (numBytes != null) { + fieldsStream.writeBytes(numBytes, 0, numBytes.length); + } else { fieldsStream.writeString(field.stringValue()); } } Index: lucene/src/java/org/apache/lucene/util/NumericUtils.java =================================================================== --- lucene/src/java/org/apache/lucene/util/NumericUtils.java (revision 1099371) +++ lucene/src/java/org/apache/lucene/util/NumericUtils.java (working copy) @@ -465,5 +465,55 @@ } } - + + /** Below methods used for encoding NumericField into + * index's stored fields: */ + + public static byte[] intToBytes(int val) { + byte[] arr = new byte[4]; + arr[0] = (byte)(val>>>24); + arr[1] = (byte)(val>>>16); + arr[2] = (byte)(val>>>8); + arr[3] = (byte)(val); + return arr; + } + + public static int bytesToInt(byte[] arr) { + return (arr[0]<<24) | ((arr[1]&0xff)<<16) | ((arr[2]&0xff)<<8) | (arr[3]&0xff); + } + + public static byte[] longToBytes(long val) { + byte[] arr = new byte[8]; + arr[0] = (byte)(val>>>56); + arr[1] = (byte)(val>>>48); + arr[2] = (byte)(val>>>40); + arr[3] = (byte)(val>>>32); + arr[4] = (byte)(val>>>24); + arr[5] = (byte)(val>>>16); + arr[6] = (byte)(val>>>8); + arr[7] = (byte)(val); + return arr; + } + + public static long bytesToLong(byte[] arr) { + int high = (arr[0]<<24) | ((arr[1]&0xff)<<16) | ((arr[2]&0xff)<<8) | (arr[3]&0xff); + int low = (arr[4]<<24) | ((arr[5]&0xff)<<16) | ((arr[6]&0xff)<<8) | (arr[7]&0xff); + return (((long)high)<<32) | (low&0x0ffffffffL); + } + + public static byte[] floatToBytes(float val) { + return intToBytes(Float.floatToRawIntBits(val)); + } + + public static float bytesToFloat(byte[] arr) { + return Float.intBitsToFloat(bytesToInt(arr)); + } + + public static byte[] doubleToBytes(double val) { + return longToBytes(Double.doubleToRawLongBits(val)); + } + + public static double bytesToDouble(byte[] arr) { + return Double.longBitsToDouble(bytesToLong(arr)); + } } Index: lucene/src/test/org/apache/lucene/document/TestDocument.java =================================================================== --- lucene/src/test/org/apache/lucene/document/TestDocument.java (revision 1099371) +++ lucene/src/test/org/apache/lucene/document/TestDocument.java (working copy) @@ -3,13 +3,14 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -279,4 +280,60 @@ // expected } } + + public void testNumericField() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random, dir); + final int numDocs = _TestUtil.nextInt(random, 500, 1000) * RANDOM_MULTIPLIER; + final Number[] answers = new Number[numDocs]; + for(int id=0;id