Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1099371) +++ lucene/CHANGES.txt (working copy) @@ -43,6 +43,14 @@ PhraseQuery as term with lower doc freq will also have less positions. (Uwe Schindler, Robert Muir, Otis Gospodnetic) +* LUCENE-3065: When a NumericField is retrieved from a Document loaded + from IndexReader (or IndexSearcher), it will now come back as + NumericField not as a Field with a string-ified version of the + numeric value you had indexed. Note that this only applies for + newly-indexed Documents; older indices will still return Field + with the string-ified numeric value (Uwe Schindler, Ryan McKinley, + Mike McCandless) + Test Cases * LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to Index: lucene/src/java/org/apache/lucene/index/FieldsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldsReader.java (revision 1099371) +++ lucene/src/java/org/apache/lucene/index/FieldsReader.java (working copy) @@ -17,6 +17,10 @@ * limitations under the License. */ +import java.io.IOException; +import java.io.Reader; +import java.util.zip.DataFormatException; + import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.AbstractField; import org.apache.lucene.document.CompressionTools; @@ -25,16 +29,13 @@ import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.AlreadyClosedException; -import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.util.CloseableThreadLocal; -import java.io.IOException; -import java.io.Reader; -import java.util.zip.DataFormatException; - /** * Class responsible for access to stored document fields. *

@@ -240,43 +241,43 @@ Document doc = new Document(); int numFields = fieldsStream.readVInt(); - for (int i = 0; i < numFields; i++) { + out: for (int i = 0; i < numFields; i++) { int fieldNumber = fieldsStream.readVInt(); FieldInfo fi = fieldInfos.fieldInfo(fieldNumber); FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name); - byte bits = fieldsStream.readByte(); - assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY; + int bits = fieldsStream.readByte() & 0xFF; + assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_COMPRESSED | FieldsWriter.FIELD_IS_TOKENIZED | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true) : "compressed fields are only allowed in indexes of version <= 2.9"; boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; - //TODO: Find an alternative approach here if this list continues to grow beyond the - //list of 5 or 6 currently here. See Lucene 762 for discussion - if (acceptField.equals(FieldSelectorResult.LOAD)) { - addField(doc, fi, binary, compressed, tokenize); + final int numeric = bits & FieldsWriter.FIELD_IS_NUMERIC_MASK; + + switch (acceptField) { + case LOAD: + addField(doc, fi, binary, compressed, tokenize, numeric); + break; + case LOAD_AND_BREAK: + addField(doc, fi, binary, compressed, tokenize, numeric); + break out; //Get out of this loop + case LAZY_LOAD: + addFieldLazy(doc, fi, binary, compressed, tokenize, true, numeric); + break; + case LATENT: + addFieldLazy(doc, fi, binary, compressed, tokenize, false, numeric); + break; + case SIZE: + skipFieldBytes(binary, compressed, addFieldSize(doc, fi, binary, compressed, numeric)); + break; + case SIZE_AND_BREAK: + addFieldSize(doc, fi, binary, compressed, numeric); + break out; //Get out of this loop + default: + skipField(binary, compressed, numeric); } - else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){ - addField(doc, fi, binary, compressed, tokenize); - break;//Get out of this loop - } - else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) { - addFieldLazy(doc, fi, binary, compressed, tokenize, true); - } else if (acceptField.equals(FieldSelectorResult.LATENT)) { - addFieldLazy(doc, fi, binary, compressed, tokenize, false); - } - else if (acceptField.equals(FieldSelectorResult.SIZE)){ - skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed)); - } - else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){ - addFieldSize(doc, fi, binary, compressed); - break; - } - else { - skipField(binary, compressed); - } } return doc; @@ -312,41 +313,73 @@ * Skip the field. We still have to read some of the information about the field, but can skip past the actual content. * This will have the most payoff on large fields. */ - private void skipField(boolean binary, boolean compressed) throws IOException { - skipField(binary, compressed, fieldsStream.readVInt()); + private void skipField(boolean binary, boolean compressed, int numeric) throws IOException { + final int numBytes; + switch(numeric) { + case 0: + numBytes = fieldsStream.readVInt(); + break; + case FieldsWriter.FIELD_IS_NUMERIC_INT: + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + numBytes = 4; + break; + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + numBytes = 8; + break; + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + + skipFieldBytes(binary, compressed, numBytes); } - private void skipField(boolean binary, boolean compressed, int toRead) throws IOException { - if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { - fieldsStream.seek(fieldsStream.getFilePointer() + toRead); - } else { - // We need to skip chars. This will slow us down, but still better - fieldsStream.skipChars(toRead); - } + private void skipFieldBytes(boolean binary, boolean compressed, int toRead) throws IOException { + if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { + fieldsStream.seek(fieldsStream.getFilePointer() + toRead); + } else { + // We need to skip chars. This will slow us down, but still better + fieldsStream.skipChars(toRead); + } } - private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, boolean cacheResult) throws IOException { + private NumericField loadNumericField(FieldInfo fi, int numeric) throws IOException { + assert numeric != 0; + switch(numeric) { + case FieldsWriter.FIELD_IS_NUMERIC_INT: + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setIntValue(fieldsStream.readInt()); + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setLongValue(fieldsStream.readLong()); + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setFloatValue(Float.intBitsToFloat(fieldsStream.readInt())); + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + return new NumericField(fi.name, Field.Store.YES, fi.isIndexed).setDoubleValue(Double.longBitsToDouble(fieldsStream.readLong())); + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + } + + private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, boolean cacheResult, int numeric) throws IOException { + final AbstractField f; if (binary) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); - //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES)); - doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed, cacheResult)); + f = new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed, cacheResult); //Need to move the pointer ahead by toRead positions fieldsStream.seek(pointer + toRead); + } else if (numeric != 0) { + f = loadNumericField(fi, numeric); } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); - AbstractField f; if (compressed) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); f = new LazyField(fi.name, store, toRead, pointer, binary, compressed, cacheResult); //skip over the part that we aren't loading fieldsStream.seek(pointer + toRead); - f.setOmitNorms(fi.omitNorms); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); } else { int length = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); @@ -357,16 +390,16 @@ fieldsStream.skipChars(length); } f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, compressed, cacheResult); - f.setOmitNorms(fi.omitNorms); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - } - - doc.add(f); + } } - + + f.setOmitNorms(fi.omitNorms); + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + doc.add(f); } - private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws CorruptIndexException, IOException { + private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize, int numeric) throws CorruptIndexException, IOException { + final AbstractField f; //we have a binary stored field, and it may be compressed if (binary) { @@ -374,19 +407,18 @@ final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); if (compressed) { - doc.add(new Field(fi.name, uncompress(b))); + f = new Field(fi.name, uncompress(b)); } else { - doc.add(new Field(fi.name, b)); + f = new Field(fi.name, b); } + } else if (numeric != 0) { + f = loadNumericField(fi, numeric); } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); - - AbstractField f; if (compressed) { int toRead = fieldsStream.readVInt(); - final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); f = new Field(fi.name, // field name @@ -395,8 +427,6 @@ store, index, termVector); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - f.setOmitNorms(fi.omitNorms); } else { f = new Field(fi.name, // name false, @@ -404,19 +434,35 @@ store, index, termVector); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - f.setOmitNorms(fi.omitNorms); } - - doc.add(f); } + + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + f.setOmitNorms(fi.omitNorms); + doc.add(f); } // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) // Read just the size -- caller must skip the field content to continue reading fields // Return the size in bytes or chars, depending on field type - private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed) throws IOException { - int size = fieldsStream.readVInt(), bytesize = binary || compressed ? size : 2*size; + private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed, int numeric) throws IOException { + final int bytesize, size; + switch(numeric) { + case 0: + size = fieldsStream.readVInt(); + bytesize = (binary || compressed) ? size : 2*size; + break; + case FieldsWriter.FIELD_IS_NUMERIC_INT: + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + size = bytesize = 4; + break; + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + size = bytesize = 8; + break; + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } byte[] sizebytes = new byte[4]; sizebytes[0] = (byte) (bytesize>>>24); sizebytes[1] = (byte) (bytesize>>>16); @@ -427,7 +473,7 @@ } /** - * A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is + * A Lazy implementation of Fieldable that defers loading of fields until asked for, instead of when the Document is * loaded. */ private class LazyField extends AbstractField implements Fieldable { @@ -519,13 +565,13 @@ } catch (IOException e) { throw new FieldReaderException(e); } - if (cacheResult == true){ - fieldsData = value; - } - return value; + if (cacheResult == true){ + fieldsData = value; + } + return value; } else{ - return (String) fieldsData; - } + return (String) fieldsData; + } } } @@ -574,25 +620,24 @@ if (isCompressed == true) { value = uncompress(b); } else { - value = b; - } + value = b; + } } catch (IOException e) { throw new FieldReaderException(e); } binaryOffset = 0; binaryLength = toRead; - if (cacheResult == true){ - fieldsData = value; - } - return value; + if (cacheResult == true){ + fieldsData = value; + } + return value; } else{ - return (byte[]) fieldsData; - } - - - } else - return null; + return (byte[]) fieldsData; + } + } else { + return null; + } } } Index: lucene/src/java/org/apache/lucene/index/FieldsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldsWriter.java (revision 1099371) +++ lucene/src/java/org/apache/lucene/index/FieldsWriter.java (working copy) @@ -21,20 +21,33 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.NumericField; import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.IOUtils; final class FieldsWriter { - static final byte FIELD_IS_TOKENIZED = 0x1; - static final byte FIELD_IS_BINARY = 0x2; - + static final int FIELD_IS_TOKENIZED = 1 << 0; + static final int FIELD_IS_BINARY = 1 << 1; + /** @deprecated Kept for backwards-compatibility with <3.0 indexes; will be removed in 4.0 */ @Deprecated - static final byte FIELD_IS_COMPRESSED = 0x4; + static final int FIELD_IS_COMPRESSED = 1 << 2; + private static final int _NUMERIC_BIT_SHIFT = 3; + static final int FIELD_IS_NUMERIC_MASK = 0x07 << _NUMERIC_BIT_SHIFT; + + static final int FIELD_IS_NUMERIC_INT = 1 << _NUMERIC_BIT_SHIFT; + static final int FIELD_IS_NUMERIC_LONG = 2 << _NUMERIC_BIT_SHIFT; + static final int FIELD_IS_NUMERIC_FLOAT = 3 << _NUMERIC_BIT_SHIFT; + static final int FIELD_IS_NUMERIC_DOUBLE = 4 << _NUMERIC_BIT_SHIFT; + // currently unused: static final int FIELD_IS_NUMERIC_SHORT = 5 << _NUMERIC_BIT_SHIFT; + // currently unused: static final int FIELD_IS_NUMERIC_BYTE = 6 << _NUMERIC_BIT_SHIFT; + + // the next possible bits are: 1 << 6; 1 << 7 + // Original format static final int FORMAT = 0; @@ -44,10 +57,13 @@ // Lucene 3.0: Removal of compressed fields static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2; + // Lucene 3.2: NumericFields are stored in binary format + static final int FORMAT_LUCENE_3_2_NUMERIC_FIELDS = 3; + // NOTE: if you introduce a new format, make it 1 higher // than the current one, and always change this if you // switch to a new format! - static final int FORMAT_CURRENT = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; + static final int FORMAT_CURRENT = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; private FieldInfos fieldInfos; @@ -134,14 +150,26 @@ final void writeField(FieldInfo fi, Fieldable field) throws IOException { fieldsStream.writeVInt(fi.number); - byte bits = 0; + int bits = 0; if (field.isTokenized()) - bits |= FieldsWriter.FIELD_IS_TOKENIZED; + bits |= FIELD_IS_TOKENIZED; if (field.isBinary()) - bits |= FieldsWriter.FIELD_IS_BINARY; + bits |= FIELD_IS_BINARY; + if (field instanceof NumericField) { + final Number n = ((NumericField) field).getNumericValue(); + if (n instanceof Integer) { + bits |= FIELD_IS_NUMERIC_INT; + } else if (n instanceof Long) { + bits |= FIELD_IS_NUMERIC_LONG; + } else if (n instanceof Float) { + bits |= FIELD_IS_NUMERIC_FLOAT; + } else { + assert n instanceof Double; + bits |= FIELD_IS_NUMERIC_DOUBLE; + } + } + fieldsStream.writeByte((byte) bits); - fieldsStream.writeByte(bits); - if (field.isBinary()) { final byte[] data; final int len; @@ -152,8 +180,25 @@ fieldsStream.writeVInt(len); fieldsStream.writeBytes(data, offset, len); - } - else { + } else if (field instanceof NumericField) { + final Number n = ((NumericField) field).getNumericValue(); + switch (bits & FIELD_IS_NUMERIC_MASK) { + case FIELD_IS_NUMERIC_INT: + fieldsStream.writeInt(n.intValue()); + break; + case FIELD_IS_NUMERIC_LONG: + fieldsStream.writeLong(n.longValue()); + break; + case FIELD_IS_NUMERIC_FLOAT: + fieldsStream.writeInt(Float.floatToIntBits(n.floatValue())); + break; + case FIELD_IS_NUMERIC_DOUBLE: + fieldsStream.writeLong(Double.doubleToLongBits(n.doubleValue())); + break; + default: + assert false : "Should never get here"; + } + } else { fieldsStream.writeString(field.stringValue()); } } Index: lucene/src/test/org/apache/lucene/index/TestFieldsReader.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestFieldsReader.java (revision 1099371) +++ lucene/src/test/org/apache/lucene/index/TestFieldsReader.java (working copy) @@ -27,12 +27,14 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.LoadFirstFieldSelector; import org.apache.lucene.document.SetBasedFieldSelector; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.FieldCache; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; @@ -508,4 +510,61 @@ } } + + public void testNumericField() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random, dir); + final int numDocs = _TestUtil.nextInt(random, 500, 1000) * RANDOM_MULTIPLIER; + final Number[] answers = new Number[numDocs]; + for(int id=0;id