Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1143927) +++ lucene/CHANGES.txt (working copy) @@ -238,6 +238,11 @@ * LUCENE-3146: IndexReader.setNorm throws IllegalStateException if the field does not store norms. (Shai Erera, Mike McCandless) + +* LUCENE-3309: Stored fields no longer record whether they were + tokenized or not. In general you should not rely on stored fields + to record any "metadata" from indexing (tokenized, omitNorms, + IndexOptions, boost, etc.) (Mike McCandless) API Changes Index: lucene/src/java/org/apache/lucene/index/FieldsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldsReader.java (revision 1143927) +++ lucene/src/java/org/apache/lucene/index/FieldsReader.java (working copy) @@ -206,6 +206,54 @@ indexStream.seek(FORMAT_SIZE + (docID + docStoreOffset) * 8L); } + public final void visitDocument(int n, IndexReader.FieldVisitor visitor) throws CorruptIndexException, IOException { + seekIndex(n); + fieldsStream.seek(indexStream.readLong()); + + final int numFields = fieldsStream.readVInt(); + for (int fieldIDX = 0; fieldIDX < numFields; fieldIDX++) { + int fieldNumber = fieldsStream.readVInt(); + FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); + + int bits = fieldsStream.readByte() & 0xFF; + assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); + + final boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; + final int numeric = bits & FieldsWriter.FIELD_IS_NUMERIC_MASK; + + final boolean doStop; + if (binary) { + final int numBytes = fieldsStream.readVInt(); + doStop = visitor.binaryField(fieldInfo, fieldsStream, numBytes); + } else if (numeric != 0) { + switch(numeric) { + case FieldsWriter.FIELD_IS_NUMERIC_INT: + doStop = visitor.intField(fieldInfo, fieldsStream.readInt()); + break; + case FieldsWriter.FIELD_IS_NUMERIC_LONG: + doStop = visitor.longField(fieldInfo, fieldsStream.readLong()); + break; + case FieldsWriter.FIELD_IS_NUMERIC_FLOAT: + doStop = visitor.floatField(fieldInfo, Float.intBitsToFloat(fieldsStream.readInt())); + break; + case FieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + doStop = visitor.doubleField(fieldInfo, Double.longBitsToDouble(fieldsStream.readLong())); + break; + default: + throw new FieldReaderException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + } else { + // Text: + final int numUTF8Bytes = fieldsStream.readVInt(); + doStop = visitor.stringField(fieldInfo, fieldsStream, numUTF8Bytes); + } + + if (doStop) { + return; + } + } + } + public final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { seekIndex(n); long position = indexStream.readLong(); @@ -219,9 +267,9 @@ FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name); int bits = fieldsStream.readByte() & 0xFF; - assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_TOKENIZED | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); + assert bits <= (FieldsWriter.FIELD_IS_NUMERIC_MASK | FieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); - boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; + boolean tokenize = true; // made up! boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; final int numeric = bits & FieldsWriter.FIELD_IS_NUMERIC_MASK; Index: lucene/src/java/org/apache/lucene/index/FieldsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldsWriter.java (revision 1143927) +++ lucene/src/java/org/apache/lucene/index/FieldsWriter.java (working copy) @@ -25,7 +25,7 @@ import org.apache.lucene.util.IOUtils; final class FieldsWriter { - static final int FIELD_IS_TOKENIZED = 1 << 0; + // NOTE: bit 0 is free here! static final int FIELD_IS_BINARY = 1 << 1; // the old bit 1 << 2 was compressed, is now left out @@ -137,11 +137,6 @@ final void writeField(int fieldNumber, IndexableField field) throws IOException { fieldsStream.writeVInt(fieldNumber); int bits = 0; - // nocommit -- when we decouple analysis we should stop - // recording this: - if (field.indexed() && field.tokenized()) { - bits |= FIELD_IS_TOKENIZED; - } final BytesRef bytes; final String string; // nocommit -- maybe a field should serialize itself? Index: lucene/src/java/org/apache/lucene/index/ParallelReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/ParallelReader.java (revision 1143927) +++ lucene/src/java/org/apache/lucene/index/ParallelReader.java (working copy) @@ -345,6 +345,14 @@ hasDeletions = false; } + @Override + public void document(int docID, FieldVisitor visitor) throws CorruptIndexException, IOException { + ensureOpen(); + for (final IndexReader reader: storedFieldReaders) { + reader.document(docID, visitor); + } + } + // append fields from storedFieldReaders @Override public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { Index: lucene/src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/SegmentReader.java (revision 1143927) +++ lucene/src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -457,6 +457,11 @@ return core.fieldInfos; } + public void document(int docID, FieldVisitor visitor) throws CorruptIndexException, IOException { + ensureOpen(); + getFieldsReader().visitDocument(docID, visitor); + } + @Override public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { ensureOpen(); Index: lucene/src/java/org/apache/lucene/index/MultiReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/MultiReader.java (revision 1143927) +++ lucene/src/java/org/apache/lucene/index/MultiReader.java (working copy) @@ -257,6 +257,13 @@ return maxDoc; } + @Override + public void document(int docID, FieldVisitor visitor) throws CorruptIndexException, IOException { + ensureOpen(); + int i = readerIndex(docID); // find segment num + subReaders[i].document(docID - starts[i], visitor); // dispatch to segment reader + } + // inherit javadoc @Override public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { Index: lucene/src/java/org/apache/lucene/index/DirectoryReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DirectoryReader.java (revision 1143927) +++ lucene/src/java/org/apache/lucene/index/DirectoryReader.java (working copy) @@ -557,6 +557,13 @@ return maxDoc; } + @Override + public void document(int docID, FieldVisitor visitor) throws CorruptIndexException, IOException { + ensureOpen(); + int i = readerIndex(docID); // find segment num + subReaders[i].document(docID - starts[i], visitor); // dispatch to segment reader + } + // inherit javadoc @Override public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { Index: lucene/src/java/org/apache/lucene/index/FilterIndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FilterIndexReader.java (revision 1143927) +++ lucene/src/java/org/apache/lucene/index/FilterIndexReader.java (working copy) @@ -341,6 +341,12 @@ } @Override + public void document(int docID, FieldVisitor visitor) throws CorruptIndexException, IOException { + ensureOpen(); + in.document(docID, visitor); + } + + @Override public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { ensureOpen(); return in.document(n, fieldSelector); Index: lucene/src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- lucene/src/java/org/apache/lucene/index/SegmentMerger.java (revision 1143927) +++ lucene/src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -310,6 +310,10 @@ // skip deleted docs continue; } + // TODO: this could be more efficient using + // FieldVisitor instead of loading/writing entire + // doc; ie we just have to renumber the field number + // on the fly? // NOTE: it's very important to first assign to doc then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Document doc = reader.document(j); Index: lucene/src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexReader.java (revision 1143927) +++ lucene/src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -17,27 +17,29 @@ * limitations under the License. */ +import java.io.Closeable; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.search.FieldCache; // javadocs import org.apache.lucene.search.Similarity; -import org.apache.lucene.index.codecs.Codec; -import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.store.*; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.ReaderUtil; // for javadocs -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.Closeable; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; - /** IndexReader is an abstract class, providing an interface for accessing an index. Search of an index is done entirely through this abstract interface, so that any subclass which implements it is searchable. @@ -949,6 +951,54 @@ return maxDoc() - numDocs(); } + /** @lucene.experimental */ + public interface FieldVisitor { + /** Process a binary field. Note that if you want to + * skip the field you must seek the IndexInput + * (e.g., call in.seek(numUTF8Bytes + in.getFilePointer()) + * + *

Return true to stop loading fields. */ + public boolean binaryField(FieldInfo fieldInfo, IndexInput in, int numBytes) throws IOException; + + /** Process a string field by reading numUTF8Bytes. + * Note that if you want to skip the field you must + * seek the IndexInput as if you had read numBytes by + * (e.g., call in.seek(numUTF8Bytes + in.getFilePointer()) + * + *

Return true to stop loading fields. */ + public boolean stringField(FieldInfo fieldInfo, IndexInput in, int numUTF8Bytes) throws IOException; + + /** Process a int numeric field. + * + *

Return true to stop loading fields. */ + public boolean intField(FieldInfo fieldInfo, int value) throws IOException; + + /** Process a long numeric field. + * + *

Return true to stop loading fields. */ + public boolean longField(FieldInfo fieldInfo, long value) throws IOException; + + /** Process a float numeric field. + * + *

Return true to stop loading fields. */ + public boolean floatField(FieldInfo fieldInfo, float value) throws IOException; + + /** Process a double numeric field. + * + *

Return true to stop loading fields. */ + public boolean doubleField(FieldInfo fieldInfo, double value) throws IOException; + } + + /** Expert: visits the fields of a stored document, for + * custom processing/loading of each field. If you + * simply want to load all fields, use {@link document(int)}. */ + public abstract void document(int docID, FieldVisitor visitor) throws CorruptIndexException, IOException; + + // nocommit -- the new document(int docID) API should + // clearly advertise that only field types/values are + // preserved -- index time metadata like boost, omitNorm, + // IndexOptions, tokenized are not preserved + /** * Returns the stored fields of the nth * Document in this index. @@ -964,7 +1014,61 @@ */ public Document document(int n) throws CorruptIndexException, IOException { ensureOpen(); - return document(n, null); + + // return document(n, (FieldSelector) null); + + final Document doc = new Document(); + document(n, new FieldVisitor() { + + @Override + public boolean binaryField(FieldInfo fieldInfo, IndexInput in, int numBytes) throws IOException { + final byte[] b = new byte[numBytes]; + in.readBytes(b, 0, b.length); + doc.add(new Field(fieldInfo.name, b)); + return false; + } + + @Override + public boolean stringField(FieldInfo fieldInfo, IndexInput in, int numUTF8Bytes) throws IOException { + final byte[] b = new byte[numUTF8Bytes]; + in.readBytes(b, 0, b.length); + doc.add(new Field(fieldInfo.name, + false, + new String(b, "UTF-8"), + Field.Store.YES, + Field.Index.ANALYZED, // made up! + Field.TermVector.toTermVector(fieldInfo.storeTermVector, + fieldInfo.storeOffsetWithTermVector, + fieldInfo.storePositionWithTermVector))); + return false; + } + + @Override + public boolean intField(FieldInfo fieldInfo, int value) { + doc.add(new NumericField(fieldInfo.name, Field.Store.YES, fieldInfo.isIndexed).setIntValue(value)); + return false; + } + + @Override + public boolean longField(FieldInfo fieldInfo, long value) { + doc.add(new NumericField(fieldInfo.name, Field.Store.YES, fieldInfo.isIndexed).setLongValue(value)); + return false; + } + + @Override + public boolean floatField(FieldInfo fieldInfo, float value) { + doc.add(new NumericField(fieldInfo.name, Field.Store.YES, fieldInfo.isIndexed).setFloatValue(value)); + return false; + } + + @Override + public boolean doubleField(FieldInfo fieldInfo, double value) { + doc.add(new NumericField(fieldInfo.name, Field.Store.YES, fieldInfo.isIndexed).setDoubleValue(value)); + return false; + } + }); + + return doc; } /** Index: lucene/contrib/CHANGES.txt =================================================================== --- lucene/contrib/CHANGES.txt (revision 1143927) +++ lucene/contrib/CHANGES.txt (working copy) @@ -5,6 +5,14 @@ ======================= Trunk (not yet released) ======================= +Changes in Runtime Behavior + + * LUCENE-3309: Fast vector highlighter now inserts the + MultiValuedSeparator for NOT_ANALYZED fields (in addition to + ANALYZED fields). To ensure your offsets are correct you should + provide an analyzer that returns 1 from the offsetGap method. + (Mike McCandless) + Build * LUCENE-2845: Moved contrib/benchmark to modules. Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java =================================================================== --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (revision 1143927) +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (working copy) @@ -313,6 +313,11 @@ return getIndex().getDocumentsByNumber()[n].getDocument(); } + @Override + public void document(int docID, FieldVisitor visitor) throws IOException { + throw new UnsupportedOperationException(); + } + /** * never ever touch these values. it is the true values, unless norms have * been touched. Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java =================================================================== --- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 1143927) +++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy) @@ -1230,6 +1230,12 @@ return new Document(); // there are no stored fields } + @Override + public void document(int docID, FieldVisitor visitor) { + if (DEBUG) System.err.println("MemoryIndexReader.document"); + // no-op: there are no stored fields + } + //When we convert to JDK 1.5 make this Set @Override public Document document(int n, FieldSelector fieldSelector) throws IOException { Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (revision 1145297) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.java (working copy) @@ -34,9 +34,10 @@ import org.apache.lucene.document2.TextField; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.PhraseQuery; @@ -88,7 +89,26 @@ super.setUp(); analyzerW = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false); analyzerB = new BigramAnalyzer(); - analyzerK = new MockAnalyzer(random, MockTokenizer.KEYWORD, false); + final Analyzer k = new MockAnalyzer(random, MockTokenizer.KEYWORD, false); + analyzerK = new Analyzer() { + @Override + public TokenStream tokenStream(String fieldName, Reader reader) { + return k.tokenStream(fieldName, reader); + } + + @Override + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + return k.reusableTokenStream(fieldName, reader); + } + + @Override + public int getOffsetGap(IndexableField field) { + // Because we add single-char separator for all + // (even not-tokenized) fields: + return 1; + } + }; + paW = new QueryParser(TEST_VERSION_CURRENT, F, analyzerW ); paB = new QueryParser(TEST_VERSION_CURRENT, F, analyzerB ); dir = newDirectory(); Index: lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java =================================================================== --- lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java (revision 1145297) +++ lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java (working copy) @@ -152,9 +152,8 @@ SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); - // '/' separator doesn't effect the snippet because of NOT_ANALYZED field sfb.setMultiValuedSeparator( '/' ); - assertEquals( "abcdefghijkl", sfb.createFragment( reader, 0, F, ffl ) ); + assertEquals( "abc/defg/hijkl/", sfb.createFragment( reader, 0, F, ffl ) ); } public void testMVSeparator() throws Exception { Index: lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java =================================================================== --- lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (revision 1143927) +++ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java (working copy) @@ -142,8 +142,7 @@ int startOffset, int endOffset ){ while( buffer.length() < endOffset && index[0] < values.length ){ buffer.append( values[index[0]].stringValue() ); - if( values[index[0]].isTokenized() ) - buffer.append( multiValuedSeparator ); + buffer.append( multiValuedSeparator ); index[0]++; } int eo = buffer.length() < endOffset ? buffer.length() : endOffset;