diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java index ab89821..c838a55 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java @@ -30,11 +30,14 @@ import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; import java.io.Closeable; import java.util.Set; +import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.*; + /** * Class responsible for access to stored document fields. *

@@ -44,14 +47,13 @@ import java.util.Set; * @lucene.internal */ public final class Lucene40StoredFieldsReader extends StoredFieldsReader implements Cloneable, Closeable { - private final static int FORMAT_SIZE = 4; - private final FieldInfos fieldInfos; private final IndexInput fieldsStream; private final IndexInput indexStream; private int numTotalDocs; private int size; private boolean closed; + private final long headerLength; /** Returns a cloned FieldsReader that shares open * IndexInputs with the original one. It is the caller's @@ -61,16 +63,17 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme @Override public Lucene40StoredFieldsReader clone() { ensureOpen(); - return new Lucene40StoredFieldsReader(fieldInfos, numTotalDocs, size, (IndexInput)fieldsStream.clone(), (IndexInput)indexStream.clone()); + return new Lucene40StoredFieldsReader(fieldInfos, numTotalDocs, size, (IndexInput)fieldsStream.clone(), (IndexInput)indexStream.clone(), headerLength); } // Used only by clone - private Lucene40StoredFieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, IndexInput fieldsStream, IndexInput indexStream) { + private Lucene40StoredFieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, IndexInput fieldsStream, IndexInput indexStream, long headerLength) { this.fieldInfos = fieldInfos; this.numTotalDocs = numTotalDocs; this.size = size; this.fieldsStream = fieldsStream; this.indexStream = indexStream; + this.headerLength = headerLength; } public Lucene40StoredFieldsReader(Directory d, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { @@ -78,17 +81,15 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme boolean success = false; fieldInfos = fn; try { - fieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, "", Lucene40StoredFieldsWriter.FIELDS_EXTENSION), context); - final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION); + fieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context); + final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION); indexStream = d.openInput(indexStreamFN, context); - // its a 4.0 codec: so its not too-old, its corrupt. - // TODO: change this to CodecUtil.checkHeader - if (Lucene40StoredFieldsWriter.FORMAT_CURRENT != indexStream.readInt()) { - throw new CorruptIndexException("unexpected fdx header: " + indexStream); - } - - final long indexSize = indexStream.length() - FORMAT_SIZE; + CodecUtil.checkHeader(indexStream, CODEC_NAME, VERSION_START, VERSION_CURRENT); + CodecUtil.checkHeader(fieldsStream, CODEC_NAME, VERSION_START, VERSION_CURRENT); + headerLength = fieldsStream.getFilePointer(); + assert headerLength == indexStream.getFilePointer(); + final long indexSize = indexStream.length() - headerLength; this.size = (int) (indexSize >> 3); // Verify two sources of "maxDoc" agree: if (this.size != si.docCount) { @@ -135,7 +136,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme } private void seekIndex(int docID) throws IOException { - indexStream.seek(FORMAT_SIZE + docID * 8L); + indexStream.seek(headerLength + docID * 8L); } public final void visitDocument(int n, StoredFieldVisitor visitor) throws CorruptIndexException, IOException { @@ -148,7 +149,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); int bits = fieldsStream.readByte() & 0xFF; - assert bits <= (Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_MASK | Lucene40StoredFieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); + assert bits <= (FIELD_IS_NUMERIC_MASK | FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); switch(visitor.needsField(fieldInfo)) { case YES: @@ -164,19 +165,19 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme } private void readField(StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException { - final int numeric = bits & Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_MASK; + final int numeric = bits & FIELD_IS_NUMERIC_MASK; if (numeric != 0) { switch(numeric) { - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_INT: + case FIELD_IS_NUMERIC_INT: visitor.intField(info, fieldsStream.readInt()); return; - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_LONG: + case FIELD_IS_NUMERIC_LONG: visitor.longField(info, fieldsStream.readLong()); return; - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_FLOAT: + case FIELD_IS_NUMERIC_FLOAT: visitor.floatField(info, Float.intBitsToFloat(fieldsStream.readInt())); return; - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + case FIELD_IS_NUMERIC_DOUBLE: visitor.doubleField(info, Double.longBitsToDouble(fieldsStream.readLong())); return; default: @@ -186,7 +187,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme final int length = fieldsStream.readVInt(); byte bytes[] = new byte[length]; fieldsStream.readBytes(bytes, 0, length); - if ((bits & Lucene40StoredFieldsWriter.FIELD_IS_BINARY) != 0) { + if ((bits & FIELD_IS_BINARY) != 0) { visitor.binaryField(info, bytes, 0, bytes.length); } else { visitor.stringField(info, new String(bytes, 0, bytes.length, IOUtils.CHARSET_UTF_8)); @@ -195,15 +196,15 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme } private void skipField(int bits) throws IOException { - final int numeric = bits & Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_MASK; + final int numeric = bits & FIELD_IS_NUMERIC_MASK; if (numeric != 0) { switch(numeric) { - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_INT: - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_FLOAT: + case FIELD_IS_NUMERIC_INT: + case FIELD_IS_NUMERIC_FLOAT: fieldsStream.readInt(); return; - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_LONG: - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + case FIELD_IS_NUMERIC_LONG: + case FIELD_IS_NUMERIC_DOUBLE: fieldsStream.readLong(); return; default: @@ -242,7 +243,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme } public static void files(SegmentInfo info, Set files) throws IOException { - files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION)); - files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40StoredFieldsWriter.FIELDS_EXTENSION)); + files.add(IndexFileNames.segmentFileName(info.name, "", FIELDS_INDEX_EXTENSION)); + files.add(IndexFileNames.segmentFileName(info.name, "", FIELDS_EXTENSION)); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java index c236d9c..40ee4e4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java @@ -34,6 +34,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; /** @@ -62,16 +63,10 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { // currently unused: static final int FIELD_IS_NUMERIC_SHORT = 5 << _NUMERIC_BIT_SHIFT; // currently unused: static final int FIELD_IS_NUMERIC_BYTE = 6 << _NUMERIC_BIT_SHIFT; - // (Happens to be the same as for now) Lucene 3.2: NumericFields are stored in binary format - static final int FORMAT_LUCENE_3_2_NUMERIC_FIELDS = 3; + static final String CODEC_NAME = "Lucene40StoredFields"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; - // NOTE: if you introduce a new format, make it 1 higher - // than the current one, and always change this if you - // switch to a new format! - static final int FORMAT_CURRENT = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; - - // when removing support for old versions, leave the last supported version here - static final int FORMAT_MINIMUM = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; /** Extension of stored fields file */ public static final String FIELDS_EXTENSION = "fdt"; @@ -83,6 +78,7 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { private final String segment; private IndexOutput fieldsStream; private IndexOutput indexStream; + private final long headerLength; public Lucene40StoredFieldsWriter(Directory directory, String segment, IOContext context) throws IOException { assert directory != null; @@ -94,9 +90,10 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { fieldsStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context); indexStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION), context); - fieldsStream.writeInt(FORMAT_CURRENT); - indexStream.writeInt(FORMAT_CURRENT); - + CodecUtil.writeHeader(fieldsStream, CODEC_NAME, VERSION_CURRENT); + CodecUtil.writeHeader(indexStream, CODEC_NAME, VERSION_CURRENT); + headerLength = fieldsStream.getFilePointer(); + assert headerLength == indexStream.getFilePointer(); success = true; } finally { if (!success) { @@ -209,7 +206,7 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { @Override public void finish(int numDocs) throws IOException { - if (4+((long) numDocs)*8 != indexStream.getFilePointer()) + if (headerLength+((long) numDocs)*8 != indexStream.getFilePointer()) // This is most likely a bug in Sun JRE 1.6.0_04/_05; // we detect that the bug has struck, here, and // throw an exception to prevent the corruption from diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java index c0420d1..5681e94 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java @@ -33,8 +33,6 @@ import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexFormatTooNewException; -import org.apache.lucene.index.IndexFormatTooOldException; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -43,8 +41,10 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; + /** * Lucene 4.0 Term Vectors reader. *

@@ -54,22 +54,6 @@ import org.apache.lucene.util.IOUtils; */ public class Lucene40TermVectorsReader extends TermVectorsReader { - // NOTE: if you make a new format, it must be larger than - // the current format - - // Changed strings to UTF8 with length-in-bytes not length-in-chars - static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4; - - // NOTE: always change this if you switch to a new format! - // whenever you add a new format, make it 1 larger (positive version logic)! - static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES; - - // when removing support for old versions, leave the last supported version here - static final int FORMAT_MINIMUM = FORMAT_UTF8_LENGTH_IN_BYTES; - - //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file - static final int FORMAT_SIZE = 4; - static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1; static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2; @@ -82,6 +66,12 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { /** Extension of vectors index file */ static final String VECTORS_INDEX_EXTENSION = "tvx"; + + static final String CODEC_NAME = "Lucene40TermVectors"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + private final long headerLength; private FieldInfos fieldInfos; @@ -91,17 +81,16 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { private int size; private int numTotalDocs; - private final int format; // used by clone - Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int format) { + Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, long headerLength) { this.fieldInfos = fieldInfos; this.tvx = tvx; this.tvd = tvd; this.tvf = tvf; this.size = size; this.numTotalDocs = numTotalDocs; - this.format = format; + this.headerLength = headerLength; } public Lucene40TermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context) @@ -114,18 +103,21 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { try { String idxName = IndexFileNames.segmentFileName(segment, "", VECTORS_INDEX_EXTENSION); tvx = d.openInput(idxName, context); - format = checkValidFormat(tvx); + final int tvxVersion = CodecUtil.checkHeader(tvx, CODEC_NAME, VERSION_START, VERSION_CURRENT); + headerLength = tvx.getFilePointer(); String fn = IndexFileNames.segmentFileName(segment, "", VECTORS_DOCUMENTS_EXTENSION); tvd = d.openInput(fn, context); - final int tvdFormat = checkValidFormat(tvd); + final int tvdVersion = CodecUtil.checkHeader(tvd, CODEC_NAME, VERSION_START, VERSION_CURRENT); fn = IndexFileNames.segmentFileName(segment, "", VECTORS_FIELDS_EXTENSION); tvf = d.openInput(fn, context); - final int tvfFormat = checkValidFormat(tvf); - - assert format == tvdFormat; - assert format == tvfFormat; + final int tvfVersion = CodecUtil.checkHeader(tvf, CODEC_NAME, VERSION_START, VERSION_CURRENT); + + assert headerLength == tvd.getFilePointer(); + assert headerLength == tvf.getFilePointer(); + assert tvxVersion == tvdVersion; + assert tvxVersion == tvfVersion; - numTotalDocs = (int) (tvx.length() >> 4); + numTotalDocs = (int) (tvx.length()-headerLength >> 4); this.size = numTotalDocs; assert size == 0 || numTotalDocs == size; @@ -156,13 +148,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { // Not private to avoid synthetic access$NNN methods void seekTvx(final int docNum) throws IOException { - tvx.seek(docNum * 16L + FORMAT_SIZE); - } - - boolean canReadRawDocs() { - // we can always read raw docs, unless the term vectors - // didn't exist - return format != 0; + tvx.seek(docNum * 16L + headerLength); } /** Retrieve the length (in bytes) of the tvd and tvf @@ -210,16 +196,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { } } - private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException - { - int format = in.readInt(); - if (format < FORMAT_MINIMUM) - throw new IndexFormatTooOldException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT); - if (format > FORMAT_CURRENT) - throw new IndexFormatTooNewException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT); - return format; - } - public void close() throws IOException { IOUtils.close(tvx, tvd, tvf); } @@ -708,7 +684,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { cloneTvf = (IndexInput) tvf.clone(); } - return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, format); + return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, headerLength); } public static void files(SegmentInfo info, Set files) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java index 372db23..3d2fdff 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java @@ -35,9 +35,13 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.StringHelper; +import static org.apache.lucene.codecs.lucene40.Lucene40TermVectorsReader.*; + + // TODO: make a new 4.0 TV format that encodes better // - use startOffset (not endOffset) as base for delta on // next startOffset because today for syns or ngrams or @@ -58,6 +62,8 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { private final Directory directory; private final String segment; private IndexOutput tvx = null, tvd = null, tvf = null; + private final long headerLength; + public Lucene40TermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException { this.directory = directory; @@ -66,11 +72,14 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { try { // Open files for TermVector storage tvx = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene40TermVectorsReader.VECTORS_INDEX_EXTENSION), context); - tvx.writeInt(Lucene40TermVectorsReader.FORMAT_CURRENT); + CodecUtil.writeHeader(tvx, CODEC_NAME, VERSION_CURRENT); tvd = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene40TermVectorsReader.VECTORS_DOCUMENTS_EXTENSION), context); - tvd.writeInt(Lucene40TermVectorsReader.FORMAT_CURRENT); + CodecUtil.writeHeader(tvd, CODEC_NAME, VERSION_CURRENT); tvf = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene40TermVectorsReader.VECTORS_FIELDS_EXTENSION), context); - tvf.writeInt(Lucene40TermVectorsReader.FORMAT_CURRENT); + CodecUtil.writeHeader(tvf, CODEC_NAME, VERSION_CURRENT); + headerLength = tvx.getFilePointer(); + assert headerLength == tvd.getFilePointer(); + assert headerLength == tvf.getFilePointer(); success = true; } finally { if (!success) { @@ -252,10 +261,7 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { TermVectorsReader vectorsReader = matchingSegmentReader.getTermVectorsReader(); if (vectorsReader != null && vectorsReader instanceof Lucene40TermVectorsReader) { - // If the TV* files are an older format then they cannot read raw docs: - if (((Lucene40TermVectorsReader)vectorsReader).canReadRawDocs()) { matchingVectorsReader = (Lucene40TermVectorsReader) vectorsReader; - } } } if (reader.liveDocs != null) { @@ -356,7 +362,7 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { @Override public void finish(int numDocs) throws IOException { - if (4+((long) numDocs)*16 != tvx.getFilePointer()) + if (headerLength+((long) numDocs)*16 != tvx.getFilePointer()) // This is most likely a bug in Sun JRE 1.6.0_04/_05; // we detect that the bug has struck, here, and // throw an exception to prevent the corruption from