diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java index ab89821..38da5e2 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java @@ -30,11 +30,14 @@ import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; import java.io.Closeable; import java.util.Set; +import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.*; + /** * Class responsible for access to stored document fields. *

@@ -44,8 +47,6 @@ import java.util.Set; * @lucene.internal */ public final class Lucene40StoredFieldsReader extends StoredFieldsReader implements Cloneable, Closeable { - private final static int FORMAT_SIZE = 4; - private final FieldInfos fieldInfos; private final IndexInput fieldsStream; private final IndexInput indexStream; @@ -78,17 +79,15 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme boolean success = false; fieldInfos = fn; try { - fieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, "", Lucene40StoredFieldsWriter.FIELDS_EXTENSION), context); - final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION); + fieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context); + final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION); indexStream = d.openInput(indexStreamFN, context); - // its a 4.0 codec: so its not too-old, its corrupt. - // TODO: change this to CodecUtil.checkHeader - if (Lucene40StoredFieldsWriter.FORMAT_CURRENT != indexStream.readInt()) { - throw new CorruptIndexException("unexpected fdx header: " + indexStream); - } - - final long indexSize = indexStream.length() - FORMAT_SIZE; + CodecUtil.checkHeader(indexStream, CODEC_NAME, VERSION_START, VERSION_CURRENT); + CodecUtil.checkHeader(fieldsStream, CODEC_NAME, VERSION_START, VERSION_CURRENT); + assert HEADER_LENGTH == fieldsStream.getFilePointer(); + assert HEADER_LENGTH == indexStream.getFilePointer(); + final long indexSize = indexStream.length() - HEADER_LENGTH; this.size = (int) (indexSize >> 3); // Verify two sources of "maxDoc" agree: if (this.size != si.docCount) { @@ -135,7 +134,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme } private void seekIndex(int docID) throws IOException { - indexStream.seek(FORMAT_SIZE + docID * 8L); + indexStream.seek(HEADER_LENGTH + docID * 8L); } public final void visitDocument(int n, StoredFieldVisitor visitor) throws CorruptIndexException, IOException { @@ -148,7 +147,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); int bits = fieldsStream.readByte() & 0xFF; - assert bits <= (Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_MASK | Lucene40StoredFieldsWriter.FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); + assert bits <= (FIELD_IS_NUMERIC_MASK | FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); switch(visitor.needsField(fieldInfo)) { case YES: @@ -164,19 +163,19 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme } private void readField(StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException { - final int numeric = bits & Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_MASK; + final int numeric = bits & FIELD_IS_NUMERIC_MASK; if (numeric != 0) { switch(numeric) { - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_INT: + case FIELD_IS_NUMERIC_INT: visitor.intField(info, fieldsStream.readInt()); return; - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_LONG: + case FIELD_IS_NUMERIC_LONG: visitor.longField(info, fieldsStream.readLong()); return; - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_FLOAT: + case FIELD_IS_NUMERIC_FLOAT: visitor.floatField(info, Float.intBitsToFloat(fieldsStream.readInt())); return; - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + case FIELD_IS_NUMERIC_DOUBLE: visitor.doubleField(info, Double.longBitsToDouble(fieldsStream.readLong())); return; default: @@ -186,7 +185,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme final int length = fieldsStream.readVInt(); byte bytes[] = new byte[length]; fieldsStream.readBytes(bytes, 0, length); - if ((bits & Lucene40StoredFieldsWriter.FIELD_IS_BINARY) != 0) { + if ((bits & FIELD_IS_BINARY) != 0) { visitor.binaryField(info, bytes, 0, bytes.length); } else { visitor.stringField(info, new String(bytes, 0, bytes.length, IOUtils.CHARSET_UTF_8)); @@ -195,15 +194,15 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme } private void skipField(int bits) throws IOException { - final int numeric = bits & Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_MASK; + final int numeric = bits & FIELD_IS_NUMERIC_MASK; if (numeric != 0) { switch(numeric) { - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_INT: - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_FLOAT: + case FIELD_IS_NUMERIC_INT: + case FIELD_IS_NUMERIC_FLOAT: fieldsStream.readInt(); return; - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_LONG: - case Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_DOUBLE: + case FIELD_IS_NUMERIC_LONG: + case FIELD_IS_NUMERIC_DOUBLE: fieldsStream.readLong(); return; default: @@ -242,7 +241,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme } public static void files(SegmentInfo info, Set files) throws IOException { - files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION)); - files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40StoredFieldsWriter.FIELDS_EXTENSION)); + files.add(IndexFileNames.segmentFileName(info.name, "", FIELDS_INDEX_EXTENSION)); + files.add(IndexFileNames.segmentFileName(info.name, "", FIELDS_EXTENSION)); } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java index c236d9c..fb48443 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java @@ -34,6 +34,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; /** @@ -62,16 +63,12 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { // currently unused: static final int FIELD_IS_NUMERIC_SHORT = 5 << _NUMERIC_BIT_SHIFT; // currently unused: static final int FIELD_IS_NUMERIC_BYTE = 6 << _NUMERIC_BIT_SHIFT; - // (Happens to be the same as for now) Lucene 3.2: NumericFields are stored in binary format - static final int FORMAT_LUCENE_3_2_NUMERIC_FIELDS = 3; + static final String CODEC_NAME = "Lucene40StoredFields"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + static final long HEADER_LENGTH = CodecUtil.headerLength(CODEC_NAME); - // NOTE: if you introduce a new format, make it 1 higher - // than the current one, and always change this if you - // switch to a new format! - static final int FORMAT_CURRENT = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; - // when removing support for old versions, leave the last supported version here - static final int FORMAT_MINIMUM = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; /** Extension of stored fields file */ public static final String FIELDS_EXTENSION = "fdt"; @@ -83,6 +80,7 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { private final String segment; private IndexOutput fieldsStream; private IndexOutput indexStream; + private final long headerLength; public Lucene40StoredFieldsWriter(Directory directory, String segment, IOContext context) throws IOException { assert directory != null; @@ -94,9 +92,10 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { fieldsStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context); indexStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION), context); - fieldsStream.writeInt(FORMAT_CURRENT); - indexStream.writeInt(FORMAT_CURRENT); - + CodecUtil.writeHeader(fieldsStream, CODEC_NAME, VERSION_CURRENT); + CodecUtil.writeHeader(indexStream, CODEC_NAME, VERSION_CURRENT); + headerLength = fieldsStream.getFilePointer(); + assert headerLength == indexStream.getFilePointer(); success = true; } finally { if (!success) { @@ -209,7 +208,7 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { @Override public void finish(int numDocs) throws IOException { - if (4+((long) numDocs)*8 != indexStream.getFilePointer()) + if (headerLength+((long) numDocs)*8 != indexStream.getFilePointer()) // This is most likely a bug in Sun JRE 1.6.0_04/_05; // we detect that the bug has struck, here, and // throw an exception to prevent the corruption from diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java index c0420d1..2d9449a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsReader.java @@ -33,8 +33,6 @@ import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexFormatTooNewException; -import org.apache.lucene.index.IndexFormatTooOldException; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; @@ -43,8 +41,10 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; + /** * Lucene 4.0 Term Vectors reader. *

@@ -54,22 +54,6 @@ import org.apache.lucene.util.IOUtils; */ public class Lucene40TermVectorsReader extends TermVectorsReader { - // NOTE: if you make a new format, it must be larger than - // the current format - - // Changed strings to UTF8 with length-in-bytes not length-in-chars - static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4; - - // NOTE: always change this if you switch to a new format! - // whenever you add a new format, make it 1 larger (positive version logic)! - static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES; - - // when removing support for old versions, leave the last supported version here - static final int FORMAT_MINIMUM = FORMAT_UTF8_LENGTH_IN_BYTES; - - //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file - static final int FORMAT_SIZE = 4; - static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1; static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2; @@ -82,6 +66,12 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { /** Extension of vectors index file */ static final String VECTORS_INDEX_EXTENSION = "tvx"; + + static final String CODEC_NAME = "Lucene40TermVectors"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + private static final long HEADER_LENGTH = CodecUtil.headerLength(CODEC_NAME); private FieldInfos fieldInfos; @@ -91,17 +81,15 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { private int size; private int numTotalDocs; - private final int format; // used by clone - Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int format) { + Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs) { this.fieldInfos = fieldInfos; this.tvx = tvx; this.tvd = tvd; this.tvf = tvf; this.size = size; this.numTotalDocs = numTotalDocs; - this.format = format; } public Lucene40TermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context) @@ -114,18 +102,21 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { try { String idxName = IndexFileNames.segmentFileName(segment, "", VECTORS_INDEX_EXTENSION); tvx = d.openInput(idxName, context); - format = checkValidFormat(tvx); + final int tvxVersion = CodecUtil.checkHeader(tvx, CODEC_NAME, VERSION_START, VERSION_CURRENT); + String fn = IndexFileNames.segmentFileName(segment, "", VECTORS_DOCUMENTS_EXTENSION); tvd = d.openInput(fn, context); - final int tvdFormat = checkValidFormat(tvd); + final int tvdVersion = CodecUtil.checkHeader(tvd, CODEC_NAME, VERSION_START, VERSION_CURRENT); fn = IndexFileNames.segmentFileName(segment, "", VECTORS_FIELDS_EXTENSION); tvf = d.openInput(fn, context); - final int tvfFormat = checkValidFormat(tvf); + final int tvfVersion = CodecUtil.checkHeader(tvf, CODEC_NAME, VERSION_START, VERSION_CURRENT); + assert HEADER_LENGTH == tvx.getFilePointer(); + assert HEADER_LENGTH == tvd.getFilePointer(); + assert HEADER_LENGTH == tvf.getFilePointer(); + assert tvxVersion == tvdVersion; + assert tvxVersion == tvfVersion; - assert format == tvdFormat; - assert format == tvfFormat; - - numTotalDocs = (int) (tvx.length() >> 4); + numTotalDocs = (int) (tvx.length()-HEADER_LENGTH >> 4); this.size = numTotalDocs; assert size == 0 || numTotalDocs == size; @@ -156,13 +147,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { // Not private to avoid synthetic access$NNN methods void seekTvx(final int docNum) throws IOException { - tvx.seek(docNum * 16L + FORMAT_SIZE); - } - - boolean canReadRawDocs() { - // we can always read raw docs, unless the term vectors - // didn't exist - return format != 0; + tvx.seek(docNum * 16L + HEADER_LENGTH); } /** Retrieve the length (in bytes) of the tvd and tvf @@ -210,16 +195,6 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { } } - private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException - { - int format = in.readInt(); - if (format < FORMAT_MINIMUM) - throw new IndexFormatTooOldException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT); - if (format > FORMAT_CURRENT) - throw new IndexFormatTooNewException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT); - return format; - } - public void close() throws IOException { IOUtils.close(tvx, tvd, tvf); } @@ -708,7 +683,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader { cloneTvf = (IndexInput) tvf.clone(); } - return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, format); + return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs); } public static void files(SegmentInfo info, Set files) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java index 372db23..3d2fdff 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java @@ -35,9 +35,13 @@ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.StringHelper; +import static org.apache.lucene.codecs.lucene40.Lucene40TermVectorsReader.*; + + // TODO: make a new 4.0 TV format that encodes better // - use startOffset (not endOffset) as base for delta on // next startOffset because today for syns or ngrams or @@ -58,6 +62,8 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { private final Directory directory; private final String segment; private IndexOutput tvx = null, tvd = null, tvf = null; + private final long headerLength; + public Lucene40TermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException { this.directory = directory; @@ -66,11 +72,14 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { try { // Open files for TermVector storage tvx = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene40TermVectorsReader.VECTORS_INDEX_EXTENSION), context); - tvx.writeInt(Lucene40TermVectorsReader.FORMAT_CURRENT); + CodecUtil.writeHeader(tvx, CODEC_NAME, VERSION_CURRENT); tvd = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene40TermVectorsReader.VECTORS_DOCUMENTS_EXTENSION), context); - tvd.writeInt(Lucene40TermVectorsReader.FORMAT_CURRENT); + CodecUtil.writeHeader(tvd, CODEC_NAME, VERSION_CURRENT); tvf = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene40TermVectorsReader.VECTORS_FIELDS_EXTENSION), context); - tvf.writeInt(Lucene40TermVectorsReader.FORMAT_CURRENT); + CodecUtil.writeHeader(tvf, CODEC_NAME, VERSION_CURRENT); + headerLength = tvx.getFilePointer(); + assert headerLength == tvd.getFilePointer(); + assert headerLength == tvf.getFilePointer(); success = true; } finally { if (!success) { @@ -252,10 +261,7 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { TermVectorsReader vectorsReader = matchingSegmentReader.getTermVectorsReader(); if (vectorsReader != null && vectorsReader instanceof Lucene40TermVectorsReader) { - // If the TV* files are an older format then they cannot read raw docs: - if (((Lucene40TermVectorsReader)vectorsReader).canReadRawDocs()) { matchingVectorsReader = (Lucene40TermVectorsReader) vectorsReader; - } } } if (reader.liveDocs != null) { @@ -356,7 +362,7 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { @Override public void finish(int numDocs) throws IOException { - if (4+((long) numDocs)*16 != tvx.getFilePointer()) + if (headerLength+((long) numDocs)*16 != tvx.getFilePointer()) // This is most likely a bug in Sun JRE 1.6.0_04/_05; // we detect that the bug has struck, here, and // throw an exception to prevent the corruption from