Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 949434) +++ lucene/CHANGES.txt (working copy) @@ -74,6 +74,10 @@ character. Furthermore, the rest of the automaton package and RegexpQuery use true Unicode codepoint representation. (Robert Muir, Mike McCandless) +* LUCENE-2480: Though not a change in backwards compatibility policy, pre-3.0 + indexes are no longer supported. You should upgrade to 3.x first, then run + optimize(), or reindex. (Shai Erera, Earwin Burrfoot) + Changes in runtime behavior * LUCENE-2421: NativeFSLockFactory does not throw LockReleaseFailedException if Index: lucene/src/java/org/apache/lucene/index/CheckIndex.java =================================================================== --- lucene/src/java/org/apache/lucene/index/CheckIndex.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/CheckIndex.java (working copy) @@ -17,6 +17,7 @@ * limitations under the License. */ +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; @@ -342,33 +343,13 @@ String sFormat = ""; boolean skip = false; - if (format == SegmentInfos.FORMAT) - sFormat = "FORMAT [Lucene Pre-2.1]"; - if (format == SegmentInfos.FORMAT_LOCKLESS) - sFormat = "FORMAT_LOCKLESS [Lucene 2.1]"; - else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE) - sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]"; - else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE) - sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]"; - else { - if (format == SegmentInfos.FORMAT_CHECKSUM) - sFormat = "FORMAT_CHECKSUM [Lucene 2.4]"; - else if (format == SegmentInfos.FORMAT_DEL_COUNT) - sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]"; - else if (format == SegmentInfos.FORMAT_HAS_PROX) - sFormat = "FORMAT_HAS_PROX [Lucene 2.4]"; - else if (format == SegmentInfos.FORMAT_USER_DATA) - sFormat = "FORMAT_USER_DATA [Lucene 2.9]"; - else if (format == SegmentInfos.FORMAT_DIAGNOSTICS) - sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]"; - else if (format == SegmentInfos.FORMAT_FLEX_POSTINGS) - sFormat = "FORMAT_FLEX_POSTINGS [Lucene 3.1]"; - else if (format < SegmentInfos.CURRENT_FORMAT) { - sFormat = "int=" + format + " [newer version of Lucene than this tool]"; - skip = true; - } else { - sFormat = format + " [Lucene 1.3 or prior]"; - } + if (format == SegmentInfos.FORMAT_DIAGNOSTICS) + sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]"; + else if (format == SegmentInfos.FORMAT_FLEX_POSTINGS) + sFormat = "FORMAT_FLEX_POSTINGS [Lucene 4.0]"; + else if (format < SegmentInfos.CURRENT_FORMAT) { + sFormat = "int=" + format + " [newer version of Lucene than this tool]"; + skip = true; } result.segmentsFileName = segmentsFileName; @@ -656,7 +637,7 @@ int lastDoc = -1; while(true) { final int doc = docs2.nextDoc(); - if (doc == DocsEnum.NO_MORE_DOCS) { + if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } final int freq = docs2.freq(); @@ -698,7 +679,7 @@ if (reader.hasDeletions()) { final DocsEnum docsNoDel = terms.docs(null, docs); int count = 0; - while(docsNoDel.nextDoc() != DocsEnum.NO_MORE_DOCS) { + while(docsNoDel.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { count++; } if (count != docFreq) { Index: lucene/src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -639,10 +639,11 @@ consumer.flush(threads, flushState); if (infoStream != null) { - SegmentInfo si = new SegmentInfo(flushState.segmentName, flushState.numDocs, directory, flushState.codec); - si.setHasProx(hasProx()); + SegmentInfo si = new SegmentInfo(flushState.segmentName, + flushState.numDocs, directory, false, -1, flushState.segmentName, + false, hasProx(), flushState.codec); final long newSegmentSize = si.sizeInBytes(); - String message = " ramUsed=" + nf.format(((double) numBytesUsed)/1024./1024.) + " MB" + + String message = " ramUsed=" + nf.format(numBytesUsed/1024./1024.) + " MB" + " newFlushedSize=" + newSegmentSize + " docs/MB=" + nf.format(numDocsInRAM/(newSegmentSize/1024./1024.)) + " new/old=" + nf.format(100.0*newSegmentSize/numBytesUsed) + "%"; Index: lucene/src/java/org/apache/lucene/index/FieldInfos.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldInfos.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/FieldInfos.java (working copy) @@ -36,9 +36,6 @@ */ public final class FieldInfos { - // Used internally (ie not written to *.fnm files) for pre-2.9 files - public static final int FORMAT_PRE = -1; - // First used in 2.9; prior to 2.9 there was no format header public static final int FORMAT_START = -2; @@ -68,29 +65,7 @@ FieldInfos(Directory d, String name) throws IOException { IndexInput input = d.openInput(name); try { - try { - read(input, name); - } catch (IOException ioe) { - if (format == FORMAT_PRE) { - // LUCENE-1623: FORMAT_PRE (before there was a - // format) may be 2.3.2 (pre-utf8) or 2.4.x (utf8) - // encoding; retry with input set to pre-utf8 - input.seek(0); - input.setModifiedUTF8StringsMode(); - byNumber.clear(); - byName.clear(); - try { - read(input, name); - } catch (Throwable t) { - // Ignore any new exception & throw original IOE - throw ioe; - } - } else { - // The IOException cannot be caused by - // LUCENE-1623, so re-throw it - throw ioe; - } - } + read(input, name); } finally { input.close(); } @@ -330,25 +305,13 @@ } private void read(IndexInput input, String fileName) throws IOException { - int firstInt = input.readVInt(); + format = input.readVInt(); - if (firstInt < 0) { - // This is a real format - format = firstInt; - } else { - format = FORMAT_PRE; - } - - if (format != FORMAT_PRE & format != FORMAT_START) { + if (format > FORMAT_START) { throw new CorruptIndexException("unrecognized format " + format + " in file \"" + fileName + "\""); } - int size; - if (format == FORMAT_PRE) { - size = firstInt; - } else { - size = input.readVInt(); //read in the size - } + final int size = input.readVInt(); //read in the size for (int i = 0; i < size; i++) { String name = StringHelper.intern(input.readString()); Index: lucene/src/java/org/apache/lucene/index/FieldsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldsReader.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/FieldsReader.java (working copy) @@ -19,7 +19,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.AbstractField; -import org.apache.lucene.document.CompressionTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; @@ -33,7 +32,6 @@ import java.io.IOException; import java.io.Reader; -import java.util.zip.DataFormatException; /** * Class responsible for access to stored document fields. @@ -41,6 +39,8 @@ * It uses <segment>.fdt and <segment>.fdx; files. */ final class FieldsReader implements Cloneable { + private final static int FORMAT_SIZE = 4; + private final FieldInfos fieldInfos; // The main fieldStream, used only for cloning. @@ -56,7 +56,6 @@ private int size; private boolean closed; private final int format; - private final int formatSize; // The docID offset where our docs begin in the index // file. This will be 0 if we have our own private file. @@ -73,17 +72,16 @@ @Override public Object clone() { ensureOpen(); - return new FieldsReader(fieldInfos, numTotalDocs, size, format, formatSize, docStoreOffset, cloneableFieldsStream, cloneableIndexStream); + return new FieldsReader(fieldInfos, numTotalDocs, size, format, docStoreOffset, cloneableFieldsStream, cloneableIndexStream); } // Used only by clone - private FieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int formatSize, - int docStoreOffset, IndexInput cloneableFieldsStream, IndexInput cloneableIndexStream) { + private FieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int docStoreOffset, + IndexInput cloneableFieldsStream, IndexInput cloneableIndexStream) { this.fieldInfos = fieldInfos; this.numTotalDocs = numTotalDocs; this.size = size; this.format = format; - this.formatSize = formatSize; this.docStoreOffset = docStoreOffset; this.cloneableFieldsStream = cloneableFieldsStream; this.cloneableIndexStream = cloneableIndexStream; @@ -95,10 +93,6 @@ this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0); } - FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException { - this(d, segment, fn, readBufferSize, -1, 0); - } - FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException { boolean success = false; isOriginal = true; @@ -108,30 +102,15 @@ cloneableFieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, "", IndexFileNames.FIELDS_EXTENSION), readBufferSize); cloneableIndexStream = d.openInput(IndexFileNames.segmentFileName(segment, "", IndexFileNames.FIELDS_INDEX_EXTENSION), readBufferSize); - // First version of fdx did not include a format - // header, but, the first int will always be 0 in that - // case - int firstInt = cloneableIndexStream.readInt(); - if (firstInt == 0) - format = 0; - else - format = firstInt; + format = cloneableIndexStream.readInt(); if (format > FieldsWriter.FORMAT_CURRENT) throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FieldsWriter.FORMAT_CURRENT + " or lower"); - if (format > FieldsWriter.FORMAT) - formatSize = 4; - else - formatSize = 0; - - if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) - cloneableFieldsStream.setModifiedUTF8StringsMode(); - fieldsStream = (IndexInput) cloneableFieldsStream.clone(); - final long indexSize = cloneableIndexStream.length()-formatSize; + final long indexSize = cloneableIndexStream.length() - FORMAT_SIZE; if (docStoreOffset != -1) { // We read only a slice out of this shared fields file @@ -201,8 +180,8 @@ return size; } - private final void seekIndex(int docID) throws IOException { - indexStream.seek(formatSize + (docID + docStoreOffset) * 8L); + private void seekIndex(int docID) throws IOException { + indexStream.seek(FORMAT_SIZE + (docID + docStoreOffset) * 8L); } boolean canReadRawDocs() { @@ -226,34 +205,31 @@ FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name); byte bits = fieldsStream.readByte(); - assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY; + assert bits <= FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY; - boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; - assert (compressed ? (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS) : true) - : "compressed fields are only allowed in indexes of version <= 2.9"; boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; //TODO: Find an alternative approach here if this list continues to grow beyond the //list of 5 or 6 currently here. See Lucene 762 for discussion if (acceptField.equals(FieldSelectorResult.LOAD)) { - addField(doc, fi, binary, compressed, tokenize); + addField(doc, fi, binary, tokenize); } else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){ - addField(doc, fi, binary, compressed, tokenize); + addField(doc, fi, binary, tokenize); break;//Get out of this loop } else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) { - addFieldLazy(doc, fi, binary, compressed, tokenize); + addFieldLazy(doc, fi, binary, tokenize); } else if (acceptField.equals(FieldSelectorResult.SIZE)){ - skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed)); + skipField(addFieldSize(doc, fi, binary)); } else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){ - addFieldSize(doc, fi, binary, compressed); + addFieldSize(doc, fi, binary); break; } else { - skipField(binary, compressed); + skipField(); } } @@ -290,25 +266,20 @@ * Skip the field. We still have to read some of the information about the field, but can skip past the actual content. * This will have the most payoff on large fields. */ - private void skipField(boolean binary, boolean compressed) throws IOException { - skipField(binary, compressed, fieldsStream.readVInt()); + private void skipField() throws IOException { + skipField(fieldsStream.readVInt()); } - private void skipField(boolean binary, boolean compressed, int toRead) throws IOException { - if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { - fieldsStream.seek(fieldsStream.getFilePointer() + toRead); - } else { - // We need to skip chars. This will slow us down, but still better - fieldsStream.skipChars(toRead); - } + private void skipField(int toRead) throws IOException { + fieldsStream.seek(fieldsStream.getFilePointer() + toRead); } - private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException { + private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean tokenize) throws IOException { if (binary) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES)); - doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary, compressed)); + doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary)); //Need to move the pointer ahead by toRead positions fieldsStream.seek(pointer + toRead); } else { @@ -317,75 +288,42 @@ Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); AbstractField f; - if (compressed) { - int toRead = fieldsStream.readVInt(); - long pointer = fieldsStream.getFilePointer(); - f = new LazyField(fi.name, store, toRead, pointer, binary, compressed); - //skip over the part that we aren't loading - fieldsStream.seek(pointer + toRead); - f.setOmitNorms(fi.omitNorms); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - } else { - int length = fieldsStream.readVInt(); - long pointer = fieldsStream.getFilePointer(); - //Skip ahead of where we are by the length of what is stored - if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { - fieldsStream.seek(pointer+length); - } else { - fieldsStream.skipChars(length); - } - f = new LazyField(fi.name, store, index, termVector, length, pointer, binary, compressed); - f.setOmitNorms(fi.omitNorms); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - } - + int length = fieldsStream.readVInt(); + long pointer = fieldsStream.getFilePointer(); + //Skip ahead of where we are by the length of what is stored + fieldsStream.seek(pointer+length); + f = new LazyField(fi.name, store, index, termVector, length, pointer, binary); + f.setOmitNorms(fi.omitNorms); + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + doc.add(f); } } - private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws CorruptIndexException, IOException { + private void addField(Document doc, FieldInfo fi, boolean binary, boolean tokenize) throws CorruptIndexException, IOException { //we have a binary stored field, and it may be compressed if (binary) { int toRead = fieldsStream.readVInt(); final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); - if (compressed) { - doc.add(new Field(fi.name, uncompress(b))); - } else { - doc.add(new Field(fi.name, b)); - } + doc.add(new Field(fi.name, b)); } else { Field.Store store = Field.Store.YES; Field.Index index = Field.Index.toIndex(fi.isIndexed, tokenize); Field.TermVector termVector = Field.TermVector.toTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); AbstractField f; - if (compressed) { - int toRead = fieldsStream.readVInt(); + f = new Field(fi.name, // name + false, + fieldsStream.readString(), // read value + store, + index, + termVector); + f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + f.setOmitNorms(fi.omitNorms); - final byte[] b = new byte[toRead]; - fieldsStream.readBytes(b, 0, b.length); - f = new Field(fi.name, // field name - false, - new String(uncompress(b), "UTF-8"), // uncompress the value and add as string - store, - index, - termVector); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - f.setOmitNorms(fi.omitNorms); - } else { - f = new Field(fi.name, // name - false, - fieldsStream.readString(), // read value - store, - index, - termVector); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); - f.setOmitNorms(fi.omitNorms); - } - doc.add(f); } } @@ -393,8 +331,8 @@ // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) // Read just the size -- caller must skip the field content to continue reading fields // Return the size in bytes or chars, depending on field type - private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed) throws IOException { - int size = fieldsStream.readVInt(), bytesize = binary || compressed ? size : 2*size; + private int addFieldSize(Document doc, FieldInfo fi, boolean binary) throws IOException { + int size = fieldsStream.readVInt(), bytesize = binary ? size : 2*size; byte[] sizebytes = new byte[4]; sizebytes[0] = (byte) (bytesize>>>24); sizebytes[1] = (byte) (bytesize>>>16); @@ -411,11 +349,8 @@ private class LazyField extends AbstractField implements Fieldable { private int toRead; private long pointer; - /** @deprecated Only kept for backward-compatbility with <3.0 indexes. Will be removed in 4.0. */ - @Deprecated - private boolean isCompressed; - public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary, boolean isCompressed) { + public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary) { super(name, store, Field.Index.NO, Field.TermVector.NO); this.toRead = toRead; this.pointer = pointer; @@ -423,10 +358,9 @@ if (isBinary) binaryLength = toRead; lazy = true; - this.isCompressed = isCompressed; } - public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary, boolean isCompressed) { + public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary) { super(name, store, index, termVector); this.toRead = toRead; this.pointer = pointer; @@ -434,7 +368,6 @@ if (isBinary) binaryLength = toRead; lazy = true; - this.isCompressed = isCompressed; } private IndexInput getFieldStream() { @@ -474,22 +407,9 @@ IndexInput localFieldsStream = getFieldStream(); try { localFieldsStream.seek(pointer); - if (isCompressed) { - final byte[] b = new byte[toRead]; - localFieldsStream.readBytes(b, 0, b.length); - fieldsData = new String(uncompress(b), "UTF-8"); - } else { - if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { - byte[] bytes = new byte[toRead]; - localFieldsStream.readBytes(bytes, 0, toRead); - fieldsData = new String(bytes, "UTF-8"); - } else { - //read in chars b/c we already know the length we need to read - char[] chars = new char[toRead]; - localFieldsStream.readChars(chars, 0, toRead); - fieldsData = new String(chars); - } - } + byte[] bytes = new byte[toRead]; + localFieldsStream.readBytes(bytes, 0, toRead); + fieldsData = new String(bytes, "UTF-8"); } catch (IOException e) { throw new FieldReaderException(e); } @@ -498,26 +418,6 @@ } } - public long getPointer() { - ensureOpen(); - return pointer; - } - - public void setPointer(long pointer) { - ensureOpen(); - this.pointer = pointer; - } - - public int getToRead() { - ensureOpen(); - return toRead; - } - - public void setToRead(int toRead) { - ensureOpen(); - this.toRead = toRead; - } - @Override public byte[] getBinaryValue(byte[] result) { ensureOpen(); @@ -538,11 +438,7 @@ try { localFieldsStream.seek(pointer); localFieldsStream.readBytes(b, 0, toRead); - if (isCompressed == true) { - fieldsData = uncompress(b); - } else { - fieldsData = b; - } + fieldsData = b; } catch (IOException e) { throw new FieldReaderException(e); } @@ -556,16 +452,4 @@ return null; } } - - private byte[] uncompress(byte[] b) - throws CorruptIndexException { - try { - return CompressionTools.decompress(b); - } catch (DataFormatException e) { - // this will happen if the field is not compressed - CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString()); - newException.initCause(e); - throw newException; - } - } } Index: lucene/src/java/org/apache/lucene/index/FieldsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldsWriter.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/FieldsWriter.java (working copy) @@ -31,16 +31,6 @@ static final byte FIELD_IS_TOKENIZED = 0x1; static final byte FIELD_IS_BINARY = 0x2; - /** @deprecated Kept for backwards-compatibility with <3.0 indexes; will be removed in 4.0 */ - @Deprecated - static final byte FIELD_IS_COMPRESSED = 0x4; - - // Original format - static final int FORMAT = 0; - - // Changed strings to UTF8 - static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = 1; - // Lucene 3.0: Removal of compressed fields static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2; Index: lucene/src/java/org/apache/lucene/index/IndexFileNameFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexFileNameFilter.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/IndexFileNameFilter.java (working copy) @@ -62,8 +62,7 @@ return true; } } else { - if (name.equals(IndexFileNames.DELETABLE)) return true; - else if (name.startsWith(IndexFileNames.SEGMENTS)) return true; + if (name.startsWith(IndexFileNames.SEGMENTS)) return true; } return false; } Index: lucene/src/java/org/apache/lucene/index/IndexFileNames.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexFileNames.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/IndexFileNames.java (working copy) @@ -46,10 +46,6 @@ /** Name of the generation reference file name */ public static final String SEGMENTS_GEN = "segments." + GEN_EXTENSION; - /** Name of the index deletable file (only used in - * pre-lockless indices) */ - public static final String DELETABLE = "deletable"; - /** Extension of norms file */ public static final String NORMS_EXTENSION = "nrm"; @@ -80,19 +76,15 @@ /** Extension of field infos */ public static final String FIELD_INFOS_EXTENSION = "fnm"; - /** Extension of plain norms */ - public static final String PLAIN_NORMS_EXTENSION = "f"; - /** Extension of separate norms */ public static final String SEPARATE_NORMS_EXTENSION = "s"; /** * This array contains all filename extensions used by - * Lucene's index files, with two exceptions, namely the - * extension made up from .f + a number and - * from .s + a number. Also note that - * Lucene's segments_N files do not have any - * filename extension. + * Lucene's index files, with one exception, namely the + * extension made up from .s + a number. + * Also note that Lucene's segments_N files + * do not have any filename extension. */ public static final String INDEX_EXTENSIONS[] = new String[] { COMPOUND_FILE_EXTENSION, @@ -146,7 +138,7 @@ * @param ext extension of the filename * @param gen generation */ - public static final String fileNameFromGeneration(String base, String ext, long gen) { + public static String fileNameFromGeneration(String base, String ext, long gen) { if (gen == SegmentInfo.NO) { return null; } else if (gen == SegmentInfo.WITHOUT_GEN) { @@ -168,7 +160,7 @@ * Returns true if the provided filename is one of the doc store files (ends * with an extension in {@link #STORE_INDEX_EXTENSIONS}). */ - public static final boolean isDocStoreFile(String fileName) { + public static boolean isDocStoreFile(String fileName) { if (fileName.endsWith(COMPOUND_FILE_STORE_EXTENSION)) return true; for (String ext : STORE_INDEX_EXTENSIONS) { @@ -193,7 +185,7 @@ * otherwise some structures may fail to handle them properly (such as if they * are added to compound files). */ - public static final String segmentFileName(String segmentName, String name, String ext) { + public static String segmentFileName(String segmentName, String name, String ext) { if (ext.length() > 0 || name.length() > 0) { assert !ext.startsWith("."); StringBuilder sb = new StringBuilder(segmentName.length() + 2 + name.length() + ext.length()); @@ -214,7 +206,7 @@ * Returns true if the given filename ends with the given extension. One * should provide a pure extension, withouth '.'. */ - public static final boolean matchesExtension(String filename, String ext) { + public static boolean matchesExtension(String filename, String ext) { // It doesn't make a difference whether we allocate a StringBuilder ourself // or not, since there's only 1 '+' operator. return filename.endsWith("." + ext); @@ -229,7 +221,7 @@ * @return the filename with the segment name removed, or the given filename * if it does not contain a '.' and '_'. */ - public static final String stripSegmentName(String filename) { + public static String stripSegmentName(String filename) { // If it is a .del file, there's an '_' after the first character int idx = filename.indexOf('_', 1); if (idx == -1) { Index: lucene/src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexWriter.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -2969,8 +2969,8 @@ SegmentInfo info = null; synchronized(this) { - info = new SegmentInfo(mergedName, docCount, directory, false, true, - -1, null, false, merger.hasProx(), merger.getCodec()); + info = new SegmentInfo(mergedName, docCount, directory, false, -1, + null, false, merger.hasProx(), merger.getCodec()); setDiagnostics(info, "addIndexes(IndexReader...)"); segmentInfos.add(info); checkpoint(); @@ -3335,10 +3335,9 @@ // successfully. newSegment = new SegmentInfo(segment, flushedDocCount, - directory, false, true, - docStoreOffset, docStoreSegment, - docStoreIsCompoundFile, - docWriter.hasProx(), + directory, false, docStoreOffset, + docStoreSegment, docStoreIsCompoundFile, + docWriter.hasProx(), docWriter.getCodec()); setDiagnostics(newSegment, "flush"); @@ -3853,8 +3852,7 @@ // ConcurrentMergePolicy we keep deterministic segment // names. merge.info = new SegmentInfo(newSegmentName(), 0, - directory, false, true, - docStoreOffset, + directory, false, docStoreOffset, docStoreSegment, docStoreIsCompoundFile, false, Index: lucene/src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- lucene/src/java/org/apache/lucene/index/SegmentInfo.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -20,17 +20,16 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.BitVector; import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.CodecProvider; import java.io.IOException; +import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Set; import java.util.HashSet; import java.util.HashMap; import java.util.ArrayList; -import java.util.Collections; /** * Information about a segment such as it's name, directory, and files related @@ -42,41 +41,30 @@ static final int NO = -1; // e.g. no norms; no deletes; static final int YES = 1; // e.g. have norms; have deletes; - static final int CHECK_DIR = 0; // e.g. must check dir to see if there are norms/deletions static final int WITHOUT_GEN = 0; // a file name that has no GEN in it. public String name; // unique name in dir public int docCount; // number of docs in seg public Directory dir; // where segment resides - private boolean preLockless; // true if this is a segments file written before - // lock-less commits (2.1) + /* + * Current generation of del file: + * - NO if there are no deletes + * - YES or higher if there are deletes at generation N + */ + private long delGen; + + /* + * Current generation of each field's norm file. If this array is null, + * means no separate norms. If this array is not null, its values mean: + * - NO says this field has no separate norms + * >= YES says this field has separate norms with the specified generation + */ + private long[] normGen; - private long delGen; // current generation of del file; NO if there - // are no deletes; CHECK_DIR if it's a pre-2.1 segment - // (and we must check filesystem); YES or higher if - // there are deletes at generation N - - private long[] normGen; // current generation of each field's norm file. - // If this array is null, for lockLess this means no - // separate norms. For preLockLess this means we must - // check filesystem. If this array is not null, its - // values mean: NO says this field has no separate - // norms; CHECK_DIR says it is a preLockLess segment and - // filesystem must be checked; >= YES says this field - // has separate norms with the specified generation + private boolean isCompoundFile; - private byte isCompoundFile; // NO if it is not; YES if it is; CHECK_DIR if it's - // pre-2.1 (ie, must check file system to see - // if .cfs and .nrm exist) - - private boolean hasSingleNormFile; // true if this segment maintains norms in a single file; - // false otherwise - // this is currently false for segments populated by DocumentWriter - // and true for newly created merged segments (both - // compound and non compound). - - private List files; // cached list of files that this segment uses + private List files; // cached list of files that this segment uses // in the Directory long sizeInBytes = -1; // total byte size of all of our files (computed on demand) @@ -87,8 +75,7 @@ // other segments private boolean docStoreIsCompoundFile; // whether doc store files are stored in compound file (*.cfx) - private int delCount; // How many deleted docs in this segment, or -1 if not yet known - // (if it's an older index) + private int delCount; // How many deleted docs in this segment private boolean hasProx; // True if this segment has any fields with omitTermFreqAndPositions==false @@ -97,29 +84,13 @@ private Map diagnostics; - public SegmentInfo(String name, int docCount, Directory dir, Codec codec) { + public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, int docStoreOffset, + String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx, Codec codec) { this.name = name; this.docCount = docCount; this.dir = dir; delGen = NO; - isCompoundFile = CHECK_DIR; - preLockless = true; - hasSingleNormFile = false; - docStoreOffset = -1; - docStoreSegment = name; - docStoreIsCompoundFile = false; - delCount = 0; - hasProx = true; - this.codec = codec; - } - - public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile, - int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx, - Codec codec) { - this(name, docCount, dir, codec); - this.isCompoundFile = (byte) (isCompoundFile ? YES : NO); - this.hasSingleNormFile = hasSingleNormFile; - preLockless = false; + this.isCompoundFile = isCompoundFile; this.docStoreOffset = docStoreOffset; this.docStoreSegment = docStoreSegment; this.docStoreIsCompoundFile = docStoreIsCompoundFile; @@ -137,7 +108,6 @@ name = src.name; docCount = src.docCount; dir = src.dir; - preLockless = src.preLockless; delGen = src.delGen; docStoreOffset = src.docStoreOffset; docStoreIsCompoundFile = src.docStoreIsCompoundFile; @@ -148,7 +118,6 @@ System.arraycopy(src.normGen, 0, normGen, 0, src.normGen.length); } isCompoundFile = src.isCompoundFile; - hasSingleNormFile = src.hasSingleNormFile; delCount = src.delCount; codec = src.codec; } @@ -174,98 +143,44 @@ name = input.readString(); docCount = input.readInt(); final String codecName; - if (format <= SegmentInfos.FORMAT_LOCKLESS) { - delGen = input.readLong(); - if (format <= SegmentInfos.FORMAT_SHARED_DOC_STORE) { - docStoreOffset = input.readInt(); - if (docStoreOffset != -1) { - docStoreSegment = input.readString(); - docStoreIsCompoundFile = (1 == input.readByte()); - } else { - docStoreSegment = name; - docStoreIsCompoundFile = false; - } - } else { - docStoreOffset = -1; - docStoreSegment = name; - docStoreIsCompoundFile = false; + delGen = input.readLong(); + docStoreOffset = input.readInt(); + if (docStoreOffset != -1) { + docStoreSegment = input.readString(); + docStoreIsCompoundFile = input.readByte() == YES; + } else { + docStoreSegment = name; + docStoreIsCompoundFile = false; + } + // single norms file + assert 1 == input.readByte(); + int numNormGen = input.readInt(); + if (numNormGen == NO) { + normGen = null; + } else { + normGen = new long[numNormGen]; + for(int j=0;jemptyMap(); - } - } else { - delGen = CHECK_DIR; - normGen = null; - isCompoundFile = CHECK_DIR; - preLockless = true; - hasSingleNormFile = false; - docStoreOffset = -1; - docStoreIsCompoundFile = false; - docStoreSegment = null; - delCount = -1; - hasProx = true; + hasProx = input.readByte() == YES; + + // System.out.println(Thread.currentThread().getName() + ": si.read hasProx=" + hasProx + " seg=" + name); + + if (format <= SegmentInfos.FORMAT_FLEX_POSTINGS) + codecName = input.readString(); + else codecName = "PreFlex"; - diagnostics = Collections.emptyMap(); - } + + diagnostics = input.readStringStringMap(); codec = codecs.lookup(codecName); } - void setNumFields(int numFields) { - if (normGen == null) { - // normGen is null if we loaded a pre-2.1 segment - // file, or, if this segments file hasn't had any - // norms set against it yet: - normGen = new long[numFields]; - - if (preLockless) { - // Do nothing: thus leaving normGen[k]==CHECK_DIR (==0), so that later we know - // we have to check filesystem for norm files, because this is prelockless. - - } else { - // This is a FORMAT_LOCKLESS segment, which means - // there are no separate norms: - for(int i=0;i= YES: this means this segment has deletions // - // delGen == CHECK_DIR: this means this segment was written by - // pre-LOCKLESS code which means we must check - // directory to see if .del file exists - // - // delGen >= YES: this means this segment was written by - // the LOCKLESS code and for certain has - // deletions - // - if (delGen == NO) { - return false; - } else if (delGen >= YES) { - return true; - } else { - return dir.fileExists(getDelFileName()); - } + return delGen != NO; } void advanceDelGen() { - // delGen 0 is reserved for pre-LOCKLESS format if (delGen == NO) { delGen = YES; } else { @@ -325,14 +223,12 @@ } @Override - public Object clone () { - SegmentInfo si = new SegmentInfo(name, docCount, dir, codec); + public Object clone() { + SegmentInfo si = new SegmentInfo(name, docCount, dir, isCompoundFile, docStoreOffset, docStoreSegment, docStoreIsCompoundFile, hasProx, codec); si.isCompoundFile = isCompoundFile; si.delGen = delGen; si.delCount = delCount; si.hasProx = hasProx; - si.preLockless = preLockless; - si.hasSingleNormFile = hasSingleNormFile; si.diagnostics = new HashMap(diagnostics); if (normGen != null) { si.normGen = normGen.clone(); @@ -350,7 +246,6 @@ // against this segment return null; } else { - // If delGen is CHECK_DIR, it's the pre-lockless-commit file format return IndexFileNames.fileNameFromGeneration(name, IndexFileNames.DELETES_EXTENSION, delGen); } } @@ -360,69 +255,34 @@ * * @param fieldNumber the field index to check */ - public boolean hasSeparateNorms(int fieldNumber) - throws IOException { - if ((normGen == null && preLockless) || (normGen != null && normGen[fieldNumber] == CHECK_DIR)) { - // Must fallback to directory file exists check: - String fileName = name + ".s" + fieldNumber; - return dir.fileExists(fileName); - } else if (normGen == null || normGen[fieldNumber] == NO) { - return false; - } else { - return true; - } + public boolean hasSeparateNorms(int fieldNumber) { + return normGen != null && normGen[fieldNumber] != NO; } /** * Returns true if any fields in this segment have separate norms. */ - public boolean hasSeparateNorms() - throws IOException { + public boolean hasSeparateNorms() { if (normGen == null) { - if (!preLockless) { - // This means we were created w/ LOCKLESS code and no - // norms are written yet: - return false; - } else { - // This means this segment was saved with pre-LOCKLESS - // code. So we must fallback to the original - // directory list check: - String[] result = dir.listAll(); - if (result == null) - throw new IOException("cannot read directory " + dir + ": listAll() returned null"); - - final String pattern = name + ".s\\d+"; - for(int i = 0; i < result.length; i++){ - String fileName = result[i]; - if (fileName.matches(pattern)) { - return true; - } - } - return false; - } + return false; } else { - // This means this segment was saved with LOCKLESS - // code so we first check whether any normGen's are >= 1 - // (meaning they definitely have separate norms): - for(int i=0;i= YES) { + for (long fieldNormGen : normGen) { + if (fieldNormGen >= YES) { return true; } } - // Next we look for any == 0. These cases were - // pre-LOCKLESS and must be checked in directory: - for(int i=0;i(fileSet); @@ -727,16 +517,7 @@ StringBuilder s = new StringBuilder(); s.append(name).append(':'); - char cfs; - try { - if (getUseCompoundFile()) { - cfs = 'c'; - } else { - cfs = 'C'; - } - } catch (IOException ioe) { - cfs = '?'; - } + char cfs = getUseCompoundFile() ? 'c' : 'C'; s.append(cfs); if (this.dir != dir) { @@ -744,22 +525,9 @@ } s.append(docCount); - int delCount; - try { - delCount = getDelCount(); - } catch (IOException ioe) { - delCount = -1; - } - if (delCount != -1) { - delCount += pendingDelCount; - } + int delCount = getDelCount() + pendingDelCount; if (delCount != 0) { - s.append('/'); - if (delCount == -1) { - s.append('?'); - } else { - s.append(delCount); - } + s.append('/').append(delCount); } if (docStoreOffset != -1) { Index: lucene/src/java/org/apache/lucene/index/SegmentInfos.java =================================================================== --- lucene/src/java/org/apache/lucene/index/SegmentInfos.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/SegmentInfos.java (working copy) @@ -45,47 +45,17 @@ */ public final class SegmentInfos extends Vector { - /** The file format version, a negative number. */ - /* Works since counter, the old 1st entry, is always >= 0 */ - public static final int FORMAT = -1; - - /** This format adds details used for lockless commits. It differs - * slightly from the previous format in that file names - * are never re-used (write once). Instead, each file is - * written to the next generation. For example, - * segments_1, segments_2, etc. This allows us to not use - * a commit lock. See file - * formats for details. + /* + * The file format version, a negative number. + * + * NOTE: future format numbers must always be one smaller + * than the latest. With time, support for old formats will + * be removed, however the numbers should continue to decrease. */ - public static final int FORMAT_LOCKLESS = -2; - /** This format adds a "hasSingleNormFile" flag into each segment info. - * See LUCENE-756 - * for details. - */ - public static final int FORMAT_SINGLE_NORM_FILE = -3; - - /** This format allows multiple segments to share a single - * vectors and stored fields file. */ - public static final int FORMAT_SHARED_DOC_STORE = -4; - - /** This format adds a checksum at the end of the file to - * ensure all bytes were successfully written. */ - public static final int FORMAT_CHECKSUM = -5; - - /** This format adds the deletion count for each segment. - * This way IndexWriter can efficiently report numDocs(). */ - public static final int FORMAT_DEL_COUNT = -6; - - /** This format adds the boolean hasProx to record if any - * fields in the segment store prox information (ie, have - * omitTermFreqAndPositions==false) */ - public static final int FORMAT_HAS_PROX = -7; - - /** This format adds optional commit userData (String) storage. */ - public static final int FORMAT_USER_DATA = -8; - + /** Used for the segments.gen file only! */ + public static final int FORMAT_SEGMENTS_GEN_CURRENT = -2; + /** This format adds optional per-segment String * diagnostics storage, and switches userData to Map */ public static final int FORMAT_DIAGNOSTICS = -9; @@ -98,6 +68,7 @@ static final int CURRENT_FORMAT = FORMAT_FLEX_POSTINGS; public int counter = 0; // used to name new segments + /** * counts how often the index has been changed by adding or deleting docs. * starting with the current time in milliseconds forces to create unique version numbers. @@ -132,8 +103,7 @@ return -1; } long max = -1; - for (int i = 0; i < files.length; i++) { - String file = files[i]; + for (String file : files) { if (file.startsWith(IndexFileNames.SEGMENTS) && !file.equals(IndexFileNames.SEGMENTS_GEN)) { long gen = generationFromSegmentsFileName(file); if (gen > max) { @@ -248,46 +218,25 @@ try { int format = input.readInt(); - if(format < 0){ // file contains explicit format info - // check that it is a format we can understand - if (format < CURRENT_FORMAT) - throw new CorruptIndexException("Unknown format version: " + format); - version = input.readLong(); // read version - counter = input.readInt(); // read counter - } - else{ // file is in old format without explicit format info - counter = format; - } + + // check that it is a format we can understand + if (format < CURRENT_FORMAT) + throw new CorruptIndexException("Unknown (newer than us?) format version: " + format); + + version = input.readLong(); // read version + counter = input.readInt(); // read counter for (int i = input.readInt(); i > 0; i--) { // read segmentInfos add(new SegmentInfo(directory, format, input, codecs)); } - if(format >= 0){ // in old format the version number may be at the end of the file - if (input.getFilePointer() >= input.length()) - version = System.currentTimeMillis(); // old file format without version number - else - version = input.readLong(); // read version - } + userData = input.readStringStringMap(); - if (format <= FORMAT_USER_DATA) { - if (format <= FORMAT_DIAGNOSTICS) { - userData = input.readStringStringMap(); - } else if (0 != input.readByte()) { - userData = Collections.singletonMap("userData", input.readString()); - } else { - userData = Collections.emptyMap(); - } - } else { - userData = Collections.emptyMap(); - } + final long checksumNow = input.getChecksum(); + final long checksumThen = input.readLong(); + if (checksumNow != checksumThen) + throw new CorruptIndexException("checksum mismatch in segments file"); - if (format <= FORMAT_CHECKSUM) { - final long checksumNow = input.getChecksum(); - final long checksumThen = input.readLong(); - if (checksumNow != checksumThen) - throw new CorruptIndexException("checksum mismatch in segments file"); - } success = true; } finally { @@ -327,7 +276,7 @@ // before finishCommit is called ChecksumIndexOutput pendingSegnOutput; - private final void write(Directory directory) throws IOException { + private void write(Directory directory) throws IOException { String segmentFileName = getNextSegmentFileName(); @@ -612,7 +561,7 @@ if (genInput != null) { try { int version = genInput.readInt(); - if (version == FORMAT_LOCKLESS) { + if (version == FORMAT_SEGMENTS_GEN_CURRENT) { long gen0 = genInput.readLong(); long gen1 = genInput.readLong(); if (infoStream != null) { @@ -642,10 +591,7 @@ } // Pick the larger of the two gen's: - if (genA > genB) - gen = genA; - else - gen = genB; + gen = Math.max(genA, genB); if (gen == -1) { // Neither approach found a generation @@ -858,9 +804,7 @@ // logic in SegmentInfos to kick in and load the last // good (previous) segments_N-1 file. - final String fileName = IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, - "", - generation); + final String fileName = IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "", generation); success = false; try { dir.sync(Collections.singleton(fileName)); @@ -880,7 +824,7 @@ try { IndexOutput genOutput = dir.createOutput(IndexFileNames.SEGMENTS_GEN); try { - genOutput.writeInt(FORMAT_LOCKLESS); + genOutput.writeInt(FORMAT_SEGMENTS_GEN_CURRENT); genOutput.writeLong(generation); genOutput.writeLong(generation); } finally { Index: lucene/src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/SegmentReader.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -728,7 +728,7 @@ } if (normsDirty) { // re-write norms - si.setNumFields(core.fieldInfos.size()); + si.initNormGen(core.fieldInfos.size()); for (final Norm norm : norms.values()) { if (norm.dirty) { norm.reWrite(si); Index: lucene/src/java/org/apache/lucene/index/TermVectorsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermVectorsReader.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/TermVectorsReader.java (working copy) @@ -29,11 +29,7 @@ // NOTE: if you make a new format, it must be larger than // the current format - static final int FORMAT_VERSION = 2; - // Changes to speed up bulk merging of term vectors: - static final int FORMAT_VERSION2 = 3; - // Changed strings to UTF8 with length-in-bytes not length-in-chars static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4; @@ -87,13 +83,8 @@ assert format == tvdFormat; assert format == tvfFormat; - if (format >= FORMAT_VERSION2) { - assert (tvx.length()-FORMAT_SIZE) % 16 == 0; - numTotalDocs = (int) (tvx.length() >> 4); - } else { - assert (tvx.length()-FORMAT_SIZE) % 8 == 0; - numTotalDocs = (int) (tvx.length() >> 3); - } + assert (tvx.length()-FORMAT_SIZE) % 16 == 0; + numTotalDocs = (int) (tvx.length() >> 4); if (-1 == docStoreOffset) { this.docStoreOffset = 0; @@ -133,11 +124,8 @@ return tvf; } - final private void seekTvx(final int docNum) throws IOException { - if (format < FORMAT_VERSION2) - tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE); - else - tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE); + private void seekTvx(final int docNum) throws IOException { + tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE); } boolean canReadRawDocs() { @@ -160,7 +148,7 @@ // SegmentMerger calls canReadRawDocs() first and should // not call us if that returns false. - if (format < FORMAT_VERSION2) + if (format < FORMAT_UTF8_LENGTH_IN_BYTES) throw new IllegalStateException("cannot read raw docs with older term vector formats"); seekTvx(startDocID); @@ -242,11 +230,7 @@ int number = 0; int found = -1; for (int i = 0; i < fieldCount; i++) { - if (format >= FORMAT_VERSION) - number = tvd.readVInt(); - else - number += tvd.readVInt(); - + number = tvd.readVInt(); if (number == fieldNumber) found = i; } @@ -255,11 +239,7 @@ // document if (found != -1) { // Compute position in the tvf file - long position; - if (format >= FORMAT_VERSION2) - position = tvx.readLong(); - else - position = tvd.readVLong(); + long position = tvx.readLong(); for (int i = 1; i <= found; i++) position += tvd.readVLong(); @@ -292,16 +272,12 @@ // Reads the String[] fields; you have to pre-seek tvd to // the right point - final private String[] readFields(int fieldCount) throws IOException { + private String[] readFields(int fieldCount) throws IOException { int number = 0; String[] fields = new String[fieldCount]; for (int i = 0; i < fieldCount; i++) { - if (format >= FORMAT_VERSION) - number = tvd.readVInt(); - else - number += tvd.readVInt(); - + number = tvd.readVInt(); fields[i] = fieldInfos.fieldName(number); } @@ -310,13 +286,9 @@ // Reads the long[] offsets into TVF; you have to pre-seek // tvx/tvd to the right point - final private long[] readTvfPointers(int fieldCount) throws IOException { + private long[] readTvfPointers(int fieldCount) throws IOException { // Compute position in the tvf file - long position; - if (format >= FORMAT_VERSION2) - position = tvx.readLong(); - else - position = tvd.readVLong(); + long position = tvx.readLong(); long[] tvfPointers = new long[fieldCount]; tvfPointers[0] = position; @@ -425,32 +397,18 @@ boolean storePositions; boolean storeOffsets; - if (format >= FORMAT_VERSION){ - byte bits = tvf.readByte(); - storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; - storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; - } - else{ - tvf.readVInt(); - storePositions = false; - storeOffsets = false; - } + byte bits = tvf.readByte(); + storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; + storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; + mapper.setExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; byte[] byteBuffer; - char[] charBuffer; - final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; - // init the buffers - if (preUTF8) { - charBuffer = new char[10]; - byteBuffer = null; - } else { - charBuffer = null; - byteBuffer = new byte[20]; - } + // init the buffer + byteBuffer = new byte[20]; for (int i = 0; i < numTerms; i++) { start = tvf.readVInt(); @@ -459,26 +417,17 @@ final String term; - if (preUTF8) { - // Term stored as java chars - if (charBuffer.length < totalLength) { - charBuffer = ArrayUtil.grow(charBuffer, totalLength); - } - tvf.readChars(charBuffer, start, deltaLength); - term = new String(charBuffer, 0, totalLength); - } else { - // Term stored as utf8 bytes - if (byteBuffer.length < totalLength) { - byteBuffer = ArrayUtil.grow(byteBuffer, totalLength); - } - tvf.readBytes(byteBuffer, start, deltaLength); - term = new String(byteBuffer, 0, totalLength, "UTF-8"); + // Term stored as utf8 bytes + if (byteBuffer.length < totalLength) { + byteBuffer = ArrayUtil.grow(byteBuffer, totalLength); } + tvf.readBytes(byteBuffer, start, deltaLength); + term = new String(byteBuffer, 0, totalLength, "UTF-8"); int freq = tvf.readVInt(); int [] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? - if (mapper.isIgnoringPositions() == false) { + if (!mapper.isIgnoringPositions()) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) @@ -498,7 +447,7 @@ TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? - if (mapper.isIgnoringOffsets() == false) { + if (!mapper.isIgnoringOffsets()) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (working copy) @@ -36,9 +36,6 @@ long size; long position = -1; - /** The file format version, a negative number. */ - public static final int FORMAT = -3; - // Changed strings to true utf8 with length-in-bytes not // length-in-chars public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; @@ -97,19 +94,11 @@ } else { indexInterval = input.readInt(); skipInterval = input.readInt(); - if (format <= FORMAT) { - // this new format introduces multi-level skipping - maxSkipLevels = input.readInt(); - } + maxSkipLevels = input.readInt(); } assert indexInterval > 0: "indexInterval=" + indexInterval + " is negative; must be > 0"; assert skipInterval > 0: "skipInterval=" + skipInterval + " is negative; must be > 0"; } - if (format > FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { - termBuffer.setPreUTF8Strings(); - scanBuffer.setPreUTF8Strings(); - prevBuffer.setPreUTF8Strings(); - } } @Override Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (revision 949434) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (working copy) @@ -29,7 +29,6 @@ private String field; private Term term; // cached - private boolean preUTF8Strings; // true if strings are stored in modified UTF8 encoding (LUCENE-510) private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes) private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result(); @@ -42,8 +41,8 @@ return field.compareTo(other.field); } - private static final int compareChars(char[] chars1, int len1, - char[] chars2, int len2) { + private static int compareChars(char[] chars1, int len1, + char[] chars2, int len2) { final int end = len1 < len2 ? len1:len2; for (int k = 0; k < end; k++) { char c1 = chars1[k]; @@ -55,41 +54,28 @@ return len1 - len2; } - /** Call this if the IndexInput passed to {@link #read} - * stores terms in the "modified UTF8" (pre LUCENE-510) - * format. */ - void setPreUTF8Strings() { - preUTF8Strings = true; - } - public final void read(IndexInput input, FieldInfos fieldInfos) throws IOException { this.term = null; // invalidate cache int start = input.readVInt(); int length = input.readVInt(); int totalLength = start + length; - if (preUTF8Strings) { - text.setLength(totalLength); - input.readChars(text.result, start, length); + if (dirty) { + // Fully convert all bytes since bytes is dirty + UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); + if (bytes.bytes.length < totalLength) + bytes.bytes = new byte[totalLength]; + bytes.length = totalLength; + input.readBytes(bytes.bytes, start, length); + UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text); + dirty = false; } else { - - if (dirty) { - // Fully convert all bytes since bytes is dirty - UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); - if (bytes.bytes.length < totalLength) - bytes.bytes = new byte[totalLength]; - bytes.length = totalLength; - input.readBytes(bytes.bytes, start, length); - UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text); - dirty = false; - } else { - // Incrementally convert only the UTF8 bytes that are new: - if (bytes.bytes.length < totalLength) - bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength); - bytes.length = totalLength; - input.readBytes(bytes.bytes, start, length); - UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text); - } + // Incrementally convert only the UTF8 bytes that are new: + if (bytes.bytes.length < totalLength) + bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength); + bytes.length = totalLength; + input.readBytes(bytes.bytes, start, length); + UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text); } this.field = fieldInfos.fieldName(input.readVInt()); } Index: lucene/src/java/org/apache/lucene/store/DataInput.java =================================================================== --- lucene/src/java/org/apache/lucene/store/DataInput.java (revision 949434) +++ lucene/src/java/org/apache/lucene/store/DataInput.java (working copy) @@ -29,8 +29,6 @@ * data types. */ public abstract class DataInput implements Cloneable { - private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format - /** Reads and returns a single byte. * @see DataOutput#writeByte(byte) */ @@ -114,89 +112,16 @@ return i; } - /** Call this if readString should read characters stored - * in the old modified UTF8 format (length in java chars - * and java's modified UTF8 encoding). This is used for - * indices written pre-2.4 See LUCENE-510 for details. */ - public void setModifiedUTF8StringsMode() { - preUTF8Strings = true; - } - /** Reads a string. * @see DataOutput#writeString(String) */ public String readString() throws IOException { - if (preUTF8Strings) - return readModifiedUTF8String(); int length = readVInt(); final byte[] bytes = new byte[length]; readBytes(bytes, 0, length); return new String(bytes, 0, length, "UTF-8"); } - private String readModifiedUTF8String() throws IOException { - int length = readVInt(); - final char[] chars = new char[length]; - readChars(chars, 0, length); - return new String(chars, 0, length); - } - - /** Reads Lucene's old "modified UTF-8" encoded - * characters into an array. - * @param buffer the array to read characters into - * @param start the offset in the array to start storing characters - * @param length the number of characters to read - * @see DataOutput#writeChars(String,int,int) - * @deprecated -- please use readString or readBytes - * instead, and construct the string - * from those utf8 bytes - */ - @Deprecated - public void readChars(char[] buffer, int start, int length) - throws IOException { - final int end = start + length; - for (int i = start; i < end; i++) { - byte b = readByte(); - if ((b & 0x80) == 0) - buffer[i] = (char)(b & 0x7F); - else if ((b & 0xE0) != 0xE0) { - buffer[i] = (char)(((b & 0x1F) << 6) - | (readByte() & 0x3F)); - } else { - buffer[i] = (char)(((b & 0x0F) << 12) - | ((readByte() & 0x3F) << 6) - | (readByte() & 0x3F)); - } - } - } - - /** - * Expert - * - * Similar to {@link #readChars(char[], int, int)} but does not do any conversion operations on the bytes it is reading in. It still - * has to invoke {@link #readByte()} just as {@link #readChars(char[], int, int)} does, but it does not need a buffer to store anything - * and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine - * how many more bytes to read - * @param length The number of chars to read - * @deprecated this method operates on old "modified utf8" encoded - * strings - */ - @Deprecated - public void skipChars(int length) throws IOException{ - for (int i = 0; i < length; i++) { - byte b = readByte(); - if ((b & 0x80) == 0){ - //do nothing, we only need one byte - } else if ((b & 0xE0) != 0xE0) { - readByte();//read an additional byte - } else { - //read two additional bytes. - readByte(); - readByte(); - } - } - } - /** Returns a clone of this stream. * *

Clones of a stream access the same data, and are positioned at the same Index: lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (revision 949434) +++ lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (working copy) @@ -22,14 +22,11 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.ByteArrayInputStream; -import java.io.DataInputStream; import java.io.OutputStream; import java.util.Arrays; import java.util.Random; import java.util.Enumeration; import java.util.List; -import java.util.ArrayList; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; @@ -37,8 +34,6 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.document.FieldSelector; -import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.document.NumericField; import org.apache.lucene.search.DocIdSetIterator; @@ -50,13 +45,12 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.BytesRef; /* - Verify we can read the pre-2.1 file format, do searches + Verify we can read the pre-4.0 file format, do searches against it, and add documents to it. */ @@ -128,94 +122,13 @@ } */ - final String[] oldNames = {"19.cfs", - "19.nocfs", - "20.cfs", - "20.nocfs", - "21.cfs", - "21.nocfs", - "22.cfs", - "22.nocfs", - "23.cfs", - "23.nocfs", - "24.cfs", - "24.nocfs", - "29.cfs", - "29.nocfs", - "30.cfs", + final String[] oldNames = {"30.cfs", "30.nocfs", "31.cfs", "31.nocfs", }; - private void assertCompressedFields29(Directory dir, boolean shouldStillBeCompressed) throws IOException { - int count = 0; - final int TEXT_PLAIN_LENGTH = TEXT_TO_COMPRESS.length() * 2; - // FieldSelectorResult.SIZE returns 2*number_of_chars for String fields: - final int BINARY_PLAIN_LENGTH = BINARY_TO_COMPRESS.length; - - IndexReader reader = IndexReader.open(dir, true); - try { - // look into sub readers and check if raw merge is on/off - List readers = new ArrayList(); - ReaderUtil.gatherSubReaders(readers, reader); - for (IndexReader ir : readers) { - final FieldsReader fr = ((SegmentReader) ir).getFieldsReader(); - assertTrue("for a 2.9 index, FieldsReader.canReadRawDocs() must be false and other way round for a trunk index", - shouldStillBeCompressed != fr.canReadRawDocs()); - } - - // test that decompression works correctly - for(int i=0; i 0; - final int shouldSize = shouldStillBeCompressed ? - compressedSize : - (binary ? BINARY_PLAIN_LENGTH : TEXT_PLAIN_LENGTH); - assertEquals("size incorrect", shouldSize, actualSize); - if (!shouldStillBeCompressed) { - assertFalse("uncompressed field should have another size than recorded in index", compressedSize == actualSize); - } - } - } - assertEquals("correct number of tests", 34 * 2, count); - } finally { - reader.close(); - } - } - public void testOptimizeOldIndex() throws Exception { - int hasTested29 = 0; - Random rand = newRandom(); for(int i=0;i= 3.0 - if (oldNames[i].compareTo("30.") < 0) continue; unzip(getDataFile("index." + oldNames[i] + ".zip"), oldNames[i]); String fullPath = fullDir(oldNames[i]); Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestCodecs.java (revision 949434) +++ lucene/src/test/org/apache/lucene/index/TestCodecs.java (working copy) @@ -281,7 +281,7 @@ final Directory dir = new MockRAMDirectory(); this.write(fieldInfos, dir, fields); - final SegmentInfo si = new SegmentInfo(SEGMENT, 10000, dir, CodecProvider.getDefault().getWriter(null)); + final SegmentInfo si = new SegmentInfo(SEGMENT, 10000, dir, false, -1, SEGMENT, false, true, CodecProvider.getDefault().getWriter(null)); si.setHasProx(false); final FieldsProducer reader = si.getCodec().fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 64, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR)); @@ -319,7 +319,7 @@ final Directory dir = new MockRAMDirectory(); this.write(fieldInfos, dir, fields); - final SegmentInfo si = new SegmentInfo(SEGMENT, 10000, dir, CodecProvider.getDefault().getWriter(null)); + final SegmentInfo si = new SegmentInfo(SEGMENT, 10000, dir, false, -1, SEGMENT, false, true, CodecProvider.getDefault().getWriter(null)); final FieldsProducer terms = si.getCodec().fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR)); Index: lucene/src/test/org/apache/lucene/index/TestDoc.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestDoc.java (revision 949434) +++ lucene/src/test/org/apache/lucene/index/TestDoc.java (working copy) @@ -194,8 +194,7 @@ merger.closeReaders(); final SegmentInfo info = new SegmentInfo(merged, si1.docCount + si2.docCount, si1.dir, - useCompoundFile, true, -1, null, false, merger.hasProx(), - merger.getCodec()); + useCompoundFile, -1, null, false, merger.hasProx(), merger.getCodec()); if (useCompoundFile) { List filesToDelete = merger.createCompoundFile(merged + ".cfs", info); Index: lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java (revision 949434) +++ lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java (working copy) @@ -134,9 +134,6 @@ // Create a bogus fnm file when the CFS already exists: copyFile(dir, "_0.cfs", "_0.fnm"); - // Create a deletable file: - copyFile(dir, "_0.cfs", "deletable"); - // Create some old segments file: copyFile(dir, "segments_2", "segments"); copyFile(dir, "segments_2", "segments_1"); Index: lucene/src/test/org/apache/lucene/index/TestIndexInput.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexInput.java (revision 949434) +++ lucene/src/test/org/apache/lucene/index/TestIndexInput.java (working copy) @@ -79,44 +79,4 @@ assertEquals("\u0000",is.readString()); assertEquals("Lu\u0000ce\u0000ne",is.readString()); } - - /** - * Expert - * - * @throws IOException - */ - public void testSkipChars() throws IOException { - byte[] bytes = new byte[]{(byte) 0x80, 0x01, - (byte) 0xFF, 0x7F, - (byte) 0x80, (byte) 0x80, 0x01, - (byte) 0x81, (byte) 0x80, 0x01, - 0x06, 'L', 'u', 'c', 'e', 'n', 'e', - }; - String utf8Str = "\u0634\u1ea1"; - byte [] utf8Bytes = utf8Str.getBytes("UTF-8"); - byte [] theBytes = new byte[bytes.length + 1 + utf8Bytes.length]; - System.arraycopy(bytes, 0, theBytes, 0, bytes.length); - theBytes[bytes.length] = (byte)utf8Str.length();//Add in the number of chars we are storing, which should fit in a byte for this test - System.arraycopy(utf8Bytes, 0, theBytes, bytes.length + 1, utf8Bytes.length); - IndexInput is = new MockIndexInput(theBytes); - assertEquals(128, is.readVInt()); - assertEquals(16383, is.readVInt()); - assertEquals(16384, is.readVInt()); - assertEquals(16385, is.readVInt()); - int charsToRead = is.readVInt();//number of chars in the Lucene string - assertTrue(0x06 + " does not equal: " + charsToRead, 0x06 == charsToRead); - is.skipChars(3); - char [] chars = new char[3];//there should be 6 chars remaining - is.readChars(chars, 0, 3); - String tmpStr = new String(chars); - assertTrue(tmpStr + " is not equal to " + "ene", tmpStr.equals("ene" ) == true); - //Now read the UTF8 stuff - charsToRead = is.readVInt() - 1;//since we are skipping one - is.skipChars(1); - assertTrue(utf8Str.length() - 1 + " does not equal: " + charsToRead, utf8Str.length() - 1 == charsToRead); - chars = new char[charsToRead]; - is.readChars(chars, 0, charsToRead); - tmpStr = new String(chars); - assertTrue(tmpStr + " is not equal to " + utf8Str.substring(1), tmpStr.equals(utf8Str.substring(1)) == true); - } } Index: lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java (revision 949434) +++ lucene/src/test/org/apache/lucene/index/TestSegmentMerger.java (working copy) @@ -72,8 +72,8 @@ merger.closeReaders(); assertTrue(docsMerged == 2); //Should be able to open a new SegmentReader against the new directory - SegmentReader mergedReader = SegmentReader.get(false, mergedDir, new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, true, - -1, null, false, merger.hasProx(), merger.getCodec()), BufferedIndexInput.BUFFER_SIZE, true, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, null); + SegmentReader mergedReader = SegmentReader.get(false, mergedDir, new SegmentInfo(mergedSegment, docsMerged, mergedDir, false, -1, + null, false, merger.hasProx(), merger.getCodec()), BufferedIndexInput.BUFFER_SIZE, true, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR, null); assertTrue(mergedReader != null); assertTrue(mergedReader.numDocs() == 2); Index: lucene/src/test/org/apache/lucene/index/index.19.cfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.19.nocfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.20.cfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.20.nocfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.21.cfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.21.nocfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.22.cfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.22.nocfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.23.cfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.23.nocfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.24.cfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.24.nocfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.29.cfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: lucene/src/test/org/apache/lucene/index/index.29.nocfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream