Index: src/test/org/apache/lucene/index/index.23.nocfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: src/test/org/apache/lucene/index/index.23.cfs.zip =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Index: src/test/org/apache/lucene/index/TestIndexInput.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexInput.java (revision 633453) +++ src/test/org/apache/lucene/index/TestIndexInput.java (working copy) @@ -24,16 +24,70 @@ public class TestIndexInput extends LuceneTestCase { public void testRead() throws IOException { - IndexInput is = new MockIndexInput(new byte[]{(byte) 0x80, 0x01, - (byte) 0xFF, 0x7F, - (byte) 0x80, (byte) 0x80, 0x01, - (byte) 0x81, (byte) 0x80, 0x01, - 0x06, 'L', 'u', 'c', 'e', 'n', 'e'}); - assertEquals(128, is.readVInt()); - assertEquals(16383, is.readVInt()); - assertEquals(16384, is.readVInt()); - assertEquals(16385, is.readVInt()); - assertEquals("Lucene", is.readString()); + IndexInput is = new MockIndexInput(new byte[] { + (byte) 0x80, 0x01, + (byte) 0xFF, 0x7F, + (byte) 0x80, (byte) 0x80, 0x01, + (byte) 0x81, (byte) 0x80, 0x01, + 0x06, 'L', 'u', 'c', 'e', 'n', 'e', + + // 2-byte UTF-8 (U+00BF "INVERTED QUESTION MARK") + 0x02, (byte) 0xC2, (byte) 0xBF, + 0x0A, 'L', 'u', (byte) 0xC2, (byte) 0xBF, + 'c', 'e', (byte) 0xC2, (byte) 0xBF, + 'n', 'e', + + // 3-byte UTF-8 (U+2620 "SKULL AND CROSSBONES") + 0x03, (byte) 0xE2, (byte) 0x98, (byte) 0xA0, + 0x0C, 'L', 'u', (byte) 0xE2, (byte) 0x98, (byte) 0xA0, + 'c', 'e', (byte) 0xE2, (byte) 0x98, (byte) 0xA0, + 'n', 'e', + + // surrogate pairs + // (U+1D11E "MUSICAL SYMBOL G CLEF") + // (U+1D160 "MUSICAL SYMBOL EIGHTH NOTE") + 0x04, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E, + 0x08, (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E, + (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0, + 0x0E, 'L', 'u', + (byte) 0xF0, (byte) 0x9D, (byte) 0x84, (byte) 0x9E, + 'c', 'e', + (byte) 0xF0, (byte) 0x9D, (byte) 0x85, (byte) 0xA0, + 'n', 'e', + + // null bytes + 0x01, 0x00, + 0x08, 'L', 'u', 0x00, 'c', 'e', 0x00, 'n', 'e', + + // Modified UTF-8 null bytes + 0x02, (byte) 0xC0, (byte) 0x80, + 0x0A, 'L', 'u', (byte) 0xC0, (byte) 0x80, + 'c', 'e', (byte) 0xC0, (byte) 0x80, + 'n', 'e', + + }); + + assertEquals(128,is.readVInt()); + assertEquals(16383,is.readVInt()); + assertEquals(16384,is.readVInt()); + assertEquals(16385,is.readVInt()); + assertEquals("Lucene",is.readString()); + + assertEquals("\u00BF",is.readString()); + assertEquals("Lu\u00BFce\u00BFne",is.readString()); + + assertEquals("\u2620",is.readString()); + assertEquals("Lu\u2620ce\u2620ne",is.readString()); + + assertEquals("\uD834\uDD1E",is.readString()); + assertEquals("\uD834\uDD1E\uD834\uDD60",is.readString()); + assertEquals("Lu\uD834\uDD1Ece\uD834\uDD60ne",is.readString()); + + assertEquals("\u0000",is.readString()); + assertEquals("Lu\u0000ce\u0000ne",is.readString()); + + assertEquals("\u0000",is.readString()); + assertEquals("Lu\u0000ce\u0000ne",is.readString()); } /** Index: src/test/org/apache/lucene/index/TestBackwardsCompatibility.java =================================================================== --- src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (revision 633453) +++ src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (working copy) @@ -421,6 +421,7 @@ Document doc = new Document(); doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED)); doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.add(new Field("utf8", "Lu\uD834\uDD1Ece\uD834\uDD60ne \u0000 \u2620", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.addDocument(doc); } Index: src/java/org/apache/lucene/index/FieldsReader.java =================================================================== --- src/java/org/apache/lucene/index/FieldsReader.java (revision 633453) +++ src/java/org/apache/lucene/index/FieldsReader.java (working copy) @@ -51,6 +51,8 @@ private int numTotalDocs; private int size; private boolean closed; + private final int format; + private final int formatSize; // The docID offset where our docs begin in the index // file. This will be 0 if we have our own private file. @@ -76,6 +78,26 @@ fieldsStream = (IndexInput) cloneableFieldsStream.clone(); indexStream = d.openInput(segment + ".fdx", readBufferSize); + // First version of fdx did not include a format + // header, but, the first int will always be 0 in that + // case + int firstInt = indexStream.readInt(); + if (firstInt == 0) + format = 0; + else + format = firstInt; + + if (format > FieldsWriter.FORMAT_CURRENT) + throw new CorruptIndexException("Incompatible format version: " + format + " expected " + + FieldsWriter.FORMAT_CURRENT + " or lower"); + + if (format > FieldsWriter.FORMAT) + formatSize = 4; + else + formatSize = 0; + + final long indexSize = indexStream.length()-formatSize; + if (docStoreOffset != -1) { // We read only a slice out of this shared fields file this.docStoreOffset = docStoreOffset; @@ -83,13 +105,13 @@ // Verify the file is long enough to hold all of our // docs - assert ((int) (indexStream.length() / 8)) >= size + this.docStoreOffset; + assert ((int) (indexSize / 8)) >= size + this.docStoreOffset; } else { this.docStoreOffset = 0; - this.size = (int) (indexStream.length() >> 3); + this.size = (int) (indexSize >> 3); } - numTotalDocs = (int) (indexStream.length() >> 3); + numTotalDocs = (int) (indexSize >> 3); success = true; } finally { // With lock-less commits, it's entirely possible (and @@ -142,8 +164,12 @@ return size; } + private final void seekIndex(int docID) throws IOException { + indexStream.seek(formatSize + (docID + docStoreOffset) * 8L); + } + final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { - indexStream.seek((n + docStoreOffset) * 8L); + seekIndex(n); long position = indexStream.readLong(); fieldsStream.seek(position); @@ -195,7 +221,7 @@ * startDocID. Returns the IndexInput (the fieldStream), * already seeked to the starting point for startDocID.*/ final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException { - indexStream.seek((docStoreOffset+startDocID) * 8L); + seekIndex(startDocID); long startOffset = indexStream.readLong(); long lastOffset = startOffset; int count = 0; @@ -225,13 +251,12 @@ } private void skipField(boolean binary, boolean compressed, int toRead) throws IOException { - if (binary || compressed) { - long pointer = fieldsStream.getFilePointer(); - fieldsStream.seek(pointer + toRead); - } else { - //We need to skip chars. This will slow us down, but still better - fieldsStream.skipChars(toRead); - } + if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { + fieldsStream.seek(fieldsStream.getFilePointer() + toRead); + } else { + // We need to skip chars. This will slow us down, but still better + fieldsStream.skipChars(toRead); + } } private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException { @@ -265,7 +290,10 @@ int length = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); //Skip ahead of where we are by the length of what is stored - fieldsStream.skipChars(length); + if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) + fieldsStream.seek(pointer+length); + else + fieldsStream.skipChars(length); f = new LazyField(fi.name, store, index, termVector, length, pointer); f.setOmitNorms(fi.omitNorms); } @@ -464,10 +492,16 @@ localFieldsStream.readBytes(b, 0, b.length); fieldsData = new String(uncompress(b), "UTF-8"); } else { - //read in chars b/c we already know the length we need to read - char[] chars = new char[toRead]; - localFieldsStream.readChars(chars, 0, toRead); - fieldsData = new String(chars); + if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { + //read in chars b/c we already know the length we need to read + byte[] bytes = new byte[toRead]; + localFieldsStream.readBytes(bytes, 0, toRead); + fieldsData = new String(bytes, "UTF-8"); + } else { + char[] chars = new char[toRead]; + localFieldsStream.readChars(chars, 0, toRead); + fieldsData = new String(chars); + } } } catch (IOException e) { throw new FieldReaderException(e); Index: src/java/org/apache/lucene/index/FieldsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FieldsWriter.java (revision 633453) +++ src/java/org/apache/lucene/index/FieldsWriter.java (working copy) @@ -33,6 +33,17 @@ static final byte FIELD_IS_TOKENIZED = 0x1; static final byte FIELD_IS_BINARY = 0x2; static final byte FIELD_IS_COMPRESSED = 0x4; + + // Original format + static final int FORMAT = 0; + + // Changed strings to true utf8 with length-in-bytes not length-in-chars + static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = 1; + + // NOTE: if you introduce a new format, make it 1 higher + // than the current one, and always change this if you + // switch to a new format! + static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; private FieldInfos fieldInfos; @@ -44,8 +55,34 @@ FieldsWriter(Directory d, String segment, FieldInfos fn) throws IOException { fieldInfos = fn; - fieldsStream = d.createOutput(segment + ".fdt"); - indexStream = d.createOutput(segment + ".fdx"); + + boolean success = false; + final String fieldsName = segment + "." + IndexFileNames.FIELDS_EXTENSION; + try { + fieldsStream = d.createOutput(fieldsName); + fieldsStream.writeInt(FORMAT_CURRENT); + success = true; + } finally { + if (!success) { + close(); + d.deleteFile(fieldsName); + } + } + + success = false; + final String indexName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION; + try { + indexStream = d.createOutput(indexName); + indexStream.writeInt(FORMAT_CURRENT); + success = true; + } finally { + if (!success) { + close(); + d.deleteFile(fieldsName); + d.deleteFile(indexName); + } + } + doClose = true; } @@ -73,8 +110,10 @@ final void close() throws IOException { if (doClose) { - fieldsStream.close(); - indexStream.close(); + if (fieldsStream != null) + fieldsStream.close(); + if (indexStream != null) + indexStream.close(); } } Index: src/java/org/apache/lucene/index/SegmentTermEnum.java =================================================================== --- src/java/org/apache/lucene/index/SegmentTermEnum.java (revision 633453) +++ src/java/org/apache/lucene/index/SegmentTermEnum.java (working copy) @@ -61,8 +61,8 @@ format = firstInt; // check that it is a format we can understand - if (format < TermInfosWriter.FORMAT) - throw new CorruptIndexException("Unknown format version:" + format); + if (format < TermInfosWriter.FORMAT_CURRENT) + throw new CorruptIndexException("Unknown format version:" + format + " expected " + TermInfosWriter.FORMAT_CURRENT + " or higher"); size = input.readLong(); // read the size @@ -77,13 +77,14 @@ } else { indexInterval = input.readInt(); skipInterval = input.readInt(); - if (format == -3) { + if (format <= TermInfosWriter.FORMAT) { // this new format introduces multi-level skipping maxSkipLevels = input.readInt(); } } } - + if (format > TermInfosWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) + termBuffer.setLegacyStringsMode(); } protected Object clone() { Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 633453) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -58,12 +58,12 @@ segment = seg; fieldInfos = fis; - origEnum = new SegmentTermEnum(directory.openInput(segment + ".tis", + origEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_EXTENSION, readBufferSize), fieldInfos, false); size = origEnum.size; totalIndexInterval = origEnum.indexInterval; - indexEnum = new SegmentTermEnum(directory.openInput(segment + ".tii", + indexEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION, readBufferSize), fieldInfos, true); success = true; Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 633453) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -857,6 +857,7 @@ if (fieldsWriter == null) { assert docStoreSegment == null; assert segment != null; + files = null; docStoreSegment = segment; // If we hit an exception while init'ing the // fieldsWriter, we must abort this segment @@ -867,7 +868,6 @@ } catch (Throwable t) { throw new AbortException(t, DocumentsWriter.this); } - files = null; } localFieldsWriter = new FieldsWriter(null, fdtLocal, fieldInfos); } @@ -877,17 +877,18 @@ if (docHasVectors) { if (tvx == null) { assert docStoreSegment != null; + files = null; // If we hit an exception while init'ing the term // vector output files, we must abort this segment // because those files will be in an unknown // state: try { tvx = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); - tvx.writeInt(TermVectorsReader.FORMAT_VERSION2); + tvx.writeInt(TermVectorsReader.FORMAT_CURRENT); tvd = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); - tvd.writeInt(TermVectorsReader.FORMAT_VERSION2); + tvd.writeInt(TermVectorsReader.FORMAT_CURRENT); tvf = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); - tvf.writeInt(TermVectorsReader.FORMAT_VERSION2); + tvf.writeInt(TermVectorsReader.FORMAT_CURRENT); // We must "catch up" for all docs before us // that had no vectors: @@ -899,7 +900,6 @@ } catch (Throwable t) { throw new AbortException(t, DocumentsWriter.this); } - files = null; } numVectorFields = 0; } @@ -1925,7 +1925,7 @@ doVectorSort(postingsVectors, numPostingsVectors); - Posting lastPosting = null; + byte[] lastTermBytes = null; final ByteSliceReader reader = vectorSliceReader; @@ -1934,40 +1934,32 @@ Posting posting = vector.p; final int freq = posting.docFreq; - final int prefix; final char[] text2 = charPool.buffers[posting.textStart >> CHAR_BLOCK_SHIFT]; final int start2 = posting.textStart & CHAR_BLOCK_MASK; + int pos2 = start2; + while(text2[pos2] != 0xffff) + pos2++; + final byte[] termBytes = new String(text2, start2, pos2-start2).getBytes("UTF-8"); + // Compute common prefix between last term and // this term - if (lastPosting == null) - prefix = 0; - else { - final char[] text1 = charPool.buffers[lastPosting.textStart >> CHAR_BLOCK_SHIFT]; - final int start1 = lastPosting.textStart & CHAR_BLOCK_MASK; - int pos1 = start1; - while(true) { - final char c1 = text1[pos1]; - final char c2 = text2[pos2]; - if (c1 != c2 || c1 == 0xffff) { - prefix = pos1-start1; + int prefix = 0; + if (lastTermBytes != null) + while(prefix < lastTermBytes.length && prefix < termBytes.length) { + final byte b1 = lastTermBytes[prefix]; + final byte b2 = termBytes[prefix]; + if (b1 != b2) break; - } - pos1++; - pos2++; + prefix++; } - } - lastPosting = posting; + lastTermBytes = termBytes; - // Compute length - while(text2[pos2] != 0xffff) - pos2++; - - final int suffix = pos2 - start2 - prefix; + final int suffix = termBytes.length - prefix; tvfLocal.writeVInt(prefix); tvfLocal.writeVInt(suffix); - tvfLocal.writeChars(text2, start2 + prefix, suffix); + tvfLocal.writeBytes(termBytes, prefix, suffix); tvfLocal.writeVInt(freq); if (doVectorPositions) { @@ -2383,7 +2375,11 @@ // Write term termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer)); - termsOut.add(fieldNumber, text, start, pos-start, termInfo); + + //termsOut.add(fieldNumber, text, start, pos-start, termInfo); + termsOut.add(fieldNumber, + new String(text, start, pos-start).getBytes("UTF-8"), + termInfo); } } Index: src/java/org/apache/lucene/index/TermVectorsReader.java =================================================================== --- src/java/org/apache/lucene/index/TermVectorsReader.java (revision 633453) +++ src/java/org/apache/lucene/index/TermVectorsReader.java (working copy) @@ -32,8 +32,16 @@ // NOTE: if you make a new format, it must be larger than // the current format static final int FORMAT_VERSION = 2; + + // Changes to speed up bulk merging of term vectors: static final int FORMAT_VERSION2 = 3; + // Changed strings to true utf8 with length-in-bytes not length-in-chars + static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = 4; + + // NOTE: always change this if you switch to a new format! + static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; + //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file static final int FORMAT_SIZE = 4; @@ -52,7 +60,7 @@ // file. This will be 0 if we have our own private file. private int docStoreOffset; - private final int format; + final int format; TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos) throws CorruptIndexException, IOException { @@ -133,7 +141,7 @@ } boolean canReadRawDocs() { - return format >= FORMAT_VERSION2; + return format == FORMAT_CURRENT; } /** Retrieve the length (in bytes) of the tvd and tvf @@ -188,9 +196,9 @@ private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException { int format = in.readInt(); - if (format > FORMAT_VERSION2) { + if (format > FORMAT_CURRENT) { throw new CorruptIndexException("Incompatible format version: " + format + " expected " - + FORMAT_VERSION2 + " or less"); + + FORMAT_CURRENT + " or less"); } return format; } @@ -432,24 +440,46 @@ int start = 0; int deltaLength = 0; int totalLength = 0; - char [] buffer = new char[10]; // init the buffer with a length of 10 character - char[] previousBuffer = {}; - + byte[] byteBuffer; + char[] charBuffer; + final boolean legacy = format < FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; + + // init the buffers + if (legacy) { + charBuffer = new char[10]; + byteBuffer = null; + } else { + charBuffer= null; + byteBuffer = new byte[20]; + } + for (int i = 0; i < numTerms; i++) { start = tvf.readVInt(); deltaLength = tvf.readVInt(); totalLength = start + deltaLength; - if (buffer.length < totalLength) { // increase buffer - buffer = null; // give a hint to garbage collector - buffer = new char[totalLength]; - - if (start > 0) // just copy if necessary - System.arraycopy(previousBuffer, 0, buffer, 0, start); + + final String term; + + if (legacy) { + // Term stored as java chars + if (charBuffer.length < totalLength) { + char[] newCharBuffer = new char[(int) (1.5*totalLength)]; + System.arraycopy(charBuffer, 0, newCharBuffer, 0, start); + charBuffer = newCharBuffer; + } + tvf.readChars(charBuffer, start, deltaLength); + term = new String(charBuffer, 0, totalLength); + } else { + // Term stored as utf8 bytes + if (byteBuffer.length < totalLength) { + byte[] newByteBuffer = new byte[(int) (1.5*totalLength)]; + System.arraycopy(byteBuffer, 0, newByteBuffer, 0, start); + byteBuffer = newByteBuffer; + } + tvf.readBytes(byteBuffer, start, deltaLength); + term = new String(byteBuffer, 0, totalLength, "UTF-8"); } - - tvf.readChars(buffer, start, deltaLength); - String term = new String(buffer, 0, totalLength); - previousBuffer = buffer; + int freq = tvf.readVInt(); int [] positions = null; if (storePositions) { //read in the positions Index: src/java/org/apache/lucene/index/TermBuffer.java =================================================================== --- src/java/org/apache/lucene/index/TermBuffer.java (revision 633453) +++ src/java/org/apache/lucene/index/TermBuffer.java (working copy) @@ -21,70 +21,94 @@ import org.apache.lucene.store.IndexInput; final class TermBuffer implements Cloneable { - private static final char[] NO_CHARS = new char[0]; + private static final byte[] NO_BYTES = new byte[0]; private String field; + + private byte[] bytes = NO_BYTES; + private int bytesLength; + + // Only used when reading legacy index (LUCENE-510) + private static final char[] NO_CHARS = new char[0]; private char[] text = NO_CHARS; - private int textLength; + private Term term; // cached + private boolean legacyStrings; // true if strings are stored as chars not bytes (LUCENE-510) public final int compareTo(TermBuffer other) { if (field == other.field) // fields are interned - return compareChars(text, textLength, other.text, other.textLength); + return compareBytes(bytes, bytesLength, other.bytes, other.bytesLength); else return field.compareTo(other.field); } - private static final int compareChars(char[] v1, int len1, - char[] v2, int len2) { + private static final int compareBytes(byte[] bytes1, int len1, + byte[] bytes2, int len2) { int end = Math.min(len1, len2); for (int k = 0; k < end; k++) { - char c1 = v1[k]; - char c2 = v2[k]; - if (c1 != c2) { - return c1 - c2; + int b1 = (bytes1[k] & 0xFF); + int b2 = (bytes2[k] & 0xFF); + if (b1 != b2) { + return b1 - b2; } } return len1 - len2; } - private final void setTextLength(int newLength) { - if (text.length < newLength) { - char[] newText = new char[newLength]; - System.arraycopy(text, 0, newText, 0, textLength); - text = newText; + private final void setBytesLength(int newLength) { + if (bytes.length < newLength) { + byte[] newBytes = new byte[newLength]; + System.arraycopy(bytes, 0, newBytes, 0, bytesLength); + bytes = newBytes; } - textLength = newLength; + bytesLength = newLength; } + void setLegacyStringsMode() { + legacyStrings = true; + } + public final void read(IndexInput input, FieldInfos fieldInfos) throws IOException { this.term = null; // invalidate cache int start = input.readVInt(); int length = input.readVInt(); int totalLength = start + length; - setTextLength(totalLength); - input.readChars(this.text, start, length); + if (legacyStrings) { + if (totalLength > text.length) { + char[] newText = new char[(int) (totalLength*1.5)]; + System.arraycopy(text, 0, newText, 0, text.length); + text = newText; + } + input.readChars(text, start, length); + byte[] stringBytes = new String(text, 0, totalLength).getBytes("UTF-8"); + setBytesLength(stringBytes.length); + System.arraycopy(stringBytes, 0, bytes, 0, stringBytes.length); + } else { + setBytesLength(totalLength); + input.readBytes(bytes, start, length); + } this.field = fieldInfos.fieldName(input.readVInt()); } - public final void set(Term term) { - if (term == null) { + public final void set(Term t) { + if (t == null) { reset(); return; } - // copy text into the buffer - setTextLength(term.text().length()); - term.text().getChars(0, term.text().length(), text, 0); - - this.field = term.field(); - this.term = term; + // convert chars into UTF-8 bytes, store in buffer + try { + bytes = t.text().getBytes("UTF-8"); + } catch (java.io.UnsupportedEncodingException e) { } + setBytesLength(bytes.length); + this.field = t.field(); + this.term = t; } public final void set(TermBuffer other) { - setTextLength(other.textLength); - System.arraycopy(other.text, 0, text, 0, textLength); + setBytesLength(other.bytesLength); + System.arraycopy(other.bytes, 0, bytes, 0, bytesLength); this.field = other.field; this.term = other.term; @@ -92,7 +116,7 @@ public void reset() { this.field = null; - this.textLength = 0; + this.bytesLength = 0; this.term = null; } @@ -101,7 +125,10 @@ return null; if (term == null) - term = new Term(field, new String(text, 0, textLength), false); + try { + term = new Term(field, + new String(bytes, 0, bytesLength, "UTF-8"), false ); + } catch (java.io.UnsupportedEncodingException e) { } return term; } @@ -112,8 +139,8 @@ clone = (TermBuffer)super.clone(); } catch (CloneNotSupportedException e) {} - clone.text = new char[text.length]; - System.arraycopy(text, 0, clone.text, 0, textLength); + clone.bytes = new byte[bytes.length]; + System.arraycopy(bytes, 0, clone.bytes, 0, bytesLength); return clone; } Index: src/java/org/apache/lucene/index/TermInfosWriter.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosWriter.java (revision 633453) +++ src/java/org/apache/lucene/index/TermInfosWriter.java (working copy) @@ -29,6 +29,13 @@ /** The file format version, a negative number. */ public static final int FORMAT = -3; + // Changed strings to true utf8 with length-in-bytes not + // length-in-chars + public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; + + // NOTE: always change this if you switch to a new format! + public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; + private FieldInfos fieldInfos; private IndexOutput output; private TermInfo lastTi = new TermInfo(); @@ -62,12 +69,10 @@ private long lastIndexPointer; private boolean isIndex; - private char[] lastTermText = new char[10]; - private int lastTermTextLength; + private static byte[] NO_BYTES = new byte[0]; + private byte[] lastTermBytes = NO_BYTES; private int lastFieldNumber = -1; - private char[] termTextBuffer = new char[10]; - private TermInfosWriter other; TermInfosWriter(Directory directory, String segment, FieldInfos fis, @@ -89,27 +94,19 @@ fieldInfos = fis; isIndex = isi; output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); - output.writeInt(FORMAT); // write format + output.writeInt(FORMAT_CURRENT); // write format output.writeLong(0); // leave space for size - output.writeInt(indexInterval); // write indexInterval - output.writeInt(skipInterval); // write skipInterval - output.writeInt(maxSkipLevels); // write maxSkipLevels + output.writeInt(indexInterval); // write indexInterval + output.writeInt(skipInterval); // write skipInterval + output.writeInt(maxSkipLevels); // write maxSkipLevels } void add(Term term, TermInfo ti) throws IOException { - - final int length = term.text.length(); - if (termTextBuffer.length < length) - termTextBuffer = new char[(int) (length*1.25)]; - - term.text.getChars(0, length, termTextBuffer, 0); - - add(fieldInfos.fieldNumber(term.field), termTextBuffer, 0, length, ti); + add(fieldInfos.fieldNumber(term.field), term.text.getBytes("UTF-8"), ti); } // Currently used only by assert statement - private int compareToLastTerm(int fieldNumber, char[] termText, int start, int length) { - int pos = 0; + private int compareToLastTerm(int fieldNumber, byte[] termBytes) { if (lastFieldNumber != fieldNumber) { final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); @@ -121,45 +118,46 @@ return cmp; } - while(pos < length && pos < lastTermTextLength) { - final char c1 = lastTermText[pos]; - final char c2 = termText[pos + start]; - if (c1 < c2) + int pos = 0; + while(pos < termBytes.length && pos < lastTermBytes.length) { + final int b1 = lastTermBytes[pos] & 0xff; + final int b2 = termBytes[pos] & 0xff; + if (b1 < b2) return -1; - else if (c1 > c2) + else if (b1 > b2) return 1; pos++; } - if (pos < lastTermTextLength) + if (pos < lastTermBytes.length) // Last term was longer return 1; - else if (pos < length) + else if (pos < termBytes.length) // Last term was shorter return -1; else return 0; } - /** Adds a new <, TermInfo> pair to the set. + /** Adds a new <, TermInfo> pair to the set. Term must be lexicographically greater than all previous Terms added. TermInfo pointers must be positive and greater than all previous.*/ - void add(int fieldNumber, char[] termText, int termTextStart, int termTextLength, TermInfo ti) + void add(int fieldNumber, byte[] termBytes, TermInfo ti) throws IOException { - assert compareToLastTerm(fieldNumber, termText, termTextStart, termTextLength) < 0 || - (isIndex && termTextLength == 0 && lastTermTextLength == 0) : + assert compareToLastTerm(fieldNumber, termBytes) < 0 || + (isIndex && termBytes.length == 0 && lastTermBytes.length == 0) : "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + - " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + - " text=" + new String(termText, termTextStart, termTextLength) + " lastText=" + new String(lastTermText, 0, lastTermTextLength); + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + + " text=" + new String(termBytes, "UTF-8") + " lastText=" + new String(lastTermBytes, "UTF-8"); assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; if (!isIndex && size % indexInterval == 0) - other.add(lastFieldNumber, lastTermText, 0, lastTermTextLength, lastTi); // add an index term + other.add(lastFieldNumber, lastTermBytes, lastTi); // add an index term - writeTerm(fieldNumber, termText, termTextStart, termTextLength); // write term + writeTerm(fieldNumber, termBytes); // write term output.writeVInt(ti.docFreq); // write doc freq output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers @@ -174,33 +172,29 @@ lastIndexPointer = other.output.getFilePointer(); // write pointer } - if (lastTermText.length < termTextLength) - lastTermText = new char[(int) (termTextLength*1.25)]; - System.arraycopy(termText, termTextStart, lastTermText, 0, termTextLength); - lastTermTextLength = termTextLength; + lastTermBytes = termBytes; lastFieldNumber = fieldNumber; - lastTi.set(ti); size++; } - private void writeTerm(int fieldNumber, char[] termText, int termTextStart, int termTextLength) + private void writeTerm(int fieldNumber, byte[] termBytes) throws IOException { // Compute prefix in common with last term: int start = 0; - final int limit = termTextLength < lastTermTextLength ? termTextLength : lastTermTextLength; + final int limit = termBytes.length < lastTermBytes.length ? termBytes.length : lastTermBytes.length; while(start < limit) { - if (termText[termTextStart+start] != lastTermText[start]) + if (termBytes[start] != lastTermBytes[start]) break; start++; } - int length = termTextLength - start; + int length = termBytes.length - start; output.writeVInt(start); // write shared prefix length output.writeVInt(length); // write delta length - output.writeChars(termText, start+termTextStart, length); // write delta chars + output.writeBytes(termBytes, start, length); // write delta bytes output.writeVInt(fieldNumber); // write field num } Index: src/java/org/apache/lucene/index/TermVectorsWriter.java =================================================================== --- src/java/org/apache/lucene/index/TermVectorsWriter.java (revision 633453) +++ src/java/org/apache/lucene/index/TermVectorsWriter.java (working copy) @@ -27,17 +27,18 @@ private IndexOutput tvx = null, tvd = null, tvf = null; private FieldInfos fieldInfos; + private static byte[] NO_BYTES = new byte[0]; public TermVectorsWriter(Directory directory, String segment, FieldInfos fieldInfos) throws IOException { // Open files for TermVector storage tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); - tvx.writeInt(TermVectorsReader.FORMAT_VERSION2); + tvx.writeInt(TermVectorsReader.FORMAT_CURRENT); tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); - tvd.writeInt(TermVectorsReader.FORMAT_VERSION2); + tvd.writeInt(TermVectorsReader.FORMAT_CURRENT); tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); - tvf.writeInt(TermVectorsReader.FORMAT_VERSION2); + tvf.writeInt(TermVectorsReader.FORMAT_CURRENT); this.fieldInfos = fieldInfos; } @@ -97,15 +98,16 @@ final String[] terms = vectors[i].getTerms(); final int[] freqs = vectors[i].getTermFrequencies(); - String lastTermText = ""; + byte[] lastTermBytes = NO_BYTES; for (int j=0; j= TermVectorsReader.FORMAT_CURRENT; + long tvdPosition = tvd.getFilePointer(); long tvfPosition = tvf.getFilePointer(); long tvdStart = tvdPosition; Index: src/java/org/apache/lucene/store/IndexInput.java =================================================================== --- src/java/org/apache/lucene/store/IndexInput.java (revision 633453) +++ src/java/org/apache/lucene/store/IndexInput.java (working copy) @@ -24,7 +24,9 @@ * @see Directory */ public abstract class IndexInput implements Cloneable { - private char[] chars; // used by readString() + private byte[] bytes; // used by readString() + private char[] chars; // used by readLegacyString() + private boolean legacyStrings; // true if we are reading old string format /** Reads and returns a single byte. * @see IndexOutput#writeByte(byte) @@ -102,11 +104,28 @@ return i; } + /** Call this if readSring should read characters stored + * in the legacy format (length-in-chars and not quite + * UTF8 encoding). See LUCENE-510 for details. */ + public void setLegacyStringsMode() { + legacyStrings = true; + } + /** Reads a string. * @see IndexOutput#writeString(String) */ public String readString() throws IOException { + if (legacyStrings) + return readLegacyString(); int length = readVInt(); + if (bytes == null || length > bytes.length) + bytes = new byte[(int) (length*1.25)]; + readBytes(bytes, 0, length); + return new String(bytes, 0, length, "UTF-8"); + } + + private String readLegacyString() throws IOException { + int length = readVInt(); if (chars == null || length > chars.length) chars = new char[length]; readChars(chars, 0, length); @@ -118,24 +137,66 @@ * @param start the offset in the array to start storing characters * @param length the number of characters to read * @see IndexOutput#writeChars(String,int,int) + * @deprecated -- please use readString or readBytes + * instead, and construct the utf8 string + * from those bytes */ - public void readChars(char[] buffer, int start, int length) - throws IOException { + public void readChars(char[] buffer, int start, int length) throws IOException { final int end = start + length; for (int i = start; i < end; i++) { byte b = readByte(); if ((b & 0x80) == 0) - buffer[i] = (char)(b & 0x7F); - else if ((b & 0xE0) != 0xE0) { - buffer[i] = (char)(((b & 0x1F) << 6) - | (readByte() & 0x3F)); - } else - buffer[i] = (char)(((b & 0x0F) << 12) - | ((readByte() & 0x3F) << 6) - | (readByte() & 0x3F)); + buffer[i] = (char)(b & 0x7F); + else if ((b & 0xE0) != 0xE0) + buffer[i] = (char)(((b & 0x1F) << 6) + | (readByte() & 0x3F)); + else + buffer[i] = (char)(((b & 0x0F) << 12) + | ((readByte() & 0x3F) << 6) + | (readByte() & 0x3F)); } } + /** Reads UTF-8 encoded characters into an array. + * @param buffer the array to read characters into + * @param start the offset in the array to start storing characters + * @param length the number of characters to read + * @see IndexOutput#writeChars(String,int,int) + */ + + /* + private void readUTF8Chars(char[] buffer, int start, int length) + throws IOException { + final int end = start + length; + for (int i = start; i < end; i++) { + byte b = readByte(); + switch (StringHelper.TRAILING_BYTES_FOR_UTF8[b & 0xFF]) { + case 0: + buffer[i] = (char)(b & 0x7F); + break; + case 1: + buffer[i] = (char)(((b & 0x1F) << 6) + | (readByte() & 0x3F)); + break; + case 2: + buffer[i] = (char)(((b & 0x0F) << 12) + | ((readByte() & 0x3F) << 6) + | (readByte() & 0x3F)); + break; + case 3: + int utf32 = (((b & 0x0F) << 18) + | ((readByte() & 0x3F) << 12) + | ((readByte() & 0x3F) << 6) + | (readByte() & 0x3F)); + buffer[i] = (char)((utf32 >> 10) + 0xD7C0); + i++; + buffer[i] = (char)((utf32 & 0x03FF) + 0xDC00); + break; + } + } + } + */ + /** * Expert * @@ -144,6 +205,8 @@ * and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine * how many more bytes to read * @param length The number of chars to read + * @deprecated this method operates on legacy non-utf8 encoded + * strings and will be removed in 3.0 */ public void skipChars(int length) throws IOException{ for (int i = 0; i < length; i++) { @@ -194,7 +257,7 @@ clone = (IndexInput)super.clone(); } catch (CloneNotSupportedException e) {} - clone.chars = null; + clone.bytes = null; return clone; } Index: src/java/org/apache/lucene/store/IndexOutput.java =================================================================== --- src/java/org/apache/lucene/store/IndexOutput.java (revision 633453) +++ src/java/org/apache/lucene/store/IndexOutput.java (working copy) @@ -18,6 +18,7 @@ */ import java.io.IOException; +import org.apache.lucene.util.StringHelper; /** Abstract base class for output to a file in a Directory. A random-access * output stream. Used for all Lucene index output operations. @@ -96,9 +97,9 @@ * @see IndexInput#readString() */ public void writeString(String s) throws IOException { - int length = s.length(); - writeVInt(length); - writeChars(s, 0, length); + int byteCount = StringHelper.countUTF8Bytes(s); + writeVInt(byteCount); + writeChars(s, 0, s.length()); } /** Writes a sequence of UTF-8 encoded characters from a string. @@ -112,15 +113,37 @@ final int end = start + length; for (int i = start; i < end; i++) { final int code = (int)s.charAt(i); - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); + if (code < 0x80) + writeByte((byte)code); + else if (code < 0x800) { + writeByte((byte)(0xC0 | (code >> 6))); + writeByte((byte)(0x80 | (code & 0x3F))); + } else if (code < 0xD800 || code > 0xDFFF) { + writeByte((byte)(0xE0 | (code >> 12))); + writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); + writeByte((byte)(0x80 | (code & 0x3F))); } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); + // surrogate pair + int utf32; + // confirm valid high surrogate + if (code < 0xDC00 && (i < end-1)) { + utf32 = ((int)s.charAt(i+1)); + // confirm valid low surrogate and write pair + if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { + utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); + i++; + writeByte((byte)(0xF0 | (utf32 >> 18))); + writeByte((byte)(0x80 | ((utf32 >> 12) & 0x3F))); + writeByte((byte)(0x80 | ((utf32 >> 6) & 0x3F))); + writeByte((byte)(0x80 | (utf32 & 0x3F))); + continue; + } + } + // replace unpaired surrogate or out-of-order low surrogate + // with substitution character + writeByte((byte)0xEF); + writeByte((byte)0xBF); + writeByte((byte)0xBD); } } } @@ -136,15 +159,37 @@ final int end = start + length; for (int i = start; i < end; i++) { final int code = (int)s[i]; - if (code >= 0x01 && code <= 0x7F) - writeByte((byte)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((byte)(0xC0 | (code >> 6))); - writeByte((byte)(0x80 | (code & 0x3F))); + if (code < 0x80) + writeByte((byte)code); + else if (code < 0x800) { + writeByte((byte)(0xC0 | (code >> 6))); + writeByte((byte)(0x80 | (code & 0x3F))); + } else if (code < 0xD800 || code > 0xDFFF) { + writeByte((byte)(0xE0 | (code >> 12))); + writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); + writeByte((byte)(0x80 | (code & 0x3F))); } else { - writeByte((byte)(0xE0 | (code >>> 12))); - writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); - writeByte((byte)(0x80 | (code & 0x3F))); + // surrogate pair + int utf32; + // confirm valid high surrogate + if (code < 0xDC00 && (i < end-1)) { + utf32 = ((int)s[i+1]); + // confirm valid low surrogate and write pair + if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { + utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); + i++; + writeByte((byte)(0xF0 | (utf32 >> 18))); + writeByte((byte)(0x80 | ((utf32 >> 12) & 0x3F))); + writeByte((byte)(0x80 | ((utf32 >> 6) & 0x3F))); + writeByte((byte)(0x80 | (utf32 & 0x3F))); + continue; + } + } + // replace unpaired surrogate or out-of-order low surrogate + // with substitution character + writeByte((byte)0xEF); + writeByte((byte)0xBF); + writeByte((byte)0xBD); } } } Index: src/java/org/apache/lucene/util/StringHelper.java =================================================================== --- src/java/org/apache/lucene/util/StringHelper.java (revision 633453) +++ src/java/org/apache/lucene/util/StringHelper.java (working copy) @@ -17,7 +17,6 @@ * limitations under the License. */ - /** * Methods for manipulating strings. * @@ -26,6 +25,87 @@ public abstract class StringHelper { /** + * Compares two byte[] arrays, element by element, and returns the + * number of elements common to both arrays. + * + * @param bytes1 The first byte[] to compare + * @param bytes2 The second byte[] to compare + * @return The number of common elements. + */ + public static final int bytesDifference(byte[] bytes1, byte[] bytes2) { + int len1 = bytes1.length; + int len2 = bytes2.length; + int len = len1 < len2 ? len1 : len2; + for (int i = 0; i < len; i++) { + if (bytes1[i] != bytes2[i]) { + return i; + } + } + return len; + } + + public static final byte[] TRAILING_BYTES_FOR_UTF8 = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3 + }; + + /** + * Count the number of bytes which would be occupied by this string + * were it to be converted to UTF-8. + * + * @param s The string to operate against + * @return The number of UTF-8 bytes + */ + public static final int countUTF8Bytes(String s) { + int end = s.length(); + int byteCount = end; // start with 1 byte per char + for (int i = 0; i < end; i++) { + // add the number of trailing bytes for each char + final int code = (int)s.charAt(i); + if (code < 0x80) + continue; + else if (code < 0x800) { + byteCount += 1; + } else if (code < 0xD800 || code > 0xDFFF) { + byteCount += 2; + } else { + // surrogate pair + int utf32; + // confirm valid high surrogate + if (code < 0xDC00 && (i < end-1)) { + utf32 = ((int)s.charAt(i+1)); + // confirm valid low surrogate + if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { + byteCount += 2; // not 3; compensate for extra char + i++; + continue; + } + } + // replace unpaired surrogate or out-of-order low surrogate + // with substitution character, which is 3 bytes in UTF-8 + byteCount += 2; + } + } + return byteCount; + } + + + /** * Compares two strings, character by character, and returns the * first position where the two strings differ from one another. * Index: src/site/src/documentation/content/xdocs/fileformats.xml =================================================================== --- src/site/src/documentation/content/xdocs/fileformats.xml (revision 633453) +++ src/site/src/documentation/content/xdocs/fileformats.xml (working copy) @@ -748,8 +748,9 @@
String

- Lucene writes strings as a VInt representing the length, followed by - the character data. + Lucene writes strings as a VInt representing the + length of the string in Java chars (UTF-16 code + units), followed by the character data.