Index: src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java (revision 641983) +++ src/java/org/apache/lucene/index/DocumentsWriterFieldMergeState.java (working copy) @@ -30,6 +30,7 @@ private Posting p; char[] text; int textOffset; + int textLen; private int postingUpto = -1; @@ -49,6 +50,7 @@ text = field.threadState.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; textOffset = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; + textLen = p.textLen; if (p.freqUpto > p.freqStart) freq.init(field.threadState.postingsPool, p.freqStart, p.freqUpto); Index: src/java/org/apache/lucene/index/DocumentsWriterFieldData.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriterFieldData.java (revision 641983) +++ src/java/org/apache/lucene/index/DocumentsWriterFieldData.java (working copy) @@ -311,6 +311,9 @@ * current tokenText. */ boolean postingEquals(final char[] tokenText, final int tokenTextLen) { + if (p.textLen != tokenTextLen) + return false; + final char[] text = threadState.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; assert text != null; int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; @@ -319,7 +322,7 @@ for(;tokenPos DocumentsWriter.CHAR_BLOCK_SIZE) { - if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { + if (tokenTextLen + threadState.charPool.byteUpto >= DocumentsWriter.CHAR_BLOCK_SIZE) { + if (tokenTextLen > DocumentsWriter.CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if @@ -485,11 +487,12 @@ p = threadState.postingsFreeList[--threadState.postingsFreeCount]; p.textStart = textUpto + threadState.charPool.byteOffset; - threadState.charPool.byteUpto += textLen1; + p.textLen = (short) tokenTextLen; + threadState.charPool.byteUpto += tokenTextLen; + assert text != null || 0 == tokenTextLen; + System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen); - - text[textUpto+tokenTextLen] = 0xffff; assert postingsHash[hashPos] == null; @@ -682,9 +685,7 @@ if (p0 != null) { final int start = p0.textStart & DocumentsWriter.CHAR_BLOCK_MASK; final char[] text = threadState.charPool.buffers[p0.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int pos = start; - while(text[pos] != 0xffff) - pos++; + int pos = start+p0.textLen; int code = 0; while (pos > start) code = (code*31) + text[--pos]; @@ -757,7 +758,7 @@ final UnicodeUtil.UTF8Result utf8Result = threadState.utf8Results[encoderUpto]; // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); + UnicodeUtil.UTF16toUTF8(text2, start2, posting.textLen, utf8Result); final int termBytesCount = utf8Result.length; // TODO: UTF16toUTF8 could tell us this prefix Index: src/java/org/apache/lucene/index/Posting.java =================================================================== --- src/java/org/apache/lucene/index/Posting.java (revision 641983) +++ src/java/org/apache/lucene/index/Posting.java (working copy) @@ -24,6 +24,7 @@ * this is how RAM usage is measured. */ final class Posting { int textStart; // Address into char[] blocks where our text is stored + int textLen; // Text length int docFreq; // # times this term occurs in the current doc int freqStart; // Address of first byte[] slice for freq int freqUpto; // Next write address for freq Index: src/java/org/apache/lucene/index/DocumentsWriterThreadState.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriterThreadState.java (revision 641983) +++ src/java/org/apache/lucene/index/DocumentsWriterThreadState.java (working copy) @@ -675,22 +675,23 @@ int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK; assert text1 != text2 || pos1 != pos2; + int pos = 0; - while(true) { + final int len1 = p1.textLen; + final int len2 = p2.textLen; + final int limit = pos1 + (len1 < len2 ? len1 : len2); + + while(pos1 < limit) { final char c1 = text1[pos1++]; final char c2 = text2[pos2++]; - if (c1 != c2) { - if (0xffff == c2) - return 1; - else if (0xffff == c1) - return -1; - else - return c1-c2; - } else - // This method should never compare equal postings - // unless p1==p2 - assert c1 != 0xffff; + if (c1 != c2) + return c1-c2; } + + // This method should never compare equal postings + // unless p1==p2 + assert len1 != len2; + return len1 - len2; } String lastVectorFieldName; Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 641983) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -755,20 +755,17 @@ return segment + "." + extension; } - private static int compareText(final char[] text1, int pos1, final char[] text2, int pos2) { - while(true) { + private static int compareText(final char[] text1, int len1, int pos1, final char[] text2, int len2, int pos2) { + final int len = len1 < len2 ? len1 : len2; + int pos = 0; + final int end = pos1 + len; + while(pos1 < end) { final char c1 = text1[pos1++]; final char c2 = text2[pos2++]; - if (c1 != c2) { - if (0xffff == c2) - return 1; - else if (0xffff == c1) - return -1; - else - return c1-c2; - } else if (0xffff == c1) - return 0; + if (c1 != c2) + return c1-c2; } + return len1 - len2; } private final TermInfo termInfo = new TermInfo(); // minimize consing @@ -815,7 +812,8 @@ for(int i=1;i out.length) { - byte[] newOut = new byte[2*out.length]; - assert newOut.length >= upto+4; - System.arraycopy(out, 0, newOut, 0, upto); - result.result = out = newOut; - } - if (code < 0x80) - out[upto++] = (byte) code; - else if (code < 0x800) { - out[upto++] = (byte) (0xC0 | (code >> 6)); - out[upto++] = (byte)(0x80 | (code & 0x3F)); - } else if (code < 0xD800 || code > 0xDFFF) { - if (code == 0xffff) - // END - break; - out[upto++] = (byte)(0xE0 | (code >> 12)); - out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); - out[upto++] = (byte)(0x80 | (code & 0x3F)); - } else { - // surrogate pair - // confirm valid high surrogate - if (code < 0xDC00 && source[i] != 0xffff) { - int utf32 = (int) source[i]; - // confirm valid low surrogate and write pair - if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { - utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); - i++; - out[upto++] = (byte)(0xF0 | (utf32 >> 18)); - out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); - out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); - out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); - continue; - } - } - // replace unpaired surrogate or out-of-order low surrogate - // with substitution character - out[upto++] = (byte) 0xEF; - out[upto++] = (byte) 0xBF; - out[upto++] = (byte) 0xBD; - } - } - //assert matches(source, offset, i-offset-1, out, upto); - result.length = upto; - } - - /** Encode characters from a char[] source, starting at * offset for length chars. Returns the number of bytes * written to bytesOut. */ public static void UTF16toUTF8(final char[] source, final int offset, final int length, UTF8Result result) {