Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java (revision 955903) +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java (working copy) @@ -179,7 +179,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override @@ -263,7 +263,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override Index: lucene/src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (revision 955903) +++ lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -4621,8 +4621,7 @@ private void checkTermsOrder(IndexReader r, Set allTerms, boolean isTop) throws IOException { TermsEnum terms = MultiFields.getFields(r).terms("f").iterator(); - char[] last = new char[2]; - int lastLength = 0; + BytesRef last = new BytesRef(); Set seenTerms = new HashSet(); @@ -4632,27 +4631,13 @@ if (term == null) { break; } - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); - assertTrue(utf16.length <= 2); - // Make sure last term comes before current one, in - // UTF16 sort order - int i = 0; - for(i=0;i pairs in a + Directory. A TermInfos can be written once, in order. */ + +final class TermInfosWriter { + /** The file format version, a negative number. */ + public static final int FORMAT = -3; + + // Changed strings to true utf8 with length-in-bytes not + // length-in-chars + public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; + + // NOTE: always change this if you switch to a new format! + public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; + + private FieldInfos fieldInfos; + private IndexOutput output; + private TermInfo lastTi = new TermInfo(); + private long size; + + // TODO: the default values for these two parameters should be settable from + // IndexWriter. However, once that's done, folks will start setting them to + // ridiculous values and complaining that things don't work well, as with + // mergeFactor. So, let's wait until a number of folks find that alternate + // values work better. Note that both of these values are stored in the + // segment, so that it's safe to change these w/o rebuilding all indexes. + + /** Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful.*/ + int indexInterval = 128; + + /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, + * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in + * smaller indexes, greater acceleration, but fewer accelerable cases, while + * smaller values result in bigger indexes, less acceleration and more + * accelerable cases. More detailed experiments would be useful here. */ + int skipInterval = 16; + + /** Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + int maxSkipLevels = 10; + + private long lastIndexPointer; + private boolean isIndex; + private byte[] lastTermBytes = new byte[10]; + private int lastTermBytesLength = 0; + private int lastFieldNumber = -1; + + private TermInfosWriter other; + private BytesRef utf8Result = new BytesRef(10); + + TermInfosWriter(Directory directory, String segment, FieldInfos fis, + int interval) + throws IOException { + initialize(directory, segment, fis, interval, false); + other = new TermInfosWriter(directory, segment, fis, interval, true); + other.other = this; + } + + private TermInfosWriter(Directory directory, String segment, FieldInfos fis, + int interval, boolean isIndex) throws IOException { + initialize(directory, segment, fis, interval, isIndex); + } + + private void initialize(Directory directory, String segment, FieldInfos fis, + int interval, boolean isi) throws IOException { + indexInterval = interval; + fieldInfos = fis; + isIndex = isi; + output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); + output.writeInt(FORMAT_CURRENT); // write format + output.writeLong(0); // leave space for size + output.writeInt(indexInterval); // write indexInterval + output.writeInt(skipInterval); // write skipInterval + output.writeInt(maxSkipLevels); // write maxSkipLevels + assert initUTF16Results(); + } + + void add(Term term, TermInfo ti) throws IOException { + UnicodeUtil.UTF16toUTF8(term.text(), 0, term.text().length(), utf8Result); + add(fieldInfos.fieldNumber(term.field()), utf8Result.bytes, utf8Result.length, ti); + } + + // Currently used only by assert statements + UnicodeUtil.UTF16Result utf16Result1; + UnicodeUtil.UTF16Result utf16Result2; + + // Currently used only by assert statements + private boolean initUTF16Results() { + utf16Result1 = new UnicodeUtil.UTF16Result(); + utf16Result2 = new UnicodeUtil.UTF16Result(); + return true; + } + + // Currently used only by assert statement + private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { + + if (lastFieldNumber != fieldNumber) { + final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); + // If there is a field named "" (empty string) then we + // will get 0 on this comparison, yet, it's "OK". But + // it's not OK if two different field numbers map to + // the same name. + if (cmp != 0 || lastFieldNumber != -1) + return cmp; + } + + UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); + UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); + final int len; + if (utf16Result1.length < utf16Result2.length) + len = utf16Result1.length; + else + len = utf16Result2.length; + + System.out.println("cmp len1=" + utf16Result1.length + " len2=" + utf16Result2.length); + for(int i=0;i, TermInfo> pair to the set. + Term must be lexicographically greater than all previous Terms added. + TermInfo pointers must be positive and greater than all previous.*/ + void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) + throws IOException { + + assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || + (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : + "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + + " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); + + assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; + assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; + + if (!isIndex && size % indexInterval == 0) + other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term + + writeTerm(fieldNumber, termBytes, termBytesLength); // write term + + output.writeVInt(ti.docFreq); // write doc freq + output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers + output.writeVLong(ti.proxPointer - lastTi.proxPointer); + + if (ti.docFreq >= skipInterval) { + output.writeVInt(ti.skipOffset); + } + + if (isIndex) { + output.writeVLong(other.output.getFilePointer() - lastIndexPointer); + lastIndexPointer = other.output.getFilePointer(); // write pointer + } + + lastFieldNumber = fieldNumber; + lastTi.set(ti); + size++; + } + + private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) + throws IOException { + + // TODO: UTF16toUTF8 could tell us this prefix + // Compute prefix in common with last term: + int start = 0; + final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; + while(start < limit) { + if (termBytes[start] != lastTermBytes[start]) + break; + start++; + } + + final int length = termBytesLength - start; + output.writeVInt(start); // write shared prefix length + output.writeVInt(length); // write delta length + output.writeBytes(termBytes, start, length); // write delta bytes + output.writeVInt(fieldNumber); // write field num + if (lastTermBytes.length < termBytesLength) { + lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); + } + System.arraycopy(termBytes, start, lastTermBytes, start, length); + lastTermBytesLength = termBytesLength; + } + + /** Called to complete TermInfos creation. */ + void close() throws IOException { + output.seek(4); // write size after format + output.writeLong(size); + output.close(); + + if (!isIndex) + other.close(); + } + +} Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 0) +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 0) @@ -0,0 +1,131 @@ +package org.apache.lucene.index.codecs.preflex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.store.*; +import org.apache.lucene.index.*; +import org.apache.lucene.index.codecs.*; +import org.apache.lucene.util.*; + +import java.util.*; +import java.io.IOException; + +public class TestSurrogates extends LuceneTestCase { + + private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec) throws IOException { + + // nocommit + // final int numField = _TestUtil.nextInt(r, 2, 5); + final int numField = 1; + + List terms = new ArrayList(); + + for(int f=0;f byName = new HashMap(); private int format; - FieldInfos() { } + public FieldInfos() { } /** * Construct a FieldInfos object using the directory and the name of the file @@ -62,7 +62,7 @@ * @param name The name of the file to open the IndexInput from in the Directory * @throws IOException */ - FieldInfos(Directory d, String name) throws IOException { + public FieldInfos(Directory d, String name) throws IOException { IndexInput input = d.openInput(name); try { read(input, name); Index: lucene/src/java/org/apache/lucene/index/TermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermsEnum.java (revision 955903) +++ lucene/src/java/org/apache/lucene/index/TermsEnum.java (working copy) @@ -145,7 +145,9 @@ @Override public Comparator getComparator() { // return an unused dummy to prevent NPE - return BytesRef.getUTF8SortedAsUTF16Comparator(); + // nocommit -- ok? + return null; + //return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override Index: lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (revision 955903) +++ lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (working copy) @@ -130,7 +130,7 @@ // TODO: we may want to make this sort in same order // as Codec's terms dict? - final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator()); + final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator()); tvf.writeVInt(numPostings); byte bits = 0x0; Index: lucene/src/java/org/apache/lucene/index/CheckIndex.java =================================================================== --- lucene/src/java/org/apache/lucene/index/CheckIndex.java (revision 955903) +++ lucene/src/java/org/apache/lucene/index/CheckIndex.java (working copy) @@ -598,6 +598,7 @@ while(true) { + // nocommit -- assert order? final BytesRef term = terms.next(); if (term == null) { break; Index: lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (revision 955903) +++ lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (working copy) @@ -80,7 +80,7 @@ // Terms dict success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -111,7 +111,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUTF16Comparator()); + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -126,7 +126,7 @@ state.dir, state.fieldInfos, state.segmentInfo.name, pulsingReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUTF16Comparator(), + BytesRef.getUTF8SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (revision 955903) +++ lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (working copy) @@ -63,7 +63,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -95,7 +95,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUTF16Comparator()); + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -111,7 +111,7 @@ state.segmentInfo.name, postingsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUTF16Comparator(), + BytesRef.getUTF8SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (revision 955903) +++ lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (working copy) @@ -58,7 +58,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUTF16Comparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -85,7 +85,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUTF16Comparator()); + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -101,7 +101,7 @@ state.segmentInfo.name, postings, state.readBufferSize, - BytesRef.getUTF8SortedAsUTF16Comparator(), + BytesRef.getUTF8SortedAsUnicodeComparator(), TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (revision 955903) +++ lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (working copy) @@ -67,7 +67,7 @@ success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -95,7 +95,7 @@ state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUTF16Comparator()); + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -111,7 +111,7 @@ state.segmentInfo.name, postingsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUTF16Comparator(), + BytesRef.getUTF8SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (revision 955903) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (working copy) @@ -39,6 +39,8 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; /** Exposes flex API on a pre-flex index, as a codec. * @lucene.experimental */ @@ -174,7 +176,6 @@ private class PreFlexFieldsEnum extends FieldsEnum { final Iterator it; private final PreTermsEnum termsEnum; - private int count; FieldInfo current; public PreFlexFieldsEnum() throws IOException { @@ -185,7 +186,6 @@ @Override public String next() { if (it.hasNext()) { - count++; current = it.next(); return current.name; } else { @@ -195,7 +195,7 @@ @Override public TermsEnum terms() throws IOException { - termsEnum.reset(current, count == 1); + termsEnum.reset(current); return termsEnum; } } @@ -209,14 +209,15 @@ @Override public TermsEnum iterator() throws IOException { PreTermsEnum termsEnum = new PreTermsEnum(); - termsEnum.reset(fieldInfo, false); + termsEnum.reset(fieldInfo); return termsEnum; } @Override public Comparator getComparator() { - // Pre-flex indexes always sorted in UTF16 order - return BytesRef.getUTF8SortedAsUTF16Comparator(); + // Pre-flex indexes always sorted in UTF16 order, but + // we remap on-the-fly to unicode order + return BytesRef.getUTF8SortedAsUnicodeComparator(); } } @@ -227,37 +228,147 @@ private BytesRef current; private final BytesRef scratchBytesRef = new BytesRef(); - void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException { + private int[] surrogateSeekPending = new int[1]; + private int surrogateSeekUpto; + private char[] pendingPrefix; + + private SegmentTermEnum seekTermEnum; + private Term protoTerm; + private int newSuffixStart; + + void reset(FieldInfo fieldInfo) throws IOException { this.fieldInfo = fieldInfo; + protoTerm = new Term(fieldInfo.name); if (termEnum == null) { - // First time reset is called - if (isFirstField) { - termEnum = getTermsDict().terms(); - skipNext = false; - } else { - termEnum = getTermsDict().terms(new Term(fieldInfo.name, "")); - skipNext = true; - } + termEnum = getTermsDict().terms(protoTerm); + seekTermEnum = getTermsDict().terms(protoTerm); } else { - final Term t = termEnum.term(); - if (t != null && t.field() == fieldInfo.name) { - // No need to seek -- we have already advanced onto - // this field. We must be @ first term because - // flex API will not advance this enum further, on - // seeing a different field. - } else { - assert t == null || !t.field().equals(fieldInfo.name); // make sure field name is interned - final TermInfosReader tis = getTermsDict(); - tis.seekEnum(termEnum, new Term(fieldInfo.name, "")); + getTermsDict().seekEnum(termEnum, protoTerm); + getTermsDict().seekEnum(seekTermEnum, protoTerm); + } + skipNext = true; + + surrogateSeekPending[0] = 0; + surrogateSeekUpto = 1; + newSuffixStart = 0; + + surrogatesDance(); + } + + private void surrogatesDance() throws IOException { + + // Tricky: prior to 4.0, Lucene index sorted terms in + // UTF16 order, but as of 4.0 we sort by Unicode code + // point order. These orders differ because of the + // surrrogates; so we have to fixup our enum, here, by + // carefully first seeking past the surrogates and + // then back again at the end. We use a stack to + // record the pending seek-backs. + + boolean any; + do { + any = false; + any |= popPendingSeek(); + any |= pushNewSurrogate(); + } while(any); + } + + private boolean popPendingSeek() throws IOException { + // if a .next() has advanced beyond the + // after-surrogates range we had last seeked to, we + // must seek back to the start and resume .next from + // there. this pops the pending seek off the stack. + final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1]; + if (newSuffixStart < seekPrefix) { + assert pendingPrefix != null; + assert pendingPrefix.length > seekPrefix; + pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START; + getTermsDict().seekEnum(termEnum, protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix))); + surrogateSeekUpto--; + + // +2 because we don't want to re-check the + // surrogates we just seek'd back to + newSuffixStart = seekPrefix + 2; + return true; + } else { + return false; + } + } + + private boolean pushNewSurrogate() throws IOException { + final String text = termEnum.term().text(); + final int textLen = text.length(); + + for(int i=newSuffixStart;i= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { + + // the next() that we just did read in a new + // suffix, containing a surrogate pair + + // seek forward to see if there are any terms with + // this same prefix, but with characters after the + // surrogate range; if so, we must first iterate + // them, then seek back to the surrogates + + pendingPrefix = new char[i+1]; + for(int j=0;j getComparator() { - // Pre-flex indexes always sorted in UTF16 order - return BytesRef.getUTF8SortedAsUTF16Comparator(); + // Pre-flex indexes always sorted in UTF16 order, but + // we remap on-the-fly to unicode order + return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override @@ -274,7 +385,11 @@ public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { skipNext = false; final TermInfosReader tis = getTermsDict(); - final Term t0 = new Term(fieldInfo.name, term.utf8ToString()); + final Term t0 = protoTerm.createTerm(term.utf8ToString()); + + // nocommit -- termEnum should never be null? + assert termEnum != null; + if (termEnum == null) { termEnum = tis.terms(t0); } else { @@ -290,6 +405,8 @@ tr = null; } + surrogateSeekUpto = 1; + if (t != null && t.field() == fieldInfo.name && term.bytesEquals(tr)) { current = tr; return SeekStatus.FOUND; @@ -316,6 +433,7 @@ if (termEnum.next()) { final Term t = termEnum.term(); if (t.field() == fieldInfo.name) { + surrogatesDance(); scratchBytesRef.copy(t.text()); current = scratchBytesRef; return current; Index: lucene/src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 955903) +++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy) @@ -217,14 +217,7 @@ bytes = ArrayUtil.grow(bytes, newLength); } - private final static Comparator utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator(); - - public static Comparator getUTF8SortedAsUTF16Comparator() { - return utf8SortedAsUTF16SortOrder; - } - /** Unsigned byte order comparison */ - /* public int compareTo(BytesRef other) { if (this == other) return 0; @@ -245,52 +238,18 @@ // One is a prefix of the other, or, they are equal: return this.length - other.length; } - */ - /** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change - * in the future to unsigned byte comparison. */ - public int compareTo(BytesRef other) { - if (this == other) return 0; + private final static Comparator utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator(); - final byte[] aBytes = this.bytes; - int aUpto = this.offset; - final byte[] bBytes = other.bytes; - int bUpto = other.offset; - - final int aStop = aUpto + Math.min(this.length, other.length); - - while(aUpto < aStop) { - int aByte = aBytes[aUpto++] & 0xff; - int bByte = bBytes[bUpto++] & 0xff; - if (aByte != bByte) { - - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; - } - } - - // One is a prefix of the other, or, they are equal: - return this.length - other.length; + public static Comparator getUTF8SortedAsUnicodeComparator() { + return utf8SortedAsUnicodeSortOrder; } - private static class UTF8SortedAsUTF16Comparator implements Comparator { + private static class UTF8SortedAsUnicodeComparator implements Comparator { // Only singleton - private UTF8SortedAsUTF16Comparator() {}; + private UTF8SortedAsUnicodeComparator() {}; public int compare(BytesRef a, BytesRef b) { - final byte[] aBytes = a.bytes; int aUpto = a.offset; final byte[] bBytes = b.bytes; @@ -307,32 +266,15 @@ int aByte = aBytes[aUpto++] & 0xff; int bByte = bBytes[bUpto++] & 0xff; - if (aByte != bByte) { - - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; + int diff = aByte - bByte; + if (diff != 0) { + return diff; } } // One is a prefix of the other, or, they are equal: return a.length - b.length; - } - - public boolean equals(Object other) { - return this == other; - } + } } public void writeExternal(ObjectOutput out) Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java =================================================================== --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (revision 955903) +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (working copy) @@ -426,7 +426,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } }; } Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java =================================================================== --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (revision 955903) +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (working copy) @@ -123,7 +123,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } } Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java =================================================================== --- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 955903) +++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy) @@ -808,7 +808,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override @@ -903,7 +903,7 @@ @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } }