Index: lucene/src/test/org/apache/lucene/index/TestAddIndexes.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (revision 957599) +++ lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (working copy) @@ -464,7 +464,7 @@ private void verifyTermDocs(Directory dir, Term term, int numDocs) throws IOException { IndexReader reader = IndexReader.open(dir, true); - DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, term.field, new BytesRef(term.text)); + DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, term.field, term.bytes); int count = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) count++; Index: lucene/src/test/org/apache/lucene/index/TestPayloads.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestPayloads.java (revision 957599) +++ lucene/src/test/org/apache/lucene/index/TestPayloads.java (working copy) @@ -188,7 +188,7 @@ Term[] terms = generateTerms(fieldName, numTerms); StringBuilder sb = new StringBuilder(); for (int i = 0; i < terms.length; i++) { - sb.append(terms[i].text); + sb.append(terms[i].text()); sb.append(" "); } String content = sb.toString(); Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 957599) +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (working copy) @@ -109,8 +109,15 @@ fieldInfos.write(dir, segName); // sorts in UTF16 order, just like preflex: - Collections.sort(terms); + Comparator utf16comparator = new Comparator() { + @Override + public int compare(Term o1, Term o2) { + return o1.compareToUTF16(o2); + } + }; + Collections.sort(terms, utf16comparator); + TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128); TermInfo ti = new TermInfo(); BytesRef utf8 = new BytesRef(10); Index: lucene/src/java/org/apache/lucene/index/Term.java =================================================================== --- lucene/src/java/org/apache/lucene/index/Term.java (revision 957599) +++ lucene/src/java/org/apache/lucene/index/Term.java (working copy) @@ -17,6 +17,9 @@ * limitations under the License. */ +import java.util.Comparator; + +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; /** @@ -29,15 +32,22 @@ public final class Term implements Comparable, java.io.Serializable { String field; - String text; + BytesRef bytes; /** Constructs a Term with the given field and text. *

Note that a null field or null text value results in undefined * behavior for most Lucene APIs that accept a Term parameter. */ - public Term(String fld, String txt) { + public Term(String fld, BytesRef bytes) { field = fld == null ? null : StringHelper.intern(fld); - text = txt; + this.bytes = bytes; } + + /** Constructs a Term with the given field and text. + *

Note that a null field or null text value results in undefined + * behavior for most Lucene APIs that accept a Term parameter. */ + public Term(String fld, String text) { + this(fld, new BytesRef(text)); + } /** Constructs a Term with the given field and empty text. * This serves two purposes: 1) reuse of a Term with the same field. @@ -46,15 +56,20 @@ * @param fld */ public Term(String fld) { - this(fld, "", true); + this(fld, BytesRef.EMPTY, true); } /** @lucene.experimental */ - public Term(String fld, String txt, boolean intern) { + public Term(String fld, BytesRef bytes, boolean intern) { field = intern ? StringHelper.intern(fld) : fld; // field names are interned - text = txt; // unless already known to be + this.bytes = bytes; // unless already known to be } + /** @lucene.experimental */ + public Term(String fld, String text, boolean intern) { + this(fld, new BytesRef(text), intern); + } + /** Returns the field of this term, an interned string. The field indicates the part of a document which this term came from. */ public final String field() { return field; } @@ -62,11 +77,25 @@ /** Returns the text of this term. In the case of words, this is simply the text of the word. In the case of dates and other types, this is an encoding of the object as a string. */ - public final String text() { return text; } - + public final String text() { return bytes.utf8ToString(); } + + /** Returns the bytes of this term. */ + public final BytesRef bytes() { return bytes; } + /** * Optimized construction of new Terms by reusing same field as this Term * - avoids field.intern() overhead + * @param text The bytes of the new term (field is implicitly same as this Term instance) + * @return A new Term + */ + public Term createTerm(BytesRef bytes) + { + return new Term(field,bytes,false); + } + + /** + * Optimized construction of new Terms by reusing same field as this Term + * - avoids field.intern() overhead * @param text The text of the new term (field is implicitly same as this Term instance) * @return A new Term */ @@ -89,10 +118,10 @@ return false; } else if (!field.equals(other.field)) return false; - if (text == null) { - if (other.text != null) + if (bytes == null) { + if (other.bytes != null) return false; - } else if (!text.equals(other.text)) + } else if (!bytes.equals(other.bytes)) return false; return true; } @@ -102,7 +131,7 @@ final int prime = 31; int result = 1; result = prime * result + ((field == null) ? 0 : field.hashCode()); - result = prime * result + ((text == null) ? 0 : text.hashCode()); + result = prime * result + ((bytes == null) ? 0 : bytes.hashCode()); return result; } @@ -113,19 +142,41 @@ The ordering of terms is first by field, then by text.*/ public final int compareTo(Term other) { if (field == other.field) // fields are interned - return text.compareTo(other.text); + return bytes.compareTo(other.bytes); else return field.compareTo(other.field); } + + @Deprecated + private static final Comparator legacyComparator = + BytesRef.getUTF8SortedAsUTF16Comparator(); + /** + * @deprecated For internal backwards compatibility use only + * @lucene.internal + */ + @Deprecated + public final int compareToUTF16(Term other) { + if (field == other.field) // fields are interned + return legacyComparator.compare(this.bytes, other.bytes); + else + return field.compareTo(other.field); + } + /** Resets the field and text of a Term. */ + final void set(String fld, BytesRef bytes) { + field = fld; + this.bytes = bytes; + } + + /** Resets the field and text of a Term. */ final void set(String fld, String txt) { field = fld; - text = txt; + this.bytes = new BytesRef(txt); } @Override - public final String toString() { return field + ":" + text; } + public final String toString() { return field + ":" + bytes.utf8ToString(); } private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException Index: lucene/src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (revision 957599) +++ lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -1166,7 +1166,7 @@ num.setNum(docIDUpto); deletesInRAM.numTerms++; - deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.text.length()*CHAR_NUM_BYTE); + deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.bytes.length); } // Buffer a specific docID for deletion. Currently only Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (revision 957599) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (working copy) @@ -189,7 +189,7 @@ while (hi >= lo) { int mid = (lo + hi) >>> 1; - int delta = term.compareTo(indexTerms[mid]); + int delta = term.compareToUTF16(indexTerms[mid]); if (delta < 0) hi = mid - 1; else if (delta > 0) @@ -234,17 +234,17 @@ // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.term() != null // term is at or past current - && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) - || term.compareTo(enumerator.term()) >= 0)) { + && ((enumerator.prev() != null && term.compareToUTF16(enumerator.prev())> 0) + || term.compareToUTF16(enumerator.term()) >= 0)) { int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; if (indexTerms.length == enumOffset // but before end of block - || term.compareTo(indexTerms[enumOffset]) < 0) { + || term.compareToUTF16(indexTerms[enumOffset]) < 0) { // no need to seek final TermInfo ti; int numScans = enumerator.scanTo(term); - if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { + if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (numScans > 1) { // we only want to put this TermInfo into the cache if @@ -279,7 +279,7 @@ seekEnum(enumerator, indexPos); enumerator.scanTo(term); final TermInfo ti; - if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { + if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (tiOrd == null) { termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position)); @@ -328,9 +328,9 @@ SegmentTermEnum enumerator = getThreadResources().termEnum; seekEnum(enumerator, indexOffset); - while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} + while(term.compareToUTF16(enumerator.term()) > 0 && enumerator.next()) {} - if (term.compareTo(enumerator.term()) == 0) + if (term.compareToUTF16(enumerator.term()) == 0) return enumerator.position; else return -1; Index: lucene/src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 957599) +++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy) @@ -32,6 +32,8 @@ public static final byte[] EMPTY_BYTES = new byte[0]; + public static final BytesRef EMPTY = new BytesRef(0); + /** The contents of the BytesRef. Should never be {@code null}. */ public byte[] bytes; @@ -277,6 +279,62 @@ } } + private final static Comparator utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator(); + + public static Comparator getUTF8SortedAsUTF16Comparator() { + return utf8SortedAsUnicodeSortOrder; + } + + private static class UTF8SortedAsUTF16Comparator implements Comparator { + // Only singleton + private UTF8SortedAsUTF16Comparator() {}; + + public int compare(BytesRef a, BytesRef b) { + + final byte[] aBytes = a.bytes; + int aUpto = a.offset; + final byte[] bBytes = b.bytes; + int bUpto = b.offset; + + final int aStop; + if (a.length < b.length) { + aStop = aUpto + a.length; + } else { + aStop = aUpto + b.length; + } + + while(aUpto < aStop) { + int aByte = aBytes[aUpto++] & 0xff; + int bByte = bBytes[bUpto++] & 0xff; + + if (aByte != bByte) { + + // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order + + // We know the terms are not equal, but, we may + // have to carefully fixup the bytes at the + // difference to match UTF16's sort order: + if (aByte >= 0xee && bByte >= 0xee) { + if ((aByte & 0xfe) == 0xee) { + aByte += 0x10; + } + if ((bByte&0xfe) == 0xee) { + bByte += 0x10; + } + } + return aByte - bByte; + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + + public boolean equals(Object other) { + return this == other; + } + } + public void writeExternal(ObjectOutput out) throws IOException {