Index: lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java =================================================================== --- lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (revision 958138) +++ lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java (working copy) @@ -213,7 +213,7 @@ totalVariantDocFreqs+=fe.docFreq(); float score=boostAtt.getBoost(); if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){ - ScoreTerm st=new ScoreTerm(new Term(startTerm.field(), possibleMatch.utf8ToString()),score,startTerm); + ScoreTerm st=new ScoreTerm(new Term(startTerm.field(), new BytesRef(possibleMatch)),score,startTerm); variantsQ.insertWithOverflow(st); minScore = variantsQ.top().score; // maintain minScore } Index: lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java =================================================================== --- lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java (revision 958138) +++ lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java (working copy) @@ -77,7 +77,7 @@ } if (terms != null) { - br.copy(term.text()); + br.copy(term.bytes()); if (termsEnum.seek(br) == TermsEnum.SeekStatus.FOUND) { docs = termsEnum.docs(delDocs, docs); while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) { Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (revision 958138) +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (working copy) @@ -237,7 +237,6 @@ private FieldInfo fieldInfo; private boolean skipNext; private BytesRef current; - private final BytesRef scratchBytesRef = new BytesRef(); private int[] surrogateSeekPending = new int[1]; private boolean[] surrogateDidSeekBack = new boolean[1]; @@ -319,7 +318,8 @@ assert pendingPrefix != null; assert pendingPrefix.length > seekPrefix; pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START; - Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix)); + pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START; + Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix)); if (DEBUG_SURROGATES) { System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text())); } @@ -334,7 +334,7 @@ assert pendingPrefix != null; assert pendingPrefix.length > seekPrefix; pendingPrefix[seekPrefix] = 0xffff; - Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix)); + Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix)); if (DEBUG_SURROGATES) { System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text())); } @@ -358,6 +358,9 @@ return false; } + private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result(); + private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result(); + private boolean pushNewSurrogate() throws IOException { if (DEBUG_SURROGATES) { System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack()); @@ -366,11 +369,12 @@ if (t == null || t.field() != fieldInfo.name) { return false; } - final String text = t.text(); - final int textLen = text.length(); - for(int i=Math.max(0,newSuffixStart);i= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) { if (DEBUG_SURROGATES) { @@ -385,24 +389,27 @@ // surrogate range; if so, we must first iterate // them, then seek back to the surrogates - char[] testPrefix = new char[i+1]; + char[] testPrefix = new char[i+2]; for(int j=0;j= lo) { int mid = (lo + hi) >>> 1; - int delta = term.compareTo(indexTerms[mid]); + int delta = term.compareToUTF16(indexTerms[mid]); if (delta < 0) hi = mid - 1; else if (delta > 0) @@ -234,17 +234,17 @@ // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.term() != null // term is at or past current - && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) - || term.compareTo(enumerator.term()) >= 0)) { + && ((enumerator.prev() != null && term.compareToUTF16(enumerator.prev())> 0) + || term.compareToUTF16(enumerator.term()) >= 0)) { int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; if (indexTerms.length == enumOffset // but before end of block - || term.compareTo(indexTerms[enumOffset]) < 0) { + || term.compareToUTF16(indexTerms[enumOffset]) < 0) { // no need to seek final TermInfo ti; int numScans = enumerator.scanTo(term); - if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { + if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (numScans > 1) { // we only want to put this TermInfo into the cache if @@ -279,7 +279,7 @@ seekEnum(enumerator, indexPos); enumerator.scanTo(term); final TermInfo ti; - if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { + if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (tiOrd == null) { termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position)); @@ -328,9 +328,9 @@ SegmentTermEnum enumerator = getThreadResources().termEnum; seekEnum(enumerator, indexOffset); - while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} + while(term.compareToUTF16(enumerator.term()) > 0 && enumerator.next()) {} - if (term.compareTo(enumerator.term()) == 0) + if (term.compareToUTF16(enumerator.term()) == 0) return enumerator.position; else return -1; Index: lucene/src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (revision 958138) +++ lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -1073,7 +1073,6 @@ TermsEnum termsEnum = null; String currentField = null; - BytesRef termRef = new BytesRef(); DocsEnum docs = null; for (Entry entry: deletesFlushed.terms.entrySet()) { @@ -1097,9 +1096,7 @@ } assert checkDeleteTerm(term); - termRef.copy(term.text()); - - if (termsEnum.seek(termRef, false) == TermsEnum.SeekStatus.FOUND) { + if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) { DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); if (docsEnum != null) { @@ -1166,7 +1163,7 @@ num.setNum(docIDUpto); deletesInRAM.numTerms++; - deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.text.length()*CHAR_NUM_BYTE); + deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.bytes.length); } // Buffer a specific docID for deletion. Currently only Index: lucene/src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexReader.java (revision 958138) +++ lucene/src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -883,7 +883,7 @@ public abstract Fields fields() throws IOException; public int docFreq(Term term) throws IOException { - return docFreq(term.field(), new BytesRef(term.text())); + return docFreq(term.field(), term.bytes()); } /** Returns the number of documents containing the term @@ -1000,7 +1000,7 @@ DocsEnum docs = MultiFields.getTermDocsEnum(this, MultiFields.getDeletedDocs(this), term.field(), - new BytesRef(term.text())); + term.bytes()); if (docs == null) return 0; int n = 0; int doc; Index: lucene/src/java/org/apache/lucene/index/Term.java =================================================================== --- lucene/src/java/org/apache/lucene/index/Term.java (revision 958138) +++ lucene/src/java/org/apache/lucene/index/Term.java (working copy) @@ -17,6 +17,9 @@ * limitations under the License. */ +import java.util.Comparator; + +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; /** @@ -29,14 +32,26 @@ public final class Term implements Comparable, java.io.Serializable { String field; - String text; + BytesRef bytes; + /** Constructs a Term with the given field and bytes. + *

Note that a null field or null bytes value results in undefined + * behavior for most Lucene APIs that accept a Term parameter. + *

WARNING: the provided BytesRef is not copied, but used directly. + * Therefore the bytes should not be modified after construction, for + * example, you should clone a copy rather than pass reused bytes from + * a TermsEnum. + */ + public Term(String fld, BytesRef bytes) { + field = fld == null ? null : StringHelper.intern(fld); + this.bytes = bytes; + } + /** Constructs a Term with the given field and text. *

Note that a null field or null text value results in undefined * behavior for most Lucene APIs that accept a Term parameter. */ - public Term(String fld, String txt) { - field = fld == null ? null : StringHelper.intern(fld); - text = txt; + public Term(String fld, String text) { + this(fld, new BytesRef(text)); } /** Constructs a Term with the given field and empty text. @@ -46,15 +61,27 @@ * @param fld */ public Term(String fld) { - this(fld, "", true); + this(fld, new BytesRef(), true); } - /** @lucene.experimental */ - public Term(String fld, String txt, boolean intern) { + /** + * WARNING: the provided BytesRef is not copied, but used directly. + * Therefore the bytes should not be modified after construction, for + * example, you should clone a copy rather than pass reused bytes from + * a TermsEnum. + * + * @lucene.experimental + */ + public Term(String fld, BytesRef bytes, boolean intern) { field = intern ? StringHelper.intern(fld) : fld; // field names are interned - text = txt; // unless already known to be + this.bytes = bytes; // unless already known to be } + /** @lucene.experimental */ + public Term(String fld, String text, boolean intern) { + this(fld, new BytesRef(text), intern); + } + /** Returns the field of this term, an interned string. The field indicates the part of a document which this term came from. */ public final String field() { return field; } @@ -62,11 +89,29 @@ /** Returns the text of this term. In the case of words, this is simply the text of the word. In the case of dates and other types, this is an encoding of the object as a string. */ - public final String text() { return text; } - + public final String text() { return bytes.utf8ToString(); } + + /** Returns the bytes of this term. */ + public final BytesRef bytes() { return bytes; } + /** * Optimized construction of new Terms by reusing same field as this Term * - avoids field.intern() overhead + *

WARNING: the provided BytesRef is not copied, but used directly. + * Therefore the bytes should not be modified after construction, for + * example, you should clone a copy rather than pass reused bytes from + * a TermsEnum. + * @param text The bytes of the new term (field is implicitly same as this Term instance) + * @return A new Term + */ + public Term createTerm(BytesRef bytes) + { + return new Term(field,bytes,false); + } + + /** + * Optimized construction of new Terms by reusing same field as this Term + * - avoids field.intern() overhead * @param text The text of the new term (field is implicitly same as this Term instance) * @return A new Term */ @@ -89,10 +134,10 @@ return false; } else if (!field.equals(other.field)) return false; - if (text == null) { - if (other.text != null) + if (bytes == null) { + if (other.bytes != null) return false; - } else if (!text.equals(other.text)) + } else if (!bytes.equals(other.bytes)) return false; return true; } @@ -102,7 +147,7 @@ final int prime = 31; int result = 1; result = prime * result + ((field == null) ? 0 : field.hashCode()); - result = prime * result + ((text == null) ? 0 : text.hashCode()); + result = prime * result + ((bytes == null) ? 0 : bytes.hashCode()); return result; } @@ -113,19 +158,47 @@ The ordering of terms is first by field, then by text.*/ public final int compareTo(Term other) { if (field == other.field) // fields are interned - return text.compareTo(other.text); + return bytes.compareTo(other.bytes); else return field.compareTo(other.field); } + + @Deprecated + private static final Comparator legacyComparator = + BytesRef.getUTF8SortedAsUTF16Comparator(); + /** + * @deprecated For internal backwards compatibility use only + * @lucene.internal + */ + @Deprecated + public final int compareToUTF16(Term other) { + if (field == other.field) // fields are interned + return legacyComparator.compare(this.bytes, other.bytes); + else + return field.compareTo(other.field); + } + + /** + * Resets the field and text of a Term. + *

WARNING: the provided BytesRef is not copied, but used directly. + * Therefore the bytes should not be modified after construction, for + * example, you should clone a copy rather than pass reused bytes from + * a TermsEnum. + */ + final void set(String fld, BytesRef bytes) { + field = fld; + this.bytes = bytes; + } + /** Resets the field and text of a Term. */ final void set(String fld, String txt) { field = fld; - text = txt; + this.bytes = new BytesRef(txt); } @Override - public final String toString() { return field + ":" + text; } + public final String toString() { return field + ":" + bytes.utf8ToString(); } private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException Index: lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java (revision 958138) +++ lucene/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java (working copy) @@ -95,13 +95,20 @@ public class FieldCacheTermsFilter extends Filter { private String field; - private String[] terms; + private BytesRef[] terms; - public FieldCacheTermsFilter(String field, String... terms) { + public FieldCacheTermsFilter(String field, BytesRef... terms) { this.field = field; this.terms = terms; } + public FieldCacheTermsFilter(String field, String... terms) { + this.field = field; + this.terms = new BytesRef[terms.length]; + for (int i = 0; i < terms.length; i++) + this.terms[i] = new BytesRef(terms[i]); + } + public FieldCache getFieldCache() { return FieldCache.DEFAULT; } @@ -121,7 +128,7 @@ openBitSet = new OpenBitSet(this.fcsi.size()); final BytesRef spare = new BytesRef(); for (int i=0;i 0) { openBitSet.fastSet(termNumber); } Index: lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (revision 958138) +++ lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (working copy) @@ -499,14 +499,13 @@ List docsEnums = new LinkedList(); final Bits delDocs = MultiFields.getDeletedDocs(indexReader); for (int i = 0; i < terms.length; i++) { - final BytesRef text = new BytesRef(terms[i].text()); DocsAndPositionsEnum postings = indexReader.termPositionsEnum(delDocs, terms[i].field(), - text); + terms[i].bytes()); if (postings != null) { docsEnums.add(postings); } else { - if (MultiFields.getTermDocsEnum(indexReader, delDocs, terms[i].field(), text) != null) { + if (MultiFields.getTermDocsEnum(indexReader, delDocs, terms[i].field(), terms[i].bytes()) != null) { // term does exist, but has no positions throw new IllegalStateException("field \"" + terms[i].field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + terms[i].text() + ")"); } Index: lucene/src/java/org/apache/lucene/search/MultiTermQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/MultiTermQuery.java (revision 958138) +++ lucene/src/java/org/apache/lucene/search/MultiTermQuery.java (working copy) @@ -32,6 +32,7 @@ import org.apache.lucene.queryParser.QueryParser; // for javadoc import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.PagedBytes; /** * An abstract {@link Query} that matches documents @@ -177,11 +178,6 @@ private abstract static class BooleanQueryRewrite extends RewriteMethod { protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { - - if (query.field == null) { - throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); - } - final Fields fields = MultiFields.getFields(reader); if (fields == null) { // reader has no fields @@ -203,10 +199,9 @@ termsEnum.attributes().addAttribute(BoostAttribute.class); collector.boostAtt = boostAtt; int count = 0; - BytesRef term; - final Term placeholderTerm = new Term(query.field); - while ((term = termsEnum.next()) != null) { - if (collector.collect(placeholderTerm.createTerm(term.utf8ToString()), boostAtt.getBoost())) { + BytesRef bytes; + while ((bytes = termsEnum.next()) != null) { + if (collector.collect(bytes, boostAtt.getBoost())) { count++; } else { break; @@ -217,15 +212,15 @@ } protected static abstract class TermCollector { - /** this field is only set if a boostAttribute is used (e.g. {@link FuzzyTermsEnum}) */ private BoostAttribute boostAtt = null; /** return false to stop collecting */ - public abstract boolean collect(Term t, float boost) throws IOException; + public abstract boolean collect(BytesRef bytes, float boost) throws IOException; /** set the minimum boost as a hint for the term producer */ protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { - if (boostAtt != null) boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); + assert boostAtt != null; + boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); } } } @@ -234,9 +229,11 @@ @Override public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { final BooleanQuery result = new BooleanQuery(true); + final Term placeholderTerm = new Term(query.field); query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { - public boolean collect(Term t, float boost) { - TermQuery tq = new TermQuery(t); // found a match + public boolean collect(BytesRef bytes, float boost) { + // add new TQ, we must clone the term, else it may get overwritten! + TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes))); tq.setBoost(query.getBoost() * boost); // set the boost result.add(tq, BooleanClause.Occur.SHOULD); // add to query return true; @@ -297,16 +294,16 @@ protected abstract Query getQuery(Term term); @Override - public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { + public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount()); final PriorityQueue stQueue = new PriorityQueue(); collectTerms(reader, query, new TermCollector() { - public boolean collect(Term t, float boost) { + public boolean collect(BytesRef bytes, float boost) { // ignore uncompetetive hits if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost) return true; - // add new entry in PQ - st.term = t; + // add new entry in PQ, we must clone the term, else it may get overwritten! + st.bytes.copy(bytes); st.boost = boost; stQueue.offer(st); // possibly drop entries from queue @@ -319,9 +316,11 @@ private ScoreTerm st = new ScoreTerm(); }); + final Term placeholderTerm = new Term(query.field); final BooleanQuery bq = new BooleanQuery(true); for (final ScoreTerm st : stQueue) { - Query tq = getQuery(st.term); // found a match + // add new query, we must clone the term, else it may get overwritten! + Query tq = getQuery(placeholderTerm.createTerm(st.bytes)); tq.setBoost(query.getBoost() * st.boost); // set the boost bq.add(tq, BooleanClause.Occur.SHOULD); // add to query } @@ -348,12 +347,13 @@ } private static class ScoreTerm implements Comparable { - public Term term; + public final BytesRef bytes = new BytesRef(); public float boost; public int compareTo(ScoreTerm other) { if (this.boost == other.boost) - return other.term.compareTo(this.term); + // TODO: is it OK to use default compare here? + return other.bytes.compareTo(this.bytes); else return Float.compare(this.boost, other.boost); } @@ -530,58 +530,67 @@ final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); - final CutOffTermCollector col = new CutOffTermCollector(reader, docCountCutoff, termCountLimit); + final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit); collectTerms(reader, query, col); if (col.hasCutOff) { return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); + } else if (col.termCount == 0) { + return new BooleanQuery(true); } else { - final Query result; - if (col.pendingTerms.isEmpty()) { - result = new BooleanQuery(true); - } else { - BooleanQuery bq = new BooleanQuery(true); - for(Term term : col.pendingTerms) { - TermQuery tq = new TermQuery(term); - bq.add(tq, BooleanClause.Occur.SHOULD); + final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false); + try { + final BooleanQuery bq = new BooleanQuery(true); + final Term placeholderTerm = new Term(query.field); + long start = col.startOffset; + for(int i = 0; i < col.termCount; i++) { + final BytesRef bytes = new BytesRef(); + start = bytesReader.fillUsingLengthPrefix3(bytes, start); + bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD); } // Strip scores - result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); + final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); result.setBoost(query.getBoost()); + query.incTotalNumberOfTerms(col.termCount); + return result; + } finally { + bytesReader.close(); } - query.incTotalNumberOfTerms(col.pendingTerms.size()); - return result; } } private static final class CutOffTermCollector extends TermCollector { - CutOffTermCollector(IndexReader reader, int docCountCutoff, int termCountLimit) { + CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) { this.reader = reader; + this.field = field; this.docCountCutoff = docCountCutoff; this.termCountLimit = termCountLimit; } - public boolean collect(Term t, float boost) throws IOException { - pendingTerms.add(t); - if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { + public boolean collect(BytesRef bytes, float boost) throws IOException { + termCount++; + if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) { hasCutOff = true; return false; } + pendingTerms.copyUsingLengthPrefix(bytes); // Loading the TermInfo from the terms dict here // should not be costly, because 1) the // query/filter will load the TermInfo when it // runs, and 2) the terms dict has a cache: - // @deprecated: in 4.0 use BytesRef for collectTerms() - docVisitCount += reader.docFreq(t); + docVisitCount += reader.docFreq(field, bytes); return true; } int docVisitCount = 0; boolean hasCutOff = false; + int termCount = 0; final IndexReader reader; + final String field; final int docCountCutoff, termCountLimit; - final ArrayList pendingTerms = new ArrayList(); + final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB + final long startOffset = pendingTerms.getPointer(); } @Override @@ -647,20 +656,9 @@ */ public MultiTermQuery(final String field) { this.field = field; + assert field != null; } - /** - * Constructs a query matching terms that cannot be represented with a single - * Term. - * @deprecated Use {@link #MultiTermQuery(String)}, as the flex branch can - * only work on one field per terms enum. If you override - * {@link #getTermsEnum(IndexReader)}, you cannot use this ctor. - */ - @Deprecated - public MultiTermQuery() { - this(null); - } - /** Returns the field name for this query */ public final String getField() { return field; } Index: lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (revision 958138) +++ lucene/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java (working copy) @@ -106,10 +106,6 @@ */ @Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { - if (query.field == null) { - throw new NullPointerException("If you implement getTermsEnum(), you must specify a non-null field in the constructor of MultiTermQuery."); - } - final Fields fields = MultiFields.getFields(reader); if (fields == null) { // reader has no fields Index: lucene/src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PhraseQuery.java (revision 958138) +++ lucene/src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -184,15 +184,14 @@ final Bits delDocs = MultiFields.getDeletedDocs(reader); for (int i = 0; i < terms.size(); i++) { final Term t = terms.get(i); - final BytesRef text = new BytesRef(t.text()); DocsAndPositionsEnum postingsEnum = MultiFields.getTermPositionsEnum(reader, delDocs, t.field(), - text); + t.bytes()); // PhraseQuery on a field that did not index // positions. if (postingsEnum == null) { - if (MultiFields.getTermDocsEnum(reader, delDocs, t.field(), text) != null) { + if (MultiFields.getTermDocsEnum(reader, delDocs, t.field(), t.bytes()) != null) { // term does exist, but has no positions throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")"); } else { @@ -200,7 +199,7 @@ return null; } } - postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), text), positions.get(i).intValue()); + postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue()); } // sort by increasing docFreq order Index: lucene/src/java/org/apache/lucene/search/PrefixQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PrefixQuery.java (revision 958138) +++ lucene/src/java/org/apache/lucene/search/PrefixQuery.java (working copy) @@ -46,7 +46,7 @@ @Override protected TermsEnum getTermsEnum(IndexReader reader) throws IOException { - if (prefix.text().length() == 0) { + if (prefix.bytes().length == 0) { // no prefix -- match all terms for this field: final Terms terms = MultiFields.getTerms(reader, getField()); return (terms != null) ? terms.iterator() : TermsEnum.EMPTY; Index: lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java (revision 958138) +++ lucene/src/java/org/apache/lucene/search/PrefixTermsEnum.java (working copy) @@ -36,7 +36,7 @@ public PrefixTermsEnum(IndexReader reader, Term prefix) throws IOException { super(reader, prefix.field()); - setInitialSeekTerm(prefixRef = new BytesRef(prefix.text())); + setInitialSeekTerm(prefixRef = prefix.bytes()); } @Override Index: lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java (revision 958138) +++ lucene/src/java/org/apache/lucene/search/SingleTermsEnum.java (working copy) @@ -41,7 +41,7 @@ */ public SingleTermsEnum(IndexReader reader, Term singleTerm) throws IOException { super(reader, singleTerm.field()); - singleRef = new BytesRef(singleTerm.text()); + singleRef = singleTerm.bytes(); setInitialSeekTerm(singleRef); } Index: lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java (revision 958138) +++ lucene/src/java/org/apache/lucene/search/spans/SpanTermQuery.java (working copy) @@ -85,16 +85,15 @@ public Spans getSpans(final IndexReader reader) throws IOException { // NOTE: debateably, the caller should never pass in a // multi reader... - final BytesRef textBytes = new BytesRef(term.text()); final DocsAndPositionsEnum postings = MultiFields.getTermPositionsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), - textBytes); + term.bytes()); if (postings != null) { return new TermSpans(postings, term); } else { - if (MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), textBytes) != null) { + if (MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), term.bytes()) != null) { // term does exist, but has no positions throw new IllegalStateException("field \"" + term.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run SpanTermQuery (term=" + term.text() + ")"); } else { Index: lucene/src/java/org/apache/lucene/search/TermQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TermQuery.java (revision 958138) +++ lucene/src/java/org/apache/lucene/search/TermQuery.java (working copy) @@ -75,7 +75,7 @@ public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { // NOTE: debateably, the caller should never pass in a // multi reader... - DocsEnum docs = MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), new BytesRef(term.text())); + DocsEnum docs = MultiFields.getTermDocsEnum(reader, MultiFields.getDeletedDocs(reader), term.field(), term.bytes()); if (docs == null) { return null; } @@ -118,7 +118,7 @@ Explanation tfExplanation = new Explanation(); int tf = 0; - DocsEnum docs = reader.termDocsEnum(MultiFields.getDeletedDocs(reader), term.field(), new BytesRef(term.text())); + DocsEnum docs = reader.termDocsEnum(MultiFields.getDeletedDocs(reader), term.field(), term.bytes()); if (docs != null) { int newDoc = docs.advance(doc); if (newDoc == doc) { Index: lucene/src/java/org/apache/lucene/util/BytesRef.java =================================================================== --- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 958138) +++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy) @@ -77,6 +77,16 @@ this(); copy(text); } + + /** + * @param text Initialize the byte[] from the UTF8 bytes + * for the provided array. This must be well-formed + * unicode text, with no unpaired surrogates or U+FFFF. + */ + public BytesRef(char text[], int offset, int length) { + this(length * 4); + copy(text, offset, length); + } public BytesRef(BytesRef other) { this(); @@ -106,6 +116,15 @@ UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this); } + /** + * Copies the UTF8 bytes for this string. + * + * @param text Must be well-formed unicode text, with no + * unpaired surrogates or invalid UTF16 code units. + */ + public void copy(char text[], int offset, int length) { + UnicodeUtil.UTF16toUTF8(text, offset, length, this); + } public boolean bytesEquals(BytesRef other) { if (length == other.length) { int otherUpto = other.offset; @@ -277,6 +296,62 @@ } } + private final static Comparator utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator(); + + public static Comparator getUTF8SortedAsUTF16Comparator() { + return utf8SortedAsUTF16SortOrder; + } + + private static class UTF8SortedAsUTF16Comparator implements Comparator { + // Only singleton + private UTF8SortedAsUTF16Comparator() {}; + + public int compare(BytesRef a, BytesRef b) { + + final byte[] aBytes = a.bytes; + int aUpto = a.offset; + final byte[] bBytes = b.bytes; + int bUpto = b.offset; + + final int aStop; + if (a.length < b.length) { + aStop = aUpto + a.length; + } else { + aStop = aUpto + b.length; + } + + while(aUpto < aStop) { + int aByte = aBytes[aUpto++] & 0xff; + int bByte = bBytes[bUpto++] & 0xff; + + if (aByte != bByte) { + + // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order + + // We know the terms are not equal, but, we may + // have to carefully fixup the bytes at the + // difference to match UTF16's sort order: + if (aByte >= 0xee && bByte >= 0xee) { + if ((aByte & 0xfe) == 0xee) { + aByte += 0x10; + } + if ((bByte&0xfe) == 0xee) { + bByte += 0x10; + } + } + return aByte - bByte; + } + } + + // One is a prefix of the other, or, they are equal: + return a.length - b.length; + } + + public boolean equals(Object other) { + return this == other; + } + } + public void writeExternal(ObjectOutput out) throws IOException { Index: lucene/src/java/org/apache/lucene/util/PagedBytes.java =================================================================== --- lucene/src/java/org/apache/lucene/util/PagedBytes.java (revision 958138) +++ lucene/src/java/org/apache/lucene/util/PagedBytes.java (working copy) @@ -125,7 +125,27 @@ return index; } + /** @lucene.internal Reads length as 1 or 2 byte vInt prefix, starting @ start. + * Returns the start offset of the next part, suitable as start parameter on next call + * to sequentially read all BytesRefs. */ + public long fillUsingLengthPrefix3(BytesRef b, long start) { + final int index = (int) (start >> blockBits); + final int offset = (int) (start & blockMask); + final byte[] block = b.bytes = blocks[index]; + if ((block[offset] & 128) == 0) { + b.length = block[offset]; + b.offset = offset+1; + start += 1L + b.length; + } else { + b.length = (((int) (block[offset] & 0x7f)) << 8) | (block[1+offset] & 0xff); + b.offset = offset+2; + start += 2L + b.length; + assert b.length > 0; + } + return start; + } + /** @lucene.internal */ public byte[][] getBlocks() { return blocks; @@ -230,7 +250,7 @@ /** Commits final byte[], trimming it if necessary and if trim=true */ public Reader freeze(boolean trim) { - if (upto < blockSize) { + if (trim && upto < blockSize) { final byte[] newBlock = new byte[upto]; System.arraycopy(currentBlock, 0, newBlock, 0, upto); currentBlock = newBlock; Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (revision 958138) +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (working copy) @@ -76,7 +76,6 @@ private int lastFieldNumber = -1; private TermInfosWriter other; - private BytesRef utf8Result = new BytesRef(10); TermInfosWriter(Directory directory, String segment, FieldInfos fis, int interval) @@ -106,8 +105,7 @@ } void add(Term term, TermInfo ti) throws IOException { - UnicodeUtil.UTF16toUTF8(term.text(), 0, term.text().length(), utf8Result); - add(fieldInfos.fieldNumber(term.field()), utf8Result.bytes, utf8Result.length, ti); + add(fieldInfos.fieldNumber(term.field()), term.bytes().bytes, term.bytes().length, ti); } // Currently used only by assert statements Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java =================================================================== --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 958138) +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (working copy) @@ -27,27 +27,8 @@ public class TestSurrogates extends LuceneTestCase { - private static final boolean DEBUG = false; + private static final boolean DEBUG = VERBOSE; - // like Term, but uses BytesRef for text - private static class FieldAndText implements Comparable { - String field; - BytesRef text; - - public FieldAndText(Term t) { - field = t.field(); - text = new BytesRef(t.text()); - } - - public int compareTo(FieldAndText other) { - if (other.field == field) { - return text.compareTo(other.text); - } else { - return field.compareTo(other.field); - } - } - } - // chooses from a very limited alphabet to exacerbate the // surrogate seeking required private static String makeDifficultRandomUnicodeString(Random r) { @@ -75,7 +56,7 @@ return new String(buffer, 0, end); } - private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException { + private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException { final int numField = _TestUtil.nextInt(r, 2, 5); @@ -109,11 +90,14 @@ fieldInfos.write(dir, segName); // sorts in UTF16 order, just like preflex: - Collections.sort(terms); + Collections.sort(terms, new Comparator() { + public int compare(Term o1, Term o2) { + return o1.compareToUTF16(o2); + } + }); TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128); TermInfo ti = new TermInfo(); - BytesRef utf8 = new BytesRef(10); String lastText = null; int uniqueTermCount = 0; if (DEBUG) { @@ -126,23 +110,22 @@ if (lastText != null && lastText.equals(text)) { continue; } - fieldTerms.add(new FieldAndText(t)); + fieldTerms.add(t); uniqueTermCount++; lastText = text; - UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8); if (DEBUG) { System.out.println(" " + toHexString(t)); } - w.add(fi.number, utf8.bytes, utf8.length, ti); + w.add(fi.number, t.bytes().bytes, t.bytes().length, ti); } w.close(); Collections.sort(fieldTerms); if (DEBUG) { System.out.println("\nTEST: codepoint order"); - for(FieldAndText t: fieldTerms) { - System.out.println(" " + t.field + ":" + UnicodeUtil.toHexString(t.text.utf8ToString())); + for(Term t: fieldTerms) { + System.out.println(" " + t.field() + ":" + toHexString(t)); } } @@ -164,7 +147,7 @@ Random r = newRandom(); FieldInfos fieldInfos = new FieldInfos(); - List fieldTerms = new ArrayList(); + List fieldTerms = new ArrayList(); SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); // hack alert!! @@ -186,8 +169,8 @@ BytesRef text; BytesRef lastText = null; while((text = termsEnum.next()) != null) { - UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16); if (DEBUG) { + UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16); System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length))); System.out.println(); } @@ -197,8 +180,8 @@ assertTrue(lastText.compareTo(text) < 0); lastText.copy(text); } - assertEquals(fieldTerms.get(termCount).field, field); - assertEquals(fieldTerms.get(termCount).text, text); + assertEquals(fieldTerms.get(termCount).field(), field); + assertEquals(fieldTerms.get(termCount).bytes(), text); termCount++; } if (DEBUG) { Index: lucene/src/test/org/apache/lucene/index/TestAddIndexes.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (revision 958138) +++ lucene/src/test/org/apache/lucene/index/TestAddIndexes.java (working copy) @@ -464,7 +464,7 @@ private void verifyTermDocs(Directory dir, Term term, int numDocs) throws IOException { IndexReader reader = IndexReader.open(dir, true); - DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, term.field, new BytesRef(term.text)); + DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, term.field, term.bytes); int count = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) count++; Index: lucene/src/test/org/apache/lucene/index/TestPayloads.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestPayloads.java (revision 958138) +++ lucene/src/test/org/apache/lucene/index/TestPayloads.java (working copy) @@ -188,7 +188,7 @@ Term[] terms = generateTerms(fieldName, numTerms); StringBuilder sb = new StringBuilder(); for (int i = 0; i < terms.length; i++) { - sb.append(terms[i].text); + sb.append(terms[i].text()); sb.append(" "); } String content = sb.toString(); Index: solr/src/java/org/apache/solr/search/SolrIndexSearcher.java =================================================================== --- solr/src/java/org/apache/solr/search/SolrIndexSearcher.java (revision 958138) +++ solr/src/java/org/apache/solr/search/SolrIndexSearcher.java (working copy) @@ -480,7 +480,7 @@ if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; - BytesRef termBytes = new BytesRef(t.text()); + BytesRef termBytes = t.bytes(); DocsEnum docs = terms.docs(MultiFields.getDeletedDocs(reader), termBytes, null); if (docs == null) return -1; int id = docs.nextDoc(); @@ -754,7 +754,7 @@ Fields fields = sir.fields(); Terms terms = fields.terms(t.field()); - BytesRef termBytes = new BytesRef(t.text()); + BytesRef termBytes = t.bytes(); Bits skipDocs = sir.getDeletedDocs(); DocsEnum docsEnum = terms==null ? null : terms.docs(skipDocs, termBytes, null); Index: solr/src/java/org/apache/solr/update/DirectUpdateHandler.java =================================================================== --- solr/src/java/org/apache/solr/update/DirectUpdateHandler.java (revision 958138) +++ solr/src/java/org/apache/solr/update/DirectUpdateHandler.java (working copy) @@ -118,7 +118,7 @@ DocsEnum tdocs = MultiFields.getTermDocsEnum(ir, MultiFields.getDeletedDocs(ir), idTerm.field(), - new BytesRef(idTerm.text())); + idTerm.bytes()); if (tdocs != null) { return tdocs.nextDoc() != DocsEnum.NO_MORE_DOCS; } else {