Index: lucene/src/java/org/apache/lucene/search/MultiTermQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/MultiTermQuery.java (revision 1005475) +++ lucene/src/java/org/apache/lucene/search/MultiTermQuery.java (working copy) @@ -20,6 +20,10 @@ import java.io.IOException; import java.io.Serializable; import java.util.PriorityQueue; +import java.util.HashMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; @@ -31,7 +35,10 @@ import org.apache.lucene.queryParser.QueryParser; // for javadoc import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.RecyclingByteBlockAllocator; +import org.apache.lucene.util.ReaderUtil; /** * An abstract {@link Query} that matches documents @@ -177,51 +184,48 @@ private abstract static class BooleanQueryRewrite extends RewriteMethod { protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { - final Fields fields = MultiFields.getFields(reader); - if (fields == null) { - // reader has no fields - return 0; - } + final List subReaders = new ArrayList(); + ReaderUtil.gatherSubReaders(subReaders, reader); + int count = 0; + + for (IndexReader r : subReaders) { + final Fields fields = r.fields(); + if (fields == null) { + // reader has no fields + continue; + } - final Terms terms = fields.terms(query.field); - if (terms == null) { - // field does not exist - return 0; - } + final Terms terms = fields.terms(query.field); + if (terms == null) { + // field does not exist + continue; + } - final TermsEnum termsEnum = query.getTermsEnum(reader); - assert termsEnum != null; + final TermsEnum termsEnum = query.getTermsEnum(r); + assert termsEnum != null; - if (termsEnum == TermsEnum.EMPTY) - return 0; - final BoostAttribute boostAtt = - termsEnum.attributes().addAttribute(BoostAttribute.class); - collector.boostAtt = boostAtt; - int count = 0; - BytesRef bytes; - while ((bytes = termsEnum.next()) != null) { - if (collector.collect(termsEnum, bytes, boostAtt.getBoost())) { - termsEnum.cacheCurrentTerm(); - count++; - } else { - break; + if (termsEnum == TermsEnum.EMPTY) + continue; + collector.setNextEnum(termsEnum); + BytesRef bytes; + while ((bytes = termsEnum.next()) != null) { + if (collector.collect(bytes)) { + termsEnum.cacheCurrentTerm(); + count++; + } else { + return count; // interrupt whole term collection, so also don't iterate other subReaders + } } } - collector.boostAtt = null; return count; } - protected static abstract class TermCollector { - private BoostAttribute boostAtt = null; - + protected static interface TermCollector { /** return false to stop collecting */ - public abstract boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException; + boolean collect(BytesRef bytes) throws IOException; - /** set the minimum boost as a hint for the term producer */ - protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { - assert boostAtt != null; - boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); - } + /** the next segment's {@link TermsEnum} that is used to collect terms */ + void setNextEnum(TermsEnum termsEnum) throws IOException; } } @@ -230,16 +234,37 @@ public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { final BooleanQuery result = new BooleanQuery(true); final Term placeholderTerm = new Term(query.field); + final Map terms = new HashMap(); query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { - @Override - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) { - // add new TQ, we must clone the term, else it may get overwritten! - TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)), termsEnum.docFreq()); - tq.setBoost(query.getBoost() * boost); // set the boost - result.add(tq, BooleanClause.Occur.SHOULD); // add to query + private TermsEnum termsEnum; + private BoostAttribute boostAtt; + + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); + } + + public boolean collect(BytesRef bytes) { + final TermFreqBoost tfb = terms.get(bytes); + if (tfb != null) { + tfb.docFreq += termsEnum.docFreq(); + assert tfb.boost == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums"; + } else { + // we must clone the term, else it may get overwritten! + terms.put(new BytesRef(bytes), new TermFreqBoost(termsEnum.docFreq(), boostAtt.getBoost())); + } + if (terms.size() > BooleanQuery.getMaxClauseCount()) + throw new BooleanQuery.TooManyClauses(); return true; } })); + + for (Map.Entry e : terms.entrySet()) { + final Term t = placeholderTerm.createTerm(e.getKey()); + TermQuery tq = new TermQuery(t, e.getValue().docFreq); + tq.setBoost(query.getBoost() * e.getValue().boost); + result.add(tq, BooleanClause.Occur.SHOULD); + } return result; } @@ -247,6 +272,16 @@ protected Object readResolve() { return SCORING_BOOLEAN_QUERY_REWRITE; } + + private final class TermFreqBoost { + TermFreqBoost(int docFreq, float boost) { + this.docFreq = docFreq; + this.boost = boost; + } + + int docFreq; + float boost; + } } /** A rewrite method that first translates each term into @@ -291,8 +326,17 @@ final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount()); final PriorityQueue stQueue = new PriorityQueue(); collectTerms(reader, query, new TermCollector() { - @Override - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) { + private TermsEnum termsEnum; + private BoostAttribute boostAtt; + + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); + boostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + } + + public boolean collect(BytesRef bytes) { + final float boost = boostAtt.getBoost(); // ignore uncompetetive hits if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost) return true; @@ -300,15 +344,28 @@ st.bytes.copy(bytes); st.boost = boost; st.docFreq = termsEnum.docFreq(); - stQueue.offer(st); - // possibly drop entries from queue - st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm(); - setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + final ScoreTerm st2 = visitedTerms.get(st.bytes); + if (st2 != null) { + // if the term is already in the queue, only update docFreq + st2.docFreq += st.docFreq; + assert st2.boost == st.boost : "boost should be equal in all segment TermsEnums"; + } else { + stQueue.offer(st); + // possibly drop entries from queue + if (stQueue.size() > maxSize) { + st = stQueue.poll(); + visitedTerms.remove(st.bytes); + } else { + st = new ScoreTerm(); + } + boostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + } return true; } // reusable instance private ScoreTerm st = new ScoreTerm(); + private final Map visitedTerms = new HashMap(); }); final Term placeholderTerm = new Term(query.field); @@ -341,7 +398,7 @@ return true; } - private static class ScoreTerm implements Comparable { + private static final class ScoreTerm implements Comparable { public final BytesRef bytes = new BytesRef(); public float boost; public int docFreq; @@ -510,63 +567,56 @@ final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); - final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit); + final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); collectTerms(reader, query, col); if (col.hasCutOff) { return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); - } else if (col.termCount == 0) { + } else if (col.pendingTerms.size() == 0) { return new BooleanQuery(true); } else { - final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false); - try { - final BooleanQuery bq = new BooleanQuery(true); - final Term placeholderTerm = new Term(query.field); - long start = col.startOffset; - for(int i = 0; i < col.termCount; i++) { - final BytesRef bytes = new BytesRef(); - start = bytesReader.fillUsingLengthPrefix3(bytes, start); - bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD); - } - // Strip scores - final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); - result.setBoost(query.getBoost()); - query.incTotalNumberOfTerms(col.termCount); - return result; - } finally { - bytesReader.close(); + final BooleanQuery bq = new BooleanQuery(true); + final Term placeholderTerm = new Term(query.field); + for(int i = 0, c = col.pendingTerms.size(); i < c; i++) { + final BytesRef bytes = new BytesRef(col.pendingTerms.get(i)); + // docFreq is not used for constant score here, we pass 1 + // to explicitely set a fake value, so it's not calculated + bq.add(new TermQuery(placeholderTerm.createTerm(bytes), 1), BooleanClause.Occur.SHOULD); } + // Strip scores + final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); + result.setBoost(query.getBoost()); + query.incTotalNumberOfTerms(col.pendingTerms.size()); + return result; } } - private static final class CutOffTermCollector extends TermCollector { - CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) { - this.reader = reader; - this.field = field; + private static final class CutOffTermCollector implements TermCollector { + CutOffTermCollector(int docCountCutoff, int termCountLimit) { this.docCountCutoff = docCountCutoff; this.termCountLimit = termCountLimit; } - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException { - termCount++; - if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) { + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + } + + public boolean collect(BytesRef bytes) throws IOException { + if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { hasCutOff = true; return false; } - pendingTerms.copyUsingLengthPrefix(bytes); + pendingTerms.add(bytes); docVisitCount += termsEnum.docFreq(); return true; } int docVisitCount = 0; boolean hasCutOff = false; - int termCount = 0; - - final IndexReader reader; - final String field; + TermsEnum termsEnum; + final int docCountCutoff, termCountLimit; - final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB - final long startOffset = pendingTerms.getPointer(); + final BytesRefHash pendingTerms = new BytesRefHash(new ByteBlockPool(new RecyclingByteBlockAllocator())); } @Override