Index: lucene/src/java/org/apache/lucene/search/MultiTermQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/MultiTermQuery.java (revision 1006192) +++ lucene/src/java/org/apache/lucene/search/MultiTermQuery.java (working copy) @@ -20,6 +20,10 @@ import java.io.IOException; import java.io.Serializable; import java.util.PriorityQueue; +import java.util.HashMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; @@ -31,7 +35,11 @@ import org.apache.lucene.queryParser.QueryParser; // for javadoc import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.FloatUtil; +import org.apache.lucene.util.RecyclingByteBlockAllocator; +import org.apache.lucene.util.ReaderUtil; /** * An abstract {@link Query} that matches documents @@ -177,51 +185,48 @@ private abstract static class BooleanQueryRewrite extends RewriteMethod { protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { - final Fields fields = MultiFields.getFields(reader); - if (fields == null) { - // reader has no fields - return 0; - } + final List subReaders = new ArrayList(); + ReaderUtil.gatherSubReaders(subReaders, reader); + int count = 0; + + for (IndexReader r : subReaders) { + final Fields fields = r.fields(); + if (fields == null) { + // reader has no fields + continue; + } - final Terms terms = fields.terms(query.field); - if (terms == null) { - // field does not exist - return 0; - } + final Terms terms = fields.terms(query.field); + if (terms == null) { + // field does not exist + continue; + } - final TermsEnum termsEnum = query.getTermsEnum(reader); - assert termsEnum != null; + final TermsEnum termsEnum = query.getTermsEnum(r); + assert termsEnum != null; - if (termsEnum == TermsEnum.EMPTY) - return 0; - final BoostAttribute boostAtt = - termsEnum.attributes().addAttribute(BoostAttribute.class); - collector.boostAtt = boostAtt; - int count = 0; - BytesRef bytes; - while ((bytes = termsEnum.next()) != null) { - if (collector.collect(termsEnum, bytes, boostAtt.getBoost())) { - termsEnum.cacheCurrentTerm(); - count++; - } else { - break; + if (termsEnum == TermsEnum.EMPTY) + continue; + collector.setNextEnum(termsEnum); + BytesRef bytes; + while ((bytes = termsEnum.next()) != null) { + if (collector.collect(bytes)) { + termsEnum.cacheCurrentTerm(); + count++; + } else { + return count; // interrupt whole term collection, so also don't iterate other subReaders + } } } - collector.boostAtt = null; return count; } - protected static abstract class TermCollector { - private BoostAttribute boostAtt = null; - + protected static interface TermCollector { /** return false to stop collecting */ - public abstract boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException; + boolean collect(BytesRef bytes) throws IOException; - /** set the minimum boost as a hint for the term producer */ - protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { - assert boostAtt != null; - boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); - } + /** the next segment's {@link TermsEnum} that is used to collect terms */ + void setNextEnum(TermsEnum termsEnum) throws IOException; } } @@ -230,16 +235,37 @@ public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { final BooleanQuery result = new BooleanQuery(true); final Term placeholderTerm = new Term(query.field); + final Map terms = new HashMap(); query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { - @Override - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) { - // add new TQ, we must clone the term, else it may get overwritten! - TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)), termsEnum.docFreq()); - tq.setBoost(query.getBoost() * boost); // set the boost - result.add(tq, BooleanClause.Occur.SHOULD); // add to query + private TermsEnum termsEnum; + private BoostAttribute boostAtt; + + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); + } + + public boolean collect(BytesRef bytes) { + final TermFreqBoost tfb = terms.get(bytes); + if (tfb != null) { + tfb.docFreq += termsEnum.docFreq(); + assert tfb.boost == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums"; + } else { + // we must clone the term, else it may get overwritten! + terms.put(new BytesRef(bytes), new TermFreqBoost(termsEnum.docFreq(), boostAtt.getBoost())); + } + if (terms.size() > BooleanQuery.getMaxClauseCount()) + throw new BooleanQuery.TooManyClauses(); return true; } })); + + for (Map.Entry e : terms.entrySet()) { + final Term t = placeholderTerm.createTerm(e.getKey()); + TermQuery tq = new TermQuery(t, e.getValue().docFreq); + tq.setBoost(query.getBoost() * e.getValue().boost); + result.add(tq, BooleanClause.Occur.SHOULD); + } return result; } @@ -247,6 +273,16 @@ protected Object readResolve() { return SCORING_BOOLEAN_QUERY_REWRITE; } + + private final class TermFreqBoost { + TermFreqBoost(int docFreq, float boost) { + this.docFreq = docFreq; + this.boost = boost; + } + + int docFreq; + float boost; + } } /** A rewrite method that first translates each term into @@ -291,24 +327,55 @@ final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount()); final PriorityQueue stQueue = new PriorityQueue(); collectTerms(reader, query, new TermCollector() { - @Override - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) { + private TermsEnum termsEnum; + private BoostAttribute boostAtt; + private float boostLie; + + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); + float minBoost = (stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY; + boostLie = minBoost; + boostAtt.setMaxNonCompetitiveBoost(FloatUtil.nextAfter(minBoost, Double.NEGATIVE_INFINITY)); + } + + public boolean collect(BytesRef bytes) { + final float boost = boostAtt.getBoost(); // ignore uncompetetive hits - if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost) + if (stQueue.size() >= maxSize && boost != boostLie && boost <= stQueue.peek().boost) return true; // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copy(bytes); st.boost = boost; st.docFreq = termsEnum.docFreq(); - stQueue.offer(st); - // possibly drop entries from queue - st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm(); - setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + final ScoreTerm st2 = visitedTerms.get(st.bytes); + if (st2 != null) { + // if the term is already in the queue, only update docFreq + st2.docFreq += st.docFreq; + assert st2.boost == st.boost : "boost should be equal in all segment TermsEnums"; + } else { + visitedTerms.put(st.bytes, st); + stQueue.offer(st); + // possibly drop entries from queue + if (stQueue.size() > maxSize) { + st = stQueue.poll(); + visitedTerms.remove(st.bytes); + } else { + st = new ScoreTerm(); + } + float minBoost = (stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY; + if (minBoost == boostLie) { + boostAtt.setMaxNonCompetitiveBoost(FloatUtil.nextAfter(minBoost, Float.NEGATIVE_INFINITY)); + } else { + boostAtt.setMaxNonCompetitiveBoost(minBoost); + } + } return true; } // reusable instance private ScoreTerm st = new ScoreTerm(); + private final Map visitedTerms = new HashMap(); }); final Term placeholderTerm = new Term(query.field); @@ -341,7 +408,7 @@ return true; } - private static class ScoreTerm implements Comparable { + private static final class ScoreTerm implements Comparable { public final BytesRef bytes = new BytesRef(); public float boost; public int docFreq; @@ -510,63 +577,56 @@ final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); - final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit); + final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); collectTerms(reader, query, col); if (col.hasCutOff) { return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); - } else if (col.termCount == 0) { + } else if (col.pendingTerms.size() == 0) { return new BooleanQuery(true); } else { - final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false); - try { - final BooleanQuery bq = new BooleanQuery(true); - final Term placeholderTerm = new Term(query.field); - long start = col.startOffset; - for(int i = 0; i < col.termCount; i++) { - final BytesRef bytes = new BytesRef(); - start = bytesReader.fillUsingLengthPrefix3(bytes, start); - bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD); - } - // Strip scores - final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); - result.setBoost(query.getBoost()); - query.incTotalNumberOfTerms(col.termCount); - return result; - } finally { - bytesReader.close(); + final BooleanQuery bq = new BooleanQuery(true); + final Term placeholderTerm = new Term(query.field); + for(int i = 0, c = col.pendingTerms.size(); i < c; i++) { + final BytesRef bytes = new BytesRef(col.pendingTerms.get(i)); + // docFreq is not used for constant score here, we pass 1 + // to explicitely set a fake value, so it's not calculated + bq.add(new TermQuery(placeholderTerm.createTerm(bytes), 1), BooleanClause.Occur.SHOULD); } + // Strip scores + final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); + result.setBoost(query.getBoost()); + query.incTotalNumberOfTerms(col.pendingTerms.size()); + return result; } } - private static final class CutOffTermCollector extends TermCollector { - CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) { - this.reader = reader; - this.field = field; + private static final class CutOffTermCollector implements TermCollector { + CutOffTermCollector(int docCountCutoff, int termCountLimit) { this.docCountCutoff = docCountCutoff; this.termCountLimit = termCountLimit; } - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException { - termCount++; - if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) { + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + } + + public boolean collect(BytesRef bytes) throws IOException { + if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { hasCutOff = true; return false; } - pendingTerms.copyUsingLengthPrefix(bytes); + pendingTerms.add(bytes); docVisitCount += termsEnum.docFreq(); return true; } int docVisitCount = 0; boolean hasCutOff = false; - int termCount = 0; - - final IndexReader reader; - final String field; + TermsEnum termsEnum; + final int docCountCutoff, termCountLimit; - final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB - final long startOffset = pendingTerms.getPointer(); + final BytesRefHash pendingTerms = new BytesRefHash(new ByteBlockPool(new RecyclingByteBlockAllocator())); } @Override Index: lucene/src/java/org/apache/lucene/util/FloatUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/FloatUtil.java (revision 0) +++ lucene/src/java/org/apache/lucene/util/FloatUtil.java (revision 0) @@ -0,0 +1,75 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class FloatUtil { + /** + * Answers a float next to the first given float value in the direction of + * the second given double value. + * + * @param start + * the float value to start + * @param direction + * the double indicating the direction + * @return a float next to the first given float value in the direction of + * the second given double. + * + * @since 1.6 + */ + @SuppressWarnings("boxing") + public static float nextAfter(float start, double direction) { + /* this implementation is from apache harmony's java 6 branch (StrictMath) */ + if (Float.isNaN(start) || Double.isNaN(direction)) { + return Float.NaN; + } + if (0 == start && 0 == direction) { + return new Float(direction); + } + if ((start == Float.MIN_VALUE && direction < start) + || (start == -Float.MIN_VALUE && direction > start)) { + return (start > 0 ? 0f : -0f); + } + if (Float.isInfinite(start) && (direction != start)) { + return (start > 0 ? Float.MAX_VALUE : -Float.MAX_VALUE); + } + if ((start == Float.MAX_VALUE && direction > start) + || (start == -Float.MAX_VALUE && direction < start)) { + return (start > 0 ? Float.POSITIVE_INFINITY + : Float.NEGATIVE_INFINITY); + } + if (direction > start) { + if (start > 0) { + return Float.intBitsToFloat(Float.floatToIntBits(start) + 1); + } + if (start < 0) { + return Float.intBitsToFloat(Float.floatToIntBits(start) - 1); + } + return +Float.MIN_VALUE; + } + if (direction < start) { + if (start > 0) { + return Float.intBitsToFloat(Float.floatToIntBits(start) - 1); + } + if (start < 0) { + return Float.intBitsToFloat(Float.floatToIntBits(start) + 1); + } + return -Float.MIN_VALUE; + } + return Double.valueOf(direction).floatValue(); + } +}