diff -r b9a32a0862d2 lucene/src/java/org/apache/lucene/search/BooleanQuery.java --- a/lucene/src/java/org/apache/lucene/search/BooleanQuery.java Sat Oct 09 08:51:15 2010 -0400 +++ b/lucene/src/java/org/apache/lucene/search/BooleanQuery.java Sat Oct 09 23:01:29 2010 -0400 @@ -31,7 +31,7 @@ */ public class BooleanQuery extends Query implements Iterable { - private static int maxClauseCount = 1024; + private static int maxClauseCount = 10240; /** Thrown when an attempt is made to add more than {@link * #getMaxClauseCount()} clauses. This typically happens if @@ -368,8 +368,34 @@ return new BooleanWeight(searcher); } + static final Comparator tqComp = new Comparator() { + public int compare(BooleanClause one, BooleanClause two) { + Query oneQ = one.getQuery(); + Query twoQ = two.getQuery(); + boolean oneTQ = oneQ instanceof TermQuery; + boolean twoTQ = twoQ instanceof TermQuery; + if (oneTQ && twoTQ) { + return ((TermQuery) oneQ).getTerm().compareTo(((TermQuery) twoQ).getTerm()); + } else { + //return oneQ.hashCode() - twoQ.hashCode(); + return 0; + /* + } else if (oneTQ) { + return -1; + } else if (twoTQ) { + return 1; + } else { + return 0; + */ + + + } + } + }; + @Override public Query rewrite(IndexReader reader) throws IOException { + if (minNrShouldMatch == 0 && clauses.size() == 1) { // optimize 1-clause queries BooleanClause c = clauses.get(0); if (!c.isProhibited()) { // just return clause @@ -386,20 +412,18 @@ } } - BooleanQuery clone = null; // recursively rewrite + BooleanQuery clone = (BooleanQuery)this.clone(); + for (int i = 0 ; i < clauses.size(); i++) { BooleanClause c = clauses.get(i); Query query = c.getQuery().rewrite(reader); if (query != c.getQuery()) { // clause rewrote: must clone - if (clone == null) - clone = (BooleanQuery)this.clone(); clone.clauses.set(i, new BooleanClause(query, c.getOccur())); } } - if (clone != null) { - return clone; // some clauses rewrote - } else - return this; // no clauses rewrote + + Collections.sort(clone.clauses, tqComp); + return clone.equals(this) ? this : clone; // some clauses rewrote } // inherit javadoc diff -r b9a32a0862d2 lucene/src/java/org/apache/lucene/search/MultiTermQuery.java --- a/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java Sat Oct 09 08:51:15 2010 -0400 +++ b/lucene/src/java/org/apache/lucene/search/MultiTermQuery.java Sat Oct 09 23:01:29 2010 -0400 @@ -20,6 +20,10 @@ import java.io.IOException; import java.io.Serializable; import java.util.PriorityQueue; +import java.util.HashMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; @@ -31,7 +35,11 @@ import org.apache.lucene.queryParser.QueryParser; // for javadoc import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.PagedBytes; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.util.FloatUtil; +import org.apache.lucene.util.RecyclingByteBlockAllocator; +import org.apache.lucene.util.ReaderUtil; /** * An abstract {@link Query} that matches documents @@ -177,51 +185,48 @@ private abstract static class BooleanQueryRewrite extends RewriteMethod { protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { - final Fields fields = MultiFields.getFields(reader); - if (fields == null) { - // reader has no fields - return 0; - } + final List subReaders = new ArrayList(); + ReaderUtil.gatherSubReaders(subReaders, reader); + int count = 0; + + for (IndexReader r : subReaders) { + final Fields fields = r.fields(); + if (fields == null) { + // reader has no fields + continue; + } - final Terms terms = fields.terms(query.field); - if (terms == null) { - // field does not exist - return 0; - } + final Terms terms = fields.terms(query.field); + if (terms == null) { + // field does not exist + continue; + } - final TermsEnum termsEnum = query.getTermsEnum(reader); - assert termsEnum != null; + final TermsEnum termsEnum = query.getTermsEnum(r); + assert termsEnum != null; - if (termsEnum == TermsEnum.EMPTY) - return 0; - final BoostAttribute boostAtt = - termsEnum.attributes().addAttribute(BoostAttribute.class); - collector.boostAtt = boostAtt; - int count = 0; - BytesRef bytes; - while ((bytes = termsEnum.next()) != null) { - if (collector.collect(termsEnum, bytes, boostAtt.getBoost())) { - termsEnum.cacheCurrentTerm(); - count++; - } else { - break; + if (termsEnum == TermsEnum.EMPTY) + continue; + collector.setNextEnum(termsEnum); + BytesRef bytes; + while ((bytes = termsEnum.next()) != null) { + if (collector.collect(bytes)) { + termsEnum.cacheCurrentTerm(); + count++; + } else { + return count; // interrupt whole term collection, so also don't iterate other subReaders + } } } - collector.boostAtt = null; return count; } - protected static abstract class TermCollector { - private BoostAttribute boostAtt = null; - + protected static interface TermCollector { /** return false to stop collecting */ - public abstract boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException; + boolean collect(BytesRef bytes) throws IOException; - /** set the minimum boost as a hint for the term producer */ - protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { - assert boostAtt != null; - boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); - } + /** the next segment's {@link TermsEnum} that is used to collect terms */ + void setNextEnum(TermsEnum termsEnum) throws IOException; } } @@ -230,16 +235,38 @@ public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { final BooleanQuery result = new BooleanQuery(true); final Term placeholderTerm = new Term(query.field); - query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { - @Override - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) { - // add new TQ, we must clone the term, else it may get overwritten! - TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes)), termsEnum.docFreq()); - tq.setBoost(query.getBoost() * boost); // set the boost - result.add(tq, BooleanClause.Occur.SHOULD); // add to query + final Map terms = new HashMap(); + collectTerms(reader, query, new TermCollector() { + private TermsEnum termsEnum; + private BoostAttribute boostAtt; + + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + this.boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); + } + + public boolean collect(BytesRef bytes) { + final TermFreqBoost tfb = terms.get(bytes); + if (tfb != null) { + tfb.docFreq += termsEnum.docFreq(); + assert tfb.boost == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums"; + } else { + // we must clone the term, else it may get overwritten! + terms.put(new BytesRef(bytes), new TermFreqBoost(termsEnum.docFreq(), boostAtt.getBoost())); + } + if (terms.size() > BooleanQuery.getMaxClauseCount()) + throw new BooleanQuery.TooManyClauses(); return true; } - })); + }); + + for (Map.Entry e : terms.entrySet()) { + final Term t = placeholderTerm.createTerm(e.getKey()); + TermQuery tq = new TermQuery(t, e.getValue().docFreq); + tq.setBoost(query.getBoost() * e.getValue().boost); + result.add(tq, BooleanClause.Occur.SHOULD); + } + query.incTotalNumberOfTerms(terms.size()); return result; } @@ -247,6 +274,16 @@ protected Object readResolve() { return SCORING_BOOLEAN_QUERY_REWRITE; } + + private final class TermFreqBoost { + TermFreqBoost(int docFreq, float boost) { + this.docFreq = docFreq; + this.boost = boost; + } + + int docFreq; + float boost; + } } /** A rewrite method that first translates each term into @@ -291,24 +328,55 @@ final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount()); final PriorityQueue stQueue = new PriorityQueue(); collectTerms(reader, query, new TermCollector() { - @Override - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) { + private TermsEnum termsEnum; + private BoostAttribute boostAtt; + private float boostLie; + + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); + float minBoost = (stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY; + boostLie = minBoost; + boostAtt.setMaxNonCompetitiveBoost(FloatUtil.nextAfter(minBoost, Double.NEGATIVE_INFINITY)); + } + + public boolean collect(BytesRef bytes) { + final float boost = boostAtt.getBoost(); // ignore uncompetetive hits - if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost) + if (stQueue.size() >= maxSize && boost != boostLie && boost <= stQueue.peek().boost) return true; // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copy(bytes); st.boost = boost; st.docFreq = termsEnum.docFreq(); - stQueue.offer(st); - // possibly drop entries from queue - st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm(); - setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + final ScoreTerm st2 = visitedTerms.get(st.bytes); + if (st2 != null) { + // if the term is already in the queue, only update docFreq + st2.docFreq += st.docFreq; + assert st2.boost == st.boost : "boost should be equal in all segment TermsEnums"; + } else { + visitedTerms.put(st.bytes, st); + stQueue.offer(st); + // possibly drop entries from queue + if (stQueue.size() > maxSize) { + st = stQueue.poll(); + visitedTerms.remove(st.bytes); + } else { + st = new ScoreTerm(); + } + float minBoost = (stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY; + if (minBoost == boostLie) { + boostAtt.setMaxNonCompetitiveBoost(FloatUtil.nextAfter(minBoost, Float.NEGATIVE_INFINITY)); + } else { + boostAtt.setMaxNonCompetitiveBoost(minBoost); + } + } return true; } // reusable instance private ScoreTerm st = new ScoreTerm(); + private final Map visitedTerms = new HashMap(); }); final Term placeholderTerm = new Term(query.field); @@ -341,7 +409,7 @@ return true; } - private static class ScoreTerm implements Comparable { + private static final class ScoreTerm implements Comparable { public final BytesRef bytes = new BytesRef(); public float boost; public int docFreq; @@ -510,63 +578,56 @@ final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); - final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit); + final CutOffTermCollector col = new CutOffTermCollector(docCountCutoff, termCountLimit); collectTerms(reader, query, col); if (col.hasCutOff) { return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); - } else if (col.termCount == 0) { + } else if (col.pendingTerms.size() == 0) { return new BooleanQuery(true); } else { - final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false); - try { - final BooleanQuery bq = new BooleanQuery(true); - final Term placeholderTerm = new Term(query.field); - long start = col.startOffset; - for(int i = 0; i < col.termCount; i++) { - final BytesRef bytes = new BytesRef(); - start = bytesReader.fillUsingLengthPrefix3(bytes, start); - bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD); - } - // Strip scores - final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); - result.setBoost(query.getBoost()); - query.incTotalNumberOfTerms(col.termCount); - return result; - } finally { - bytesReader.close(); + final BooleanQuery bq = new BooleanQuery(true); + final Term placeholderTerm = new Term(query.field); + for(int i = 0, c = col.pendingTerms.size(); i < c; i++) { + final BytesRef bytes = new BytesRef(col.pendingTerms.get(i)); + // docFreq is not used for constant score here, we pass 1 + // to explicitely set a fake value, so it's not calculated + bq.add(new TermQuery(placeholderTerm.createTerm(bytes), 1), BooleanClause.Occur.SHOULD); } + // Strip scores + final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); + result.setBoost(query.getBoost()); + query.incTotalNumberOfTerms(col.pendingTerms.size()); + return result; } } - private static final class CutOffTermCollector extends TermCollector { - CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) { - this.reader = reader; - this.field = field; + private static final class CutOffTermCollector implements TermCollector { + CutOffTermCollector(int docCountCutoff, int termCountLimit) { this.docCountCutoff = docCountCutoff; this.termCountLimit = termCountLimit; } - public boolean collect(TermsEnum termsEnum, BytesRef bytes, float boost) throws IOException { - termCount++; - if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) { + public void setNextEnum(TermsEnum termsEnum) throws IOException { + this.termsEnum = termsEnum; + } + + public boolean collect(BytesRef bytes) throws IOException { + if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { hasCutOff = true; return false; } - pendingTerms.copyUsingLengthPrefix(bytes); + pendingTerms.add(bytes); docVisitCount += termsEnum.docFreq(); return true; } int docVisitCount = 0; boolean hasCutOff = false; - int termCount = 0; - - final IndexReader reader; - final String field; + TermsEnum termsEnum; + final int docCountCutoff, termCountLimit; - final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB - final long startOffset = pendingTerms.getPointer(); + final BytesRefHash pendingTerms = new BytesRefHash(new ByteBlockPool(new RecyclingByteBlockAllocator())); } @Override