Index: lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java (revision 1448625) +++ lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java (working copy) @@ -342,7 +342,7 @@ // return BooleanScorer for topScorer): // Check if we can return a BooleanScorer - if (!scoreDocsInOrder && topScorer && required.size() == 0) { + if (false && !scoreDocsInOrder && topScorer && required.size() == 0) { return new BooleanScorer(this, disableCoord, minNrShouldMatch, optional, prohibited, maxCoord); } Index: lucene/core/src/java/org/apache/lucene/search/BooleanScorer2.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/BooleanScorer2.java (revision 1448625) +++ lucene/core/src/java/org/apache/lucene/search/BooleanScorer2.java (working copy) @@ -152,23 +152,43 @@ private Scorer countingDisjunctionSumScorer(final List scorers, int minNrShouldMatch) throws IOException { // each scorer from the list counted as a single matcher - return new DisjunctionSumScorer(weight, scorers, minNrShouldMatch) { - private int lastScoredDoc = -1; - // Save the score of lastScoredDoc, so that we don't compute it more than - // once in score(). - private float lastDocScore = Float.NaN; - @Override public float score() throws IOException { - int doc = docID(); - if (doc >= lastScoredDoc) { - if (doc > lastScoredDoc) { - lastDocScore = super.score(); - lastScoredDoc = doc; + if (minNrShouldMatch > 1) { + return new MinShouldMatchScorer(weight, scorers, minNrShouldMatch) { + private int lastScoredDoc = -1; + // Save the score of lastScoredDoc, so that we don't compute it more than + // once in score(). + private float lastDocScore = Float.NaN; + @Override public float score() throws IOException { + int doc = docID(); + if (doc >= lastScoredDoc) { + if (doc > lastScoredDoc) { + lastDocScore = super.score(); + lastScoredDoc = doc; + } + coordinator.nrMatchers += super.nrMatchers; } - coordinator.nrMatchers += super.nrMatchers; + return lastDocScore; } + }; + } else { + return new DisjunctionSumScorer(weight, scorers) { + private int lastScoredDoc = -1; + // Save the score of lastScoredDoc, so that we don't compute it more than + // once in score(). + private float lastDocScore = Float.NaN; + @Override public float score() throws IOException { + int doc = docID(); + if (doc >= lastScoredDoc) { + if (doc > lastScoredDoc) { + lastDocScore = super.score(); + lastScoredDoc = doc; + } + coordinator.nrMatchers += super.nrMatchers; + } return lastDocScore; - } - }; + } + }; + } } private Scorer countingConjunctionSumScorer(boolean disableCoord, @@ -270,7 +290,7 @@ : new ReqExclScorer(requiredCountingSumScorer, ((prohibitedScorers.size() == 1) ? prohibitedScorers.get(0) - : new DisjunctionSumScorer(weight, prohibitedScorers))); + : new MinShouldMatchScorer(weight, prohibitedScorers))); } /** Scores and collects all matching documents. Index: lucene/core/src/java/org/apache/lucene/search/MinShouldMatchScorer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/MinShouldMatchScorer.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/search/MinShouldMatchScorer.java (working copy) @@ -0,0 +1,321 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** A Scorer for OR like queries, counterpart of ConjunctionScorer. + * This Scorer implements {@link Scorer#advance(int)} and uses advance() on the given Scorers. + * + * This implementation uses the minimumMatch constraint actively to efficiently + * prune the number of candidates, it is hence a mixture between a pure DisjunctionScorer + * and a ConjunctionScorer. + * + * TODO It might be more efficient to fix the mm-1 lists with largest df, i.e. not requiring a stack: + * This would reduce sorting overhead, but also does not exploit within-list docid distributions. + * Lucene might benefit from Scorers being able to estimate their sparseness. + */ +class MinShouldMatchScorer extends Scorer { + + /** The overall number of non-finalized scorers */ + private int numScorers; + /** The minimum number of scorers that should match */ + private final int mm; + + private final Scorer subScorers[]; // the first numScorers-nrInStack entries are valid + private int nrInHeap; // 0..(numScorers-nrInStack-1) + + /** stack is supposed to contain the most dense subScorers as currently + * indicated by next docid + */ + private final Scorer mmStack[]; // of size mm-1: 0..mm-2 + private int nrInStack; // 0..mm-1 + + /** The document number of the current match. */ + private int doc = -1; + /** The number of subscorers that provide the current match. */ + protected int nrMatchers = -1; + private double score = Float.NaN; + + /** Construct a DisjunctionScorer. + * @param weight The weight to be used. + * @param subScorers A collection of at least two subscorers. + * @param minimumNrMatchers The positive minimum number of subscorers that should + * match to match this query. + *
When minimumNrMatchers is bigger than + * the number of subScorers, + * no matches will be produced. + *
When minimumNrMatchers equals the number of subScorers, + * it more efficient to use ConjunctionScorer. + */ + public MinShouldMatchScorer(Weight weight, List subScorers, int minimumNrMatchers) throws IOException { + super(weight); + this.nrInHeap = this.numScorers = subScorers.size(); + + if (minimumNrMatchers <= 0) { + throw new IllegalArgumentException("Minimum nr of matchers must be positive"); + } + if (numScorers <= 1) { + throw new IllegalArgumentException("There must be at least 2 subScorers"); + } + + this.mm = minimumNrMatchers; + this.subScorers = subScorers.toArray(new Scorer[this.nrInHeap]); + this.mmStack = new Scorer[this.mm-1]; + this.nrInStack = 0; + minheapHeapify(); + } + + /** Construct a DisjunctionScorer, using one as the minimum number + * of matching subscorers. + */ + public MinShouldMatchScorer(Weight weight, List subScorers) throws IOException { + this(weight, subScorers, 1); + } + + @Override + public final Collection getChildren() { + ArrayList children = new ArrayList(numScorers); + for (int i = 0; i < numScorers; i++) { + children.add(new ChildScorer(subScorers[i], "SHOULD")); + } + return children; + } + + @Override + public int nextDoc() throws IOException { + assert doc != NO_MORE_DOCS; + while(true) { + // 1. within heap, call next() on all subScorers on current doc + while (subScorers[0].docID() == doc) { + if (subScorers[0].nextDoc() != NO_MORE_DOCS) { + minheapAdjust(0); + } else { + minheapRemoveRoot(); + numScorers--; + if (numScorers < mm) { + return doc = NO_MORE_DOCS; + } + } + } + // 2. fill up stack with smallest subScorers in heap (here the local docid distribution is used as indicator for subScorer sparseness) + for (int i = nrInStack; i < mm - 1; i++) { + mmStack[i] = subScorers[0]; + minheapRemoveRoot(); + nrInStack++; + } + + evaluateSmallestDocInHeap(); + + // 3. keep stack order and iterate, then either + // advance all: throw all back into heap (always guaranteed to consider smallest set of candidates, but does more skipping) + // TODO short-circuit: throw visited back into heap (only visited non-matching subScorer have chance to determine next candidate) + for (int i = nrInStack - 1; i >= 0; i--) { // assume full stack, advance first sparsest subScorer as indicated by next doc + if (mmStack[i].docID() == doc || mmStack[i].advance(doc) != NO_MORE_DOCS) { + if (mmStack[i].docID() == doc) { + nrMatchers++; + score += mmStack[i].score(); + } + // push back into heap, no matter if on doc or after + minheapAdd(mmStack[i]); + nrInStack--; + } else { // subScorer exhausted + nrInStack--; + numScorers--; + if (numScorers < mm) { // too few subScorers left + return doc = NO_MORE_DOCS; + } + } + } + if (nrMatchers >= mm) { // doc satisfies mm constraint + break; + } + } + return doc; + } + + private void evaluateSmallestDocInHeap() throws IOException { + // within heap, subScorer[0] now contains the next candidate doc + doc = subScorers[0].docID(); + if (doc == NO_MORE_DOCS) { + nrMatchers = Integer.MAX_VALUE; // stop looping TODO is this really necessary? TestBooleanMinShouldMatch can live without it + } else { + // within heap, score and count number of matching subScorers + score = subScorers[0].score(); + nrMatchers = 1; + countMatches(1); + countMatches(2); + } + } + + // TODO: this currently scores, but so did the previous impl + // TODO: remove recursion. + // TODO: if we separate scoring, out of here, modify this + // and afterNext() to terminate when nrMatchers == minimumNrMatchers + // then also change freq() to just always compute it from scratch + private void countMatches(int root) throws IOException { + if (root < nrInHeap && subScorers[root].docID() == doc) { + nrMatchers++; + score += subScorers[root].score(); + countMatches((root<<1)+1); + countMatches((root<<1)+2); + } + } + + /** Returns the score of the current document matching the query. + * Initially invalid, until {@link #nextDoc()} is called the first time. + */ + @Override + public float score() throws IOException { + return (float)score; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int freq() throws IOException { + return nrMatchers; + } + + /** + * Advances to the first match beyond the current whose document number is + * greater than or equal to a given target.
+ * The implementation uses the advance() method on the subscorers. + * + * @param target The target document number. + * @return the document whose number is greater than or equal to the given + * target, or -1 if none exist. + */ + @Override + public int advance(int target) throws IOException { + if (numScorers < mm) + return doc = NO_MORE_DOCS; + // advance all Scorers at smaller docs to at least target + while (subScorers[0].docID() < target) { + if (subScorers[0].advance(target) != NO_MORE_DOCS) { + minheapAdjust(0); + } else { + minheapRemoveRoot(); + numScorers--; + if (numScorers < mm ) { + return doc = NO_MORE_DOCS; + } + } + } + + evaluateSmallestDocInHeap(); + + if (nrMatchers >= mm) { + return doc; + } else { + return nextDoc(); + } + } + + + /** + * Organize subScorers into a min heap with scorers generating the earliest document on top. + */ + protected final void minheapHeapify() { + for (int i = (nrInHeap >> 1) - 1; i >= 0; i--) { + minheapAdjust(i); + } + } + + /** + * The subtree of subScorers at root is a min heap except possibly for its root element. + * Bubble the root down as required to make the subtree a heap. + */ + protected final void minheapAdjust(int root) { + Scorer scorer = subScorers[root]; + int doc = scorer.docID(); + int i = root; + while (i <= (nrInHeap >> 1) - 1) { + int lchild = (i << 1) + 1; + Scorer lscorer = subScorers[lchild]; + int ldoc = lscorer.docID(); + int rdoc = Integer.MAX_VALUE, rchild = (i << 1) + 2; + Scorer rscorer = null; + if (rchild < nrInHeap) { + rscorer = subScorers[rchild]; + rdoc = rscorer.docID(); + } + if (ldoc < doc) { + if (rdoc < ldoc) { + subScorers[i] = rscorer; + subScorers[rchild] = scorer; + i = rchild; + } else { + subScorers[i] = lscorer; + subScorers[lchild] = scorer; + i = lchild; + } + } else if (rdoc < doc) { + subScorers[i] = rscorer; + subScorers[rchild] = scorer; + i = rchild; + } else { + return; + } + } + } + + /** + * Remove the root Scorer from subScorers and re-establish it as a heap + */ + protected final void minheapRemoveRoot() { + if (nrInHeap == 1) { + subScorers[0] = null; + nrInHeap = 0; + } else { + subScorers[0] = subScorers[nrInHeap - 1]; + subScorers[nrInHeap - 1] = null; + --nrInHeap; + minheapAdjust(0); + } + } + + /** + * Adds the given Scorer to the heap by adding it at the end and bubbling it up + */ + protected final void minheapAdd(Scorer scorer) { + int i = nrInHeap; + nrInHeap++; + int doc = scorer.docID(); + // find right place for scorer + while (i > 0) { + int parent = (i - 1) >> 1; + Scorer pscorer = subScorers[parent]; + int pdoc = pscorer.docID(); + if (pdoc > doc) { // move root down, make space + subScorers[i] = subScorers[parent]; + i = parent; + } else { // done, found right place + break; + } + } + subScorers[i] = scorer; + } + +} Property changes on: lucene/core/src/java/org/apache/lucene/search/MinShouldMatchScorer.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property