Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 985492) +++ lucene/CHANGES.txt (working copy) @@ -201,6 +201,11 @@ * LUCENE-2559: Added SegmentReader.reopen methods (John Wang via Mike McCandless) +* LUCENE-2590: Added Scorer.visitSubScorers, and Scorer.freq. Along + with a custom Collector these experimental methods make it possible + to gather the hit-count per sub-clause and per document while a + search is running. (Mike McCandless) + Optimizations * LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching. Index: lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java (revision 985492) +++ lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java (working copy) @@ -90,7 +90,7 @@ } }}; - BooleanScorer bs = new BooleanScorer(sim, 1, Arrays.asList(scorers), null); + BooleanScorer bs = new BooleanScorer(null, sim, 1, Arrays.asList(scorers), null); assertEquals("should have received 3000", 3000, bs.nextDoc()); assertEquals("should have received NO_MORE_DOCS", DocIdSetIterator.NO_MORE_DOCS, bs.nextDoc()); Index: lucene/src/test/org/apache/lucene/search/TestSubScorerFreqs.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSubScorerFreqs.java (revision 0) +++ lucene/src/test/org/apache/lucene/search/TestSubScorerFreqs.java (revision 0) @@ -0,0 +1,141 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.*; +import org.apache.lucene.index.*; +import org.apache.lucene.util.*; +import org.apache.lucene.store.*; +import java.util.*; +import java.io.*; + +import org.junit.Test; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import static org.junit.Assert.*; + +public class TestSubScorerFreqs extends LuceneTestCaseJ4 { + + private static Directory dir; + private static IndexSearcher s; + + @BeforeClass + public static void makeIndex() throws Exception { + dir = new MockRAMDirectory(); + RandomIndexWriter w = new RandomIndexWriter(newStaticRandom(TestSubScorerFreqs.class), dir); + Document doc = new Document(); + doc.add(new Field("f", "a b c d b c d c d d", Field.Store.NO, Field.Index.ANALYZED)); + w.addDocument(doc); + + doc = new Document(); + doc.add(new Field("f", "a b c d", Field.Store.NO, Field.Index.ANALYZED)); + w.addDocument(doc); + + s = new IndexSearcher(w.getReader()); + w.close(); + } + + @AfterClass + public static void finish() throws Exception { + s.getIndexReader().close(); + s.close(); + dir.close(); + } + + private static class CountingCollector extends Collector { + private final Collector other; + private int docBase; + + private Map subScorers; + public final Map> docCounts = new HashMap>(); + + public CountingCollector(Collector other) { + this.other = other; + } + + @Override + public void setScorer(Scorer scorer) throws IOException { + other.setScorer(scorer); + subScorers = new HashMap(); + scorer.visitSubScorers(new Scorer.VisitSubScorer() { + public void visit(Query parent, BooleanClause.Occur relationship, Query child, Scorer childScorer) { + subScorers.put(child, childScorer); + } + }); + } + + @Override + public void collect(int doc) throws IOException { + final Map freqs = new HashMap(); + for(Map.Entry ent : subScorers.entrySet()) { + freqs.put(ent.getKey(), ent.getValue().freq()); + } + docCounts.put(doc+docBase, freqs); + other.collect(doc); + } + + @Override + public void setNextReader(IndexReader reader, int docBase) throws IOException { + this.docBase = docBase; + other.setNextReader(reader, docBase); + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return other.acceptsDocsOutOfOrder(); + } + } + + private static final float FLOAT_TOLERANCE = 0.00001F; + + @Test + public void testTermQuery() throws Exception { + TermQuery q = new TermQuery(new Term("f", "d")); + CountingCollector c = new CountingCollector(TopScoreDocCollector.create(10, true)); + s.search(q, null, c); + + assertEquals(2, c.docCounts.size()); + + Map doc0 = c.docCounts.get(0); + assertEquals(1, doc0.size()); + assertEquals(4.0F, doc0.get(q), FLOAT_TOLERANCE); + + Map doc1 = c.docCounts.get(1); + assertEquals(1, doc1.size()); + assertEquals(1.0F, doc1.get(q), FLOAT_TOLERANCE); + } + + @Test + public void testPhraseQuery() throws Exception { + PhraseQuery q = new PhraseQuery(); + q.add(new Term("f", "b")); + q.add(new Term("f", "c")); + CountingCollector c = new CountingCollector(TopScoreDocCollector.create(10, true)); + s.search(q, null, c); + + assertEquals(2, c.docCounts.size()); + + Map doc0 = c.docCounts.get(0); + assertEquals(1, doc0.size()); + assertEquals(2.0F, doc0.get(q), FLOAT_TOLERANCE); + + Map doc1 = c.docCounts.get(1); + assertEquals(1, doc1.size()); + assertEquals(1.0F, doc1.get(q), FLOAT_TOLERANCE); + } +} Index: lucene/src/java/org/apache/lucene/search/BooleanScorer2.java =================================================================== --- lucene/src/java/org/apache/lucene/search/BooleanScorer2.java (revision 985492) +++ lucene/src/java/org/apache/lucene/search/BooleanScorer2.java (working copy) @@ -59,7 +59,8 @@ /** The number of optionalScorers that need to match (if there are any) */ private final int minNrShouldMatch; - + private final BooleanQuery q; + private int doc = -1; /** @@ -80,12 +81,13 @@ * @param optional * the list of optional scorers. */ - public BooleanScorer2(Similarity similarity, int minNrShouldMatch, + public BooleanScorer2(BooleanQuery q, Similarity similarity, int minNrShouldMatch, List required, List prohibited, List optional) throws IOException { super(similarity); if (minNrShouldMatch < 0) { throw new IllegalArgumentException("Minimum number of optional scorers should not be negative"); } + this.q = q; coordinator = new Coordinator(); this.minNrShouldMatch = minNrShouldMatch; @@ -305,9 +307,27 @@ } @Override + public float freq() { + return coordinator.nrMatchers; + } + + @Override public int advance(int target) throws IOException { return doc = countingSumScorer.advance(target); } -} + @Override + protected void visitSubScorers(Query parent, BooleanClause.Occur relationship, VisitSubScorer visit) { + visit.visit(parent, relationship, q, this); + for(Scorer s : optionalScorers) { + s.visitSubScorers(q, BooleanClause.Occur.SHOULD, visit); + } + for(Scorer s : prohibitedScorers) { + s.visitSubScorers(q, BooleanClause.Occur.MUST_NOT, visit); + } + for(Scorer s : requiredScorers) { + s.visitSubScorers(q, BooleanClause.Occur.MUST, visit); + } + } +} Index: lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (revision 985492) +++ lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (working copy) @@ -271,11 +271,7 @@ int d = scorer.advance(doc); float phraseFreq; if (d == doc) { - if (slop == 0) { - phraseFreq = ((ExactPhraseScorer) scorer).currentFreq(); - } else { - phraseFreq = ((SloppyPhraseScorer) scorer).currentFreq(); - } + phraseFreq = scorer.freq(); } else { phraseFreq = 0.0f; } Index: lucene/src/java/org/apache/lucene/search/PhraseScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PhraseScorer.java (revision 985492) +++ lucene/src/java/org/apache/lucene/search/PhraseScorer.java (working copy) @@ -129,8 +129,11 @@ /** * phrase frequency in current doc as computed by phraseFreq(). */ - public final float currentFreq() { return freq; } - + @Override + public final float freq() { + return freq; + } + /** * For a document containing all the phrase query terms, compute the * frequency of the phrase in that document. @@ -180,4 +183,8 @@ @Override public String toString() { return "scorer(" + weight + ")"; } + @Override + protected void visitSubScorers(Query parent, BooleanClause.Occur relationship, VisitSubScorer visit) { + visit.visit(parent, relationship, weight.getQuery(), this); + } } Index: lucene/src/java/org/apache/lucene/search/BooleanQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/BooleanQuery.java (revision 985492) +++ lucene/src/java/org/apache/lucene/search/BooleanQuery.java (working copy) @@ -319,7 +319,7 @@ // Check if we can return a BooleanScorer if (!scoreDocsInOrder && topScorer && required.size() == 0 && prohibited.size() < 32) { - return new BooleanScorer(similarity, minNrShouldMatch, optional, prohibited); + return new BooleanScorer(BooleanQuery.this, similarity, minNrShouldMatch, optional, prohibited); } if (required.size() == 0 && optional.size() == 0) { @@ -333,7 +333,7 @@ } // Return a BooleanScorer2 - return new BooleanScorer2(similarity, minNrShouldMatch, required, prohibited, optional); + return new BooleanScorer2(BooleanQuery.this, similarity, minNrShouldMatch, required, prohibited, optional); } @Override Index: lucene/src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PhraseQuery.java (revision 985492) +++ lucene/src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -275,11 +275,7 @@ int d = scorer.advance(doc); float phraseFreq; if (d == doc) { - if (slop == 0) { - phraseFreq = ((ExactPhraseScorer) scorer).currentFreq(); - } else { - phraseFreq = ((SloppyPhraseScorer) scorer).currentFreq(); - } + phraseFreq = scorer.freq(); } else { phraseFreq = 0.0f; } Index: lucene/src/java/org/apache/lucene/search/BooleanScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/BooleanScorer.java (revision 985492) +++ lucene/src/java/org/apache/lucene/search/BooleanScorer.java (working copy) @@ -115,6 +115,7 @@ float score; int doc = NO_MORE_DOCS; + int freq; public BucketScorer() { super(null); } @@ -125,6 +126,9 @@ public int docID() { return doc; } @Override + public float freq() { return freq; } + + @Override public int nextDoc() throws IOException { return NO_MORE_DOCS; } @Override @@ -159,7 +163,8 @@ static final class SubScorer { public Scorer scorer; - public boolean required = false; + // TODO: re-enable this if BQ ever sends us required clauses + //public boolean required = false; public boolean prohibited = false; public Collector collector; public SubScorer next; @@ -167,8 +172,12 @@ public SubScorer(Scorer scorer, boolean required, boolean prohibited, Collector collector, SubScorer next) throws IOException { + if (required) { + throw new IllegalArgumentException("this scorer cannot handle required=true"); + } this.scorer = scorer; - this.required = required; + // TODO: re-enable this if BQ ever sends us required clauses + //this.required = required; this.prohibited = prohibited; this.collector = collector; this.next = next; @@ -179,17 +188,20 @@ private BucketTable bucketTable = new BucketTable(); private int maxCoord = 1; private final float[] coordFactors; - private int requiredMask = 0; + // TODO: re-enable this if BQ ever sends us required clauses + //private int requiredMask = 0; private int prohibitedMask = 0; private int nextMask = 1; private final int minNrShouldMatch; private int end; private Bucket current; private int doc = -1; + private final BooleanQuery q; - BooleanScorer(Similarity similarity, int minNrShouldMatch, + BooleanScorer(BooleanQuery q, Similarity similarity, int minNrShouldMatch, List optionalScorers, List prohibitedScorers) throws IOException { super(similarity); + this.q = q; this.minNrShouldMatch = minNrShouldMatch; if (optionalScorers != null && optionalScorers.size() > 0) { @@ -233,8 +245,11 @@ while (current != null) { // more queued // check prohibited & required - if ((current.bits & prohibitedMask) == 0 && - (current.bits & requiredMask) == requiredMask) { + if ((current.bits & prohibitedMask) == 0) { + + // TODO: re-enable this if BQ ever sends us required + // clauses + //&& (current.bits & requiredMask) == requiredMask) { if (current.doc >= max){ tmp = current; @@ -247,6 +262,7 @@ if (current.coord >= minNrShouldMatch) { bs.score = current.score * coordFactors[current.coord]; bs.doc = current.doc; + bs.freq = current.coord; collector.collect(current.doc); } } @@ -296,8 +312,9 @@ // check prohibited & required, and minNrShouldMatch if ((current.bits & prohibitedMask) == 0 && - (current.bits & requiredMask) == requiredMask && current.coord >= minNrShouldMatch) { + // TODO: re-enable this if BQ ever sends us required clauses + // (current.bits & requiredMask) == requiredMask && return doc = current.doc; } } @@ -342,4 +359,24 @@ return buffer.toString(); } + @Override + protected void visitSubScorers(Query parent, BooleanClause.Occur relationship, VisitSubScorer visit) { + visit.visit(parent, relationship, q, this); + SubScorer sub = scorers; + while(sub != null) { + // TODO: re-enable this if BQ ever sends us required + //clauses + //if (sub.required) { + //sub.scorer.visitSubScorers(q, BooleanClause.Occur.MUST, visit); + if (!sub.prohibited) { + sub.scorer.visitSubScorers(q, BooleanClause.Occur.SHOULD, visit); + } else { + // TODO: maybe it's pointless to do this, but, it is + // possible the doc may still be collected, eg foo + // OR (bar -fee) + sub.scorer.visitSubScorers(q, BooleanClause.Occur.MUST_NOT, visit); + } + sub = sub.next; + } + } } Index: lucene/src/java/org/apache/lucene/search/Scorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/Scorer.java (revision 985492) +++ lucene/src/java/org/apache/lucene/search/Scorer.java (working copy) @@ -94,4 +94,36 @@ */ public abstract float score() throws IOException; + /** Returns number of matches for the current document. + * This returns a float (not int) because + * SloppyPhraseScorer discounts its freq according to how + * "sloppy" the match was. + * + * @lucene.experimental */ + public float freq() throws IOException { + throw new UnsupportedOperationException(this + " does not implement freq()"); + } + + /** Expert: used by {@link visitSubScorers}. + * + * @lucene.experimental */ + public interface VisitSubScorer { + public void visit(Query parent, BooleanClause.Occur relationship, Query child, Scorer childScorer); + } + + /** Expert: call this to gather details for all + * sub-scorers for this query. This can be used, in + * conjunction with a custom {@link Collector} to gather + * details about how each sub-query matched the current + * hit. + * + * @lucene.experimental */ + public void visitSubScorers(VisitSubScorer callback) { + visitSubScorers(null, null, callback); + } + + /** @lucene.experimental */ + protected void visitSubScorers(Query parent, BooleanClause.Occur relationship, VisitSubScorer callback) { + throw new UnsupportedOperationException(); + } } Index: lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (revision 985492) +++ lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (working copy) @@ -193,8 +193,8 @@ return "ExactPhraseScorer(" + weight + ")"; } - // used by MultiPhraseQuery - float currentFreq() { + @Override + public float freq() { return freq; } @@ -331,4 +331,9 @@ return freq; } + + @Override + protected void visitSubScorers(Query parent, BooleanClause.Occur relationship, VisitSubScorer visit) { + visit.visit(parent, relationship, weight.getQuery(), this); + } } Index: lucene/src/java/org/apache/lucene/search/TermScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TermScorer.java (revision 985492) +++ lucene/src/java/org/apache/lucene/search/TermScorer.java (working copy) @@ -103,6 +103,11 @@ return doc; } + @Override + public float freq() { + return freq; + } + /** * Advances to the next document matching the query.
* The iterator over the matching documents is buffered using @@ -172,4 +177,10 @@ /** Returns a string representation of this TermScorer. */ @Override public String toString() { return "scorer(" + weight + ")"; } + + @Override + protected void visitSubScorers(Query parent, BooleanClause.Occur relationship, VisitSubScorer visit) { + visit.visit(parent, relationship, weight.getQuery(), this); + } + }