Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 988223) +++ CHANGES.txt (working copy) @@ -204,6 +204,11 @@ * LUCENE-2559: Added SegmentReader.reopen methods (John Wang via Mike McCandless) +* LUCENE-2590: Added Scorer.visitSubScorers, and Scorer.freq. Along + with a custom Collector these experimental methods make it possible + to gather the hit-count per sub-clause and per document while a + search is running. (Simon Willnauer, Mike McCandless) + Optimizations * LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching. Index: src/java/org/apache/lucene/search/BooleanQuery.java =================================================================== --- src/java/org/apache/lucene/search/BooleanQuery.java (revision 988223) +++ src/java/org/apache/lucene/search/BooleanQuery.java (working copy) @@ -319,7 +319,7 @@ // Check if we can return a BooleanScorer if (!scoreDocsInOrder && topScorer && required.size() == 0 && prohibited.size() < 32) { - return new BooleanScorer(similarity, minNrShouldMatch, optional, prohibited); + return new BooleanScorer(BooleanQuery.this, similarity, minNrShouldMatch, optional, prohibited); } if (required.size() == 0 && optional.size() == 0) { @@ -333,7 +333,7 @@ } // Return a BooleanScorer2 - return new BooleanScorer2(similarity, minNrShouldMatch, required, prohibited, optional); + return new BooleanScorer2(BooleanQuery.this, similarity, minNrShouldMatch, required, prohibited, optional); } @Override Index: src/java/org/apache/lucene/search/BooleanScorer.java =================================================================== --- src/java/org/apache/lucene/search/BooleanScorer.java (revision 988223) +++ src/java/org/apache/lucene/search/BooleanScorer.java (working copy) @@ -21,6 +21,7 @@ import java.util.List; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.BooleanClause.Occur; /* Description from Doug Cutting (excerpted from * LUCENE-1483): @@ -115,6 +116,7 @@ float score; int doc = NO_MORE_DOCS; + int freq; public BucketScorer() { super(null); } @@ -125,6 +127,9 @@ public int docID() { return doc; } @Override + public float freq() { return freq; } + + @Override public int nextDoc() throws IOException { return NO_MORE_DOCS; } @Override @@ -159,7 +164,8 @@ static final class SubScorer { public Scorer scorer; - public boolean required = false; + // TODO: re-enable this if BQ ever sends us required clauses + //public boolean required = false; public boolean prohibited = false; public Collector collector; public SubScorer next; @@ -167,8 +173,12 @@ public SubScorer(Scorer scorer, boolean required, boolean prohibited, Collector collector, SubScorer next) throws IOException { + if (required) { + throw new IllegalArgumentException("this scorer cannot handle required=true"); + } this.scorer = scorer; - this.required = required; + // TODO: re-enable this if BQ ever sends us required clauses + //this.required = required; this.prohibited = prohibited; this.collector = collector; this.next = next; @@ -179,17 +189,20 @@ private BucketTable bucketTable = new BucketTable(); private int maxCoord = 1; private final float[] coordFactors; - private int requiredMask = 0; + // TODO: re-enable this if BQ ever sends us required clauses + //private int requiredMask = 0; private int prohibitedMask = 0; private int nextMask = 1; private final int minNrShouldMatch; private int end; private Bucket current; private int doc = -1; + private final BooleanQuery q; - BooleanScorer(Similarity similarity, int minNrShouldMatch, + BooleanScorer(BooleanQuery q, Similarity similarity, int minNrShouldMatch, List optionalScorers, List prohibitedScorers) throws IOException { super(similarity); + this.q = q; this.minNrShouldMatch = minNrShouldMatch; if (optionalScorers != null && optionalScorers.size() > 0) { @@ -233,8 +246,11 @@ while (current != null) { // more queued // check prohibited & required - if ((current.bits & prohibitedMask) == 0 && - (current.bits & requiredMask) == requiredMask) { + if ((current.bits & prohibitedMask) == 0) { + + // TODO: re-enable this if BQ ever sends us required + // clauses + //&& (current.bits & requiredMask) == requiredMask) { if (current.doc >= max){ tmp = current; @@ -247,6 +263,7 @@ if (current.coord >= minNrShouldMatch) { bs.score = current.score * coordFactors[current.coord]; bs.doc = current.doc; + bs.freq = current.coord; collector.collect(current.doc); } } @@ -296,8 +313,9 @@ // check prohibited & required, and minNrShouldMatch if ((current.bits & prohibitedMask) == 0 && - (current.bits & requiredMask) == requiredMask && current.coord >= minNrShouldMatch) { + // TODO: re-enable this if BQ ever sends us required clauses + // (current.bits & requiredMask) == requiredMask && return doc = current.doc; } } @@ -341,5 +359,38 @@ buffer.append(")"); return buffer.toString(); } + + @Override + protected void visitSubScorers(Query parent, Occur relationship, ScorerVisitor visitor) { + switch(relationship){ + case MUST: + visitor.visitRequired(parent,q, this); + break; + case MUST_NOT: + visitor.visitProhibited(parent,q, this); + break; + case SHOULD: + visitor.visitOptional(parent,q, this); + break; + } + visitor.visitOptional(parent,q, this); + SubScorer sub = scorers; + while(sub != null) { + // TODO: re-enable this if BQ ever sends us required + //clauses + //if (sub.required) { + //relationship = Occur.MUST; + if (!sub.prohibited) { + relationship = Occur.SHOULD; + } else { + // TODO: maybe it's pointless to do this, but, it is + // possible the doc may still be collected, eg foo + // OR (bar -fee) + relationship = Occur.MUST_NOT; + } + sub.scorer.visitSubScorers(q, relationship, visitor); + sub = sub.next; + } + } } Index: src/java/org/apache/lucene/search/BooleanScorer2.java =================================================================== --- src/java/org/apache/lucene/search/BooleanScorer2.java (revision 988223) +++ src/java/org/apache/lucene/search/BooleanScorer2.java (working copy) @@ -21,6 +21,8 @@ import java.util.ArrayList; import java.util.List; +import org.apache.lucene.search.BooleanClause.Occur; + /* See the description in BooleanScorer.java, comparing * BooleanScorer & BooleanScorer2 */ @@ -59,7 +61,8 @@ /** The number of optionalScorers that need to match (if there are any) */ private final int minNrShouldMatch; - + private final BooleanQuery q; + private int doc = -1; /** @@ -80,12 +83,13 @@ * @param optional * the list of optional scorers. */ - public BooleanScorer2(Similarity similarity, int minNrShouldMatch, + public BooleanScorer2(BooleanQuery q, Similarity similarity, int minNrShouldMatch, List required, List prohibited, List optional) throws IOException { super(similarity); if (minNrShouldMatch < 0) { throw new IllegalArgumentException("Minimum number of optional scorers should not be negative"); } + this.q = q; coordinator = new Coordinator(); this.minNrShouldMatch = minNrShouldMatch; @@ -305,9 +309,36 @@ } @Override + public float freq() { + return coordinator.nrMatchers; + } + + @Override public int advance(int target) throws IOException { return doc = countingSumScorer.advance(target); } -} - + @Override + protected void visitSubScorers(Query parent, Occur relationship, ScorerVisitor visitor) { + switch(relationship){ + case MUST: + visitor.visitRequired(parent,q, this); + break; + case MUST_NOT: + visitor.visitProhibited(parent,q, this); + break; + case SHOULD: + visitor.visitOptional(parent,q, this); + break; + } + for (Scorer s : optionalScorers) { + s.visitSubScorers(q, Occur.SHOULD, visitor); + } + for (Scorer s : prohibitedScorers) { + s.visitSubScorers(q, Occur.MUST_NOT, visitor); + } + for (Scorer s : requiredScorers) { + s.visitSubScorers(q, Occur.MUST, visitor); + } + } +} Index: src/java/org/apache/lucene/search/ExactPhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/ExactPhraseScorer.java (revision 988223) +++ src/java/org/apache/lucene/search/ExactPhraseScorer.java (working copy) @@ -21,6 +21,7 @@ import java.util.Arrays; import org.apache.lucene.index.*; +import org.apache.lucene.search.BooleanClause.Occur; final class ExactPhraseScorer extends Scorer { private final Weight weight; @@ -193,8 +194,8 @@ return "ExactPhraseScorer(" + weight + ")"; } - // used by MultiPhraseQuery - float currentFreq() { + @Override + public float freq() { return freq; } @@ -331,4 +332,20 @@ return freq; } + + @Override + protected void visitSubScorers(Query parent, Occur relationship, ScorerVisitor visitor) { + final Query q = weight.getQuery(); + switch(relationship){ + case MUST: + visitor.visitRequired(parent,q, this); + break; + case MUST_NOT: + visitor.visitProhibited(parent,q, this); + break; + case SHOULD: + visitor.visitOptional(parent,q, this); + break; + } + } } Index: src/java/org/apache/lucene/search/MultiPhraseQuery.java =================================================================== --- src/java/org/apache/lucene/search/MultiPhraseQuery.java (revision 988223) +++ src/java/org/apache/lucene/search/MultiPhraseQuery.java (working copy) @@ -271,11 +271,7 @@ int d = scorer.advance(doc); float phraseFreq; if (d == doc) { - if (slop == 0) { - phraseFreq = ((ExactPhraseScorer) scorer).currentFreq(); - } else { - phraseFreq = ((SloppyPhraseScorer) scorer).currentFreq(); - } + phraseFreq = scorer.freq(); } else { phraseFreq = 0.0f; } Index: src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- src/java/org/apache/lucene/search/PhraseQuery.java (revision 988223) +++ src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -275,11 +275,7 @@ int d = scorer.advance(doc); float phraseFreq; if (d == doc) { - if (slop == 0) { - phraseFreq = ((ExactPhraseScorer) scorer).currentFreq(); - } else { - phraseFreq = ((SloppyPhraseScorer) scorer).currentFreq(); - } + phraseFreq = scorer.freq(); } else { phraseFreq = 0.0f; } Index: src/java/org/apache/lucene/search/PhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/PhraseScorer.java (revision 988223) +++ src/java/org/apache/lucene/search/PhraseScorer.java (working copy) @@ -19,6 +19,8 @@ import java.io.IOException; +import org.apache.lucene.search.BooleanClause.Occur; + /** Expert: Scoring functionality for phrase queries. *
A document is considered matching if it contains the phrase-query terms * at "valid" positions. What "valid positions" are @@ -129,8 +131,11 @@ /** * phrase frequency in current doc as computed by phraseFreq(). */ - public final float currentFreq() { return freq; } - + @Override + public final float freq() { + return freq; + } + /** * For a document containing all the phrase query terms, compute the * frequency of the phrase in that document. @@ -180,4 +185,19 @@ @Override public String toString() { return "scorer(" + weight + ")"; } + @Override + protected void visitSubScorers(Query parent, Occur relationship, ScorerVisitor visitor) { + final Query q = weight.getQuery(); + switch(relationship){ + case MUST: + visitor.visitRequired(parent,q, this); + break; + case MUST_NOT: + visitor.visitProhibited(parent,q, this); + break; + case SHOULD: + visitor.visitOptional(parent,q, this); + break; + } + } } Index: src/java/org/apache/lucene/search/Scorer.java =================================================================== --- src/java/org/apache/lucene/search/Scorer.java (revision 988223) +++ src/java/org/apache/lucene/search/Scorer.java (working copy) @@ -19,6 +19,8 @@ import java.io.IOException; +import org.apache.lucene.search.BooleanClause.Occur; + /** * Expert: Common scoring functionality for different types of queries. * @@ -94,4 +96,71 @@ */ public abstract float score() throws IOException; + /** Returns number of matches for the current document. + * This returns a float (not int) because + * SloppyPhraseScorer discounts its freq according to how + * "sloppy" the match was. + * + * @lucene.experimental */ + public float freq() throws IOException { + throw new UnsupportedOperationException(this + " does not implement freq()"); + } + + /** + * A callback to gather information from a scorer and its sub-scorers. Each + * the top-level scorer as well as each of its sub-scorers are passed to + * either one of the visit methods depending on their boolean relationship in + * the query. + * @lucene.experimental + */ + public static abstract class ScorerVisitor

{ + /** + * Invoked for all optional scorer + * + * @param parent the parent query of the child query or null if the child is a top-level query + * @param child the query of the currently visited scorer + * @param scorer the current scorer + */ + public void visitOptional(P parent, C child, S scorer) {} + + /** + * Invoked for all required scorer + * + * @param parent the parent query of the child query or null if the child is a top-level query + * @param child the query of the currently visited scorer + * @param scorer the current scorer + */ + public void visitRequired(P parent, C child, S scorer) {} + + /** + * Invoked for all prohibited scorer + * + * @param parent the parent query of the child query or null if the child is a top-level query + * @param child the query of the currently visited scorer + * @param scorer the current scorer + */ + public void visitProhibited(P parent, C child, S scorer) {} + } + + /** + * Expert: call this to gather details for all sub-scorers for this query. + * This can be used, in conjunction with a custom {@link Collector} to gather + * details about how each sub-query matched the current hit. + * + * @param visitor a callback executed for each sub-scorer + * @lucene.experimental + */ + public void visitScorers(ScorerVisitor visitor) { + visitSubScorers(null, Occur.MUST/*must id default*/, visitor); + } + + /** + * {@link Scorer} subclass should implement this method to support gathering + * details for sub-scorers via {@link ScorerVisitor} + * + * @lucene.experimental + */ + protected void visitSubScorers(Query parent, Occur relationship, ScorerVisitor visitor) { + throw new UnsupportedOperationException(); + } } Index: src/java/org/apache/lucene/search/TermScorer.java =================================================================== --- src/java/org/apache/lucene/search/TermScorer.java (revision 988223) +++ src/java/org/apache/lucene/search/TermScorer.java (working copy) @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.search.BooleanClause.Occur; /** Expert: A Scorer for documents matching a Term. */ @@ -103,6 +104,11 @@ return doc; } + @Override + public float freq() { + return freq; + } + /** * Advances to the next document matching the query.
* The iterator over the matching documents is buffered using @@ -172,4 +178,20 @@ /** Returns a string representation of this TermScorer. */ @Override public String toString() { return "scorer(" + weight + ")"; } + + @Override + protected void visitSubScorers(Query parent, Occur relationship, ScorerVisitor visitor) { + final Query q = weight.getQuery(); + switch(relationship){ + case MUST: + visitor.visitRequired(parent,q, this); + break; + case MUST_NOT: + visitor.visitProhibited(parent,q, this); + break; + case SHOULD: + visitor.visitOptional(parent,q, this); + break; + } + } } Index: src/test/org/apache/lucene/search/TestBooleanScorer.java =================================================================== --- src/test/org/apache/lucene/search/TestBooleanScorer.java (revision 988223) +++ src/test/org/apache/lucene/search/TestBooleanScorer.java (working copy) @@ -90,7 +90,7 @@ } }}; - BooleanScorer bs = new BooleanScorer(sim, 1, Arrays.asList(scorers), null); + BooleanScorer bs = new BooleanScorer(null, sim, 1, Arrays.asList(scorers), null); assertEquals("should have received 3000", 3000, bs.nextDoc()); assertEquals("should have received NO_MORE_DOCS", DocIdSetIterator.NO_MORE_DOCS, bs.nextDoc()); Index: src/test/org/apache/lucene/search/TestSubScorerFreqs.java =================================================================== --- src/test/org/apache/lucene/search/TestSubScorerFreqs.java (revision 0) +++ src/test/org/apache/lucene/search/TestSubScorerFreqs.java (revision 0) @@ -0,0 +1,227 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.*; +import org.apache.lucene.index.*; +import org.apache.lucene.util.*; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.Scorer.ScorerVisitor; +import org.apache.lucene.store.*; + +import java.util.*; +import java.io.*; + +import org.junit.Test; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +import static org.junit.Assert.*; + +public class TestSubScorerFreqs extends LuceneTestCaseJ4 { + + private static Directory dir; + private static IndexSearcher s; + + @BeforeClass + public static void makeIndex() throws Exception { + dir = new RAMDirectory(); + RandomIndexWriter w = new RandomIndexWriter( + newStaticRandom(TestSubScorerFreqs.class), dir); + // make sure we have more than one segment occationally + for (int i = 0; i < 31 * RANDOM_MULTIPLIER; i++) { + Document doc = new Document(); + doc.add(new Field("f", "a b c d b c d c d d", Field.Store.NO, + Field.Index.ANALYZED)); + w.addDocument(doc); + + doc = new Document(); + doc.add(new Field("f", "a b c d", Field.Store.NO, Field.Index.ANALYZED)); + w.addDocument(doc); + } + + s = new IndexSearcher(w.getReader()); + w.close(); + } + + @AfterClass + public static void finish() throws Exception { + s.getIndexReader().close(); + s.close(); + dir.close(); + } + + private static class CountingCollector extends Collector { + private final Collector other; + private int docBase; + + public final Map> docCounts = new HashMap>(); + + private final Map subScorers = new HashMap(); + private final ScorerVisitor visitor = new MockScorerVisitor(); + private final EnumSet collect; + + private class MockScorerVisitor extends ScorerVisitor { + + @Override + public void visitOptional(Query parent, Query child, Scorer scorer) { + if (collect.contains(Occur.SHOULD)) + subScorers.put(child, scorer); + } + + @Override + public void visitProhibited(Query parent, Query child, Scorer scorer) { + if (collect.contains(Occur.MUST_NOT)) + subScorers.put(child, scorer); + } + + @Override + public void visitRequired(Query parent, Query child, Scorer scorer) { + if (collect.contains(Occur.MUST)) + subScorers.put(child, scorer); + } + + } + + public CountingCollector(Collector other) { + this(other, EnumSet.allOf(Occur.class)); + } + + public CountingCollector(Collector other, EnumSet collect) { + this.other = other; + this.collect = collect; + } + + @Override + public void setScorer(Scorer scorer) throws IOException { + other.setScorer(scorer); + scorer.visitScorers(visitor); + } + + @Override + public void collect(int doc) throws IOException { + final Map freqs = new HashMap(); + for (Map.Entry ent : subScorers.entrySet()) { + Scorer value = ent.getValue(); + int matchId = value.docID(); + freqs.put(ent.getKey(), matchId == doc ? value.freq() : 0.0f); + } + docCounts.put(doc + docBase, freqs); + other.collect(doc); + } + + @Override + public void setNextReader(IndexReader reader, int docBase) + throws IOException { + this.docBase = docBase; + other.setNextReader(reader, docBase); + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return other.acceptsDocsOutOfOrder(); + } + } + + private static final float FLOAT_TOLERANCE = 0.00001F; + + @Test + public void testTermQuery() throws Exception { + TermQuery q = new TermQuery(new Term("f", "d")); + CountingCollector c = new CountingCollector(TopScoreDocCollector.create(10, + true)); + s.search(q, null, c); + final int maxDocs = s.maxDoc(); + assertEquals(maxDocs, c.docCounts.size()); + for (int i = 0; i < maxDocs; i++) { + Map doc0 = c.docCounts.get(i); + assertEquals(1, doc0.size()); + assertEquals(4.0F, doc0.get(q), FLOAT_TOLERANCE); + + Map doc1 = c.docCounts.get(++i); + assertEquals(1, doc1.size()); + assertEquals(1.0F, doc1.get(q), FLOAT_TOLERANCE); + } + } + + @SuppressWarnings("unchecked") + @Test + public void testBooleanQuery() throws Exception { + TermQuery aQuery = new TermQuery(new Term("f", "a")); + TermQuery dQuery = new TermQuery(new Term("f", "d")); + TermQuery cQuery = new TermQuery(new Term("f", "c")); + TermQuery yQuery = new TermQuery(new Term("f", "y")); + + BooleanQuery query = new BooleanQuery(); + BooleanQuery inner = new BooleanQuery(); + + inner.add(cQuery, Occur.SHOULD); + inner.add(yQuery, Occur.MUST_NOT); + query.add(inner, Occur.MUST); + query.add(aQuery, Occur.MUST); + query.add(dQuery, Occur.MUST); + EnumSet[] occurList = new EnumSet[] {EnumSet.of(Occur.MUST), EnumSet.of(Occur.MUST, Occur.SHOULD)}; + for (EnumSet occur : occurList) { + + CountingCollector c = new CountingCollector(TopScoreDocCollector.create( + 10, true), occur); + s.search(query, null, c); + final int maxDocs = s.maxDoc(); + assertEquals(maxDocs, c.docCounts.size()); + boolean includeOptional = occur.contains(Occur.SHOULD); + for (int i = 0; i < maxDocs; i++) { + Map doc0 = c.docCounts.get(i); + assertEquals(includeOptional ? 5 : 4, doc0.size()); + assertEquals(1.0F, doc0.get(aQuery), FLOAT_TOLERANCE); + assertEquals(4.0F, doc0.get(dQuery), FLOAT_TOLERANCE); + if (includeOptional) + assertEquals(3.0F, doc0.get(cQuery), FLOAT_TOLERANCE); + + Map doc1 = c.docCounts.get(++i); + assertEquals(includeOptional ? 5 : 4, doc1.size()); + assertEquals(1.0F, doc1.get(aQuery), FLOAT_TOLERANCE); + assertEquals(1.0F, doc1.get(dQuery), FLOAT_TOLERANCE); + if (includeOptional) + assertEquals(1.0F, doc1.get(cQuery), FLOAT_TOLERANCE); + + } + } + } + + @Test + public void testPhraseQuery() throws Exception { + PhraseQuery q = new PhraseQuery(); + q.add(new Term("f", "b")); + q.add(new Term("f", "c")); + CountingCollector c = new CountingCollector(TopScoreDocCollector.create(10, + true)); + s.search(q, null, c); + final int maxDocs = s.maxDoc(); + assertEquals(maxDocs, c.docCounts.size()); + for (int i = 0; i < maxDocs; i++) { + Map doc0 = c.docCounts.get(i); + assertEquals(1, doc0.size()); + assertEquals(2.0F, doc0.get(q), FLOAT_TOLERANCE); + + Map doc1 = c.docCounts.get(++i); + assertEquals(1, doc1.size()); + assertEquals(1.0F, doc1.get(q), FLOAT_TOLERANCE); + } + + } +} Property changes on: src/test/org/apache/lucene/search/TestSubScorerFreqs.java ___________________________________________________________________ Added: svn:eol-style + native Added: svn:keywords + Date Author Id Revision HeadURL