Index: lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java =================================================================== --- lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java (revision 0) +++ lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java (working copy) @@ -0,0 +1,74 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +public class TestProximityRescorer extends LuceneTestCase { + public void testBasic() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + doc.add(newStringField("id", "0", Field.Store.YES)); + doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO)); + w.addDocument(doc); + doc = new Document(); + doc.add(newStringField("id", "1", Field.Store.YES)); + // 1 extra token, but wizard and oz are now close; + doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO)); + w.addDocument(doc); + IndexReader r = w.getReader(); + w.close(); + + // Do ordinary BooleanQuery: + BooleanQuery bq = new BooleanQuery(); + bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD); + bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD); + + IndexSearcher s = newSearcher(r); + + TopDocs hits = s.search(bq, 10); + assertEquals(2, hits.totalHits); + assertEquals("0", s.doc(hits.scoreDocs[0].doc).get("id")); + assertEquals("1", s.doc(hits.scoreDocs[1].doc).get("id")); + + // Now, do same query, but boost by prox: + RescoringCollector c = new RescoringCollector(new ProximityRescorer(), 10); + bq = new BooleanQuery(); + bq.add(new ProxTermQuery(new Term("field", "wizard")), Occur.SHOULD); + bq.add(new ProxTermQuery(new Term("field", "oz")), Occur.SHOULD); + + s.search(bq, c); + hits = c.topDocs(); + assertEquals(2, hits.totalHits); + // The sort is reversed now, thanks to proximity effects: + assertEquals("1", s.doc(hits.scoreDocs[0].doc).get("id")); + assertEquals("0", s.doc(hits.scoreDocs[1].doc).get("id")); + + r.close(); + dir.close(); + } +} Property changes on: lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/search/TermScorer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/TermScorer.java (revision 1533359) +++ lucene/core/src/java/org/apache/lucene/search/TermScorer.java (working copy) @@ -25,7 +25,7 @@ /** Expert: A Scorer for documents matching a Term. */ final class TermScorer extends Scorer { - private final DocsEnum docsEnum; + final DocsEnum docsEnum; private final Similarity.SimScorer docScorer; /** Index: lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java (working copy) @@ -0,0 +1,238 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.util.PriorityQueue; + +// nocommit payloads? + +/** A {@link RescoringCollector#Rescorer} that boost scores + * according to proximity of the term occurrences. Note + * that you must use {@link ProxTermQuery}, instead of + * {@link TermQuery}, for those terms that should + * participate in the proximity boosting for the query. + * + * @lucene.experimental */ + +public class ProximityRescorer extends RescoringCollector.Rescorer { + + final List posEnums = new ArrayList(); + private Scorer scorer; + private ProxScorerQueue proxQueue; + private ProxScorer proxScorer; + + /** Computes the actual boosting due to proximity + * effects. */ + public static abstract class ProxScorer { + + /** Called once for each term occurrence in the current + * hit. + * @param pos The position of the current term's + * occurrence. + * @param termOrd Which term occurred; termOrd=0 is the + * first term added to the query, termOrd=1 is + * the second, etc. */ + public abstract void addPosition(int pos, int termOrd); + + /** Called after all positions have been visited from + * the current hit. This should fold in the proximity + * boost over the provided query score, and return the + * total score, and then reset the class in + * preparation for the next hit. */ + public abstract float score(float queryScore); + + /** Return true if the queryScore, after proximity + * boosting, could be competitive, i.e. may score + * better than bottomCombinedScore. */ + public abstract boolean competes(float queryScore, float bottomCombinedScore); + } + + private static class PosEnumAndPos { + final int termOrd; + final DocsAndPositionsEnum posEnum; + int pos; + int posUpto; + int posLimit; + + public PosEnumAndPos(int termOrd, DocsAndPositionsEnum posEnum) { + this.termOrd = termOrd; + this.posEnum = posEnum; + } + } + + // nocommit total hack!! + private static class DefaultProxScorer extends ProxScorer { + int lastPos; + float boost; + + @Override + public void addPosition(int pos, int termOrd) { + if (lastPos != -1 && pos > lastPos) { + int distance = pos-lastPos; + boost += (float) 1.0/(distance*distance); + } + } + + @Override + public float score(float queryScore) { + float score = queryScore + boost; + boost = 0f; + return score; + } + + @Override + public boolean competes(float queryScore, float bottomCombinedScore) { + // nocommit fixme to sometimes return false + return true; + } + } + + /** Returns a {@link ProxScorer} that computes proximity + * boosts; subclass can override this to change how + * proximity effects are scored. The provided {@code + * SimWeights} can be used by a custom ProxScorer to + * make use of each term's stats. */ + protected ProxScorer getProxScorer(List weights) { + // nocommit what default? this is hack: + return new DefaultProxScorer(); + } + + @Override + public void setScorer(Scorer scorer) { + this.scorer = scorer; + posEnums.clear(); + List weights = new ArrayList(); + gatherChildren(scorer, 0, weights); + proxScorer = getProxScorer(weights); + proxQueue = new ProxScorerQueue(posEnums.size(), proxScorer); + } + + @Override + public void setNextReader(AtomicReaderContext context) { + } + + @Override + public float score(int docID, float bottomScore) throws IOException { + float score = scorer.score(); + assert score != Float.NEGATIVE_INFINITY; + assert Float.isNaN(score) == false; + if (proxScorer.competes(score, bottomScore)) { + // nocommit this is ... badly wasteful. E.g. for a + // BooleanQuery with 1000 SHOULD terms ... if only a + // few terms match for each hit, we are still O(1000) + // in this loop: + for(PosEnumAndPos pos : posEnums) { + if (pos.posEnum.docID() == docID) { + proxQueue.addEnum(pos); + } + } + proxQueue.visitPositions(); + return proxScorer.score(score); + } else { + return Float.NEGATIVE_INFINITY; + } + } + + private int gatherChildren(Scorer scorer, int termOrd, List weights) { + + // nocommit this does not necessarily preserve order: + for(Scorer.ChildScorer subScorer : scorer.getChildren()) { + if (subScorer.relationship.equals("MUST_NOT")) { + continue; + } + if (subScorer.child instanceof TermScorer) { + TermScorer ts = (TermScorer) subScorer.child; + // NOTE: we silently skip any non-pos-aware + // TermQuery; e.g., maybe the query has some + // TermQuery against a title field w/o prox, and + // others against the body field with prox: + if (ts.docsEnum instanceof DocsAndPositionsEnum) { + weights.add(((TermQuery.TermWeight) ts.getWeight()).stats); + posEnums.add(new PosEnumAndPos(termOrd++, (DocsAndPositionsEnum) ts.docsEnum)); + } + } else { + termOrd = gatherChildren(subScorer.child, termOrd, weights); + } + } + + return termOrd; + } + + /** Utility class to merge-sort multiple + * DocsAndPositionsEnum currently "on" a single doc, and + * compute resulting score "boost". */ + private static class ProxScorerQueue extends PriorityQueue { + + final ProxScorer proxScorer; + + public ProxScorerQueue(int numScorers, ProxScorer proxScorer) { + super(numScorers); + this.proxScorer = proxScorer; + } + + @Override + protected boolean lessThan(PosEnumAndPos a, PosEnumAndPos b) { + return a.pos < b.pos; + } + + public void addEnum(PosEnumAndPos posEnum) throws IOException { + System.out.println("add idx=" + posEnum.termOrd + " heapSize=" + size() + " docID=" + posEnum.posEnum.docID()); + + // All enums should be on the same doc: + assert size() == 0 || posEnum.posEnum.docID() == top().posEnum.docID(); + + posEnum.posLimit = posEnum.posEnum.freq(); + posEnum.pos = posEnum.posEnum.nextPosition(); + posEnum.posUpto = 1; + + // nocommit need test w/ syns @ same position as other tokens + + super.add(posEnum); + } + + /** Visits all positions, invoking {code + * ProxScorer.addPosition}. */ + public void visitPositions() throws IOException { + + System.out.println("visitPositions: " + size() + " enums"); + while (true) { + PosEnumAndPos min = top(); + if (min == null) { + break; + } + System.out.println(" cycle pos=" + min.pos); + proxScorer.addPosition(min.pos, min.termOrd); + + if (min.posUpto < min.posLimit) { + min.pos = min.posEnum.nextPosition(); + min.posUpto++; + updateTop(); + } else { + pop(); + } + } + } + } +} Property changes on: lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/search/TermQuery.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/TermQuery.java (revision 1533359) +++ lucene/core/src/java/org/apache/lucene/search/TermQuery.java (working copy) @@ -41,10 +41,11 @@ private final Term term; private final int docFreq; private final TermContext perReaderTermState; + private final boolean doPositions; final class TermWeight extends Weight { private final Similarity similarity; - private final Similarity.SimWeight stats; + final Similarity.SimWeight stats; private final TermContext termStates; public TermWeight(IndexSearcher searcher, TermContext termStates) @@ -82,7 +83,15 @@ if (termsEnum == null) { return null; } - DocsEnum docs = termsEnum.docs(acceptDocs, null); + DocsEnum docs; + if (doPositions) { + docs = termsEnum.docsAndPositions(acceptDocs, null); + if (docs == null) { + throw new IllegalArgumentException("field \"" + term.field() + "\" was not indexed with positions"); + } + } else { + docs = termsEnum.docs(acceptDocs, null); + } assert docs != null; return new TermScorer(this, docs, similarity.simScorer(stats, context)); } @@ -139,19 +148,31 @@ * provided docFreq instead of looking up the docFreq * against the searcher. */ public TermQuery(Term t, int docFreq) { - term = t; - this.docFreq = docFreq; - perReaderTermState = null; + this(t, docFreq, false); } /** Expert: constructs a TermQuery that will use the * provided docFreq instead of looking up the docFreq * against the searcher. */ public TermQuery(Term t, TermContext states) { + this(t, states, false); + } + + /** Used by ProxTermQuery */ + TermQuery(Term t, int docFreq, boolean doPositions) { + term = t; + this.docFreq = docFreq; + perReaderTermState = null; + this.doPositions = doPositions; + } + + /** Used by ProxTermQuery */ + TermQuery(Term t, TermContext states, boolean doPositions) { assert states != null; term = t; docFreq = states.docFreq(); perReaderTermState = states; + this.doPositions = doPositions; } /** Returns the term of this query. */ Index: lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java (working copy) @@ -0,0 +1,101 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.AtomicReaderContext; + +/** A collector that returns top scoring hits, and allows + * you to re-score each hit before collecting. + * + * @lucene.experimental + */ + +public final class RescoringCollector extends TopDocsCollector { + + private final Rescorer rescorer; + + private ScoreDoc pqTop; + private int docBase; + + public static abstract class Rescorer { + /** Compute the boosted score; use {@code + * scorer.score()} to get the first pass score, if + * necessary. The bottomScore parameter is the current worst score + * in the queue; you can use this to avoid rescoring if + * it's clear the current hits score, after boosting, + * would not compete; if so, return + * Float.NEGATIVE_INFINITY. */ + public abstract float score(int docID, float bottomScore) throws IOException; + + /** Called once for each segment with the first-pass + scorer. */ + public abstract void setScorer(Scorer scorer) throws IOException; + + /** Called for each segment; e.g., perhaps you need + * to load some doc values fields for this reader. */ + public abstract void setNextReader(AtomicReaderContext context) throws IOException; + } + + public RescoringCollector(Rescorer rescorer, int numHits) { + super(new HitQueue(numHits, true)); + this.rescorer = rescorer; + + // HitQueue implements getSentinelObject to return a ScoreDoc, so we know + // that at this point top() is already initialized. + pqTop = pq.top(); + } + + @Override + public void setScorer(Scorer scorer) throws IOException { + rescorer.setScorer(scorer); + } + + @Override + public void setNextReader(AtomicReaderContext context) throws IOException { + rescorer.setNextReader(context); + } + + @Override + public boolean acceptsDocsOutOfOrder() { + // Return false so we force "doc at a time" + // BooleanScorer2 to be used: + return false; + } + + @Override + public void collect(int doc) throws IOException { + float score = rescorer.score(doc, pqTop.score); + assert Float.isNaN(score) == false; + // nocommit what about NEGATIVE_INFINITY? i javadoc + // that rescorer should return that in order to skip + // collection ... + + totalHits++; + if (score <= pqTop.score) { + // Since we are collecting docs in order, if the score + // ties with the worst score in the queue then its + // docID will be higher and it cannot compete: + return; + } + pqTop.doc = doc + docBase; + pqTop.score = score; + pqTop = pq.updateTop(); + } +} Property changes on: lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/search/ProxTermQuery.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/ProxTermQuery.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/search/ProxTermQuery.java (working copy) @@ -0,0 +1,48 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; + +/** Just like {@link TermQuery}, except this query can + * optionally visit positions information per-hit when used + * from {@link ProximityRescorer} passed to {@link + * RescoringCollector}. + * + * @lucene.experimental */ + +public class ProxTermQuery extends TermQuery { + public ProxTermQuery(Term t) { + this(t, -1); + } + + /** Expert: constructs a TermQuery that will use the + * provided docFreq instead of looking up the docFreq + * against the searcher. */ + public ProxTermQuery(Term t, int docFreq) { + super(t, docFreq, true); + } + + /** Expert: constructs a TermQuery that will use the + * provided docFreq instead of looking up the docFreq + * against the searcher. */ + public ProxTermQuery(Term t, TermContext states) { + super(t, states, true); + } +} \ No newline at end of file Property changes on: lucene/core/src/java/org/apache/lucene/search/ProxTermQuery.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property