Index: lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java (revision 0)
+++ lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java (working copy)
@@ -0,0 +1,74 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestProximityRescorer extends LuceneTestCase {
+ public void testBasic() throws Exception {
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+
+ Document doc = new Document();
+ doc.add(newStringField("id", "0", Field.Store.YES));
+ doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO));
+ w.addDocument(doc);
+ doc = new Document();
+ doc.add(newStringField("id", "1", Field.Store.YES));
+ // 1 extra token, but wizard and oz are now close;
+ doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO));
+ w.addDocument(doc);
+ IndexReader r = w.getReader();
+ w.close();
+
+ // Do ordinary BooleanQuery:
+ BooleanQuery bq = new BooleanQuery();
+ bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD);
+ bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD);
+
+ IndexSearcher s = newSearcher(r);
+
+ TopDocs hits = s.search(bq, 10);
+ assertEquals(2, hits.totalHits);
+ assertEquals("0", s.doc(hits.scoreDocs[0].doc).get("id"));
+ assertEquals("1", s.doc(hits.scoreDocs[1].doc).get("id"));
+
+ // Now, do same query, but boost by prox:
+ RescoringCollector c = new RescoringCollector(new ProximityRescorer(), 10);
+ bq = new BooleanQuery();
+ bq.add(new ProxTermQuery(new Term("field", "wizard")), Occur.SHOULD);
+ bq.add(new ProxTermQuery(new Term("field", "oz")), Occur.SHOULD);
+
+ s.search(bq, c);
+ hits = c.topDocs();
+ assertEquals(2, hits.totalHits);
+ // The sort is reversed now, thanks to proximity effects:
+ assertEquals("1", s.doc(hits.scoreDocs[0].doc).get("id"));
+ assertEquals("0", s.doc(hits.scoreDocs[1].doc).get("id"));
+
+ r.close();
+ dir.close();
+ }
+}
Property changes on: lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: lucene/core/src/java/org/apache/lucene/search/TermScorer.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/search/TermScorer.java (revision 1533359)
+++ lucene/core/src/java/org/apache/lucene/search/TermScorer.java (working copy)
@@ -25,7 +25,7 @@
/** Expert: A Scorer for documents matching a Term.
*/
final class TermScorer extends Scorer {
- private final DocsEnum docsEnum;
+ final DocsEnum docsEnum;
private final Similarity.SimScorer docScorer;
/**
Index: lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java (revision 0)
+++ lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java (working copy)
@@ -0,0 +1,238 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.util.PriorityQueue;
+
+// nocommit payloads?
+
+/** A {@link RescoringCollector#Rescorer} that boost scores
+ * according to proximity of the term occurrences. Note
+ * that you must use {@link ProxTermQuery}, instead of
+ * {@link TermQuery}, for those terms that should
+ * participate in the proximity boosting for the query.
+ *
+ * @lucene.experimental */
+
+public class ProximityRescorer extends RescoringCollector.Rescorer {
+
+ final List posEnums = new ArrayList();
+ private Scorer scorer;
+ private ProxScorerQueue proxQueue;
+ private ProxScorer proxScorer;
+
+ /** Computes the actual boosting due to proximity
+ * effects. */
+ public static abstract class ProxScorer {
+
+ /** Called once for each term occurrence in the current
+ * hit.
+ * @param pos The position of the current term's
+ * occurrence.
+ * @param termOrd Which term occurred; termOrd=0 is the
+ * first term added to the query, termOrd=1 is
+ * the second, etc. */
+ public abstract void addPosition(int pos, int termOrd);
+
+ /** Called after all positions have been visited from
+ * the current hit. This should fold in the proximity
+ * boost over the provided query score, and return the
+ * total score, and then reset the class in
+ * preparation for the next hit. */
+ public abstract float score(float queryScore);
+
+ /** Return true if the queryScore, after proximity
+ * boosting, could be competitive, i.e. may score
+ * better than bottomCombinedScore. */
+ public abstract boolean competes(float queryScore, float bottomCombinedScore);
+ }
+
+ private static class PosEnumAndPos {
+ final int termOrd;
+ final DocsAndPositionsEnum posEnum;
+ int pos;
+ int posUpto;
+ int posLimit;
+
+ public PosEnumAndPos(int termOrd, DocsAndPositionsEnum posEnum) {
+ this.termOrd = termOrd;
+ this.posEnum = posEnum;
+ }
+ }
+
+ // nocommit total hack!!
+ private static class DefaultProxScorer extends ProxScorer {
+ int lastPos;
+ float boost;
+
+ @Override
+ public void addPosition(int pos, int termOrd) {
+ if (lastPos != -1 && pos > lastPos) {
+ int distance = pos-lastPos;
+ boost += (float) 1.0/(distance*distance);
+ }
+ }
+
+ @Override
+ public float score(float queryScore) {
+ float score = queryScore + boost;
+ boost = 0f;
+ return score;
+ }
+
+ @Override
+ public boolean competes(float queryScore, float bottomCombinedScore) {
+ // nocommit fixme to sometimes return false
+ return true;
+ }
+ }
+
+ /** Returns a {@link ProxScorer} that computes proximity
+ * boosts; subclass can override this to change how
+ * proximity effects are scored. The provided {@code
+ * SimWeights} can be used by a custom ProxScorer to
+ * make use of each term's stats. */
+ protected ProxScorer getProxScorer(List weights) {
+ // nocommit what default? this is hack:
+ return new DefaultProxScorer();
+ }
+
+ @Override
+ public void setScorer(Scorer scorer) {
+ this.scorer = scorer;
+ posEnums.clear();
+ List weights = new ArrayList();
+ gatherChildren(scorer, 0, weights);
+ proxScorer = getProxScorer(weights);
+ proxQueue = new ProxScorerQueue(posEnums.size(), proxScorer);
+ }
+
+ @Override
+ public void setNextReader(AtomicReaderContext context) {
+ }
+
+ @Override
+ public float score(int docID, float bottomScore) throws IOException {
+ float score = scorer.score();
+ assert score != Float.NEGATIVE_INFINITY;
+ assert Float.isNaN(score) == false;
+ if (proxScorer.competes(score, bottomScore)) {
+ // nocommit this is ... badly wasteful. E.g. for a
+ // BooleanQuery with 1000 SHOULD terms ... if only a
+ // few terms match for each hit, we are still O(1000)
+ // in this loop:
+ for(PosEnumAndPos pos : posEnums) {
+ if (pos.posEnum.docID() == docID) {
+ proxQueue.addEnum(pos);
+ }
+ }
+ proxQueue.visitPositions();
+ return proxScorer.score(score);
+ } else {
+ return Float.NEGATIVE_INFINITY;
+ }
+ }
+
+ private int gatherChildren(Scorer scorer, int termOrd, List weights) {
+
+ // nocommit this does not necessarily preserve order:
+ for(Scorer.ChildScorer subScorer : scorer.getChildren()) {
+ if (subScorer.relationship.equals("MUST_NOT")) {
+ continue;
+ }
+ if (subScorer.child instanceof TermScorer) {
+ TermScorer ts = (TermScorer) subScorer.child;
+ // NOTE: we silently skip any non-pos-aware
+ // TermQuery; e.g., maybe the query has some
+ // TermQuery against a title field w/o prox, and
+ // others against the body field with prox:
+ if (ts.docsEnum instanceof DocsAndPositionsEnum) {
+ weights.add(((TermQuery.TermWeight) ts.getWeight()).stats);
+ posEnums.add(new PosEnumAndPos(termOrd++, (DocsAndPositionsEnum) ts.docsEnum));
+ }
+ } else {
+ termOrd = gatherChildren(subScorer.child, termOrd, weights);
+ }
+ }
+
+ return termOrd;
+ }
+
+ /** Utility class to merge-sort multiple
+ * DocsAndPositionsEnum currently "on" a single doc, and
+ * compute resulting score "boost". */
+ private static class ProxScorerQueue extends PriorityQueue {
+
+ final ProxScorer proxScorer;
+
+ public ProxScorerQueue(int numScorers, ProxScorer proxScorer) {
+ super(numScorers);
+ this.proxScorer = proxScorer;
+ }
+
+ @Override
+ protected boolean lessThan(PosEnumAndPos a, PosEnumAndPos b) {
+ return a.pos < b.pos;
+ }
+
+ public void addEnum(PosEnumAndPos posEnum) throws IOException {
+ System.out.println("add idx=" + posEnum.termOrd + " heapSize=" + size() + " docID=" + posEnum.posEnum.docID());
+
+ // All enums should be on the same doc:
+ assert size() == 0 || posEnum.posEnum.docID() == top().posEnum.docID();
+
+ posEnum.posLimit = posEnum.posEnum.freq();
+ posEnum.pos = posEnum.posEnum.nextPosition();
+ posEnum.posUpto = 1;
+
+ // nocommit need test w/ syns @ same position as other tokens
+
+ super.add(posEnum);
+ }
+
+ /** Visits all positions, invoking {code
+ * ProxScorer.addPosition}. */
+ public void visitPositions() throws IOException {
+
+ System.out.println("visitPositions: " + size() + " enums");
+ while (true) {
+ PosEnumAndPos min = top();
+ if (min == null) {
+ break;
+ }
+ System.out.println(" cycle pos=" + min.pos);
+ proxScorer.addPosition(min.pos, min.termOrd);
+
+ if (min.posUpto < min.posLimit) {
+ min.pos = min.posEnum.nextPosition();
+ min.posUpto++;
+ updateTop();
+ } else {
+ pop();
+ }
+ }
+ }
+ }
+}
Property changes on: lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: lucene/core/src/java/org/apache/lucene/search/TermQuery.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/search/TermQuery.java (revision 1533359)
+++ lucene/core/src/java/org/apache/lucene/search/TermQuery.java (working copy)
@@ -41,10 +41,11 @@
private final Term term;
private final int docFreq;
private final TermContext perReaderTermState;
+ private final boolean doPositions;
final class TermWeight extends Weight {
private final Similarity similarity;
- private final Similarity.SimWeight stats;
+ final Similarity.SimWeight stats;
private final TermContext termStates;
public TermWeight(IndexSearcher searcher, TermContext termStates)
@@ -82,7 +83,15 @@
if (termsEnum == null) {
return null;
}
- DocsEnum docs = termsEnum.docs(acceptDocs, null);
+ DocsEnum docs;
+ if (doPositions) {
+ docs = termsEnum.docsAndPositions(acceptDocs, null);
+ if (docs == null) {
+ throw new IllegalArgumentException("field \"" + term.field() + "\" was not indexed with positions");
+ }
+ } else {
+ docs = termsEnum.docs(acceptDocs, null);
+ }
assert docs != null;
return new TermScorer(this, docs, similarity.simScorer(stats, context));
}
@@ -139,19 +148,31 @@
* provided docFreq instead of looking up the docFreq
* against the searcher. */
public TermQuery(Term t, int docFreq) {
- term = t;
- this.docFreq = docFreq;
- perReaderTermState = null;
+ this(t, docFreq, false);
}
/** Expert: constructs a TermQuery that will use the
* provided docFreq instead of looking up the docFreq
* against the searcher. */
public TermQuery(Term t, TermContext states) {
+ this(t, states, false);
+ }
+
+ /** Used by ProxTermQuery */
+ TermQuery(Term t, int docFreq, boolean doPositions) {
+ term = t;
+ this.docFreq = docFreq;
+ perReaderTermState = null;
+ this.doPositions = doPositions;
+ }
+
+ /** Used by ProxTermQuery */
+ TermQuery(Term t, TermContext states, boolean doPositions) {
assert states != null;
term = t;
docFreq = states.docFreq();
perReaderTermState = states;
+ this.doPositions = doPositions;
}
/** Returns the term of this query. */
Index: lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java (revision 0)
+++ lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java (working copy)
@@ -0,0 +1,101 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.AtomicReaderContext;
+
+/** A collector that returns top scoring hits, and allows
+ * you to re-score each hit before collecting.
+ *
+ * @lucene.experimental
+ */
+
+public final class RescoringCollector extends TopDocsCollector {
+
+ private final Rescorer rescorer;
+
+ private ScoreDoc pqTop;
+ private int docBase;
+
+ public static abstract class Rescorer {
+ /** Compute the boosted score; use {@code
+ * scorer.score()} to get the first pass score, if
+ * necessary. The bottomScore parameter is the current worst score
+ * in the queue; you can use this to avoid rescoring if
+ * it's clear the current hits score, after boosting,
+ * would not compete; if so, return
+ * Float.NEGATIVE_INFINITY. */
+ public abstract float score(int docID, float bottomScore) throws IOException;
+
+ /** Called once for each segment with the first-pass
+ scorer. */
+ public abstract void setScorer(Scorer scorer) throws IOException;
+
+ /** Called for each segment; e.g., perhaps you need
+ * to load some doc values fields for this reader. */
+ public abstract void setNextReader(AtomicReaderContext context) throws IOException;
+ }
+
+ public RescoringCollector(Rescorer rescorer, int numHits) {
+ super(new HitQueue(numHits, true));
+ this.rescorer = rescorer;
+
+ // HitQueue implements getSentinelObject to return a ScoreDoc, so we know
+ // that at this point top() is already initialized.
+ pqTop = pq.top();
+ }
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ rescorer.setScorer(scorer);
+ }
+
+ @Override
+ public void setNextReader(AtomicReaderContext context) throws IOException {
+ rescorer.setNextReader(context);
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ // Return false so we force "doc at a time"
+ // BooleanScorer2 to be used:
+ return false;
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
+ float score = rescorer.score(doc, pqTop.score);
+ assert Float.isNaN(score) == false;
+ // nocommit what about NEGATIVE_INFINITY? i javadoc
+ // that rescorer should return that in order to skip
+ // collection ...
+
+ totalHits++;
+ if (score <= pqTop.score) {
+ // Since we are collecting docs in order, if the score
+ // ties with the worst score in the queue then its
+ // docID will be higher and it cannot compete:
+ return;
+ }
+ pqTop.doc = doc + docBase;
+ pqTop.score = score;
+ pqTop = pq.updateTop();
+ }
+}
Property changes on: lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: lucene/core/src/java/org/apache/lucene/search/ProxTermQuery.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/search/ProxTermQuery.java (revision 0)
+++ lucene/core/src/java/org/apache/lucene/search/ProxTermQuery.java (working copy)
@@ -0,0 +1,48 @@
+package org.apache.lucene.search;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+
+/** Just like {@link TermQuery}, except this query can
+ * optionally visit positions information per-hit when used
+ * from {@link ProximityRescorer} passed to {@link
+ * RescoringCollector}.
+ *
+ * @lucene.experimental */
+
+public class ProxTermQuery extends TermQuery {
+ public ProxTermQuery(Term t) {
+ this(t, -1);
+ }
+
+ /** Expert: constructs a TermQuery that will use the
+ * provided docFreq instead of looking up the docFreq
+ * against the searcher. */
+ public ProxTermQuery(Term t, int docFreq) {
+ super(t, docFreq, true);
+ }
+
+ /** Expert: constructs a TermQuery that will use the
+ * provided docFreq instead of looking up the docFreq
+ * against the searcher. */
+ public ProxTermQuery(Term t, TermContext states) {
+ super(t, states, true);
+ }
+}
\ No newline at end of file
Property changes on: lucene/core/src/java/org/apache/lucene/search/ProxTermQuery.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property