Index: lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java =================================================================== --- lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java (revision 0) +++ lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java (working copy) @@ -0,0 +1,76 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +public class TestProximityRescorer extends LuceneTestCase { + public void testBasic() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + + Document doc = new Document(); + doc.add(newStringField("id", "0", Field.Store.YES)); + doc.add(newTextField("field", "wizard the the the the the oz", Field.Store.NO)); + w.addDocument(doc); + doc = new Document(); + doc.add(newStringField("id", "1", Field.Store.YES)); + // 1 extra token, but wizard and oz are now close; + doc.add(newTextField("field", "wizard oz the the the the the the", Field.Store.NO)); + w.addDocument(doc); + IndexReader r = w.getReader(); + w.close(); + + // Do ordinary BooleanQuery: + BooleanQuery bq = new BooleanQuery(); + bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD); + bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD); + + IndexSearcher s = newSearcher(r); + + TopDocs hits = s.search(bq, 10); + assertEquals(2, hits.totalHits); + assertEquals("0", s.doc(hits.scoreDocs[0].doc).get("id")); + assertEquals("1", s.doc(hits.scoreDocs[1].doc).get("id")); + + // Now, do same query, but boost by prox: + bq = new BooleanQuery(); + bq.add(new TermQuery(new Term("field", "wizard")), Occur.SHOULD); + bq.add(new TermQuery(new Term("field", "oz")), Occur.SHOULD); + + RescoringCollector c = new RescoringCollector(new ProximityRescorer(bq), 10); + + s.search(bq, c); + hits = c.topDocs(); + assertEquals(2, hits.totalHits); + // The sort is reversed now, thanks to proximity + // effects: + assertEquals("1", s.doc(hits.scoreDocs[0].doc).get("id")); + assertEquals("0", s.doc(hits.scoreDocs[1].doc).get("id")); + + r.close(); + dir.close(); + } +} Property changes on: lucene/core/src/test/org/apache/lucene/search/TestProximityRescorer.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java (working copy) @@ -0,0 +1,101 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.AtomicReaderContext; + +/** A collector that returns top scoring hits, and allows + * you to re-score each hit before collecting. + * + * @lucene.experimental + */ + +public final class RescoringCollector extends TopDocsCollector { + + private final Rescorer rescorer; + + private ScoreDoc pqTop; + private int docBase; + + public static abstract class Rescorer { + /** Compute the boosted score; use {@code + * scorer.score()} to get the first pass score, if + * necessary. The bottomScore parameter is the current worst score + * in the queue; you can use this to avoid rescoring if + * it's clear the current hits score, after boosting, + * would not compete; if so, return + * Float.NEGATIVE_INFINITY. */ + public abstract float score(int docID, float bottomScore) throws IOException; + + /** Called once for each segment with the first-pass + scorer. */ + public abstract void setScorer(Scorer scorer) throws IOException; + + /** Called for each segment; e.g., perhaps you need + * to load some doc values fields for this reader. */ + public abstract void setNextReader(AtomicReaderContext context) throws IOException; + } + + public RescoringCollector(Rescorer rescorer, int numHits) { + super(new HitQueue(numHits, true)); + this.rescorer = rescorer; + + // HitQueue implements getSentinelObject to return a ScoreDoc, so we know + // that at this point top() is already initialized. + pqTop = pq.top(); + } + + @Override + public void setScorer(Scorer scorer) throws IOException { + rescorer.setScorer(scorer); + } + + @Override + public void setNextReader(AtomicReaderContext context) throws IOException { + rescorer.setNextReader(context); + } + + @Override + public boolean acceptsDocsOutOfOrder() { + // Return false so we force "doc at a time" + // BooleanScorer2 to be used: + return false; + } + + @Override + public void collect(int doc) throws IOException { + float score = rescorer.score(doc, pqTop.score); + assert Float.isNaN(score) == false; + // nocommit what about NEGATIVE_INFINITY? i javadoc + // that rescorer should return that in order to skip + // collection ... + + totalHits++; + if (score <= pqTop.score) { + // Since we are collecting docs in order, if the score + // ties with the worst score in the queue then its + // docID will be higher and it cannot compete: + return; + } + pqTop.doc = doc + docBase; + pqTop.score = score; + pqTop = pq.updateTop(); + } +} Property changes on: lucene/core/src/java/org/apache/lucene/search/RescoringCollector.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java (working copy) @@ -0,0 +1,258 @@ +package org.apache.lucene.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.util.PriorityQueue; + +// nocommit payloads? + +/** A {@link RescoringCollector#Rescorer} that boost scores + * according to proximity of the term occurrences. Note + * that you must use {@link ProxTermQuery}, instead of + * {@link TermQuery}, for those terms that should + * participate in the proximity boosting for the query. + * + * @lucene.experimental */ + +public class ProximityRescorer extends RescoringCollector.Rescorer { + + final List posEnums = new ArrayList(); + private Scorer scorer; + private ProxScorerQueue proxQueue; + private ProxScorer proxScorer; + + // Query terms, (roughly?) in order that they occurred in + // the query: + private final Term[] queryTerms; + private final String queryField; + + /** Computes the actual boosting due to proximity + * effects. */ + public static abstract class ProxScorer { + + /** Called once for each term occurrence in the current + * hit. + * @param pos The position of the current term's + * occurrence. + * @param termOrd Which term occurred; termOrd=0 is the + * first term added to the query, termOrd=1 is + * the second, etc. */ + public abstract void addPosition(int pos, int termOrd); + + /** Called after all positions have been visited from + * the current hit. This should fold in the proximity + * boost over the provided query score, and return the + * total score, and then reset the class in + * preparation for the next hit. */ + public abstract float score(float queryScore); + + /** Return true if the queryScore, after proximity + * boosting, could be competitive, i.e. may score + * better than bottomCombinedScore. */ + public abstract boolean competes(float queryScore, float bottomCombinedScore); + } + + public ProximityRescorer(Query query) { + Set terms = new LinkedHashSet(); + // nocommit is this "really" in order? + query.extractTerms(terms); + queryTerms = terms.toArray(new Term[terms.size()]); + // nocommit can/should we support multiple fields? + if (queryTerms.length > 0) { + String field = queryTerms[0].field(); + for(Term term : queryTerms) { + if (term.field().equals(field) == false) { + throw new IllegalArgumentException("all query terms must be in the same field"); + } + } + this.queryField = field; + } else { + this.queryField = null; + } + } + + // nocommit also ctors where app can specify terms / + // "prox information need" (which pairs to score, what max + // window size, etc.) + + private static class PosEnumAndPos { + final int termOrd; + final DocsAndPositionsEnum posEnum; + int pos; + int posUpto; + int posLimit; + + public PosEnumAndPos(int termOrd, DocsAndPositionsEnum posEnum) { + this.termOrd = termOrd; + this.posEnum = posEnum; + } + } + + // nocommit total hack!! + private static class DefaultProxScorer extends ProxScorer { + int lastPos; + float boost; + + @Override + public void addPosition(int pos, int termOrd) { + if (lastPos != -1 && pos > lastPos) { + int distance = pos-lastPos; + boost += (float) 1.0/(distance*distance); + } + } + + @Override + public float score(float queryScore) { + float score = queryScore + boost; + boost = 0f; + return score; + } + + @Override + public boolean competes(float queryScore, float bottomCombinedScore) { + // nocommit fixme to sometimes return false + return true; + } + } + + /** Returns a {@link ProxScorer} that computes proximity + * boosts; subclass can override this to change how + * proximity effects are scored. */ + protected ProxScorer getProxScorer(AtomicReaderContext context) { + // nocommit what default? this is hack: + return new DefaultProxScorer(); + } + + @Override + public void setScorer(Scorer scorer) { + this.scorer = scorer; + } + + @Override + public void setNextReader(AtomicReaderContext context) throws IOException { + posEnums.clear(); + AtomicReader reader = context.reader(); + Terms terms = reader.fields().terms(queryField); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(null); + for(int termOrd=0;termOrd { + + final ProxScorer proxScorer; + + public ProxScorerQueue(int numScorers, ProxScorer proxScorer) { + super(numScorers); + this.proxScorer = proxScorer; + } + + @Override + protected boolean lessThan(PosEnumAndPos a, PosEnumAndPos b) { + return a.pos < b.pos; + } + + public void addEnum(PosEnumAndPos posEnum) throws IOException { + System.out.println("add idx=" + posEnum.termOrd + " heapSize=" + size() + " docID=" + posEnum.posEnum.docID()); + + // All enums should be on the same doc: + assert size() == 0 || posEnum.posEnum.docID() == top().posEnum.docID(); + + posEnum.posLimit = posEnum.posEnum.freq(); + posEnum.pos = posEnum.posEnum.nextPosition(); + posEnum.posUpto = 1; + + // nocommit need test w/ syns @ same position as other tokens + + super.add(posEnum); + } + + /** Visits all positions, invoking {code + * ProxScorer.addPosition}. */ + public void visitPositions() throws IOException { + + System.out.println("visitPositions: " + size() + " enums"); + while (true) { + PosEnumAndPos min = top(); + if (min == null) { + break; + } + System.out.println(" cycle pos=" + min.pos); + proxScorer.addPosition(min.pos, min.termOrd); + + if (min.posUpto < min.posLimit) { + min.pos = min.posEnum.nextPosition(); + min.posUpto++; + updateTop(); + } else { + pop(); + } + } + } + } +} Property changes on: lucene/core/src/java/org/apache/lucene/search/ProximityRescorer.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property