Index: solr/src/test/org/apache/solr/schema/IndexSchemaTest.java
===================================================================
--- solr/src/test/org/apache/solr/schema/IndexSchemaTest.java (revision 1059498)
+++ solr/src/test/org/apache/solr/schema/IndexSchemaTest.java (working copy)
@@ -28,6 +28,7 @@
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -83,7 +84,7 @@
@Test
public void testSimilarityFactory() {
SolrCore core = h.getCore();
- Similarity similarity = core.getSchema().getSimilarity();
+ SimilarityProvider similarity = core.getSchema().getSimilarity();
assertTrue("wrong class", similarity instanceof MockConfigurableSimilarity);
assertEquals("is there an echo?", ((MockConfigurableSimilarity)similarity).getPassthrough());
}
Index: solr/src/java/org/apache/solr/schema/IndexSchema.java
===================================================================
--- solr/src/java/org/apache/solr/schema/IndexSchema.java (revision 1059498)
+++ solr/src/java/org/apache/solr/schema/IndexSchema.java (working copy)
@@ -20,7 +20,9 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.Version;
import org.apache.solr.common.ResourceLoader;
@@ -192,7 +194,7 @@
/**
* Returns the Similarity used for this index
*/
- public Similarity getSimilarity() { return similarityFactory.getSimilarity(); }
+ public SimilarityProvider getSimilarity() { return similarityFactory.getSimilarity(); }
/**
* Returns the SimilarityFactory used for this index
@@ -496,8 +498,8 @@
Node node = (Node) xpath.evaluate("/schema/similarity", document, XPathConstants.NODE);
if (node==null) {
similarityFactory = new SimilarityFactory() {
- public Similarity getSimilarity() {
- return Similarity.getDefault();
+ public SimilarityProvider getSimilarity() {
+ return IndexSearcher.getDefaultProvider();
}
};
log.debug("using default similarity");
@@ -509,10 +511,10 @@
similarityFactory = (SimilarityFactory)obj;
similarityFactory.init(params);
} else {
- // just like always, assume it's a Similarlity and get a ClassCastException - reasonable error handling
+ // just like always, assume it's a SimilarityProvider and get a ClassCastException - reasonable error handling
similarityFactory = new SimilarityFactory() {
- public Similarity getSimilarity() {
- return (Similarity) obj;
+ public SimilarityProvider getSimilarity() {
+ return (SimilarityProvider) obj;
}
};
}
Index: solr/src/java/org/apache/solr/schema/LatLonType.java
===================================================================
--- solr/src/java/org/apache/solr/schema/LatLonType.java (revision 1059498)
+++ solr/src/java/org/apache/solr/schema/LatLonType.java (working copy)
@@ -371,7 +371,7 @@
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
- return new SpatialScorer(getSimilarity(searcher), context, this);
+ return new SpatialScorer(context, this);
}
@Override
@@ -404,8 +404,7 @@
int lastDistDoc;
double lastDist;
- public SpatialScorer(Similarity similarity, AtomicReaderContext readerContext, SpatialWeight w) throws IOException {
- super(similarity);
+ public SpatialScorer(AtomicReaderContext readerContext, SpatialWeight w) throws IOException {
this.weight = w;
this.qWeight = w.getValue();
this.reader = readerContext.reader;
Index: solr/src/java/org/apache/solr/schema/SimilarityFactory.java
===================================================================
--- solr/src/java/org/apache/solr/schema/SimilarityFactory.java (revision 1059498)
+++ solr/src/java/org/apache/solr/schema/SimilarityFactory.java (working copy)
@@ -16,7 +16,7 @@
* limitations under the License.
*/
-import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
import org.apache.solr.common.params.SolrParams;
public abstract class SimilarityFactory {
@@ -25,5 +25,5 @@
public void init(SolrParams params) { this.params = params; }
public SolrParams getParams() { return params; }
- public abstract Similarity getSimilarity();
+ public abstract SimilarityProvider getSimilarity();
}
Index: solr/src/java/org/apache/solr/search/function/FunctionQuery.java
===================================================================
--- solr/src/java/org/apache/solr/search/function/FunctionQuery.java (revision 1059498)
+++ solr/src/java/org/apache/solr/search/function/FunctionQuery.java (working copy)
@@ -95,7 +95,7 @@
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
- return new AllScorer(getSimilarity(searcher), context, this);
+ return new AllScorer(context, this);
}
@Override
@@ -114,8 +114,7 @@
final boolean hasDeletions;
final Bits delDocs;
- public AllScorer(Similarity similarity, AtomicReaderContext context, FunctionWeight w) throws IOException {
- super(similarity);
+ public AllScorer(AtomicReaderContext context, FunctionWeight w) throws IOException {
this.weight = w;
this.qWeight = w.getValue();
this.reader = context.reader;
Index: solr/src/java/org/apache/solr/search/function/IDFValueSource.java
===================================================================
--- solr/src/java/org/apache/solr/search/function/IDFValueSource.java (revision 1059498)
+++ solr/src/java/org/apache/solr/search/function/IDFValueSource.java (working copy)
@@ -19,6 +19,7 @@
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.search.FieldSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.util.BytesRef;
@@ -41,7 +42,7 @@
@Override
public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException {
IndexSearcher searcher = (IndexSearcher)context.get("searcher");
- Similarity sim = searcher.getSimilarity();
+ FieldSimilarity sim = searcher.getSimilarity().get(field);
// todo: we need docFreq that takes a BytesRef
String strVal = ByteUtils.UTF8toUTF16(indexedBytes);
int docfreq = searcher.docFreq(new Term(indexedField, strVal));
Index: solr/src/java/org/apache/solr/search/function/BoostedQuery.java
===================================================================
--- solr/src/java/org/apache/solr/search/function/BoostedQuery.java (revision 1059498)
+++ solr/src/java/org/apache/solr/search/function/BoostedQuery.java (working copy)
@@ -96,7 +96,7 @@
if(subQueryScorer == null) {
return null;
}
- return new BoostedQuery.CustomScorer(getSimilarity(searcher), context, this, subQueryScorer, boostVal);
+ return new BoostedQuery.CustomScorer(context, this, subQueryScorer, boostVal);
}
@Override
@@ -123,9 +123,8 @@
private final DocValues vals;
private final AtomicReaderContext readerContext;
- private CustomScorer(Similarity similarity, AtomicReaderContext readerContext, BoostedQuery.BoostedWeight w,
+ private CustomScorer(AtomicReaderContext readerContext, BoostedQuery.BoostedWeight w,
Scorer scorer, ValueSource vs) throws IOException {
- super(similarity);
this.weight = w;
this.qWeight = w.getValue();
this.scorer = scorer;
Index: solr/src/java/org/apache/solr/search/function/TFValueSource.java
===================================================================
--- solr/src/java/org/apache/solr/search/function/TFValueSource.java (revision 1059498)
+++ solr/src/java/org/apache/solr/search/function/TFValueSource.java (working copy)
@@ -3,8 +3,8 @@
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.FieldSimilarity;
import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
@@ -25,7 +25,7 @@
public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException {
Fields fields = readerContext.reader.fields();
final Terms terms = fields.terms(field);
- final Similarity similarity = ((IndexSearcher)context.get("searcher")).getSimilarity();
+ final FieldSimilarity similarity = ((IndexSearcher)context.get("searcher")).getSimilarity().get(field);
return new FloatDocValues(this) {
DocsEnum docs ;
Index: solr/src/java/org/apache/solr/search/function/NormValueSource.java
===================================================================
--- solr/src/java/org/apache/solr/search/function/NormValueSource.java (revision 1059498)
+++ solr/src/java/org/apache/solr/search/function/NormValueSource.java (working copy)
@@ -18,6 +18,7 @@
package org.apache.solr.search.function;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
+import org.apache.lucene.search.FieldSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
import java.io.IOException;
@@ -46,7 +47,7 @@
@Override
public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException {
IndexSearcher searcher = (IndexSearcher)context.get("searcher");
- final Similarity similarity = searcher.getSimilarity();
+ final FieldSimilarity similarity = searcher.getSimilarity().get(field);
final byte[] norms = readerContext.reader.norms(field);
if (norms == null) {
return new ConstDoubleDocValues(0.0, this);
Index: solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java
===================================================================
--- solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java (revision 1059498)
+++ solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java (working copy)
@@ -55,13 +55,11 @@
}
protected class ConstantWeight extends Weight {
- private Similarity similarity;
private float queryNorm;
private float queryWeight;
private Map context;
public ConstantWeight(IndexSearcher searcher) throws IOException {
- this.similarity = getSimilarity(searcher);
this.context = ValueSource.newContext(searcher);
if (filter instanceof SolrFilter)
((SolrFilter)filter).createWeight(context, searcher);
@@ -91,13 +89,13 @@
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
- return new ConstantScorer(similarity, context, this);
+ return new ConstantScorer(context, this);
}
@Override
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
- ConstantScorer cs = new ConstantScorer(similarity, context, this);
+ ConstantScorer cs = new ConstantScorer(context, this);
boolean exists = cs.docIdSetIterator.advance(doc) == doc;
ComplexExplanation result = new ComplexExplanation();
@@ -124,8 +122,7 @@
final float theScore;
int doc = -1;
- public ConstantScorer(Similarity similarity, AtomicReaderContext context, ConstantWeight w) throws IOException {
- super(similarity);
+ public ConstantScorer(AtomicReaderContext context, ConstantWeight w) throws IOException {
theScore = w.getValue();
DocIdSet docIdSet = filter instanceof SolrFilter ? ((SolrFilter)filter).getDocIdSet(w.context, context) : filter.getDocIdSet(context);
if (docIdSet == null) {
Index: lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java (working copy)
@@ -25,6 +25,7 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanQuery.BooleanWeight;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@@ -68,8 +69,14 @@
// 'more' variable to work properly, and this test ensures that if the logic
// changes, we have a test to back it up.
- Similarity sim = Similarity.getDefault();
- Scorer[] scorers = new Scorer[] {new Scorer(sim) {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random, directory);
+ writer.commit();
+ IndexReader ir = writer.getReader();
+ writer.close();
+ IndexSearcher searcher = new IndexSearcher(ir);
+
+ Scorer[] scorers = new Scorer[] {new Scorer() {
private int doc = -1;
@Override public float score() throws IOException { return 0; }
@Override public int docID() { return doc; }
@@ -83,10 +90,15 @@
}
}};
- BooleanScorer bs = new BooleanScorer(null, false, sim, 1, Arrays.asList(scorers), null, scorers.length);
+ BooleanWeight weight = (BooleanWeight) new BooleanQuery().createWeight(searcher);
+ BooleanScorer bs = new BooleanScorer(weight, false, 1, Arrays.asList(scorers), null, scorers.length);
assertEquals("should have received 3000", 3000, bs.nextDoc());
assertEquals("should have received NO_MORE_DOCS", DocIdSetIterator.NO_MORE_DOCS, bs.nextDoc());
+ searcher.close();
+ ir.close();
+ directory.close();
+
}
}
Index: lucene/src/test/org/apache/lucene/search/spans/TestSpans.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/spans/TestSpans.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/search/spans/TestSpans.java (working copy)
@@ -23,6 +23,7 @@
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Weight.ScorerContext;
@@ -410,20 +411,21 @@
}
};
- SpanNearQuery snq = new SpanNearQuery(
+ final SimilarityProvider oldSim = searcher.getSimilarity();
+ Scorer spanScorer;
+ try {
+ searcher.setSimilarity(sim);
+ SpanNearQuery snq = new SpanNearQuery(
new SpanQuery[] {
makeSpanTermQuery("t1"),
makeSpanTermQuery("t2") },
slop,
- ordered) {
- @Override
- public Similarity getSimilarity(IndexSearcher s) {
- return sim;
- }
- };
+ ordered);
- Scorer spanScorer = snq.weight(searcher).scorer(new AtomicReaderContext(new SlowMultiReaderWrapper(searcher.getIndexReader())), ScorerContext.def());
-
+ spanScorer = snq.weight(searcher).scorer(new AtomicReaderContext(new SlowMultiReaderWrapper(searcher.getIndexReader())), ScorerContext.def());
+ } finally {
+ searcher.setSimilarity(oldSim);
+ }
assertTrue("first doc", spanScorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals("first doc number", spanScorer.docID(), 11);
float score = spanScorer.score();
Index: lucene/src/test/org/apache/lucene/search/TestSetNorm.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestSetNorm.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/search/TestSetNorm.java (working copy)
@@ -51,10 +51,11 @@
// reset the boost of each instance of this document
IndexReader reader = IndexReader.open(store, false);
- reader.setNorm(0, "field", Similarity.getDefault().encodeNormValue(1.0f));
- reader.setNorm(1, "field", Similarity.getDefault().encodeNormValue(2.0f));
- reader.setNorm(2, "field", Similarity.getDefault().encodeNormValue(4.0f));
- reader.setNorm(3, "field", Similarity.getDefault().encodeNormValue(16.0f));
+ FieldSimilarity similarity = new DefaultSimilarity().get("field");
+ reader.setNorm(0, "field", similarity.encodeNormValue(1.0f));
+ reader.setNorm(1, "field", similarity.encodeNormValue(2.0f));
+ reader.setNorm(2, "field", similarity.encodeNormValue(4.0f));
+ reader.setNorm(3, "field", similarity.encodeNormValue(16.0f));
reader.close();
// check that searches are ordered by this boost
Index: lucene/src/test/org/apache/lucene/search/TestBoolean2.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestBoolean2.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/search/TestBoolean2.java (working copy)
@@ -208,7 +208,7 @@
public void testQueries10() throws Exception {
String queryText = "+w3 +xx +w2 zz";
int[] expDocNrs = {2, 3};
- Similarity oldSimilarity = searcher.getSimilarity();
+ SimilarityProvider oldSimilarity = searcher.getSimilarity();
try {
searcher.setSimilarity(new DefaultSimilarity(){
@Override
Index: lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java (working copy)
@@ -69,7 +69,7 @@
assertEquals("one", ir.document(hits[2].doc).get("key"));
// change norm & retest
- ir.setNorm(0, "key", Similarity.getDefault().encodeNormValue(400f));
+ ir.setNorm(0, "key", is.getSimilarity().get("key").encodeNormValue(400f));
normsQuery = new MatchAllDocsQuery("key");
hits = is.search(normsQuery, null, 1000).scoreDocs;
assertEquals(3, hits.length);
Index: lucene/src/test/org/apache/lucene/search/JustCompileSearch.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/JustCompileSearch.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/search/JustCompileSearch.java (working copy)
@@ -188,7 +188,7 @@
static final class JustCompilePhraseScorer extends PhraseScorer {
JustCompilePhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
- Similarity similarity, byte[] norms) {
+ FieldSimilarity similarity, byte[] norms) {
super(weight, postings, similarity, norms);
}
@@ -210,8 +210,7 @@
static final class JustCompileScorer extends Scorer {
- protected JustCompileScorer(Similarity similarity) {
- super(similarity);
+ protected JustCompileScorer() {
}
@Override
Index: lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java
===================================================================
--- lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java (revision 0)
+++ lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java (revision 0)
@@ -0,0 +1,154 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.MultiNorms;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestSimilarityProvider extends LuceneTestCase {
+ private Directory directory;
+ private IndexReader reader;
+ private IndexSearcher searcher;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ directory = newDirectory();
+ SimilarityProvider sim = new ExampleSimilarityProvider();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT,
+ new MockAnalyzer()).setSimilarity(sim);
+ RandomIndexWriter iw = new RandomIndexWriter(random, directory, iwc);
+ Document doc = new Document();
+ Field field = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
+ doc.add(field);
+ Field field2 = newField("bar", "", Field.Store.NO, Field.Index.ANALYZED);
+ doc.add(field2);
+
+ field.setValue("quick brown fox");
+ field2.setValue("quick brown fox");
+ iw.addDocument(doc);
+ field.setValue("jumps over lazy brown dog");
+ field2.setValue("jumps over lazy brown dog");
+ iw.addDocument(doc);
+ reader = iw.getReader();
+ iw.close();
+ searcher = new IndexSearcher(reader);
+ searcher.setSimilarity(sim);
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ searcher.close();
+ reader.close();
+ directory.close();
+ super.tearDown();
+ }
+
+ public void testBasics() throws Exception {
+ // sanity check of norms writer
+ byte fooNorms[] = MultiNorms.norms(reader, "foo");
+ byte barNorms[] = MultiNorms.norms(reader, "bar");
+ for (int i = 0; i < fooNorms.length; i++) {
+ assertFalse(fooNorms[i] == barNorms[i]);
+ }
+
+ // sanity check of searching
+ TopDocs foodocs = searcher.search(new TermQuery(new Term("foo", "brown")), 10);
+ assertTrue(foodocs.totalHits > 0);
+ TopDocs bardocs = searcher.search(new TermQuery(new Term("bar", "brown")), 10);
+ assertTrue(bardocs.totalHits > 0);
+ assertTrue(foodocs.scoreDocs[0].score < bardocs.scoreDocs[0].score);
+ }
+
+ private class ExampleSimilarityProvider implements SimilarityProvider {
+ private FieldSimilarity sim1 = new Sim1();
+ private FieldSimilarity sim2 = new Sim2();
+
+ @Override
+ public float coord(int overlap, int maxOverlap) {
+ return 1f;
+ }
+
+ @Override
+ public float queryNorm(float sumOfSquaredWeights) {
+ return 1f;
+ }
+
+ @Override
+ public FieldSimilarity get(String field) {
+ if (field.equals("foo")) {
+ return sim1;
+ } else {
+ return sim2;
+ }
+ }
+ }
+
+ private class Sim1 extends FieldSimilarity {
+ @Override
+ public float computeNorm(String field, FieldInvertState state) {
+ return 1f;
+ }
+
+ @Override
+ public float sloppyFreq(int distance) {
+ return 1f;
+ }
+
+ @Override
+ public float tf(float freq) {
+ return 1f;
+ }
+
+ @Override
+ public float idf(int docFreq, int numDocs) {
+ return 1f;
+ }
+ }
+
+ private class Sim2 extends FieldSimilarity {
+ @Override
+ public float computeNorm(String field, FieldInvertState state) {
+ return 10f;
+ }
+
+ @Override
+ public float sloppyFreq(int distance) {
+ return 10f;
+ }
+
+ @Override
+ public float tf(float freq) {
+ return 10f;
+ }
+
+ @Override
+ public float idf(int docFreq, int numDocs) {
+ return 10f;
+ }
+ }
+}
Property changes on: lucene\src\test\org\apache\lucene\search\TestSimilarityProvider.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java (working copy)
@@ -18,6 +18,8 @@
*/
import org.apache.lucene.index.SegmentReader.Norm;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.FieldSimilarity;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@@ -272,13 +274,14 @@
* @throws Exception
*/
private void performDefaultTests(IndexReader r1) throws Exception {
- float norm1 = Similarity.getDefault().decodeNormValue(MultiNorms.norms(r1, "field1")[4]);
+ FieldSimilarity sim = new DefaultSimilarity().get("field1");
+ float norm1 = sim.decodeNormValue(MultiNorms.norms(r1, "field1")[4]);
IndexReader pr1Clone = (IndexReader) r1.clone();
pr1Clone.deleteDocument(10);
- pr1Clone.setNorm(4, "field1", Similarity.getDefault().encodeNormValue(0.5f));
- assertTrue(Similarity.getDefault().decodeNormValue(MultiNorms.norms(r1, "field1")[4]) == norm1);
- assertTrue(Similarity.getDefault().decodeNormValue(MultiNorms.norms(pr1Clone, "field1")[4]) != norm1);
+ pr1Clone.setNorm(4, "field1", sim.encodeNormValue(0.5f));
+ assertTrue(sim.decodeNormValue(MultiNorms.norms(r1, "field1")[4]) == norm1);
+ assertTrue(sim.decodeNormValue(MultiNorms.norms(pr1Clone, "field1")[4]) != norm1);
final Bits delDocs = MultiFields.getDeletedDocs(r1);
assertTrue(delDocs == null || !delDocs.get(10));
@@ -327,7 +330,8 @@
TestIndexReaderReopen.createIndex(random, dir1, false);
SegmentReader origSegmentReader = getOnlySegmentReader(IndexReader.open(dir1, false));
origSegmentReader.deleteDocument(1);
- origSegmentReader.setNorm(4, "field1", Similarity.getDefault().encodeNormValue(0.5f));
+ FieldSimilarity sim = new DefaultSimilarity().get("field1");
+ origSegmentReader.setNorm(4, "field1", sim.encodeNormValue(0.5f));
SegmentReader clonedSegmentReader = (SegmentReader) origSegmentReader
.clone();
@@ -426,8 +430,9 @@
final Directory dir1 = newDirectory();
TestIndexReaderReopen.createIndex(random, dir1, false);
IndexReader orig = IndexReader.open(dir1, false);
- orig.setNorm(1, "field1", Similarity.getDefault().encodeNormValue(17.0f));
- final byte encoded = Similarity.getDefault().encodeNormValue(17.0f);
+ FieldSimilarity sim = new DefaultSimilarity().get("field1");
+ orig.setNorm(1, "field1", sim.encodeNormValue(17.0f));
+ final byte encoded = sim.encodeNormValue(17.0f);
assertEquals(encoded, MultiNorms.norms(orig, "field1")[1]);
// the cloned segmentreader should have 2 references, 1 to itself, and 1 to
Index: lucene/src/test/org/apache/lucene/index/TestIndexReader.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestIndexReader.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/index/TestIndexReader.java (working copy)
@@ -39,8 +39,10 @@
import org.apache.lucene.index.IndexReader.FieldOption;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
+import org.apache.lucene.search.FieldSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Similarity;
@@ -464,8 +466,9 @@
// expected
}
+ FieldSimilarity sim = new DefaultSimilarity().get("aaa");
try {
- reader.setNorm(5, "aaa", Similarity.getDefault().encodeNormValue(2.0f));
+ reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f));
fail("setNorm after close failed to throw IOException");
} catch (AlreadyClosedException e) {
// expected
@@ -504,8 +507,9 @@
// expected
}
+ FieldSimilarity sim = new DefaultSimilarity().get("aaa");
try {
- reader.setNorm(5, "aaa", Similarity.getDefault().encodeNormValue(2.0f));
+ reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f));
fail("setNorm should have hit LockObtainFailedException");
} catch (LockObtainFailedException e) {
// expected
@@ -535,7 +539,8 @@
// now open reader & set norm for doc 0
IndexReader reader = IndexReader.open(dir, false);
- reader.setNorm(0, "content", Similarity.getDefault().encodeNormValue(2.0f));
+ FieldSimilarity sim = new DefaultSimilarity().get("content");
+ reader.setNorm(0, "content", sim.encodeNormValue(2.0f));
// we should be holding the write lock now:
assertTrue("locked", IndexWriter.isLocked(dir));
@@ -549,7 +554,7 @@
IndexReader reader2 = IndexReader.open(dir, false);
// set norm again for doc 0
- reader.setNorm(0, "content", Similarity.getDefault().encodeNormValue(3.0f));
+ reader.setNorm(0, "content", sim.encodeNormValue(3.0f));
assertTrue("locked", IndexWriter.isLocked(dir));
reader.close();
@@ -579,15 +584,16 @@
addDoc(writer, searchTerm.text());
writer.close();
+ FieldSimilarity sim = new DefaultSimilarity().get("content");
// now open reader & set norm for doc 0 (writes to
// _0_1.s0)
reader = IndexReader.open(dir, false);
- reader.setNorm(0, "content", Similarity.getDefault().encodeNormValue(2.0f));
+ reader.setNorm(0, "content", sim.encodeNormValue(2.0f));
reader.close();
// now open reader again & set norm for doc 0 (writes to _0_2.s0)
reader = IndexReader.open(dir, false);
- reader.setNorm(0, "content", Similarity.getDefault().encodeNormValue(2.0f));
+ reader.setNorm(0, "content", sim.encodeNormValue(2.0f));
reader.close();
assertFalse("failed to remove first generation norms file on writing second generation",
dir.fileExists("_0_1.s0"));
@@ -949,13 +955,13 @@
dir.setMaxSizeInBytes(thisDiskFree);
dir.setRandomIOExceptionRate(rate);
-
+ FieldSimilarity sim = new DefaultSimilarity().get("content");
try {
if (0 == x) {
int docId = 12;
for(int i=0;i<13;i++) {
reader.deleteDocument(docId);
- reader.setNorm(docId, "content", Similarity.getDefault().encodeNormValue(2.0f));
+ reader.setNorm(docId, "content", sim.encodeNormValue(2.0f));
docId += 12;
}
}
@@ -1113,8 +1119,9 @@
}
reader = IndexReader.open(dir, false);
+ FieldSimilarity sim = new DefaultSimilarity().get("content");
try {
- reader.setNorm(1, "content", Similarity.getDefault().encodeNormValue(2.0f));
+ reader.setNorm(1, "content", sim.encodeNormValue(2.0f));
fail("did not hit exception when calling setNorm on an invalid doc number");
} catch (ArrayIndexOutOfBoundsException e) {
// expected
Index: lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java (working copy)
@@ -18,6 +18,8 @@
*/
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.FieldSimilarity;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
@@ -67,9 +69,9 @@
Term searchTerm = new Term("id", "7");
int delCount = reader.deleteDocuments(searchTerm);
assertEquals("didn't delete the right number of documents", 1, delCount);
-
+ FieldSimilarity sim = new DefaultSimilarity().get("content");
// Set one norm so we get a .s0 file:
- reader.setNorm(21, "content", Similarity.getDefault().encodeNormValue(1.5f));
+ reader.setNorm(21, "content", sim.encodeNormValue(1.5f));
reader.close();
// Now, artificially create an extra .del file & extra
Index: lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java (working copy)
@@ -29,6 +29,7 @@
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
@@ -67,7 +68,8 @@
assertEquals(IndexWriterConfig.UNLIMITED_FIELD_LENGTH, conf.getMaxFieldLength());
assertEquals(ConcurrentMergeScheduler.class, conf.getMergeScheduler().getClass());
assertEquals(OpenMode.CREATE_OR_APPEND, conf.getOpenMode());
- assertTrue(Similarity.getDefault() == conf.getSimilarity());
+ // we don't need to assert this, it should be unspecified
+ assertTrue(IndexSearcher.getDefaultProvider() == conf.getSimilarity());
assertEquals(IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL, conf.getTermIndexInterval());
assertEquals(IndexWriterConfig.getDefaultWriteLockTimeout(), conf.getWriteLockTimeout());
assertEquals(IndexWriterConfig.WRITE_LOCK_TIMEOUT, IndexWriterConfig.getDefaultWriteLockTimeout());
@@ -186,12 +188,13 @@
conf.setMergeScheduler(null);
assertEquals(ConcurrentMergeScheduler.class, conf.getMergeScheduler().getClass());
- // Test Similarity
- assertTrue(Similarity.getDefault() == conf.getSimilarity());
+ // Test Similarity:
+ // we shouldnt assert what the default is, just that its not null.
+ assertTrue(IndexSearcher.getDefaultProvider() == conf.getSimilarity());
conf.setSimilarity(new MySimilarity());
assertEquals(MySimilarity.class, conf.getSimilarity().getClass());
conf.setSimilarity(null);
- assertTrue(Similarity.getDefault() == conf.getSimilarity());
+ assertTrue(IndexSearcher.getDefaultProvider() == conf.getSimilarity());
// Test IndexingChain
assertTrue(DocumentsWriter.defaultIndexingChain == conf.getIndexingChain());
Index: lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java (revision 1059498)
+++ lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java (working copy)
@@ -31,7 +31,9 @@
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.SegmentReader.Norm;
import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.FieldSimilarity;
import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@@ -203,19 +205,20 @@
IndexReader reader4C = (IndexReader) reader3C.clone();
SegmentReader segmentReader4C = getOnlySegmentReader(reader4C);
assertEquals(4, reader3CCNorm.bytesRef().get());
- reader4C.setNorm(5, "field1", Similarity.getDefault().encodeNormValue(0.33f));
+ FieldSimilarity sim = new DefaultSimilarity().get("field1");
+ reader4C.setNorm(5, "field1", sim.encodeNormValue(0.33f));
// generate a cannot update exception in reader1
try {
- reader3C.setNorm(1, "field1", Similarity.getDefault().encodeNormValue(0.99f));
+ reader3C.setNorm(1, "field1", sim.encodeNormValue(0.99f));
fail("did not hit expected exception");
} catch (Exception ex) {
// expected
}
// norm values should be different
- assertTrue(Similarity.getDefault().decodeNormValue(segmentReader3C.norms("field1")[5])
- != Similarity.getDefault().decodeNormValue(segmentReader4C.norms("field1")[5]));
+ assertTrue(sim.decodeNormValue(segmentReader3C.norms("field1")[5])
+ != sim.decodeNormValue(segmentReader4C.norms("field1")[5]));
Norm reader4CCNorm = segmentReader4C.norms.get("field1");
assertEquals(3, reader3CCNorm.bytesRef().get());
assertEquals(1, reader4CCNorm.bytesRef().get());
@@ -223,7 +226,7 @@
IndexReader reader5C = (IndexReader) reader4C.clone();
SegmentReader segmentReader5C = getOnlySegmentReader(reader5C);
Norm reader5CCNorm = segmentReader5C.norms.get("field1");
- reader5C.setNorm(5, "field1", Similarity.getDefault().encodeNormValue(0.7f));
+ reader5C.setNorm(5, "field1", sim.encodeNormValue(0.7f));
assertEquals(1, reader5CCNorm.bytesRef().get());
reader5C.close();
@@ -256,8 +259,9 @@
// System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm);
modifiedNorms.set(i, Float.valueOf(newNorm));
modifiedNorms.set(k, Float.valueOf(origNorm));
- ir.setNorm(i, "f" + 1, Similarity.getDefault().encodeNormValue(newNorm));
- ir.setNorm(k, "f" + 1, Similarity.getDefault().encodeNormValue(origNorm));
+ FieldSimilarity sim = new DefaultSimilarity().get("f" + 1);
+ ir.setNorm(i, "f" + 1, sim.encodeNormValue(newNorm));
+ ir.setNorm(k, "f" + 1, sim.encodeNormValue(origNorm));
// System.out.println("setNorm i: "+i);
// break;
}
@@ -277,7 +281,8 @@
assertEquals("number of norms mismatches", numDocNorms, b.length);
ArrayList
Similarity defines the components of Lucene scoring. - * Overriding computation of these components is a convenient - * way to alter Lucene scoring. - * - *
Suggested reading: - * - * Introduction To Information Retrieval, Chapter 6. - * - *
The following describes how Lucene scoring evolves from - * underlying information retrieval models to (efficient) implementation. - * We first brief on VSM Score, - * then derive from it Lucene's Conceptual Scoring Formula, - * from which, finally, evolves Lucene's Practical Scoring Function - * (the latter is connected directly with Lucene classes and methods). - * - *
Lucene combines - * - * Boolean model (BM) of Information Retrieval - * with - * - * Vector Space Model (VSM) of Information Retrieval - - * documents "approved" by BM are scored by VSM. - * - *
In VSM, documents and queries are represented as - * weighted vectors in a multi-dimensional space, - * where each distinct index term is a dimension, - * and weights are - * Tf-idf values. - * - *
VSM does not require weights to be Tf-idf values, - * but Tf-idf values are believed to produce search results of high quality, - * and so Lucene is using Tf-idf. - * Tf and Idf are described in more detail below, - * but for now, for completion, let's just say that - * for given term t and document (or query) x, - * Tf(t,x) varies with the number of occurrences of term t in x - * (when one increases so does the other) and - * idf(t) similarly varies with the inverse of the - * number of index documents containing term t. - * - *
VSM score of document d for query q is the
- *
- * Cosine Similarity
- * of the weighted query vectors V(q) and V(d):
- *
- *
- *
- *
| ||||||
|
- * |
Note: the above equation can be viewed as the dot product of - * the normalized weighted vectors, in the sense that dividing - * V(q) by its euclidean norm is normalizing it to a unit vector. - * - *
Lucene refines VSM score for both search quality and usability: - *
Under the simplifying assumption of a single field in the index,
- * we get Lucene's Conceptual scoring formula:
- *
- *
- *
- *
| |||||||
|
- * |
The conceptual formula is a simplification in the sense that (1) terms and documents - * are fielded and (2) boosts are usually per query term rather than per query. - * - *
We now describe how Lucene implements this conceptual scoring formula, and - * derive from it Lucene's Practical Scoring Function. - * - *
For efficient score computation some scoring components - * are computed and aggregated in advance: - * - *
Lucene's Practical Scoring Function is derived from the above. - * The color codes demonstrate how it relates - * to those of the conceptual formula: - * - *
- *
- *
| |||||||
|
- * |
where - *
| - * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} = - * | - *- * frequency½ - * | - *
| - * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)} = - * | - *- * 1 + log ( - * | - *
- *
|
- * - * ) - * | - *
| - * queryNorm(q) = - * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) queryNorm(sumOfSquaredWeights)} - * = - * | - *
- *
|
- *
| - * {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights} = - * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} 2 - * · - * | - *- * ∑ - * | - *- * ( - * idf(t) · - * t.getBoost() - * ) 2 - * | - *
| - * | t in q | - *- * |
- * When a document is added to the index, all the above factors are multiplied.
- * If the document has multiple fields with the same name, all their boosts are multiplied together:
- *
- *
- *
| - * norm(t,d) = - * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()} - * · - * lengthNorm - * · - * | - *- * ∏ - * | - *- * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}() - * | - *
| - * | field f in d named as t | - *- * |
This is initially an instance of {@link DefaultSimilarity}. - * - * @see IndexSearcher#setSimilarity(Similarity) - * @see org.apache.lucene.index.IndexWriterConfig#setSimilarity(Similarity) - */ - public static Similarity getDefault() { - return Similarity.defaultImpl; - } - - /** Cache of decoded bytes. */ - private static final float[] NORM_TABLE = new float[256]; - - static { - for (int i = 0; i < 256; i++) - NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); - } - - /** Decodes a normalization factor stored in an index. - * @see #encodeNormValue(float) - */ - public float decodeNormValue(byte b) { - return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 - } - - /** - * Computes the normalization value for a field, given the accumulated - * state of term processing for this field (see {@link FieldInvertState}). - * - *
Implementations should calculate a float value based on the field - * state and then return that value. - * - *
Matches in longer fields are less precise, so implementations of this
- * method usually return smaller values when state.getLength() is large,
- * and larger values when state.getLength() is small.
- *
- *
Note that the return values are computed under - * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} - * and then stored using - * {@link #encodeNormValue(float)}. - * Thus they have limited precision, and documents - * must be re-indexed if this method is altered. - * - *
For backward compatibility this method by default calls - * {@link #lengthNorm(String, int)} passing - * {@link FieldInvertState#getLength()} as the second argument, and - * then multiplies this value by {@link FieldInvertState#getBoost()}.
- * - * @lucene.experimental - * - * @param field field name - * @param state current processing state for this field - * @return the calculated float norm - */ - public abstract float computeNorm(String field, FieldInvertState state); - - /** Computes the normalization value for a field given the total number of - * terms contained in a field. These values, together with field boosts, are - * stored in an index and multipled into scores for hits on each field by the - * search code. - * - *Matches in longer fields are less precise, so implementations of this
- * method usually return smaller values when numTokens is large,
- * and larger values when numTokens is small.
- *
- *
Note that the return values are computed under - * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} - * and then stored using - * {@link #encodeNormValue(float)}. - * Thus they have limited precision, and documents - * must be re-indexed if this method is altered. - * - * @param fieldName the name of the field - * @param numTokens the total number of tokens contained in fields named - * fieldName of doc. - * @return a normalization factor for hits on this field of this document - * - * @see org.apache.lucene.document.Field#setBoost(float) - * - * @deprecated Please override computeNorm instead - */ - @Deprecated - public final float lengthNorm(String fieldName, int numTokens) { - throw new UnsupportedOperationException("please use computeNorm instead"); - } - - /** Computes the normalization value for a query given the sum of the squared - * weights of each of the query terms. This value is multiplied into the - * weight of each query term. While the classic query normalization factor is - * computed as 1/sqrt(sumOfSquaredWeights), other implementations might - * completely ignore sumOfSquaredWeights (ie return 1). - * - *
This does not affect ranking, but the default implementation does make scores - * from different queries more comparable than they would be by eliminating the - * magnitude of the Query vector as a factor in the score. - * - * @param sumOfSquaredWeights the sum of the squares of query term weights - * @return a normalization factor for query weights - */ - public abstract float queryNorm(float sumOfSquaredWeights); - - /** Encodes a normalization factor for storage in an index. - * - *
The encoding uses a three-bit mantissa, a five-bit exponent, and - * the zero-exponent point at 15, thus - * representing values from around 7x10^9 to 2x10^-9 with about one - * significant decimal digit of accuracy. Zero is also represented. - * Negative numbers are rounded up to zero. Values too large to represent - * are rounded down to the largest representable value. Positive values too - * small to represent are rounded up to the smallest positive representable - * value. - * @see org.apache.lucene.document.Field#setBoost(float) - * @see org.apache.lucene.util.SmallFloat - */ - public byte encodeNormValue(float f) { - return SmallFloat.floatToByte315(f); - } - - /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} - * factor for each term in the query and these products are then summed to - * form the initial score for a document. - * - *
Terms and phrases repeated in a document indicate the topic of the
- * document, so implementations of this method usually return larger values
- * when freq is large, and smaller values when freq
- * is small.
- *
- *
The default implementation calls {@link #tf(float)}. - * - * @param freq the frequency of a term within a document - * @return a score factor based on a term's within-document frequency - */ - public float tf(int freq) { - return tf((float)freq); - } - - /** Computes the amount of a sloppy phrase match, based on an edit distance. - * This value is summed for each sloppy phrase match in a document to form - * the frequency that is passed to {@link #tf(float)}. - * - *
A phrase match with a small edit distance to a document passage more - * closely matches the document, so implementations of this method usually - * return larger values when the edit distance is small and smaller values - * when it is large. - * - * @see PhraseQuery#setSlop(int) - * @param distance the edit distance of this sloppy phrase match - * @return the frequency increment for this match - */ - public abstract float sloppyFreq(int distance); - - /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} - * factor for each term in the query and these products are then summed to - * form the initial score for a document. - * - *
Terms and phrases repeated in a document indicate the topic of the
- * document, so implementations of this method usually return larger values
- * when freq is large, and smaller values when freq
- * is small.
- *
- * @param freq the frequency of a term within a document
- * @return a score factor based on a term's within-document frequency
- */
- public abstract float tf(float freq);
-
- /**
- * Computes a score factor for a simple term and returns an explanation
- * for that score factor.
- *
- *
- * The default implementation uses: - * - *
- * idf(docFreq, searcher.maxDoc()); - *- * - * Note that {@link IndexSearcher#maxDoc()} is used instead of - * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also - * {@link IndexSearcher#docFreq(Term)} is used, and when the latter - * is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction. - * In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute - * - * @param term the term in question - * @param searcher the document collection being searched - * @param docFreq externally computed docFreq for this term - * @return an IDFExplain object that includes both an idf score factor - and an explanation for the term. - * @throws IOException - */ - public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher, int docFreq) throws IOException { - final int df = docFreq; - final int max = searcher.maxDoc(); - final float idf = idf(df, max); - return new IDFExplanation() { - @Override - public String explain() { - return "idf(docFreq=" + df + - ", maxDocs=" + max + ")"; - } - @Override - public float getIdf() { - return idf; - }}; - } - - /** - * This method forwards to {@link - * #idfExplain(Term,IndexSearcher,int)} by passing - *
searcher.docFreq(term) as the docFreq.
- */
- public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher) throws IOException {
- return idfExplain(term, searcher, searcher.docFreq(term));
- }
-
- /**
- * Computes a score factor for a phrase.
- *
- *
- * The default implementation sums the idf factor for
- * each term in the phrase.
- *
- * @param terms the terms in the phrase
- * @param searcher the document collection being searched
- * @return an IDFExplain object that includes both an idf
- * score factor for the phrase and an explanation
- * for each term.
- * @throws IOException
- */
- public IDFExplanation idfExplain(Collection Terms that occur in fewer documents are better indicators of topic, so
- * implementations of this method usually return larger values for rare terms,
- * and smaller values for common terms.
- *
- * @param docFreq the number of documents which contain the term
- * @param numDocs the total number of documents in the collection
- * @return a score factor based on the term's document frequency
- */
- public abstract float idf(int docFreq, int numDocs);
-
- /** Computes a score factor based on the fraction of all query terms that a
- * document contains. This value is multiplied into scores.
- *
- * The presence of a large portion of the query terms indicates a better
- * match with the query, so implementations of this method usually return
- * larger values when the ratio between these parameters is large and smaller
- * values when the ratio between them is small.
- *
- * @param overlap the number of query terms matched in the document
- * @param maxOverlap the total number of terms in the query
- * @return a score factor based on term overlap with the query
- */
- public abstract float coord(int overlap, int maxOverlap);
-
- /**
- * Calculate a scoring factor based on the data in the payload. Overriding implementations
- * are responsible for interpreting what is in the payload. Lucene makes no assumptions about
- * what is in the byte array.
- *
- * The default implementation returns 1.
- *
- * @param docId The docId currently being scored. If this value is {@link #NO_DOC_ID_PROVIDED}, then it should be assumed that the PayloadQuery implementation does not provide document information
- * @param fieldName The fieldName of the term this payload belongs to
- * @param start The start position of the payload
- * @param end The end position of the payload
- * @param payload The payload byte array to be scored
- * @param offset The offset into the payload array
- * @param length The length in the array
- * @return An implementation dependent float to be used as a scoring factor
- *
- */
- // TODO: maybe switch this API to BytesRef?
- public float scorePayload(int docId, String fieldName, int start, int end, byte [] payload, int offset, int length)
- {
- return 1;
- }
-
}
Index: lucene/src/java/org/apache/lucene/search/FieldSimilarity.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/FieldSimilarity.java (revision 0)
+++ lucene/src/java/org/apache/lucene/search/FieldSimilarity.java (revision 0)
@@ -0,0 +1,801 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Collection;
+
+import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Explanation.IDFExplanation;
+import org.apache.lucene.util.SmallFloat;
+
+
+/**
+ * Expert: Scoring API.
+ *
+ * Similarity defines the components of Lucene scoring.
+ * Overriding computation of these components is a convenient
+ * way to alter Lucene scoring.
+ *
+ * Suggested reading:
+ *
+ * Introduction To Information Retrieval, Chapter 6.
+ *
+ * The following describes how Lucene scoring evolves from
+ * underlying information retrieval models to (efficient) implementation.
+ * We first brief on VSM Score,
+ * then derive from it Lucene's Conceptual Scoring Formula,
+ * from which, finally, evolves Lucene's Practical Scoring Function
+ * (the latter is connected directly with Lucene classes and methods).
+ *
+ * Lucene combines
+ *
+ * Boolean model (BM) of Information Retrieval
+ * with
+ *
+ * Vector Space Model (VSM) of Information Retrieval -
+ * documents "approved" by BM are scored by VSM.
+ *
+ * In VSM, documents and queries are represented as
+ * weighted vectors in a multi-dimensional space,
+ * where each distinct index term is a dimension,
+ * and weights are
+ * Tf-idf values.
+ *
+ * VSM does not require weights to be Tf-idf values,
+ * but Tf-idf values are believed to produce search results of high quality,
+ * and so Lucene is using Tf-idf.
+ * Tf and Idf are described in more detail below,
+ * but for now, for completion, let's just say that
+ * for given term t and document (or query) x,
+ * Tf(t,x) varies with the number of occurrences of term t in x
+ * (when one increases so does the other) and
+ * idf(t) similarly varies with the inverse of the
+ * number of index documents containing term t.
+ *
+ * VSM score of document d for query q is the
+ *
+ * Cosine Similarity
+ * of the weighted query vectors V(q) and V(d):
+ *
+ * Note: the above equation can be viewed as the dot product of
+ * the normalized weighted vectors, in the sense that dividing
+ * V(q) by its euclidean norm is normalizing it to a unit vector.
+ *
+ * Lucene refines VSM score for both search quality and usability:
+ * Under the simplifying assumption of a single field in the index,
+ * we get Lucene's Conceptual scoring formula:
+ *
+ * The conceptual formula is a simplification in the sense that (1) terms and documents
+ * are fielded and (2) boosts are usually per query term rather than per query.
+ *
+ * We now describe how Lucene implements this conceptual scoring formula, and
+ * derive from it Lucene's Practical Scoring Function.
+ *
+ * For efficient score computation some scoring components
+ * are computed and aggregated in advance:
+ *
+ * Lucene's Practical Scoring Function is derived from the above.
+ * The color codes demonstrate how it relates
+ * to those of the conceptual formula:
+ *
+ *
+ * where
+ *
+ * When a document is added to the index, all the above factors are multiplied.
+ * If the document has multiple fields with the same name, all their boosts are multiplied together:
+ *
+ * Implementations should calculate a float value based on the field
+ * state and then return that value.
+ *
+ * Matches in longer fields are less precise, so implementations of this
+ * method usually return smaller values when Note that the return values are computed under
+ * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)}
+ * and then stored using
+ * {@link #encodeNormValue(float)}.
+ * Thus they have limited precision, and documents
+ * must be re-indexed if this method is altered.
+ *
+ * For backward compatibility this method by default calls
+ * {@link #lengthNorm(String, int)} passing
+ * {@link FieldInvertState#getLength()} as the second argument, and
+ * then multiplies this value by {@link FieldInvertState#getBoost()}. Matches in longer fields are less precise, so implementations of this
+ * method usually return smaller values when Note that the return values are computed under
+ * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)}
+ * and then stored using
+ * {@link #encodeNormValue(float)}.
+ * Thus they have limited precision, and documents
+ * must be re-indexed if this method is altered.
+ *
+ * @param fieldName the name of the field
+ * @param numTokens the total number of tokens contained in fields named
+ * fieldName of doc.
+ * @return a normalization factor for hits on this field of this document
+ *
+ * @see org.apache.lucene.document.Field#setBoost(float)
+ *
+ * @deprecated Please override computeNorm instead
+ */
+ @Deprecated
+ public final float lengthNorm(String fieldName, int numTokens) {
+ throw new UnsupportedOperationException("please use computeNorm instead");
+ }
+
+ /** Encodes a normalization factor for storage in an index.
+ *
+ * The encoding uses a three-bit mantissa, a five-bit exponent, and
+ * the zero-exponent point at 15, thus
+ * representing values from around 7x10^9 to 2x10^-9 with about one
+ * significant decimal digit of accuracy. Zero is also represented.
+ * Negative numbers are rounded up to zero. Values too large to represent
+ * are rounded down to the largest representable value. Positive values too
+ * small to represent are rounded up to the smallest positive representable
+ * value.
+ * @see org.apache.lucene.document.Field#setBoost(float)
+ * @see org.apache.lucene.util.SmallFloat
+ */
+ public byte encodeNormValue(float f) {
+ return SmallFloat.floatToByte315(f);
+ }
+
+ /** Computes a score factor based on a term or phrase's frequency in a
+ * document. This value is multiplied by the {@link #idf(int, int)}
+ * factor for each term in the query and these products are then summed to
+ * form the initial score for a document.
+ *
+ * Terms and phrases repeated in a document indicate the topic of the
+ * document, so implementations of this method usually return larger values
+ * when The default implementation calls {@link #tf(float)}.
+ *
+ * @param freq the frequency of a term within a document
+ * @return a score factor based on a term's within-document frequency
+ */
+ public float tf(int freq) {
+ return tf((float)freq);
+ }
+
+ /** Computes the amount of a sloppy phrase match, based on an edit distance.
+ * This value is summed for each sloppy phrase match in a document to form
+ * the frequency that is passed to {@link #tf(float)}.
+ *
+ * A phrase match with a small edit distance to a document passage more
+ * closely matches the document, so implementations of this method usually
+ * return larger values when the edit distance is small and smaller values
+ * when it is large.
+ *
+ * @see PhraseQuery#setSlop(int)
+ * @param distance the edit distance of this sloppy phrase match
+ * @return the frequency increment for this match
+ */
+ public abstract float sloppyFreq(int distance);
+
+ /** Computes a score factor based on a term or phrase's frequency in a
+ * document. This value is multiplied by the {@link #idf(int, int)}
+ * factor for each term in the query and these products are then summed to
+ * form the initial score for a document.
+ *
+ * Terms and phrases repeated in a document indicate the topic of the
+ * document, so implementations of this method usually return larger values
+ * when
+ * The default implementation uses:
+ *
+ *
+ * The default implementation sums the idf factor for
+ * each term in the phrase.
+ *
+ * @param terms the terms in the phrase
+ * @param searcher the document collection being searched
+ * @return an IDFExplain object that includes both an idf
+ * score factor for the phrase and an explanation
+ * for each term.
+ * @throws IOException
+ */
+ public IDFExplanation idfExplain(Collection Terms that occur in fewer documents are better indicators of topic, so
+ * implementations of this method usually return larger values for rare terms,
+ * and smaller values for common terms.
+ *
+ * @param docFreq the number of documents which contain the term
+ * @param numDocs the total number of documents in the collection
+ * @return a score factor based on the term's document frequency
+ */
+ public abstract float idf(int docFreq, int numDocs);
+
+ /**
+ * Calculate a scoring factor based on the data in the payload. Overriding implementations
+ * are responsible for interpreting what is in the payload. Lucene makes no assumptions about
+ * what is in the byte array.
+ *
+ * The default implementation returns 1.
+ *
+ * @param docId The docId currently being scored. If this value is {@link #NO_DOC_ID_PROVIDED}, then it should be assumed that the PayloadQuery implementation does not provide document information
+ * @param fieldName The fieldName of the term this payload belongs to
+ * @param start The start position of the payload
+ * @param end The end position of the payload
+ * @param payload The payload byte array to be scored
+ * @param offset The offset into the payload array
+ * @param length The length in the array
+ * @return An implementation dependent float to be used as a scoring factor
+ *
+ */
+ // TODO: maybe switch this API to BytesRef?
+ public float scorePayload(int docId, String fieldName, int start, int end, byte [] payload, int offset, int length)
+ {
+ return 1;
+ }
+
+}
Property changes on: lucene\src\java\org\apache\lucene\search\FieldSimilarity.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java (revision 1059498)
+++ lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java (working copy)
@@ -19,9 +19,9 @@
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.Explanation;
+import org.apache.lucene.search.FieldSimilarity;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.spans.NearSpansOrdered;
import org.apache.lucene.search.spans.NearSpansUnordered;
@@ -153,10 +153,9 @@
Spans spans;
protected float payloadScore;
private int payloadsSeen;
- Similarity similarity = getSimilarity();
protected PayloadNearSpanScorer(Spans spans, Weight weight,
- Similarity similarity, byte[] norms) throws IOException {
+ FieldSimilarity similarity, byte[] norms) throws IOException {
super(spans, weight, similarity, norms);
this.spans = spans;
}
@@ -211,7 +210,7 @@
payloadsSeen = 0;
do {
int matchLength = spans.end() - spans.start();
- freq += getSimilarity().sloppyFreq(matchLength);
+ freq += similarity.sloppyFreq(matchLength);
Spans[] spansArr = new Spans[1];
spansArr[0] = spans;
getPayloads(spansArr);
Index: lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java (revision 1059498)
+++ lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java (working copy)
@@ -20,6 +20,7 @@
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.search.FieldSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
@@ -86,7 +87,7 @@
private final TermSpans termSpans;
public PayloadTermSpanScorer(TermSpans spans, Weight weight,
- Similarity similarity, byte[] norms) throws IOException {
+ FieldSimilarity similarity, byte[] norms) throws IOException {
super(spans, weight, similarity, norms);
termSpans = spans;
}
@@ -100,12 +101,11 @@
freq = 0.0f;
payloadScore = 0;
payloadsSeen = 0;
- Similarity similarity1 = getSimilarity();
while (more && doc == spans.doc()) {
int matchLength = spans.end() - spans.start();
- freq += similarity1.sloppyFreq(matchLength);
- processPayload(similarity1);
+ freq += similarity.sloppyFreq(matchLength);
+ processPayload(similarity);
more = spans.next();// this moves positions to the next match in this
// document
@@ -113,7 +113,7 @@
return more || (freq != 0);
}
- protected void processPayload(Similarity similarity) throws IOException {
+ protected void processPayload(FieldSimilarity similarity) throws IOException {
final DocsAndPositionsEnum postings = termSpans.getPostings();
if (postings.hasPayload()) {
payload = postings.getPayload();
Index: lucene/src/java/org/apache/lucene/search/SimilarityProvider.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/SimilarityProvider.java (revision 0)
+++ lucene/src/java/org/apache/lucene/search/SimilarityProvider.java (revision 0)
@@ -0,0 +1,66 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Expert: Scoring API.
+ *
+ * Provides top-level scoring functions that aren't specific to a field,
+ * and work across multi-field queries (such as {@link BooleanQuery}).
+ *
+ * Field-specific scoring is accomplished through {@link FieldSimilarity}.
+ *
+ * @lucene.experimental
+ */
+public interface SimilarityProvider {
+
+ /** Computes a score factor based on the fraction of all query terms that a
+ * document contains. This value is multiplied into scores.
+ *
+ * The presence of a large portion of the query terms indicates a better
+ * match with the query, so implementations of this method usually return
+ * larger values when the ratio between these parameters is large and smaller
+ * values when the ratio between them is small.
+ *
+ * @param overlap the number of query terms matched in the document
+ * @param maxOverlap the total number of terms in the query
+ * @return a score factor based on term overlap with the query
+ */
+ public abstract float coord(int overlap, int maxOverlap);
+
+ /** Computes the normalization value for a query given the sum of the squared
+ * weights of each of the query terms. This value is multiplied into the
+ * weight of each query term. While the classic query normalization factor is
+ * computed as 1/sqrt(sumOfSquaredWeights), other implementations might
+ * completely ignore sumOfSquaredWeights (ie return 1).
+ *
+ * This does not affect ranking, but the default implementation does make scores
+ * from different queries more comparable than they would be by eliminating the
+ * magnitude of the Query vector as a factor in the score.
+ *
+ * @param sumOfSquaredWeights the sum of the squares of query term weights
+ * @return a normalization factor for query weights
+ */
+ public abstract float queryNorm(float sumOfSquaredWeights);
+
+ /** Returns a {@link FieldSimilarity} for scoring a field
+ * @param field field name.
+ * @return a field-specific Similarity.
+ */
+ public abstract FieldSimilarity get(String field);
+}
Property changes on: lucene\src\java\org\apache\lucene\search\SimilarityProvider.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java (revision 1059498)
+++ lucene/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java (working copy)
@@ -107,11 +107,6 @@
}
@Override
- public Similarity getSimilarity(IndexSearcher searcher) {
- return maskedQuery.getSimilarity(searcher);
- }
-
- @Override
public Query rewrite(IndexReader reader) throws IOException {
FieldMaskingSpanQuery clone = null;
Index: lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java (revision 1059498)
+++ lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java (working copy)
@@ -30,7 +30,7 @@
* Expert-only. Public for use by other weight implementations
*/
public class SpanWeight extends Weight {
- protected Similarity similarity;
+ protected FieldSimilarity similarity;
protected float value;
protected float idf;
protected float queryNorm;
@@ -42,7 +42,7 @@
public SpanWeight(SpanQuery query, IndexSearcher searcher)
throws IOException {
- this.similarity = query.getSimilarity(searcher);
+ this.similarity = searcher.getSimilarity().get(query.getField());
this.query = query;
terms=new HashSet
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * cosine-similarity(q,d) =
+ *
+ *
+ *
+ *
+ *
+ *
+ * V(q) · V(d)
+ * –––––––––
+ * |V(q)| |V(d)|
+ *
+ *
+ *
+ *
+ * Where V(q) · V(d) is the
+ * dot product
+ * of the weighted vectors,
+ * and |V(q)| and |V(d)| are their
+ * Euclidean norms.
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * score(q,d) =
+ * coord-factor(q,d) ·
+ * query-boost(q) ·
+ *
+ *
+ *
+ *
+ *
+ *
+ * V(q) · V(d)
+ * –––––––––
+ * |V(q)|
+ * · doc-len-norm(d)
+ * · doc-boost(d)
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * score(q,d) =
+ * coord(q,d) ·
+ * queryNorm(q) ·
+ *
+ *
+ * ∑
+ *
+ *
+ * (
+ * tf(t in d) ·
+ * idf(t)2 ·
+ * t.getBoost() ·
+ * norm(t,d)
+ * )
+ *
+ *
+ *
+ *
+ * t in q
+ *
+ *
+ *
+ *
+ *
+ *
+ * @see #setDefault(Similarity)
+ * @see org.apache.lucene.index.IndexWriterConfig#setSimilarity(Similarity)
+ * @see IndexSearcher#setSimilarity(Similarity)
+ */
+public abstract class FieldSimilarity implements Serializable {
+
+ public static final int NO_DOC_ID_PROVIDED = -1;
+
+ /** Cache of decoded bytes. */
+ private static final float[] NORM_TABLE = new float[256];
+
+ static {
+ for (int i = 0; i < 256; i++)
+ NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
+ }
+
+ /** Decodes a normalization factor stored in an index.
+ * @see #encodeNormValue(float)
+ */
+ public float decodeNormValue(byte b) {
+ return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
+ }
+
+ /**
+ * Computes the normalization value for a field, given the accumulated
+ * state of term processing for this field (see {@link FieldInvertState}).
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} =
+ *
+ *
+ * frequency½
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)} =
+ *
+ *
+ * 1 + log (
+ *
+ *
+ *
+ *
+ *
+ *
+ * numDocs
+ * –––––––––
+ * docFreq+1
+ * )
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * queryNorm(q) =
+ * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) queryNorm(sumOfSquaredWeights)}
+ * =
+ *
+ *
+ *
+ *
+ *
+ *
+ * 1
+ *
+ * ––––––––––––––
+ *
+ * sumOfSquaredWeights½
+ *
+ * The sum of squared weights (of the query terms) is
+ * computed by the query {@link org.apache.lucene.search.Weight} object.
+ * For example, a {@link org.apache.lucene.search.BooleanQuery}
+ * computes this value as:
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights} =
+ * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} 2
+ * ·
+ *
+ *
+ * ∑
+ *
+ *
+ * (
+ * idf(t) ·
+ * t.getBoost()
+ * ) 2
+ *
+ *
+ *
+ *
+ * t in q
+ *
+ *
+ *
+ *
+ *
+ *
+ * The {@link #computeNorm} method is responsible for
+ * combining all of these factors into a single float.
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ * norm(t,d) =
+ * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()}
+ * ·
+ * lengthNorm
+ * ·
+ *
+ *
+ * ∏
+ *
+ *
+ * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}()
+ *
+ *
+ *
+ *
+ * field f in d named as t
+ *
+ *
+ * However the resulted norm value is {@link #encodeNormValue(float) encoded} as a single byte
+ * before being stored.
+ * At search time, the norm byte value is read from the index
+ * {@link org.apache.lucene.store.Directory directory} and
+ * {@link #decodeNormValue(byte) decoded} back to a float norm value.
+ * This encoding/decoding, while reducing index size, comes with the price of
+ * precision loss - it is not guaranteed that decode(encode(x)) = x.
+ * For instance, decode(encode(0.89)) = 0.75.
+ *
+ * Compression of norm values to a single byte saves memory at search time,
+ * because once a field is referenced at search time, its norms - for
+ * all documents - are maintained in memory.
+ *
+ * The rationale supporting such lossy compression of norm values is that
+ * given the difficulty (and inaccuracy) of users to express their true information
+ * need by a query, only big differences matter.
+ *
+ * Last, note that search time is too late to modify this norm part of scoring, e.g. by
+ * using a different {@link Similarity} for search.
+ *
+ * state.getLength() is large,
+ * and larger values when state.getLength() is small.
+ *
+ * numTokens is large,
+ * and larger values when numTokens is small.
+ *
+ * freq is large, and smaller values when freq
+ * is small.
+ *
+ * freq is large, and smaller values when freq
+ * is small.
+ *
+ * @param freq the frequency of a term within a document
+ * @return a score factor based on a term's within-document frequency
+ */
+ public abstract float tf(float freq);
+
+ /**
+ * Computes a score factor for a simple term and returns an explanation
+ * for that score factor.
+ *
+ *
+ * idf(docFreq, searcher.maxDoc());
+ *
+ *
+ * Note that {@link IndexSearcher#maxDoc()} is used instead of
+ * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
+ * {@link IndexSearcher#docFreq(Term)} is used, and when the latter
+ * is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction.
+ * In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute
+ *
+ * @param term the term in question
+ * @param searcher the document collection being searched
+ * @param docFreq externally computed docFreq for this term
+ * @return an IDFExplain object that includes both an idf score factor
+ and an explanation for the term.
+ * @throws IOException
+ */
+ public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher, int docFreq) throws IOException {
+ final int df = docFreq;
+ final int max = searcher.maxDoc();
+ final float idf = idf(df, max);
+ return new IDFExplanation() {
+ @Override
+ public String explain() {
+ return "idf(docFreq=" + df +
+ ", maxDocs=" + max + ")";
+ }
+ @Override
+ public float getIdf() {
+ return idf;
+ }};
+ }
+
+ /**
+ * This method forwards to {@link
+ * #idfExplain(Term,IndexSearcher,int)} by passing
+ * searcher.docFreq(term) as the docFreq.
+ */
+ public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher) throws IOException {
+ return idfExplain(term, searcher, searcher.docFreq(term));
+ }
+
+ /**
+ * Computes a score factor for a phrase.
+ *
+ * Similarity implementation used by this scorer.
*/
- protected Scorer(Similarity similarity) {
- this(similarity, null);
+ protected Scorer() {
+ this(null);
}
/**
* Constructs a Scorer
- * @param similarity The Similarity implementation used by this scorer.
* @param weight The scorers Weight
*/
- protected Scorer(Similarity similarity, Weight weight) {
- this.similarity = similarity;
+ protected Scorer(Weight weight) {
this.weight = weight;
}
- /** Returns the Similarity implementation used by this scorer. */
- public Similarity getSimilarity() {
- return this.similarity;
- }
-
/** Scores and collects all matching documents.
* @param collector The collector to which all matching documents are passed.
*/
Index: lucene/src/java/org/apache/lucene/search/TermQuery.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/TermQuery.java (revision 1059498)
+++ lucene/src/java/org/apache/lucene/search/TermQuery.java (working copy)
@@ -42,7 +42,7 @@
private transient PerReaderTermState perReaderTermState;
private class TermWeight extends Weight {
- private final Similarity similarity;
+ private final FieldSimilarity similarity;
private float value;
private final float idf;
private float queryNorm;
@@ -54,7 +54,7 @@
throws IOException {
assert termStates != null : "PerReaderTermState must not be null";
this.termStates = termStates;
- this.similarity = getSimilarity(searcher);
+ this.similarity = searcher.getSimilarity().get(term.field());
if (docFreq != -1) {
idfExp = similarity.idfExplain(term, searcher, docFreq);
} else {
Index: lucene/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java (revision 1059498)
+++ lucene/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java (working copy)
@@ -38,7 +38,6 @@
/** Creates a new instance by wrapping the given scorer. */
public ScoreCachingWrappingScorer(Scorer scorer) {
- super(scorer.getSimilarity());
this.scorer = scorer;
}
@@ -46,11 +45,6 @@
public boolean score(Collector collector, int max, int firstDocID) throws IOException {
return scorer.score(collector, max, firstDocID);
}
-
- @Override
- public Similarity getSimilarity() {
- return scorer.getSimilarity();
- }
@Override
public float score() throws IOException {
Index: lucene/src/java/org/apache/lucene/search/IndexSearcher.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/IndexSearcher.java (revision 1059498)
+++ lucene/src/java/org/apache/lucene/search/IndexSearcher.java (working copy)
@@ -70,8 +70,20 @@
private final ExecutorService executor;
protected final IndexSearcher[] subSearchers;
+ // the default SimilarityProvider
+ private static final SimilarityProvider defaultProvider = new DefaultSimilarity();
+
+ /**
+ * Expert: returns a default SimilarityProvider instance.
+ * In general, this should not be used.
+ * @lucene.internal
+ */
+ public static SimilarityProvider getDefaultProvider() {
+ return defaultProvider;
+ }
+
/** The Similarity implementation used by this searcher. */
- private Similarity similarity = Similarity.getDefault();
+ private SimilarityProvider similarity = defaultProvider;
/** Creates a searcher searching the index in the named
* directory, with readOnly=true
@@ -250,13 +262,12 @@
/** Expert: Set the Similarity implementation used by this Searcher.
*
- * @see Similarity#setDefault(Similarity)
*/
- public void setSimilarity(Similarity similarity) {
+ public void setSimilarity(SimilarityProvider similarity) {
this.similarity = similarity;
}
- public Similarity getSimilarity() {
+ public SimilarityProvider getSimilarity() {
return similarity;
}
Index: lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (revision 1059498)
+++ lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (working copy)
@@ -60,9 +60,12 @@
private int docID = -1;
private int freq;
+ private final FieldSimilarity similarity;
+
ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
- Similarity similarity, byte[] norms) throws IOException {
- super(similarity, weight);
+ FieldSimilarity similarity, byte[] norms) throws IOException {
+ super(weight);
+ this.similarity = similarity;
this.norms = norms;
this.value = weight.getValue();
@@ -87,7 +90,7 @@
}
for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
- scoreCache[i] = getSimilarity().tf((float) i) * value;
+ scoreCache[i] = similarity.tf((float) i) * value;
}
}
@@ -207,9 +210,9 @@
if (freq < SCORE_CACHE_SIZE) {
raw = scoreCache[freq];
} else {
- raw = getSimilarity().tf((float) freq) * value;
+ raw = similarity.tf((float) freq) * value;
}
- return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize
+ return norms == null ? raw : raw * similarity.decodeNormValue(norms[docID]); // normalize
}
private int phraseFreq() throws IOException {
Index: lucene/src/java/org/apache/lucene/search/TermScorer.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/TermScorer.java (revision 1059498)
+++ lucene/src/java/org/apache/lucene/search/TermScorer.java (working copy)
@@ -38,7 +38,8 @@
private int[] docs;
private int[] freqs;
private final DocsEnum.BulkReadResult bulkResult;
-
+ private final FieldSimilarity similarity;
+
/**
* Construct a TermScorer.
*
@@ -52,16 +53,16 @@
* @param norms
* The field norms of the document fields for the Term.
*/
- TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) {
- super(similarity, weight);
-
+ TermScorer(Weight weight, DocsEnum td, FieldSimilarity similarity, byte[] norms) {
+ super(weight);
+ this.similarity = similarity;
this.docsEnum = td;
this.norms = norms;
this.weightValue = weight.getValue();
bulkResult = td.getBulkResult();
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
- scoreCache[i] = getSimilarity().tf(i) * weightValue;
+ scoreCache[i] = similarity.tf(i) * weightValue;
}
@Override
@@ -136,9 +137,9 @@
float raw = // compute tf(f)*weight
freq < SCORE_CACHE_SIZE // check cache
? scoreCache[freq] // cache hit
- : getSimilarity().tf(freq)*weightValue; // cache miss
+ : similarity.tf(freq)*weightValue; // cache miss
- return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[doc]); // normalize for field
+ return norms == null ? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize for field
}
/**
Index: lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java (revision 1059431)
+++ lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java (working copy)
@@ -17,6 +17,7 @@
* limitations under the License.
*/
+import org.apache.lucene.search.FieldSimilarity;
import org.apache.lucene.util.ArrayUtil;
/** Taps into DocInverter, as an InvertedDocEndConsumer,
@@ -29,7 +30,8 @@
final NormsWriterPerThread perThread;
final FieldInfo fieldInfo;
final DocumentsWriter.DocState docState;
-
+ final FieldSimilarity similarity;
+
// Holds all docID/norm pairs we've seen
int[] docIDs = new int[1];
byte[] norms = new byte[1];
@@ -49,6 +51,7 @@
this.fieldInfo = fieldInfo;
docState = perThread.docState;
fieldState = docInverterPerField.fieldState;
+ similarity = docState.similarity.get(fieldInfo.name);
}
@Override
@@ -71,8 +74,8 @@
assert norms.length == upto;
norms = ArrayUtil.grow(norms, 1+upto);
}
- final float norm = docState.similarity.computeNorm(fieldInfo.name, fieldState);
- norms[upto] = docState.similarity.encodeNormValue(norm);
+ final float norm = similarity.computeNorm(fieldInfo.name, fieldState);
+ norms[upto] = similarity.encodeNormValue(norm);
docIDs[upto] = docState.docID;
upto++;
}
Index: lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (revision 1059498)
+++ lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (working copy)
@@ -31,6 +31,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMFile;
@@ -128,7 +129,7 @@
PrintStream infoStream;
int maxFieldLength = IndexWriterConfig.UNLIMITED_FIELD_LENGTH;
- Similarity similarity;
+ SimilarityProvider similarity;
// max # simultaneous threads; if there are more than
// this, they wait for others to finish first
@@ -142,7 +143,7 @@
Analyzer analyzer;
int maxFieldLength;
PrintStream infoStream;
- Similarity similarity;
+ SimilarityProvider similarity;
int docID;
Document doc;
String maxTermPrefix;
@@ -365,7 +366,7 @@
}
}
- synchronized void setSimilarity(Similarity similarity) {
+ synchronized void setSimilarity(SimilarityProvider similarity) {
this.similarity = similarity;
for(int i=0;inull is passed,
- * the similarity will be set to the default.
- *
- * @see Similarity#setDefault(Similarity)
+ * the similarity will be set to the default implementation (unspecified).
*/
- public IndexWriterConfig setSimilarity(Similarity similarity) {
- this.similarity = similarity == null ? Similarity.getDefault() : similarity;
+ public IndexWriterConfig setSimilarity(SimilarityProvider similarity) {
+ this.similarity = similarity == null ? IndexSearcher.getDefaultProvider() : similarity;
return this;
}
/**
- * Expert: returns the {@link Similarity} implementation used by this
- * IndexWriter. This defaults to the current value of
- * {@link Similarity#getDefault()}.
+ * Expert: returns the {@link SimilarityProvider} implementation used by this
+ * IndexWriter.
*/
- public Similarity getSimilarity() {
+ public SimilarityProvider getSimilarity() {
return similarity;
}
Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
===================================================================
--- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (revision 1059498)
+++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (working copy)
@@ -42,7 +42,9 @@
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CollectionUtil;
@@ -67,7 +69,7 @@
private final InstantiatedIndex index;
private final Analyzer analyzer;
- private Similarity similarity = Similarity.getDefault(); // how to normalize;
+ private SimilarityProvider similarity = IndexSearcher.getDefaultProvider(); // how to normalize;
private transient Set