Index: solr/src/test/org/apache/solr/schema/IndexSchemaTest.java =================================================================== --- solr/src/test/org/apache/solr/schema/IndexSchemaTest.java (revision 1059498) +++ solr/src/test/org/apache/solr/schema/IndexSchemaTest.java (working copy) @@ -28,6 +28,7 @@ import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrQueryRequest; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.SimilarityProvider; import org.junit.BeforeClass; import org.junit.Test; @@ -83,7 +84,7 @@ @Test public void testSimilarityFactory() { SolrCore core = h.getCore(); - Similarity similarity = core.getSchema().getSimilarity(); + SimilarityProvider similarity = core.getSchema().getSimilarity(); assertTrue("wrong class", similarity instanceof MockConfigurableSimilarity); assertEquals("is there an echo?", ((MockConfigurableSimilarity)similarity).getPassthrough()); } Index: solr/src/java/org/apache/solr/schema/IndexSchema.java =================================================================== --- solr/src/java/org/apache/solr/schema/IndexSchema.java (revision 1059498) +++ solr/src/java/org/apache/solr/schema/IndexSchema.java (working copy) @@ -20,7 +20,9 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.util.Version; import org.apache.solr.common.ResourceLoader; @@ -192,7 +194,7 @@ /** * Returns the Similarity used for this index */ - public Similarity getSimilarity() { return similarityFactory.getSimilarity(); } + public SimilarityProvider getSimilarity() { return similarityFactory.getSimilarity(); } /** * Returns the SimilarityFactory used for this index @@ -496,8 +498,8 @@ Node node = (Node) xpath.evaluate("/schema/similarity", document, XPathConstants.NODE); if (node==null) { similarityFactory = new SimilarityFactory() { - public Similarity getSimilarity() { - return Similarity.getDefault(); + public SimilarityProvider getSimilarity() { + return IndexSearcher.getDefaultProvider(); } }; log.debug("using default similarity"); @@ -509,10 +511,10 @@ similarityFactory = (SimilarityFactory)obj; similarityFactory.init(params); } else { - // just like always, assume it's a Similarlity and get a ClassCastException - reasonable error handling + // just like always, assume it's a SimilarityProvider and get a ClassCastException - reasonable error handling similarityFactory = new SimilarityFactory() { - public Similarity getSimilarity() { - return (Similarity) obj; + public SimilarityProvider getSimilarity() { + return (SimilarityProvider) obj; } }; } Index: solr/src/java/org/apache/solr/schema/LatLonType.java =================================================================== --- solr/src/java/org/apache/solr/schema/LatLonType.java (revision 1059498) +++ solr/src/java/org/apache/solr/schema/LatLonType.java (working copy) @@ -371,7 +371,7 @@ @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new SpatialScorer(getSimilarity(searcher), context, this); + return new SpatialScorer(context, this); } @Override @@ -404,8 +404,7 @@ int lastDistDoc; double lastDist; - public SpatialScorer(Similarity similarity, AtomicReaderContext readerContext, SpatialWeight w) throws IOException { - super(similarity); + public SpatialScorer(AtomicReaderContext readerContext, SpatialWeight w) throws IOException { this.weight = w; this.qWeight = w.getValue(); this.reader = readerContext.reader; Index: solr/src/java/org/apache/solr/schema/SimilarityFactory.java =================================================================== --- solr/src/java/org/apache/solr/schema/SimilarityFactory.java (revision 1059498) +++ solr/src/java/org/apache/solr/schema/SimilarityFactory.java (working copy) @@ -16,7 +16,7 @@ * limitations under the License. */ -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.SimilarityProvider; import org.apache.solr.common.params.SolrParams; public abstract class SimilarityFactory { @@ -25,5 +25,5 @@ public void init(SolrParams params) { this.params = params; } public SolrParams getParams() { return params; } - public abstract Similarity getSimilarity(); + public abstract SimilarityProvider getSimilarity(); } Index: solr/src/java/org/apache/solr/search/function/FunctionQuery.java =================================================================== --- solr/src/java/org/apache/solr/search/function/FunctionQuery.java (revision 1059498) +++ solr/src/java/org/apache/solr/search/function/FunctionQuery.java (working copy) @@ -95,7 +95,7 @@ @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new AllScorer(getSimilarity(searcher), context, this); + return new AllScorer(context, this); } @Override @@ -114,8 +114,7 @@ final boolean hasDeletions; final Bits delDocs; - public AllScorer(Similarity similarity, AtomicReaderContext context, FunctionWeight w) throws IOException { - super(similarity); + public AllScorer(AtomicReaderContext context, FunctionWeight w) throws IOException { this.weight = w; this.qWeight = w.getValue(); this.reader = context.reader; Index: solr/src/java/org/apache/solr/search/function/IDFValueSource.java =================================================================== --- solr/src/java/org/apache/solr/search/function/IDFValueSource.java (revision 1059498) +++ solr/src/java/org/apache/solr/search/function/IDFValueSource.java (working copy) @@ -19,6 +19,7 @@ import org.apache.lucene.index.*; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.util.BytesRef; @@ -41,7 +42,7 @@ @Override public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { IndexSearcher searcher = (IndexSearcher)context.get("searcher"); - Similarity sim = searcher.getSimilarity(); + FieldSimilarity sim = searcher.getSimilarity().get(field); // todo: we need docFreq that takes a BytesRef String strVal = ByteUtils.UTF8toUTF16(indexedBytes); int docfreq = searcher.docFreq(new Term(indexedField, strVal)); Index: solr/src/java/org/apache/solr/search/function/BoostedQuery.java =================================================================== --- solr/src/java/org/apache/solr/search/function/BoostedQuery.java (revision 1059498) +++ solr/src/java/org/apache/solr/search/function/BoostedQuery.java (working copy) @@ -96,7 +96,7 @@ if(subQueryScorer == null) { return null; } - return new BoostedQuery.CustomScorer(getSimilarity(searcher), context, this, subQueryScorer, boostVal); + return new BoostedQuery.CustomScorer(context, this, subQueryScorer, boostVal); } @Override @@ -123,9 +123,8 @@ private final DocValues vals; private final AtomicReaderContext readerContext; - private CustomScorer(Similarity similarity, AtomicReaderContext readerContext, BoostedQuery.BoostedWeight w, + private CustomScorer(AtomicReaderContext readerContext, BoostedQuery.BoostedWeight w, Scorer scorer, ValueSource vs) throws IOException { - super(similarity); this.weight = w; this.qWeight = w.getValue(); this.scorer = scorer; Index: solr/src/java/org/apache/solr/search/function/TFValueSource.java =================================================================== --- solr/src/java/org/apache/solr/search/function/TFValueSource.java (revision 1059498) +++ solr/src/java/org/apache/solr/search/function/TFValueSource.java (working copy) @@ -3,8 +3,8 @@ import org.apache.lucene.index.*; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Similarity; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrException; @@ -25,7 +25,7 @@ public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { Fields fields = readerContext.reader.fields(); final Terms terms = fields.terms(field); - final Similarity similarity = ((IndexSearcher)context.get("searcher")).getSimilarity(); + final FieldSimilarity similarity = ((IndexSearcher)context.get("searcher")).getSimilarity().get(field); return new FloatDocValues(this) { DocsEnum docs ; Index: solr/src/java/org/apache/solr/search/function/NormValueSource.java =================================================================== --- solr/src/java/org/apache/solr/search/function/NormValueSource.java (revision 1059498) +++ solr/src/java/org/apache/solr/search/function/NormValueSource.java (working copy) @@ -18,6 +18,7 @@ package org.apache.solr.search.function; import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; import java.io.IOException; @@ -46,7 +47,7 @@ @Override public DocValues getValues(Map context, AtomicReaderContext readerContext) throws IOException { IndexSearcher searcher = (IndexSearcher)context.get("searcher"); - final Similarity similarity = searcher.getSimilarity(); + final FieldSimilarity similarity = searcher.getSimilarity().get(field); final byte[] norms = readerContext.reader.norms(field); if (norms == null) { return new ConstDoubleDocValues(0.0, this); Index: solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java =================================================================== --- solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java (revision 1059498) +++ solr/src/java/org/apache/solr/search/SolrConstantScoreQuery.java (working copy) @@ -55,13 +55,11 @@ } protected class ConstantWeight extends Weight { - private Similarity similarity; private float queryNorm; private float queryWeight; private Map context; public ConstantWeight(IndexSearcher searcher) throws IOException { - this.similarity = getSimilarity(searcher); this.context = ValueSource.newContext(searcher); if (filter instanceof SolrFilter) ((SolrFilter)filter).createWeight(context, searcher); @@ -91,13 +89,13 @@ @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new ConstantScorer(similarity, context, this); + return new ConstantScorer(context, this); } @Override public Explanation explain(AtomicReaderContext context, int doc) throws IOException { - ConstantScorer cs = new ConstantScorer(similarity, context, this); + ConstantScorer cs = new ConstantScorer(context, this); boolean exists = cs.docIdSetIterator.advance(doc) == doc; ComplexExplanation result = new ComplexExplanation(); @@ -124,8 +122,7 @@ final float theScore; int doc = -1; - public ConstantScorer(Similarity similarity, AtomicReaderContext context, ConstantWeight w) throws IOException { - super(similarity); + public ConstantScorer(AtomicReaderContext context, ConstantWeight w) throws IOException { theScore = w.getValue(); DocIdSet docIdSet = filter instanceof SolrFilter ? ((SolrFilter)filter).getDocIdSet(w.context, context) : filter.getDocIdSet(context); if (docIdSet == null) { Index: lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/search/TestBooleanScorer.java (working copy) @@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanQuery.BooleanWeight; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -68,8 +69,14 @@ // 'more' variable to work properly, and this test ensures that if the logic // changes, we have a test to back it up. - Similarity sim = Similarity.getDefault(); - Scorer[] scorers = new Scorer[] {new Scorer(sim) { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, directory); + writer.commit(); + IndexReader ir = writer.getReader(); + writer.close(); + IndexSearcher searcher = new IndexSearcher(ir); + + Scorer[] scorers = new Scorer[] {new Scorer() { private int doc = -1; @Override public float score() throws IOException { return 0; } @Override public int docID() { return doc; } @@ -83,10 +90,15 @@ } }}; - BooleanScorer bs = new BooleanScorer(null, false, sim, 1, Arrays.asList(scorers), null, scorers.length); + BooleanWeight weight = (BooleanWeight) new BooleanQuery().createWeight(searcher); + BooleanScorer bs = new BooleanScorer(weight, false, 1, Arrays.asList(scorers), null, scorers.length); assertEquals("should have received 3000", 3000, bs.nextDoc()); assertEquals("should have received NO_MORE_DOCS", DocIdSetIterator.NO_MORE_DOCS, bs.nextDoc()); + searcher.close(); + ir.close(); + directory.close(); + } } Index: lucene/src/test/org/apache/lucene/search/spans/TestSpans.java =================================================================== --- lucene/src/test/org/apache/lucene/search/spans/TestSpans.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/search/spans/TestSpans.java (working copy) @@ -23,6 +23,7 @@ import org.apache.lucene.search.Similarity; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Weight.ScorerContext; @@ -410,20 +411,21 @@ } }; - SpanNearQuery snq = new SpanNearQuery( + final SimilarityProvider oldSim = searcher.getSimilarity(); + Scorer spanScorer; + try { + searcher.setSimilarity(sim); + SpanNearQuery snq = new SpanNearQuery( new SpanQuery[] { makeSpanTermQuery("t1"), makeSpanTermQuery("t2") }, slop, - ordered) { - @Override - public Similarity getSimilarity(IndexSearcher s) { - return sim; - } - }; + ordered); - Scorer spanScorer = snq.weight(searcher).scorer(new AtomicReaderContext(new SlowMultiReaderWrapper(searcher.getIndexReader())), ScorerContext.def()); - + spanScorer = snq.weight(searcher).scorer(new AtomicReaderContext(new SlowMultiReaderWrapper(searcher.getIndexReader())), ScorerContext.def()); + } finally { + searcher.setSimilarity(oldSim); + } assertTrue("first doc", spanScorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals("first doc number", spanScorer.docID(), 11); float score = spanScorer.score(); Index: lucene/src/test/org/apache/lucene/search/TestSetNorm.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSetNorm.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/search/TestSetNorm.java (working copy) @@ -51,10 +51,11 @@ // reset the boost of each instance of this document IndexReader reader = IndexReader.open(store, false); - reader.setNorm(0, "field", Similarity.getDefault().encodeNormValue(1.0f)); - reader.setNorm(1, "field", Similarity.getDefault().encodeNormValue(2.0f)); - reader.setNorm(2, "field", Similarity.getDefault().encodeNormValue(4.0f)); - reader.setNorm(3, "field", Similarity.getDefault().encodeNormValue(16.0f)); + FieldSimilarity similarity = new DefaultSimilarity().get("field"); + reader.setNorm(0, "field", similarity.encodeNormValue(1.0f)); + reader.setNorm(1, "field", similarity.encodeNormValue(2.0f)); + reader.setNorm(2, "field", similarity.encodeNormValue(4.0f)); + reader.setNorm(3, "field", similarity.encodeNormValue(16.0f)); reader.close(); // check that searches are ordered by this boost Index: lucene/src/test/org/apache/lucene/search/TestBoolean2.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestBoolean2.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/search/TestBoolean2.java (working copy) @@ -208,7 +208,7 @@ public void testQueries10() throws Exception { String queryText = "+w3 +xx +w2 zz"; int[] expDocNrs = {2, 3}; - Similarity oldSimilarity = searcher.getSimilarity(); + SimilarityProvider oldSimilarity = searcher.getSimilarity(); try { searcher.setSimilarity(new DefaultSimilarity(){ @Override Index: lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/search/TestMatchAllDocsQuery.java (working copy) @@ -69,7 +69,7 @@ assertEquals("one", ir.document(hits[2].doc).get("key")); // change norm & retest - ir.setNorm(0, "key", Similarity.getDefault().encodeNormValue(400f)); + ir.setNorm(0, "key", is.getSimilarity().get("key").encodeNormValue(400f)); normsQuery = new MatchAllDocsQuery("key"); hits = is.search(normsQuery, null, 1000).scoreDocs; assertEquals(3, hits.length); Index: lucene/src/test/org/apache/lucene/search/JustCompileSearch.java =================================================================== --- lucene/src/test/org/apache/lucene/search/JustCompileSearch.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/search/JustCompileSearch.java (working copy) @@ -188,7 +188,7 @@ static final class JustCompilePhraseScorer extends PhraseScorer { JustCompilePhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity similarity, byte[] norms) { + FieldSimilarity similarity, byte[] norms) { super(weight, postings, similarity, norms); } @@ -210,8 +210,7 @@ static final class JustCompileScorer extends Scorer { - protected JustCompileScorer(Similarity similarity) { - super(similarity); + protected JustCompileScorer() { } @Override Index: lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java (revision 0) +++ lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java (revision 0) @@ -0,0 +1,154 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MultiNorms; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +public class TestSimilarityProvider extends LuceneTestCase { + private Directory directory; + private IndexReader reader; + private IndexSearcher searcher; + + @Override + public void setUp() throws Exception { + super.setUp(); + directory = newDirectory(); + SimilarityProvider sim = new ExampleSimilarityProvider(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer()).setSimilarity(sim); + RandomIndexWriter iw = new RandomIndexWriter(random, directory, iwc); + Document doc = new Document(); + Field field = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field); + Field field2 = newField("bar", "", Field.Store.NO, Field.Index.ANALYZED); + doc.add(field2); + + field.setValue("quick brown fox"); + field2.setValue("quick brown fox"); + iw.addDocument(doc); + field.setValue("jumps over lazy brown dog"); + field2.setValue("jumps over lazy brown dog"); + iw.addDocument(doc); + reader = iw.getReader(); + iw.close(); + searcher = new IndexSearcher(reader); + searcher.setSimilarity(sim); + } + + @Override + public void tearDown() throws Exception { + searcher.close(); + reader.close(); + directory.close(); + super.tearDown(); + } + + public void testBasics() throws Exception { + // sanity check of norms writer + byte fooNorms[] = MultiNorms.norms(reader, "foo"); + byte barNorms[] = MultiNorms.norms(reader, "bar"); + for (int i = 0; i < fooNorms.length; i++) { + assertFalse(fooNorms[i] == barNorms[i]); + } + + // sanity check of searching + TopDocs foodocs = searcher.search(new TermQuery(new Term("foo", "brown")), 10); + assertTrue(foodocs.totalHits > 0); + TopDocs bardocs = searcher.search(new TermQuery(new Term("bar", "brown")), 10); + assertTrue(bardocs.totalHits > 0); + assertTrue(foodocs.scoreDocs[0].score < bardocs.scoreDocs[0].score); + } + + private class ExampleSimilarityProvider implements SimilarityProvider { + private FieldSimilarity sim1 = new Sim1(); + private FieldSimilarity sim2 = new Sim2(); + + @Override + public float coord(int overlap, int maxOverlap) { + return 1f; + } + + @Override + public float queryNorm(float sumOfSquaredWeights) { + return 1f; + } + + @Override + public FieldSimilarity get(String field) { + if (field.equals("foo")) { + return sim1; + } else { + return sim2; + } + } + } + + private class Sim1 extends FieldSimilarity { + @Override + public float computeNorm(String field, FieldInvertState state) { + return 1f; + } + + @Override + public float sloppyFreq(int distance) { + return 1f; + } + + @Override + public float tf(float freq) { + return 1f; + } + + @Override + public float idf(int docFreq, int numDocs) { + return 1f; + } + } + + private class Sim2 extends FieldSimilarity { + @Override + public float computeNorm(String field, FieldInvertState state) { + return 10f; + } + + @Override + public float sloppyFreq(int distance) { + return 10f; + } + + @Override + public float tf(float freq) { + return 10f; + } + + @Override + public float idf(int docFreq, int numDocs) { + return 10f; + } + } +} Property changes on: lucene\src\test\org\apache\lucene\search\TestSimilarityProvider.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/TestIndexReaderClone.java (working copy) @@ -18,6 +18,8 @@ */ import org.apache.lucene.index.SegmentReader.Norm; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.Similarity; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -272,13 +274,14 @@ * @throws Exception */ private void performDefaultTests(IndexReader r1) throws Exception { - float norm1 = Similarity.getDefault().decodeNormValue(MultiNorms.norms(r1, "field1")[4]); + FieldSimilarity sim = new DefaultSimilarity().get("field1"); + float norm1 = sim.decodeNormValue(MultiNorms.norms(r1, "field1")[4]); IndexReader pr1Clone = (IndexReader) r1.clone(); pr1Clone.deleteDocument(10); - pr1Clone.setNorm(4, "field1", Similarity.getDefault().encodeNormValue(0.5f)); - assertTrue(Similarity.getDefault().decodeNormValue(MultiNorms.norms(r1, "field1")[4]) == norm1); - assertTrue(Similarity.getDefault().decodeNormValue(MultiNorms.norms(pr1Clone, "field1")[4]) != norm1); + pr1Clone.setNorm(4, "field1", sim.encodeNormValue(0.5f)); + assertTrue(sim.decodeNormValue(MultiNorms.norms(r1, "field1")[4]) == norm1); + assertTrue(sim.decodeNormValue(MultiNorms.norms(pr1Clone, "field1")[4]) != norm1); final Bits delDocs = MultiFields.getDeletedDocs(r1); assertTrue(delDocs == null || !delDocs.get(10)); @@ -327,7 +330,8 @@ TestIndexReaderReopen.createIndex(random, dir1, false); SegmentReader origSegmentReader = getOnlySegmentReader(IndexReader.open(dir1, false)); origSegmentReader.deleteDocument(1); - origSegmentReader.setNorm(4, "field1", Similarity.getDefault().encodeNormValue(0.5f)); + FieldSimilarity sim = new DefaultSimilarity().get("field1"); + origSegmentReader.setNorm(4, "field1", sim.encodeNormValue(0.5f)); SegmentReader clonedSegmentReader = (SegmentReader) origSegmentReader .clone(); @@ -426,8 +430,9 @@ final Directory dir1 = newDirectory(); TestIndexReaderReopen.createIndex(random, dir1, false); IndexReader orig = IndexReader.open(dir1, false); - orig.setNorm(1, "field1", Similarity.getDefault().encodeNormValue(17.0f)); - final byte encoded = Similarity.getDefault().encodeNormValue(17.0f); + FieldSimilarity sim = new DefaultSimilarity().get("field1"); + orig.setNorm(1, "field1", sim.encodeNormValue(17.0f)); + final byte encoded = sim.encodeNormValue(17.0f); assertEquals(encoded, MultiNorms.norms(orig, "field1")[1]); // the cloned segmentreader should have 2 references, 1 to itself, and 1 to Index: lucene/src/test/org/apache/lucene/index/TestIndexReader.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexReader.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/TestIndexReader.java (working copy) @@ -39,8 +39,10 @@ import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Similarity; @@ -464,8 +466,9 @@ // expected } + FieldSimilarity sim = new DefaultSimilarity().get("aaa"); try { - reader.setNorm(5, "aaa", Similarity.getDefault().encodeNormValue(2.0f)); + reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f)); fail("setNorm after close failed to throw IOException"); } catch (AlreadyClosedException e) { // expected @@ -504,8 +507,9 @@ // expected } + FieldSimilarity sim = new DefaultSimilarity().get("aaa"); try { - reader.setNorm(5, "aaa", Similarity.getDefault().encodeNormValue(2.0f)); + reader.setNorm(5, "aaa", sim.encodeNormValue(2.0f)); fail("setNorm should have hit LockObtainFailedException"); } catch (LockObtainFailedException e) { // expected @@ -535,7 +539,8 @@ // now open reader & set norm for doc 0 IndexReader reader = IndexReader.open(dir, false); - reader.setNorm(0, "content", Similarity.getDefault().encodeNormValue(2.0f)); + FieldSimilarity sim = new DefaultSimilarity().get("content"); + reader.setNorm(0, "content", sim.encodeNormValue(2.0f)); // we should be holding the write lock now: assertTrue("locked", IndexWriter.isLocked(dir)); @@ -549,7 +554,7 @@ IndexReader reader2 = IndexReader.open(dir, false); // set norm again for doc 0 - reader.setNorm(0, "content", Similarity.getDefault().encodeNormValue(3.0f)); + reader.setNorm(0, "content", sim.encodeNormValue(3.0f)); assertTrue("locked", IndexWriter.isLocked(dir)); reader.close(); @@ -579,15 +584,16 @@ addDoc(writer, searchTerm.text()); writer.close(); + FieldSimilarity sim = new DefaultSimilarity().get("content"); // now open reader & set norm for doc 0 (writes to // _0_1.s0) reader = IndexReader.open(dir, false); - reader.setNorm(0, "content", Similarity.getDefault().encodeNormValue(2.0f)); + reader.setNorm(0, "content", sim.encodeNormValue(2.0f)); reader.close(); // now open reader again & set norm for doc 0 (writes to _0_2.s0) reader = IndexReader.open(dir, false); - reader.setNorm(0, "content", Similarity.getDefault().encodeNormValue(2.0f)); + reader.setNorm(0, "content", sim.encodeNormValue(2.0f)); reader.close(); assertFalse("failed to remove first generation norms file on writing second generation", dir.fileExists("_0_1.s0")); @@ -949,13 +955,13 @@ dir.setMaxSizeInBytes(thisDiskFree); dir.setRandomIOExceptionRate(rate); - + FieldSimilarity sim = new DefaultSimilarity().get("content"); try { if (0 == x) { int docId = 12; for(int i=0;i<13;i++) { reader.deleteDocument(docId); - reader.setNorm(docId, "content", Similarity.getDefault().encodeNormValue(2.0f)); + reader.setNorm(docId, "content", sim.encodeNormValue(2.0f)); docId += 12; } } @@ -1113,8 +1119,9 @@ } reader = IndexReader.open(dir, false); + FieldSimilarity sim = new DefaultSimilarity().get("content"); try { - reader.setNorm(1, "content", Similarity.getDefault().encodeNormValue(2.0f)); + reader.setNorm(1, "content", sim.encodeNormValue(2.0f)); fail("did not hit exception when calling setNorm on an invalid doc number"); } catch (ArrayIndexOutOfBoundsException e) { // expected Index: lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/TestIndexFileDeleter.java (working copy) @@ -18,6 +18,8 @@ */ import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; @@ -67,9 +69,9 @@ Term searchTerm = new Term("id", "7"); int delCount = reader.deleteDocuments(searchTerm); assertEquals("didn't delete the right number of documents", 1, delCount); - + FieldSimilarity sim = new DefaultSimilarity().get("content"); // Set one norm so we get a .s0 file: - reader.setNorm(21, "content", Similarity.getDefault().encodeNormValue(1.5f)); + reader.setNorm(21, "content", sim.encodeNormValue(1.5f)); reader.close(); // Now, artificially create an extra .del file & extra Index: lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java (working copy) @@ -29,6 +29,7 @@ import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.util.LuceneTestCase; import org.junit.Test; @@ -67,7 +68,8 @@ assertEquals(IndexWriterConfig.UNLIMITED_FIELD_LENGTH, conf.getMaxFieldLength()); assertEquals(ConcurrentMergeScheduler.class, conf.getMergeScheduler().getClass()); assertEquals(OpenMode.CREATE_OR_APPEND, conf.getOpenMode()); - assertTrue(Similarity.getDefault() == conf.getSimilarity()); + // we don't need to assert this, it should be unspecified + assertTrue(IndexSearcher.getDefaultProvider() == conf.getSimilarity()); assertEquals(IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL, conf.getTermIndexInterval()); assertEquals(IndexWriterConfig.getDefaultWriteLockTimeout(), conf.getWriteLockTimeout()); assertEquals(IndexWriterConfig.WRITE_LOCK_TIMEOUT, IndexWriterConfig.getDefaultWriteLockTimeout()); @@ -186,12 +188,13 @@ conf.setMergeScheduler(null); assertEquals(ConcurrentMergeScheduler.class, conf.getMergeScheduler().getClass()); - // Test Similarity - assertTrue(Similarity.getDefault() == conf.getSimilarity()); + // Test Similarity: + // we shouldnt assert what the default is, just that its not null. + assertTrue(IndexSearcher.getDefaultProvider() == conf.getSimilarity()); conf.setSimilarity(new MySimilarity()); assertEquals(MySimilarity.class, conf.getSimilarity().getClass()); conf.setSimilarity(null); - assertTrue(Similarity.getDefault() == conf.getSimilarity()); + assertTrue(IndexSearcher.getDefaultProvider() == conf.getSimilarity()); // Test IndexingChain assertTrue(DocumentsWriter.defaultIndexingChain == conf.getIndexingChain()); Index: lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/TestIndexReaderCloneNorms.java (working copy) @@ -31,7 +31,9 @@ import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.SegmentReader.Norm; import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -203,19 +205,20 @@ IndexReader reader4C = (IndexReader) reader3C.clone(); SegmentReader segmentReader4C = getOnlySegmentReader(reader4C); assertEquals(4, reader3CCNorm.bytesRef().get()); - reader4C.setNorm(5, "field1", Similarity.getDefault().encodeNormValue(0.33f)); + FieldSimilarity sim = new DefaultSimilarity().get("field1"); + reader4C.setNorm(5, "field1", sim.encodeNormValue(0.33f)); // generate a cannot update exception in reader1 try { - reader3C.setNorm(1, "field1", Similarity.getDefault().encodeNormValue(0.99f)); + reader3C.setNorm(1, "field1", sim.encodeNormValue(0.99f)); fail("did not hit expected exception"); } catch (Exception ex) { // expected } // norm values should be different - assertTrue(Similarity.getDefault().decodeNormValue(segmentReader3C.norms("field1")[5]) - != Similarity.getDefault().decodeNormValue(segmentReader4C.norms("field1")[5])); + assertTrue(sim.decodeNormValue(segmentReader3C.norms("field1")[5]) + != sim.decodeNormValue(segmentReader4C.norms("field1")[5])); Norm reader4CCNorm = segmentReader4C.norms.get("field1"); assertEquals(3, reader3CCNorm.bytesRef().get()); assertEquals(1, reader4CCNorm.bytesRef().get()); @@ -223,7 +226,7 @@ IndexReader reader5C = (IndexReader) reader4C.clone(); SegmentReader segmentReader5C = getOnlySegmentReader(reader5C); Norm reader5CCNorm = segmentReader5C.norms.get("field1"); - reader5C.setNorm(5, "field1", Similarity.getDefault().encodeNormValue(0.7f)); + reader5C.setNorm(5, "field1", sim.encodeNormValue(0.7f)); assertEquals(1, reader5CCNorm.bytesRef().get()); reader5C.close(); @@ -256,8 +259,9 @@ // System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); modifiedNorms.set(i, Float.valueOf(newNorm)); modifiedNorms.set(k, Float.valueOf(origNorm)); - ir.setNorm(i, "f" + 1, Similarity.getDefault().encodeNormValue(newNorm)); - ir.setNorm(k, "f" + 1, Similarity.getDefault().encodeNormValue(origNorm)); + FieldSimilarity sim = new DefaultSimilarity().get("f" + 1); + ir.setNorm(i, "f" + 1, sim.encodeNormValue(newNorm)); + ir.setNorm(k, "f" + 1, sim.encodeNormValue(origNorm)); // System.out.println("setNorm i: "+i); // break; } @@ -277,7 +281,8 @@ assertEquals("number of norms mismatches", numDocNorms, b.length); ArrayList storedNorms = (i == 1 ? modifiedNorms : norms); for (int j = 0; j < b.length; j++) { - float norm = Similarity.getDefault().decodeNormValue(b[j]); + FieldSimilarity sim = new DefaultSimilarity().get(field); + float norm = sim.decodeNormValue(b[j]); float norm1 = storedNorms.get(j).floatValue(); assertEquals("stored norm value of " + field + " for doc " + j + " is " + norm + " - a mismatch!", norm, norm1, 0.000001); @@ -316,8 +321,10 @@ private float nextNorm() { float norm = lastNorm + normDelta; do { - float norm1 = Similarity.getDefault().decodeNormValue( - Similarity.getDefault().encodeNormValue(norm)); + // bogus test, where is my field? + FieldSimilarity sim = new DefaultSimilarity().get("TODO: add field to this method"); + float norm1 = sim.decodeNormValue( + sim.encodeNormValue(norm)); if (norm1 > lastNorm) { // System.out.println(norm1+" > "+lastNorm); norm = norm1; Index: lucene/src/test/org/apache/lucene/index/DocHelper.java =================================================================== --- lucene/src/test/org/apache/lucene/index/DocHelper.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/DocHelper.java (working copy) @@ -28,7 +28,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; -import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.store.Directory; import static org.apache.lucene.util.LuceneTestCase.TEST_VERSION_CURRENT; @@ -220,7 +220,7 @@ */ public static SegmentInfo writeDoc(Directory dir, Document doc) throws IOException { - return writeDoc(dir, new MockAnalyzer(MockTokenizer.WHITESPACE, false), Similarity.getDefault(), doc); + return writeDoc(dir, new MockAnalyzer(MockTokenizer.WHITESPACE, false), null, doc); } /** @@ -233,7 +233,7 @@ * @param doc * @throws IOException */ - public static SegmentInfo writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc) throws IOException { + public static SegmentInfo writeDoc(Directory dir, Analyzer analyzer, SimilarityProvider similarity, Document doc) throws IOException { IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( TEST_VERSION_CURRENT, analyzer).setSimilarity(similarity)); //writer.setUseCompoundFile(false); Index: lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/TestIndexReaderReopen.java (working copy) @@ -35,9 +35,12 @@ import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; @@ -615,8 +618,9 @@ IndexReader reader2 = reader1.reopen(); modifier = IndexReader.open(dir1, false); - modifier.setNorm(1, "field1", Similarity.getDefault().encodeNormValue(50f)); - modifier.setNorm(1, "field2", Similarity.getDefault().encodeNormValue(50f)); + SimilarityProvider sim = new DefaultSimilarity(); + modifier.setNorm(1, "field1", sim.get("field1").encodeNormValue(50f)); + modifier.setNorm(1, "field2", sim.get("field2").encodeNormValue(50f)); modifier.close(); IndexReader reader3 = reader2.reopen(); @@ -709,7 +713,8 @@ protected void modifyIndex(int i) throws IOException { if (i % 3 == 0) { IndexReader modifier = IndexReader.open(dir, false); - modifier.setNorm(i, "field1", Similarity.getDefault().encodeNormValue(50f)); + FieldSimilarity sim = new DefaultSimilarity().get("field1"); + modifier.setNorm(i, "field1", sim.encodeNormValue(50f)); modifier.close(); } else if (i % 3 == 1) { IndexReader modifier = IndexReader.open(dir, false); @@ -985,9 +990,10 @@ } case 1: { IndexReader reader = IndexReader.open(dir, false); - reader.setNorm(4, "field1", Similarity.getDefault().encodeNormValue(123f)); - reader.setNorm(44, "field2", Similarity.getDefault().encodeNormValue(222f)); - reader.setNorm(44, "field4", Similarity.getDefault().encodeNormValue(22f)); + SimilarityProvider sim = new DefaultSimilarity(); + reader.setNorm(4, "field1", sim.get("field1").encodeNormValue(123f)); + reader.setNorm(44, "field2", sim.get("field2").encodeNormValue(222f)); + reader.setNorm(44, "field4", sim.get("field4").encodeNormValue(22f)); reader.close(); break; } @@ -1008,8 +1014,9 @@ } case 4: { IndexReader reader = IndexReader.open(dir, false); - reader.setNorm(5, "field1", Similarity.getDefault().encodeNormValue(123f)); - reader.setNorm(55, "field2", Similarity.getDefault().encodeNormValue(222f)); + SimilarityProvider sim = new DefaultSimilarity(); + reader.setNorm(5, "field1", sim.get("field1").encodeNormValue(123f)); + reader.setNorm(55, "field2", sim.get("field2").encodeNormValue(222f)); reader.close(); break; } Index: lucene/src/test/org/apache/lucene/index/TestParallelReader.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestParallelReader.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/TestParallelReader.java (working copy) @@ -147,7 +147,8 @@ assertTrue(pr.isCurrent()); IndexReader modifier = IndexReader.open(dir1, false); - modifier.setNorm(0, "f1", Similarity.getDefault().encodeNormValue(100f)); + SimilarityProvider sim = new DefaultSimilarity(); + modifier.setNorm(0, "f1", sim.get("f1").encodeNormValue(100f)); modifier.close(); // one of the two IndexReaders which ParallelReader is using @@ -155,7 +156,7 @@ assertFalse(pr.isCurrent()); modifier = IndexReader.open(dir2, false); - modifier.setNorm(0, "f3", Similarity.getDefault().encodeNormValue(100f)); + modifier.setNorm(0, "f3", sim.get("f3").encodeNormValue(100f)); modifier.close(); // now both are not current anymore Index: lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/TestDeletionPolicy.java (working copy) @@ -30,7 +30,6 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Similarity; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -608,7 +607,7 @@ writer.close(); IndexReader reader = IndexReader.open(dir, policy, false); reader.deleteDocument(3*i+1); - reader.setNorm(4*i+1, "content", Similarity.getDefault().encodeNormValue(2.0F)); + reader.setNorm(4*i+1, "content", conf.getSimilarity().get("content").encodeNormValue(2.0F)); IndexSearcher searcher = new IndexSearcher(reader); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(16*(1+i), hits.length); @@ -716,7 +715,7 @@ writer.close(); IndexReader reader = IndexReader.open(dir, policy, false); reader.deleteDocument(3); - reader.setNorm(5, "content", Similarity.getDefault().encodeNormValue(2.0F)); + reader.setNorm(5, "content", conf.getSimilarity().get("content").encodeNormValue(2.0F)); IndexSearcher searcher = new IndexSearcher(reader); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(16, hits.length); Index: lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (working copy) @@ -38,12 +38,14 @@ import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Bits; @@ -424,7 +426,7 @@ Term searchTerm = new Term("id", "6"); int delCount = reader.deleteDocuments(searchTerm); assertEquals("wrong delete count", 1, delCount); - reader.setNorm(searcher.search(new TermQuery(new Term("id", "22")), 10).scoreDocs[0].doc, "content", Similarity.getDefault().encodeNormValue(2.0f)); + reader.setNorm(searcher.search(new TermQuery(new Term("id", "22")), 10).scoreDocs[0].doc, "content", searcher.getSimilarity().get("content").encodeNormValue(2.0f)); reader.close(); searcher.close(); @@ -472,7 +474,7 @@ Term searchTerm = new Term("id", "6"); int delCount = reader.deleteDocuments(searchTerm); assertEquals("wrong delete count", 1, delCount); - reader.setNorm(22, "content", Similarity.getDefault().encodeNormValue(2.0f)); + reader.setNorm(22, "content", searcher.getSimilarity().get("content").encodeNormValue(2.0f)); reader.close(); // make sure they "took": @@ -531,7 +533,7 @@ assertEquals("didn't delete the right number of documents", 1, delCount); // Set one norm so we get a .s0 file: - reader.setNorm(21, "content", Similarity.getDefault().encodeNormValue(1.5f)); + reader.setNorm(21, "content", conf.getSimilarity().get("content").encodeNormValue(1.5f)); reader.close(); } @@ -568,7 +570,8 @@ assertEquals("didn't delete the right number of documents", 1, delCount); // Set one norm so we get a .s0 file: - reader.setNorm(21, "content", Similarity.getDefault().encodeNormValue(1.5f)); + SimilarityProvider sim = new DefaultSimilarity(); + reader.setNorm(21, "content", sim.get("content").encodeNormValue(1.5f)); reader.close(); // The numbering of fields can vary depending on which Index: lucene/src/test/org/apache/lucene/index/TestNorms.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestNorms.java (revision 1059498) +++ lucene/src/test/org/apache/lucene/index/TestNorms.java (working copy) @@ -29,6 +29,7 @@ import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -169,8 +170,9 @@ //System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); modifiedNorms.set(i, Float.valueOf(newNorm)); modifiedNorms.set(k, Float.valueOf(origNorm)); - ir.setNorm(i, "f"+1, Similarity.getDefault().encodeNormValue(newNorm)); - ir.setNorm(k, "f"+1, Similarity.getDefault().encodeNormValue(origNorm)); + FieldSimilarity sim = new DefaultSimilarity().get("f"+1); + ir.setNorm(i, "f"+1, sim.encodeNormValue(newNorm)); + ir.setNorm(k, "f"+1, sim.encodeNormValue(origNorm)); } ir.close(); } Index: lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/ConstantScoreQuery.java (working copy) @@ -97,12 +97,10 @@ protected class ConstantWeight extends Weight { private final Weight innerWeight; - private final Similarity similarity; private float queryNorm; private float queryWeight; public ConstantWeight(IndexSearcher searcher) throws IOException { - this.similarity = getSimilarity(searcher); this.innerWeight = (query == null) ? null : query.createWeight(searcher); } @@ -148,7 +146,7 @@ } if (disi == null) return null; - return new ConstantScorer(similarity, disi, this); + return new ConstantScorer(disi, this); } @Override @@ -181,8 +179,8 @@ final DocIdSetIterator docIdSetIterator; final float theScore; - public ConstantScorer(Similarity similarity, DocIdSetIterator docIdSetIterator, Weight w) throws IOException { - super(similarity,w); + public ConstantScorer(DocIdSetIterator docIdSetIterator, Weight w) throws IOException { + super(w); theScore = w.getValue(); this.docIdSetIterator = docIdSetIterator; } @@ -212,8 +210,7 @@ @Override public void setScorer(Scorer scorer) throws IOException { // we must wrap again here, but using the scorer passed in as parameter: - collector.setScorer(new ConstantScorer(ConstantScorer.this.getSimilarity(), - scorer, ConstantScorer.this.weight)); + collector.setScorer(new ConstantScorer(scorer, ConstantScorer.this.weight)); } @Override Index: lucene/src/java/org/apache/lucene/search/BooleanScorer2.java =================================================================== --- lucene/src/java/org/apache/lucene/search/BooleanScorer2.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/BooleanScorer2.java (working copy) @@ -22,6 +22,7 @@ import java.util.List; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery.BooleanWeight; /* See the description in BooleanScorer.java, comparing * BooleanScorer & BooleanScorer2 */ @@ -42,10 +43,10 @@ int maxCoord = 0; // to be increased for each non prohibited scorer int nrMatchers; // to be increased by score() of match counting scorers. - void init(Similarity sim, boolean disableCoord) { // use after all scorers have been added. + void init(boolean disableCoord) { // use after all scorers have been added. coordFactors = new float[optionalScorers.size() + requiredScorers.size() + 1]; for (int i = 0; i < coordFactors.length; i++) { - coordFactors[i] = disableCoord ? 1.0f : sim.coord(i, maxCoord); + coordFactors[i] = disableCoord ? 1.0f : ((BooleanWeight)weight).coord(i, maxCoord); } } } @@ -80,9 +81,9 @@ * @param optional * the list of optional scorers. */ - public BooleanScorer2(Weight weight, boolean disableCoord, Similarity similarity, int minNrShouldMatch, + public BooleanScorer2(BooleanWeight weight, boolean disableCoord, int minNrShouldMatch, List required, List prohibited, List optional, int maxCoord) throws IOException { - super(null, weight); // Similarity not used + super(weight); if (minNrShouldMatch < 0) { throw new IllegalArgumentException("Minimum number of optional scorers should not be negative"); } @@ -94,8 +95,8 @@ requiredScorers = required; prohibitedScorers = prohibited; - coordinator.init(similarity, disableCoord); - countingSumScorer = makeCountingSumScorer(disableCoord, similarity); + coordinator.init(disableCoord); + countingSumScorer = makeCountingSumScorer(disableCoord); } /** Count a scorer as a single match. */ @@ -107,7 +108,6 @@ private float lastDocScore = Float.NaN; SingleMatchScorer(Scorer scorer) { - super(null); // No similarity used. this.scorer = scorer; } @@ -163,11 +163,10 @@ } private Scorer countingConjunctionSumScorer(boolean disableCoord, - Similarity similarity, List requiredScorers) throws IOException { // each scorer from the list counted as a single matcher final int requiredNrMatchers = requiredScorers.size(); - return new ConjunctionScorer(disableCoord ? 1.0f : similarity.coord(requiredScorers.size(), requiredScorers.size()), requiredScorers) { + return new ConjunctionScorer(disableCoord ? 1.0f : ((BooleanWeight)weight).coord(requiredScorers.size(), requiredScorers.size()), requiredScorers) { private int lastScoredDoc = -1; // Save the score of lastScoredDoc, so that we don't compute it more than // once in score(). @@ -191,9 +190,8 @@ } private Scorer dualConjunctionSumScorer(boolean disableCoord, - Similarity similarity, Scorer req1, Scorer req2) throws IOException { // non counting. - return new ConjunctionScorer(disableCoord ? 1.0f : similarity.coord(2, 2), req1, req2); + return new ConjunctionScorer(disableCoord ? 1.0f : ((BooleanWeight)weight).coord(2, 2), req1, req2); // All scorers match, so defaultSimilarity always has 1 as // the coordination factor. // Therefore the sum of the scores of two scorers @@ -203,14 +201,13 @@ /** Returns the scorer to be used for match counting and score summing. * Uses requiredScorers, optionalScorers and prohibitedScorers. */ - private Scorer makeCountingSumScorer(boolean disableCoord, - Similarity similarity) throws IOException { // each scorer counted as a single matcher + private Scorer makeCountingSumScorer(boolean disableCoord) throws IOException { // each scorer counted as a single matcher return (requiredScorers.size() == 0) - ? makeCountingSumScorerNoReq(disableCoord, similarity) - : makeCountingSumScorerSomeReq(disableCoord, similarity); + ? makeCountingSumScorerNoReq(disableCoord) + : makeCountingSumScorerSomeReq(disableCoord); } - private Scorer makeCountingSumScorerNoReq(boolean disableCoord, Similarity similarity) throws IOException { // No required scorers + private Scorer makeCountingSumScorerNoReq(boolean disableCoord) throws IOException { // No required scorers // minNrShouldMatch optional scorers are required, but at least 1 int nrOptRequired = (minNrShouldMatch < 1) ? 1 : minNrShouldMatch; Scorer requiredCountingSumScorer; @@ -219,26 +216,25 @@ else if (optionalScorers.size() == 1) requiredCountingSumScorer = new SingleMatchScorer(optionalScorers.get(0)); else { - requiredCountingSumScorer = countingConjunctionSumScorer(disableCoord, similarity, optionalScorers); + requiredCountingSumScorer = countingConjunctionSumScorer(disableCoord, optionalScorers); } return addProhibitedScorers(requiredCountingSumScorer); } - private Scorer makeCountingSumScorerSomeReq(boolean disableCoord, Similarity similarity) throws IOException { // At least one required scorer. + private Scorer makeCountingSumScorerSomeReq(boolean disableCoord) throws IOException { // At least one required scorer. if (optionalScorers.size() == minNrShouldMatch) { // all optional scorers also required. ArrayList allReq = new ArrayList(requiredScorers); allReq.addAll(optionalScorers); - return addProhibitedScorers(countingConjunctionSumScorer(disableCoord, similarity, allReq)); + return addProhibitedScorers(countingConjunctionSumScorer(disableCoord, allReq)); } else { // optionalScorers.size() > minNrShouldMatch, and at least one required scorer Scorer requiredCountingSumScorer = requiredScorers.size() == 1 ? new SingleMatchScorer(requiredScorers.get(0)) - : countingConjunctionSumScorer(disableCoord, similarity, requiredScorers); + : countingConjunctionSumScorer(disableCoord, requiredScorers); if (minNrShouldMatch > 0) { // use a required disjunction scorer over the optional scorers return addProhibitedScorers( dualConjunctionSumScorer( // non counting disableCoord, - similarity, requiredCountingSumScorer, countingDisjunctionSumScorer( optionalScorers, Index: lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/MatchAllDocsQuery.java (working copy) @@ -51,14 +51,16 @@ private int doc = -1; private final int maxDoc; private final Bits delDocs; + private final FieldSimilarity similarity; - MatchAllScorer(IndexReader reader, Similarity similarity, Weight w, + MatchAllScorer(IndexReader reader, FieldSimilarity similarity, Weight w, byte[] norms) throws IOException { - super(similarity,w); + super(w); delDocs = reader.getDeletedDocs(); score = w.getValue(); maxDoc = reader.maxDoc(); this.norms = norms; + this.similarity = similarity; } @Override @@ -80,7 +82,7 @@ @Override public float score() { - return norms == null ? score : score * getSimilarity().decodeNormValue(norms[docID()]); + return norms == null ? score : score * similarity.decodeNormValue(norms[docID()]); } @Override @@ -91,12 +93,12 @@ } private class MatchAllDocsWeight extends Weight { - private Similarity similarity; + private FieldSimilarity similarity; private float queryWeight; private float queryNorm; public MatchAllDocsWeight(IndexSearcher searcher) { - this.similarity = searcher.getSimilarity(); + this.similarity = normsField == null ? null : searcher.getSimilarity().get(normsField); } @Override Index: lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java (working copy) @@ -95,15 +95,11 @@ * change suddenly in the next release.

*/ protected class DisjunctionMaxWeight extends Weight { - /** The Similarity implementation. */ - protected Similarity similarity; - /** The Weights for our subqueries, in 1-1 correspondence with disjuncts */ protected ArrayList weights = new ArrayList(); // The Weight's for our subqueries, in 1-1 correspondence with disjuncts /* Construct the Weight for this Query searched by searcher. Recursively construct subquery weights. */ public DisjunctionMaxWeight(IndexSearcher searcher) throws IOException { - this.similarity = searcher.getSimilarity(); for (Query disjunctQuery : disjuncts) { weights.add(disjunctQuery.createWeight(searcher)); } @@ -152,7 +148,7 @@ } } if (idx == 0) return null; // all scorers did not have documents - DisjunctionMaxScorer result = new DisjunctionMaxScorer(tieBreakerMultiplier, similarity, scorers, idx); + DisjunctionMaxScorer result = new DisjunctionMaxScorer(tieBreakerMultiplier, scorers, idx); return result; } Index: lucene/src/java/org/apache/lucene/search/Similarity.java =================================================================== --- lucene/src/java/org/apache/lucene/search/Similarity.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/Similarity.java (working copy) @@ -17,840 +17,16 @@ * limitations under the License. */ - -import java.io.IOException; import java.io.Serializable; -import java.util.Collection; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.Explanation.IDFExplanation; -import org.apache.lucene.util.SmallFloat; - - /** - * Expert: Scoring API. - * - *

Similarity defines the components of Lucene scoring. - * Overriding computation of these components is a convenient - * way to alter Lucene scoring. - * - *

Suggested reading: - * - * Introduction To Information Retrieval, Chapter 6. - * - *

The following describes how Lucene scoring evolves from - * underlying information retrieval models to (efficient) implementation. - * We first brief on VSM Score, - * then derive from it Lucene's Conceptual Scoring Formula, - * from which, finally, evolves Lucene's Practical Scoring Function - * (the latter is connected directly with Lucene classes and methods). - * - *

Lucene combines - * - * Boolean model (BM) of Information Retrieval - * with - * - * Vector Space Model (VSM) of Information Retrieval - - * documents "approved" by BM are scored by VSM. - * - *

In VSM, documents and queries are represented as - * weighted vectors in a multi-dimensional space, - * where each distinct index term is a dimension, - * and weights are - * Tf-idf values. - * - *

VSM does not require weights to be Tf-idf values, - * but Tf-idf values are believed to produce search results of high quality, - * and so Lucene is using Tf-idf. - * Tf and Idf are described in more detail below, - * but for now, for completion, let's just say that - * for given term t and document (or query) x, - * Tf(t,x) varies with the number of occurrences of term t in x - * (when one increases so does the other) and - * idf(t) similarly varies with the inverse of the - * number of index documents containing term t. - * - *

VSM score of document d for query q is the - * - * Cosine Similarity - * of the weighted query vectors V(q) and V(d): - * - *
 
- * - * - * - *
- * - * - *
- * - * - * - * - * - *
- * cosine-similarity(q,d)   =   - * - * - * - * - * - *
V(q) · V(d)
–––––––––
|V(q)| |V(d)|
- *
- *
- *
- *
VSM Score
- *
- *
 
- * - * - * Where V(q) · V(d) is the - * dot product - * of the weighted vectors, - * and |V(q)| and |V(d)| are their - * Euclidean norms. - * - *

Note: the above equation can be viewed as the dot product of - * the normalized weighted vectors, in the sense that dividing - * V(q) by its euclidean norm is normalizing it to a unit vector. - * - *

Lucene refines VSM score for both search quality and usability: - *

    - *
  • Normalizing V(d) to the unit vector is known to be problematic in that - * it removes all document length information. - * For some documents removing this info is probably ok, - * e.g. a document made by duplicating a certain paragraph 10 times, - * especially if that paragraph is made of distinct terms. - * But for a document which contains no duplicated paragraphs, - * this might be wrong. - * To avoid this problem, a different document length normalization - * factor is used, which normalizes to a vector equal to or larger - * than the unit vector: doc-len-norm(d). - *
  • - * - *
  • At indexing, users can specify that certain documents are more - * important than others, by assigning a document boost. - * For this, the score of each document is also multiplied by its boost value - * doc-boost(d). - *
  • - * - *
  • Lucene is field based, hence each query term applies to a single - * field, document length normalization is by the length of the certain field, - * and in addition to document boost there are also document fields boosts. - *
  • - * - *
  • The same field can be added to a document during indexing several times, - * and so the boost of that field is the multiplication of the boosts of - * the separate additions (or parts) of that field within the document. - *
  • - * - *
  • At search time users can specify boosts to each query, sub-query, and - * each query term, hence the contribution of a query term to the score of - * a document is multiplied by the boost of that query term query-boost(q). - *
  • - * - *
  • A document may match a multi term query without containing all - * the terms of that query (this is correct for some of the queries), - * and users can further reward documents matching more query terms - * through a coordination factor, which is usually larger when - * more terms are matched: coord-factor(q,d). - *
  • - *
- * - *

Under the simplifying assumption of a single field in the index, - * we get Lucene's Conceptual scoring formula: - * - *
 
- * - * - * - *
- * - * - *
- * - * - * - * - * - * - *
- * score(q,d)   =   - * coord-factor(q,d) ·   - * query-boost(q) ·   - * - * - * - * - * - *
V(q) · V(d)
–––––––––
|V(q)|
- *
- *   ·   doc-len-norm(d) - *   ·   doc-boost(d) - *
- *
- *
- *
Lucene Conceptual Scoring Formula
- *
- *
 
- * - *

The conceptual formula is a simplification in the sense that (1) terms and documents - * are fielded and (2) boosts are usually per query term rather than per query. - * - *

We now describe how Lucene implements this conceptual scoring formula, and - * derive from it Lucene's Practical Scoring Function. - * - *

For efficient score computation some scoring components - * are computed and aggregated in advance: - * - *

    - *
  • Query-boost for the query (actually for each query term) - * is known when search starts. - *
  • - * - *
  • Query Euclidean norm |V(q)| can be computed when search starts, - * as it is independent of the document being scored. - * From search optimization perspective, it is a valid question - * why bother to normalize the query at all, because all - * scored documents will be multiplied by the same |V(q)|, - * and hence documents ranks (their order by score) will not - * be affected by this normalization. - * There are two good reasons to keep this normalization: - *
      - *
    • Recall that - * - * Cosine Similarity can be used find how similar - * two documents are. One can use Lucene for e.g. - * clustering, and use a document as a query to compute - * its similarity to other documents. - * In this use case it is important that the score of document d3 - * for query d1 is comparable to the score of document d3 - * for query d2. In other words, scores of a document for two - * distinct queries should be comparable. - * There are other applications that may require this. - * And this is exactly what normalizing the query vector V(q) - * provides: comparability (to a certain extent) of two or more queries. - *
    • - * - *
    • Applying query normalization on the scores helps to keep the - * scores around the unit vector, hence preventing loss of score data - * because of floating point precision limitations. - *
    • - *
    - *
  • - * - *
  • Document length norm doc-len-norm(d) and document - * boost doc-boost(d) are known at indexing time. - * They are computed in advance and their multiplication - * is saved as a single value in the index: norm(d). - * (In the equations below, norm(t in d) means norm(field(t) in doc d) - * where field(t) is the field associated with term t.) - *
  • - *
- * - *

Lucene's Practical Scoring Function is derived from the above. - * The color codes demonstrate how it relates - * to those of the conceptual formula: - * - *

- * - * - * - *
- * - * - *
- * - * - * - * - * - * - * - * - * - * - * - *
- * score(q,d)   =   - * coord(q,d)  ·  - * queryNorm(q)  ·  - * - * - * - * ( - * tf(t in d)  ·  - * idf(t)2  ·  - * t.getBoost() ·  - * norm(t,d) - * ) - *
t in q
- *
- *
- *
Lucene Practical Scoring Function
- *
- * - *

where - *

    - *
  1. - * - * tf(t in d) - * correlates to the term's frequency, - * defined as the number of times term t appears in the currently scored document d. - * Documents that have more occurrences of a given term receive a higher score. - * Note that tf(t in q) is assumed to be 1 and therefore it does not appear in this equation, - * However if a query contains twice the same term, there will be - * two term-queries with that same term and hence the computation would still be correct (although - * not very efficient). - * The default computation for tf(t in d) in - * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is: - * - *
     
    - * - * - * - * - * - *
    - * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)}   =   - * - * frequency½ - *
    - *
     
    - *
  2. - * - *
  3. - * - * idf(t) stands for Inverse Document Frequency. This value - * correlates to the inverse of docFreq - * (the number of documents in which the term t appears). - * This means rarer terms give higher contribution to the total score. - * idf(t) appears for t in both the query and the document, - * hence it is squared in the equation. - * The default computation for idf(t) in - * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: - * - *
     
    - * - * - * - * - * - * - * - *
    - * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}  =   - * - * 1 + log ( - * - * - * - * - * - *
    numDocs
    –––––––––
    docFreq+1
    - *
    - * ) - *
    - *
     
    - *
  4. - * - *
  5. - * - * coord(q,d) - * is a score factor based on how many of the query terms are found in the specified document. - * Typically, a document that contains more of the query's terms will receive a higher score - * than another document with fewer query terms. - * This is a search time factor computed in - * {@link #coord(int, int) coord(q,d)} - * by the Similarity in effect at search time. - *
     
    - *
  6. - * - *
  7. - * - * queryNorm(q) - * - * is a normalizing factor used to make scores between queries comparable. - * This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), - * but rather just attempts to make scores from different queries (or even different indexes) comparable. - * This is a search time factor computed by the Similarity in effect at search time. - * - * The default computation in - * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) DefaultSimilarity} - * produces a Euclidean norm: - *
     
    - * - * - * - * - * - *
    - * queryNorm(q)   =   - * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) queryNorm(sumOfSquaredWeights)} - *   =   - * - * - * - * - * - *
    1
    - * –––––––––––––– - *
    sumOfSquaredWeights½
    - *
    - *
     
    - * - * The sum of squared weights (of the query terms) is - * computed by the query {@link org.apache.lucene.search.Weight} object. - * For example, a {@link org.apache.lucene.search.BooleanQuery} - * computes this value as: - * - *
     
    - * - * - * - * - * - * - * - * - * - * - * - *
    - * {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights}   =   - * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} 2 - *  ·  - * - * - * - * ( - * idf(t)  ·  - * t.getBoost() - * ) 2 - *
    t in q
    - *
     
    - * - *
  8. - * - *
  9. - * - * t.getBoost() - * is a search time boost of term t in the query q as - * specified in the query text - * (see query syntax), - * or as set by application calls to - * {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}. - * Notice that there is really no direct API for accessing a boost of one term in a multi term query, - * but rather multi terms are represented in a query as multi - * {@link org.apache.lucene.search.TermQuery TermQuery} objects, - * and so the boost of a term in the query is accessible by calling the sub-query - * {@link org.apache.lucene.search.Query#getBoost() getBoost()}. - *
     
    - *
  10. - * - *
  11. - * - * norm(t,d) encapsulates a few (indexing time) boost and length factors: - * - *
      - *
    • Document boost - set by calling - * {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()} - * before adding the document to the index. - *
    • - *
    • Field boost - set by calling - * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()} - * before adding the field to a document. - *
    • - *
    • lengthNorm - computed - * when the document is added to the index in accordance with the number of tokens - * of this field in the document, so that shorter fields contribute more to the score. - * LengthNorm is computed by the Similarity class in effect at indexing. - *
    • - *
    - * The {@link #computeNorm} method is responsible for - * combining all of these factors into a single float. - * - *

    - * When a document is added to the index, all the above factors are multiplied. - * If the document has multiple fields with the same name, all their boosts are multiplied together: - * - *
     
    - * - * - * - * - * - * - * - * - * - * - * - *
    - * norm(t,d)   =   - * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()} - *  ·  - * lengthNorm - *  ·  - * - * - * - * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}() - *
    field f in d named as t
    - *
     
    - * However the resulted norm value is {@link #encodeNormValue(float) encoded} as a single byte - * before being stored. - * At search time, the norm byte value is read from the index - * {@link org.apache.lucene.store.Directory directory} and - * {@link #decodeNormValue(byte) decoded} back to a float norm value. - * This encoding/decoding, while reducing index size, comes with the price of - * precision loss - it is not guaranteed that decode(encode(x)) = x. - * For instance, decode(encode(0.89)) = 0.75. - *
     
    - * Compression of norm values to a single byte saves memory at search time, - * because once a field is referenced at search time, its norms - for - * all documents - are maintained in memory. - *
     
    - * The rationale supporting such lossy compression of norm values is that - * given the difficulty (and inaccuracy) of users to express their true information - * need by a query, only big differences matter. - *
     
    - * Last, note that search time is too late to modify this norm part of scoring, e.g. by - * using a different {@link Similarity} for search. - *
     
    - *

  12. - *
- * - * @see #setDefault(Similarity) - * @see org.apache.lucene.index.IndexWriterConfig#setSimilarity(Similarity) - * @see IndexSearcher#setSimilarity(Similarity) + * @deprecated Use {@link SimilarityProvider instead} */ -public abstract class Similarity implements Serializable { +@Deprecated +public abstract class Similarity extends FieldSimilarity implements SimilarityProvider, Serializable { - /** - * The Similarity implementation used by default. - **/ - private static Similarity defaultImpl = new DefaultSimilarity(); - public static final int NO_DOC_ID_PROVIDED = -1; - - /** Set the default Similarity implementation used by indexing and search - * code. - * - * @see IndexSearcher#setSimilarity(Similarity) - * @see org.apache.lucene.index.IndexWriterConfig#setSimilarity(Similarity) - */ - public static void setDefault(Similarity similarity) { - Similarity.defaultImpl = similarity; + @Override + public FieldSimilarity get(String field) { + return this; /* same for every field */ } - - /** Return the default Similarity implementation used by indexing and search - * code. - * - *

This is initially an instance of {@link DefaultSimilarity}. - * - * @see IndexSearcher#setSimilarity(Similarity) - * @see org.apache.lucene.index.IndexWriterConfig#setSimilarity(Similarity) - */ - public static Similarity getDefault() { - return Similarity.defaultImpl; - } - - /** Cache of decoded bytes. */ - private static final float[] NORM_TABLE = new float[256]; - - static { - for (int i = 0; i < 256; i++) - NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); - } - - /** Decodes a normalization factor stored in an index. - * @see #encodeNormValue(float) - */ - public float decodeNormValue(byte b) { - return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 - } - - /** - * Computes the normalization value for a field, given the accumulated - * state of term processing for this field (see {@link FieldInvertState}). - * - *

Implementations should calculate a float value based on the field - * state and then return that value. - * - *

Matches in longer fields are less precise, so implementations of this - * method usually return smaller values when state.getLength() is large, - * and larger values when state.getLength() is small. - * - *

Note that the return values are computed under - * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} - * and then stored using - * {@link #encodeNormValue(float)}. - * Thus they have limited precision, and documents - * must be re-indexed if this method is altered. - * - *

For backward compatibility this method by default calls - * {@link #lengthNorm(String, int)} passing - * {@link FieldInvertState#getLength()} as the second argument, and - * then multiplies this value by {@link FieldInvertState#getBoost()}.

- * - * @lucene.experimental - * - * @param field field name - * @param state current processing state for this field - * @return the calculated float norm - */ - public abstract float computeNorm(String field, FieldInvertState state); - - /** Computes the normalization value for a field given the total number of - * terms contained in a field. These values, together with field boosts, are - * stored in an index and multipled into scores for hits on each field by the - * search code. - * - *

Matches in longer fields are less precise, so implementations of this - * method usually return smaller values when numTokens is large, - * and larger values when numTokens is small. - * - *

Note that the return values are computed under - * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} - * and then stored using - * {@link #encodeNormValue(float)}. - * Thus they have limited precision, and documents - * must be re-indexed if this method is altered. - * - * @param fieldName the name of the field - * @param numTokens the total number of tokens contained in fields named - * fieldName of doc. - * @return a normalization factor for hits on this field of this document - * - * @see org.apache.lucene.document.Field#setBoost(float) - * - * @deprecated Please override computeNorm instead - */ - @Deprecated - public final float lengthNorm(String fieldName, int numTokens) { - throw new UnsupportedOperationException("please use computeNorm instead"); - } - - /** Computes the normalization value for a query given the sum of the squared - * weights of each of the query terms. This value is multiplied into the - * weight of each query term. While the classic query normalization factor is - * computed as 1/sqrt(sumOfSquaredWeights), other implementations might - * completely ignore sumOfSquaredWeights (ie return 1). - * - *

This does not affect ranking, but the default implementation does make scores - * from different queries more comparable than they would be by eliminating the - * magnitude of the Query vector as a factor in the score. - * - * @param sumOfSquaredWeights the sum of the squares of query term weights - * @return a normalization factor for query weights - */ - public abstract float queryNorm(float sumOfSquaredWeights); - - /** Encodes a normalization factor for storage in an index. - * - *

The encoding uses a three-bit mantissa, a five-bit exponent, and - * the zero-exponent point at 15, thus - * representing values from around 7x10^9 to 2x10^-9 with about one - * significant decimal digit of accuracy. Zero is also represented. - * Negative numbers are rounded up to zero. Values too large to represent - * are rounded down to the largest representable value. Positive values too - * small to represent are rounded up to the smallest positive representable - * value. - * @see org.apache.lucene.document.Field#setBoost(float) - * @see org.apache.lucene.util.SmallFloat - */ - public byte encodeNormValue(float f) { - return SmallFloat.floatToByte315(f); - } - - /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} - * factor for each term in the query and these products are then summed to - * form the initial score for a document. - * - *

Terms and phrases repeated in a document indicate the topic of the - * document, so implementations of this method usually return larger values - * when freq is large, and smaller values when freq - * is small. - * - *

The default implementation calls {@link #tf(float)}. - * - * @param freq the frequency of a term within a document - * @return a score factor based on a term's within-document frequency - */ - public float tf(int freq) { - return tf((float)freq); - } - - /** Computes the amount of a sloppy phrase match, based on an edit distance. - * This value is summed for each sloppy phrase match in a document to form - * the frequency that is passed to {@link #tf(float)}. - * - *

A phrase match with a small edit distance to a document passage more - * closely matches the document, so implementations of this method usually - * return larger values when the edit distance is small and smaller values - * when it is large. - * - * @see PhraseQuery#setSlop(int) - * @param distance the edit distance of this sloppy phrase match - * @return the frequency increment for this match - */ - public abstract float sloppyFreq(int distance); - - /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} - * factor for each term in the query and these products are then summed to - * form the initial score for a document. - * - *

Terms and phrases repeated in a document indicate the topic of the - * document, so implementations of this method usually return larger values - * when freq is large, and smaller values when freq - * is small. - * - * @param freq the frequency of a term within a document - * @return a score factor based on a term's within-document frequency - */ - public abstract float tf(float freq); - - /** - * Computes a score factor for a simple term and returns an explanation - * for that score factor. - * - *

- * The default implementation uses: - * - *

-   * idf(docFreq, searcher.maxDoc());
-   * 
- * - * Note that {@link IndexSearcher#maxDoc()} is used instead of - * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also - * {@link IndexSearcher#docFreq(Term)} is used, and when the latter - * is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction. - * In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute - * - * @param term the term in question - * @param searcher the document collection being searched - * @param docFreq externally computed docFreq for this term - * @return an IDFExplain object that includes both an idf score factor - and an explanation for the term. - * @throws IOException - */ - public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher, int docFreq) throws IOException { - final int df = docFreq; - final int max = searcher.maxDoc(); - final float idf = idf(df, max); - return new IDFExplanation() { - @Override - public String explain() { - return "idf(docFreq=" + df + - ", maxDocs=" + max + ")"; - } - @Override - public float getIdf() { - return idf; - }}; - } - - /** - * This method forwards to {@link - * #idfExplain(Term,IndexSearcher,int)} by passing - * searcher.docFreq(term) as the docFreq. - */ - public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher) throws IOException { - return idfExplain(term, searcher, searcher.docFreq(term)); - } - - /** - * Computes a score factor for a phrase. - * - *

- * The default implementation sums the idf factor for - * each term in the phrase. - * - * @param terms the terms in the phrase - * @param searcher the document collection being searched - * @return an IDFExplain object that includes both an idf - * score factor for the phrase and an explanation - * for each term. - * @throws IOException - */ - public IDFExplanation idfExplain(Collection terms, IndexSearcher searcher) throws IOException { - final int max = searcher.maxDoc(); - float idf = 0.0f; - final StringBuilder exp = new StringBuilder(); - for (final Term term : terms ) { - final int df = searcher.docFreq(term); - idf += idf(df, max); - exp.append(" "); - exp.append(term.text()); - exp.append("="); - exp.append(df); - } - final float fIdf = idf; - return new IDFExplanation() { - @Override - public float getIdf() { - return fIdf; - } - @Override - public String explain() { - return exp.toString(); - } - }; - } - - /** Computes a score factor based on a term's document frequency (the number - * of documents which contain the term). This value is multiplied by the - * {@link #tf(int)} factor for each term in the query and these products are - * then summed to form the initial score for a document. - * - *

Terms that occur in fewer documents are better indicators of topic, so - * implementations of this method usually return larger values for rare terms, - * and smaller values for common terms. - * - * @param docFreq the number of documents which contain the term - * @param numDocs the total number of documents in the collection - * @return a score factor based on the term's document frequency - */ - public abstract float idf(int docFreq, int numDocs); - - /** Computes a score factor based on the fraction of all query terms that a - * document contains. This value is multiplied into scores. - * - *

The presence of a large portion of the query terms indicates a better - * match with the query, so implementations of this method usually return - * larger values when the ratio between these parameters is large and smaller - * values when the ratio between them is small. - * - * @param overlap the number of query terms matched in the document - * @param maxOverlap the total number of terms in the query - * @return a score factor based on term overlap with the query - */ - public abstract float coord(int overlap, int maxOverlap); - - /** - * Calculate a scoring factor based on the data in the payload. Overriding implementations - * are responsible for interpreting what is in the payload. Lucene makes no assumptions about - * what is in the byte array. - *

- * The default implementation returns 1. - * - * @param docId The docId currently being scored. If this value is {@link #NO_DOC_ID_PROVIDED}, then it should be assumed that the PayloadQuery implementation does not provide document information - * @param fieldName The fieldName of the term this payload belongs to - * @param start The start position of the payload - * @param end The end position of the payload - * @param payload The payload byte array to be scored - * @param offset The offset into the payload array - * @param length The length in the array - * @return An implementation dependent float to be used as a scoring factor - * - */ - // TODO: maybe switch this API to BytesRef? - public float scorePayload(int docId, String fieldName, int start, int end, byte [] payload, int offset, int length) - { - return 1; - } - } Index: lucene/src/java/org/apache/lucene/search/FieldSimilarity.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FieldSimilarity.java (revision 0) +++ lucene/src/java/org/apache/lucene/search/FieldSimilarity.java (revision 0) @@ -0,0 +1,801 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; +import java.io.Serializable; +import java.util.Collection; + +import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.util.SmallFloat; + + +/** + * Expert: Scoring API. + * + *

Similarity defines the components of Lucene scoring. + * Overriding computation of these components is a convenient + * way to alter Lucene scoring. + * + *

Suggested reading: + * + * Introduction To Information Retrieval, Chapter 6. + * + *

The following describes how Lucene scoring evolves from + * underlying information retrieval models to (efficient) implementation. + * We first brief on VSM Score, + * then derive from it Lucene's Conceptual Scoring Formula, + * from which, finally, evolves Lucene's Practical Scoring Function + * (the latter is connected directly with Lucene classes and methods). + * + *

Lucene combines + * + * Boolean model (BM) of Information Retrieval + * with + * + * Vector Space Model (VSM) of Information Retrieval - + * documents "approved" by BM are scored by VSM. + * + *

In VSM, documents and queries are represented as + * weighted vectors in a multi-dimensional space, + * where each distinct index term is a dimension, + * and weights are + * Tf-idf values. + * + *

VSM does not require weights to be Tf-idf values, + * but Tf-idf values are believed to produce search results of high quality, + * and so Lucene is using Tf-idf. + * Tf and Idf are described in more detail below, + * but for now, for completion, let's just say that + * for given term t and document (or query) x, + * Tf(t,x) varies with the number of occurrences of term t in x + * (when one increases so does the other) and + * idf(t) similarly varies with the inverse of the + * number of index documents containing term t. + * + *

VSM score of document d for query q is the + * + * Cosine Similarity + * of the weighted query vectors V(q) and V(d): + * + *
 
+ * + * + * + *
+ * + * + *
+ * + * + * + * + * + *
+ * cosine-similarity(q,d)   =   + * + * + * + * + * + *
V(q) · V(d)
–––––––––
|V(q)| |V(d)|
+ *
+ *
+ *
+ *
VSM Score
+ *
+ *
 
+ * + * + * Where V(q) · V(d) is the + * dot product + * of the weighted vectors, + * and |V(q)| and |V(d)| are their + * Euclidean norms. + * + *

Note: the above equation can be viewed as the dot product of + * the normalized weighted vectors, in the sense that dividing + * V(q) by its euclidean norm is normalizing it to a unit vector. + * + *

Lucene refines VSM score for both search quality and usability: + *

    + *
  • Normalizing V(d) to the unit vector is known to be problematic in that + * it removes all document length information. + * For some documents removing this info is probably ok, + * e.g. a document made by duplicating a certain paragraph 10 times, + * especially if that paragraph is made of distinct terms. + * But for a document which contains no duplicated paragraphs, + * this might be wrong. + * To avoid this problem, a different document length normalization + * factor is used, which normalizes to a vector equal to or larger + * than the unit vector: doc-len-norm(d). + *
  • + * + *
  • At indexing, users can specify that certain documents are more + * important than others, by assigning a document boost. + * For this, the score of each document is also multiplied by its boost value + * doc-boost(d). + *
  • + * + *
  • Lucene is field based, hence each query term applies to a single + * field, document length normalization is by the length of the certain field, + * and in addition to document boost there are also document fields boosts. + *
  • + * + *
  • The same field can be added to a document during indexing several times, + * and so the boost of that field is the multiplication of the boosts of + * the separate additions (or parts) of that field within the document. + *
  • + * + *
  • At search time users can specify boosts to each query, sub-query, and + * each query term, hence the contribution of a query term to the score of + * a document is multiplied by the boost of that query term query-boost(q). + *
  • + * + *
  • A document may match a multi term query without containing all + * the terms of that query (this is correct for some of the queries), + * and users can further reward documents matching more query terms + * through a coordination factor, which is usually larger when + * more terms are matched: coord-factor(q,d). + *
  • + *
+ * + *

Under the simplifying assumption of a single field in the index, + * we get Lucene's Conceptual scoring formula: + * + *
 
+ * + * + * + *
+ * + * + *
+ * + * + * + * + * + * + *
+ * score(q,d)   =   + * coord-factor(q,d) ·   + * query-boost(q) ·   + * + * + * + * + * + *
V(q) · V(d)
–––––––––
|V(q)|
+ *
+ *   ·   doc-len-norm(d) + *   ·   doc-boost(d) + *
+ *
+ *
+ *
Lucene Conceptual Scoring Formula
+ *
+ *
 
+ * + *

The conceptual formula is a simplification in the sense that (1) terms and documents + * are fielded and (2) boosts are usually per query term rather than per query. + * + *

We now describe how Lucene implements this conceptual scoring formula, and + * derive from it Lucene's Practical Scoring Function. + * + *

For efficient score computation some scoring components + * are computed and aggregated in advance: + * + *

    + *
  • Query-boost for the query (actually for each query term) + * is known when search starts. + *
  • + * + *
  • Query Euclidean norm |V(q)| can be computed when search starts, + * as it is independent of the document being scored. + * From search optimization perspective, it is a valid question + * why bother to normalize the query at all, because all + * scored documents will be multiplied by the same |V(q)|, + * and hence documents ranks (their order by score) will not + * be affected by this normalization. + * There are two good reasons to keep this normalization: + *
      + *
    • Recall that + * + * Cosine Similarity can be used find how similar + * two documents are. One can use Lucene for e.g. + * clustering, and use a document as a query to compute + * its similarity to other documents. + * In this use case it is important that the score of document d3 + * for query d1 is comparable to the score of document d3 + * for query d2. In other words, scores of a document for two + * distinct queries should be comparable. + * There are other applications that may require this. + * And this is exactly what normalizing the query vector V(q) + * provides: comparability (to a certain extent) of two or more queries. + *
    • + * + *
    • Applying query normalization on the scores helps to keep the + * scores around the unit vector, hence preventing loss of score data + * because of floating point precision limitations. + *
    • + *
    + *
  • + * + *
  • Document length norm doc-len-norm(d) and document + * boost doc-boost(d) are known at indexing time. + * They are computed in advance and their multiplication + * is saved as a single value in the index: norm(d). + * (In the equations below, norm(t in d) means norm(field(t) in doc d) + * where field(t) is the field associated with term t.) + *
  • + *
+ * + *

Lucene's Practical Scoring Function is derived from the above. + * The color codes demonstrate how it relates + * to those of the conceptual formula: + * + *

+ * + * + * + *
+ * + * + *
+ * + * + * + * + * + * + * + * + * + * + * + *
+ * score(q,d)   =   + * coord(q,d)  ·  + * queryNorm(q)  ·  + * + * + * + * ( + * tf(t in d)  ·  + * idf(t)2  ·  + * t.getBoost() ·  + * norm(t,d) + * ) + *
t in q
+ *
+ *
+ *
Lucene Practical Scoring Function
+ *
+ * + *

where + *

    + *
  1. + * + * tf(t in d) + * correlates to the term's frequency, + * defined as the number of times term t appears in the currently scored document d. + * Documents that have more occurrences of a given term receive a higher score. + * Note that tf(t in q) is assumed to be 1 and therefore it does not appear in this equation, + * However if a query contains twice the same term, there will be + * two term-queries with that same term and hence the computation would still be correct (although + * not very efficient). + * The default computation for tf(t in d) in + * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is: + * + *
     
    + * + * + * + * + * + *
    + * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)}   =   + * + * frequency½ + *
    + *
     
    + *
  2. + * + *
  3. + * + * idf(t) stands for Inverse Document Frequency. This value + * correlates to the inverse of docFreq + * (the number of documents in which the term t appears). + * This means rarer terms give higher contribution to the total score. + * idf(t) appears for t in both the query and the document, + * hence it is squared in the equation. + * The default computation for idf(t) in + * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: + * + *
     
    + * + * + * + * + * + * + * + *
    + * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}  =   + * + * 1 + log ( + * + * + * + * + * + *
    numDocs
    –––––––––
    docFreq+1
    + *
    + * ) + *
    + *
     
    + *
  4. + * + *
  5. + * + * coord(q,d) + * is a score factor based on how many of the query terms are found in the specified document. + * Typically, a document that contains more of the query's terms will receive a higher score + * than another document with fewer query terms. + * This is a search time factor computed in + * {@link #coord(int, int) coord(q,d)} + * by the Similarity in effect at search time. + *
     
    + *
  6. + * + *
  7. + * + * queryNorm(q) + * + * is a normalizing factor used to make scores between queries comparable. + * This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), + * but rather just attempts to make scores from different queries (or even different indexes) comparable. + * This is a search time factor computed by the Similarity in effect at search time. + * + * The default computation in + * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) DefaultSimilarity} + * produces a Euclidean norm: + *
     
    + * + * + * + * + * + *
    + * queryNorm(q)   =   + * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) queryNorm(sumOfSquaredWeights)} + *   =   + * + * + * + * + * + *
    1
    + * –––––––––––––– + *
    sumOfSquaredWeights½
    + *
    + *
     
    + * + * The sum of squared weights (of the query terms) is + * computed by the query {@link org.apache.lucene.search.Weight} object. + * For example, a {@link org.apache.lucene.search.BooleanQuery} + * computes this value as: + * + *
     
    + * + * + * + * + * + * + * + * + * + * + * + *
    + * {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights}   =   + * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} 2 + *  ·  + * + * + * + * ( + * idf(t)  ·  + * t.getBoost() + * ) 2 + *
    t in q
    + *
     
    + * + *
  8. + * + *
  9. + * + * t.getBoost() + * is a search time boost of term t in the query q as + * specified in the query text + * (see query syntax), + * or as set by application calls to + * {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}. + * Notice that there is really no direct API for accessing a boost of one term in a multi term query, + * but rather multi terms are represented in a query as multi + * {@link org.apache.lucene.search.TermQuery TermQuery} objects, + * and so the boost of a term in the query is accessible by calling the sub-query + * {@link org.apache.lucene.search.Query#getBoost() getBoost()}. + *
     
    + *
  10. + * + *
  11. + * + * norm(t,d) encapsulates a few (indexing time) boost and length factors: + * + *
      + *
    • Document boost - set by calling + * {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()} + * before adding the document to the index. + *
    • + *
    • Field boost - set by calling + * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()} + * before adding the field to a document. + *
    • + *
    • lengthNorm - computed + * when the document is added to the index in accordance with the number of tokens + * of this field in the document, so that shorter fields contribute more to the score. + * LengthNorm is computed by the Similarity class in effect at indexing. + *
    • + *
    + * The {@link #computeNorm} method is responsible for + * combining all of these factors into a single float. + * + *

    + * When a document is added to the index, all the above factors are multiplied. + * If the document has multiple fields with the same name, all their boosts are multiplied together: + * + *
     
    + * + * + * + * + * + * + * + * + * + * + * + *
    + * norm(t,d)   =   + * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()} + *  ·  + * lengthNorm + *  ·  + * + * + * + * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}() + *
    field f in d named as t
    + *
     
    + * However the resulted norm value is {@link #encodeNormValue(float) encoded} as a single byte + * before being stored. + * At search time, the norm byte value is read from the index + * {@link org.apache.lucene.store.Directory directory} and + * {@link #decodeNormValue(byte) decoded} back to a float norm value. + * This encoding/decoding, while reducing index size, comes with the price of + * precision loss - it is not guaranteed that decode(encode(x)) = x. + * For instance, decode(encode(0.89)) = 0.75. + *
     
    + * Compression of norm values to a single byte saves memory at search time, + * because once a field is referenced at search time, its norms - for + * all documents - are maintained in memory. + *
     
    + * The rationale supporting such lossy compression of norm values is that + * given the difficulty (and inaccuracy) of users to express their true information + * need by a query, only big differences matter. + *
     
    + * Last, note that search time is too late to modify this norm part of scoring, e.g. by + * using a different {@link Similarity} for search. + *
     
    + *

  12. + *
+ * + * @see #setDefault(Similarity) + * @see org.apache.lucene.index.IndexWriterConfig#setSimilarity(Similarity) + * @see IndexSearcher#setSimilarity(Similarity) + */ +public abstract class FieldSimilarity implements Serializable { + + public static final int NO_DOC_ID_PROVIDED = -1; + + /** Cache of decoded bytes. */ + private static final float[] NORM_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) + NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); + } + + /** Decodes a normalization factor stored in an index. + * @see #encodeNormValue(float) + */ + public float decodeNormValue(byte b) { + return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 + } + + /** + * Computes the normalization value for a field, given the accumulated + * state of term processing for this field (see {@link FieldInvertState}). + * + *

Implementations should calculate a float value based on the field + * state and then return that value. + * + *

Matches in longer fields are less precise, so implementations of this + * method usually return smaller values when state.getLength() is large, + * and larger values when state.getLength() is small. + * + *

Note that the return values are computed under + * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} + * and then stored using + * {@link #encodeNormValue(float)}. + * Thus they have limited precision, and documents + * must be re-indexed if this method is altered. + * + *

For backward compatibility this method by default calls + * {@link #lengthNorm(String, int)} passing + * {@link FieldInvertState#getLength()} as the second argument, and + * then multiplies this value by {@link FieldInvertState#getBoost()}.

+ * + * @lucene.experimental + * + * @param field field name + * @param state current processing state for this field + * @return the calculated float norm + */ + public abstract float computeNorm(String field, FieldInvertState state); + + /** Computes the normalization value for a field given the total number of + * terms contained in a field. These values, together with field boosts, are + * stored in an index and multipled into scores for hits on each field by the + * search code. + * + *

Matches in longer fields are less precise, so implementations of this + * method usually return smaller values when numTokens is large, + * and larger values when numTokens is small. + * + *

Note that the return values are computed under + * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} + * and then stored using + * {@link #encodeNormValue(float)}. + * Thus they have limited precision, and documents + * must be re-indexed if this method is altered. + * + * @param fieldName the name of the field + * @param numTokens the total number of tokens contained in fields named + * fieldName of doc. + * @return a normalization factor for hits on this field of this document + * + * @see org.apache.lucene.document.Field#setBoost(float) + * + * @deprecated Please override computeNorm instead + */ + @Deprecated + public final float lengthNorm(String fieldName, int numTokens) { + throw new UnsupportedOperationException("please use computeNorm instead"); + } + + /** Encodes a normalization factor for storage in an index. + * + *

The encoding uses a three-bit mantissa, a five-bit exponent, and + * the zero-exponent point at 15, thus + * representing values from around 7x10^9 to 2x10^-9 with about one + * significant decimal digit of accuracy. Zero is also represented. + * Negative numbers are rounded up to zero. Values too large to represent + * are rounded down to the largest representable value. Positive values too + * small to represent are rounded up to the smallest positive representable + * value. + * @see org.apache.lucene.document.Field#setBoost(float) + * @see org.apache.lucene.util.SmallFloat + */ + public byte encodeNormValue(float f) { + return SmallFloat.floatToByte315(f); + } + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(int, int)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + *

Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when freq is large, and smaller values when freq + * is small. + * + *

The default implementation calls {@link #tf(float)}. + * + * @param freq the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + public float tf(int freq) { + return tf((float)freq); + } + + /** Computes the amount of a sloppy phrase match, based on an edit distance. + * This value is summed for each sloppy phrase match in a document to form + * the frequency that is passed to {@link #tf(float)}. + * + *

A phrase match with a small edit distance to a document passage more + * closely matches the document, so implementations of this method usually + * return larger values when the edit distance is small and smaller values + * when it is large. + * + * @see PhraseQuery#setSlop(int) + * @param distance the edit distance of this sloppy phrase match + * @return the frequency increment for this match + */ + public abstract float sloppyFreq(int distance); + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(int, int)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + *

Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when freq is large, and smaller values when freq + * is small. + * + * @param freq the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + public abstract float tf(float freq); + + /** + * Computes a score factor for a simple term and returns an explanation + * for that score factor. + * + *

+ * The default implementation uses: + * + *

+   * idf(docFreq, searcher.maxDoc());
+   * 
+ * + * Note that {@link IndexSearcher#maxDoc()} is used instead of + * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also + * {@link IndexSearcher#docFreq(Term)} is used, and when the latter + * is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction. + * In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute + * + * @param term the term in question + * @param searcher the document collection being searched + * @param docFreq externally computed docFreq for this term + * @return an IDFExplain object that includes both an idf score factor + and an explanation for the term. + * @throws IOException + */ + public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher, int docFreq) throws IOException { + final int df = docFreq; + final int max = searcher.maxDoc(); + final float idf = idf(df, max); + return new IDFExplanation() { + @Override + public String explain() { + return "idf(docFreq=" + df + + ", maxDocs=" + max + ")"; + } + @Override + public float getIdf() { + return idf; + }}; + } + + /** + * This method forwards to {@link + * #idfExplain(Term,IndexSearcher,int)} by passing + * searcher.docFreq(term) as the docFreq. + */ + public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher) throws IOException { + return idfExplain(term, searcher, searcher.docFreq(term)); + } + + /** + * Computes a score factor for a phrase. + * + *

+ * The default implementation sums the idf factor for + * each term in the phrase. + * + * @param terms the terms in the phrase + * @param searcher the document collection being searched + * @return an IDFExplain object that includes both an idf + * score factor for the phrase and an explanation + * for each term. + * @throws IOException + */ + public IDFExplanation idfExplain(Collection terms, IndexSearcher searcher) throws IOException { + final int max = searcher.maxDoc(); + float idf = 0.0f; + final StringBuilder exp = new StringBuilder(); + for (final Term term : terms ) { + final int df = searcher.docFreq(term); + idf += idf(df, max); + exp.append(" "); + exp.append(term.text()); + exp.append("="); + exp.append(df); + } + final float fIdf = idf; + return new IDFExplanation() { + @Override + public float getIdf() { + return fIdf; + } + @Override + public String explain() { + return exp.toString(); + } + }; + } + + /** Computes a score factor based on a term's document frequency (the number + * of documents which contain the term). This value is multiplied by the + * {@link #tf(int)} factor for each term in the query and these products are + * then summed to form the initial score for a document. + * + *

Terms that occur in fewer documents are better indicators of topic, so + * implementations of this method usually return larger values for rare terms, + * and smaller values for common terms. + * + * @param docFreq the number of documents which contain the term + * @param numDocs the total number of documents in the collection + * @return a score factor based on the term's document frequency + */ + public abstract float idf(int docFreq, int numDocs); + + /** + * Calculate a scoring factor based on the data in the payload. Overriding implementations + * are responsible for interpreting what is in the payload. Lucene makes no assumptions about + * what is in the byte array. + *

+ * The default implementation returns 1. + * + * @param docId The docId currently being scored. If this value is {@link #NO_DOC_ID_PROVIDED}, then it should be assumed that the PayloadQuery implementation does not provide document information + * @param fieldName The fieldName of the term this payload belongs to + * @param start The start position of the payload + * @param end The end position of the payload + * @param payload The payload byte array to be scored + * @param offset The offset into the payload array + * @param length The length in the array + * @return An implementation dependent float to be used as a scoring factor + * + */ + // TODO: maybe switch this API to BytesRef? + public float scorePayload(int docId, String fieldName, int start, int end, byte [] payload, int offset, int length) + { + return 1; + } + +} Property changes on: lucene\src\java\org\apache\lucene\search\FieldSimilarity.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java (working copy) @@ -19,9 +19,9 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Weight; import org.apache.lucene.search.spans.NearSpansOrdered; import org.apache.lucene.search.spans.NearSpansUnordered; @@ -153,10 +153,9 @@ Spans spans; protected float payloadScore; private int payloadsSeen; - Similarity similarity = getSimilarity(); protected PayloadNearSpanScorer(Spans spans, Weight weight, - Similarity similarity, byte[] norms) throws IOException { + FieldSimilarity similarity, byte[] norms) throws IOException { super(spans, weight, similarity, norms); this.spans = spans; } @@ -211,7 +210,7 @@ payloadsSeen = 0; do { int matchLength = spans.end() - spans.start(); - freq += getSimilarity().sloppyFreq(matchLength); + freq += similarity.sloppyFreq(matchLength); Spans[] spansArr = new Spans[1]; spansArr[0] = spans; getPayloads(spansArr); Index: lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Weight; @@ -86,7 +87,7 @@ private final TermSpans termSpans; public PayloadTermSpanScorer(TermSpans spans, Weight weight, - Similarity similarity, byte[] norms) throws IOException { + FieldSimilarity similarity, byte[] norms) throws IOException { super(spans, weight, similarity, norms); termSpans = spans; } @@ -100,12 +101,11 @@ freq = 0.0f; payloadScore = 0; payloadsSeen = 0; - Similarity similarity1 = getSimilarity(); while (more && doc == spans.doc()) { int matchLength = spans.end() - spans.start(); - freq += similarity1.sloppyFreq(matchLength); - processPayload(similarity1); + freq += similarity.sloppyFreq(matchLength); + processPayload(similarity); more = spans.next();// this moves positions to the next match in this // document @@ -113,7 +113,7 @@ return more || (freq != 0); } - protected void processPayload(Similarity similarity) throws IOException { + protected void processPayload(FieldSimilarity similarity) throws IOException { final DocsAndPositionsEnum postings = termSpans.getPostings(); if (postings.hasPayload()) { payload = postings.getPayload(); Index: lucene/src/java/org/apache/lucene/search/SimilarityProvider.java =================================================================== --- lucene/src/java/org/apache/lucene/search/SimilarityProvider.java (revision 0) +++ lucene/src/java/org/apache/lucene/search/SimilarityProvider.java (revision 0) @@ -0,0 +1,66 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Expert: Scoring API. + * + * Provides top-level scoring functions that aren't specific to a field, + * and work across multi-field queries (such as {@link BooleanQuery}). + * + * Field-specific scoring is accomplished through {@link FieldSimilarity}. + * + * @lucene.experimental + */ +public interface SimilarityProvider { + + /** Computes a score factor based on the fraction of all query terms that a + * document contains. This value is multiplied into scores. + * + *

The presence of a large portion of the query terms indicates a better + * match with the query, so implementations of this method usually return + * larger values when the ratio between these parameters is large and smaller + * values when the ratio between them is small. + * + * @param overlap the number of query terms matched in the document + * @param maxOverlap the total number of terms in the query + * @return a score factor based on term overlap with the query + */ + public abstract float coord(int overlap, int maxOverlap); + + /** Computes the normalization value for a query given the sum of the squared + * weights of each of the query terms. This value is multiplied into the + * weight of each query term. While the classic query normalization factor is + * computed as 1/sqrt(sumOfSquaredWeights), other implementations might + * completely ignore sumOfSquaredWeights (ie return 1). + * + *

This does not affect ranking, but the default implementation does make scores + * from different queries more comparable than they would be by eliminating the + * magnitude of the Query vector as a factor in the score. + * + * @param sumOfSquaredWeights the sum of the squares of query term weights + * @return a normalization factor for query weights + */ + public abstract float queryNorm(float sumOfSquaredWeights); + + /** Returns a {@link FieldSimilarity} for scoring a field + * @param field field name. + * @return a field-specific Similarity. + */ + public abstract FieldSimilarity get(String field); +} Property changes on: lucene\src\java\org\apache\lucene\search\SimilarityProvider.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java (working copy) @@ -107,11 +107,6 @@ } @Override - public Similarity getSimilarity(IndexSearcher searcher) { - return maskedQuery.getSimilarity(searcher); - } - - @Override public Query rewrite(IndexReader reader) throws IOException { FieldMaskingSpanQuery clone = null; Index: lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java =================================================================== --- lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/spans/SpanWeight.java (working copy) @@ -30,7 +30,7 @@ * Expert-only. Public for use by other weight implementations */ public class SpanWeight extends Weight { - protected Similarity similarity; + protected FieldSimilarity similarity; protected float value; protected float idf; protected float queryNorm; @@ -42,7 +42,7 @@ public SpanWeight(SpanQuery query, IndexSearcher searcher) throws IOException { - this.similarity = query.getSimilarity(searcher); + this.similarity = searcher.getSimilarity().get(query.getField()); this.query = query; terms=new HashSet(); Index: lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/spans/SpanScorer.java (working copy) @@ -20,9 +20,9 @@ import java.io.IOException; import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Similarity; /** * Public for extension only. @@ -36,13 +36,15 @@ protected int doc; protected float freq; - - protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) + protected final FieldSimilarity similarity; + + protected SpanScorer(Spans spans, Weight weight, FieldSimilarity similarity, byte[] norms) throws IOException { - super(similarity, weight); + super(weight); this.spans = spans; this.norms = norms; this.value = weight.getValue(); + this.similarity = similarity; if (this.spans.next()) { doc = -1; } else { @@ -81,7 +83,7 @@ freq = 0.0f; do { int matchLength = spans.end() - spans.start(); - freq += getSimilarity().sloppyFreq(matchLength); + freq += similarity.sloppyFreq(matchLength); more = spans.next(); } while (more && (doc == spans.doc())); return true; @@ -92,8 +94,8 @@ @Override public float score() throws IOException { - float raw = getSimilarity().tf(freq) * value; // raw score - return norms == null? raw : raw * getSimilarity().decodeNormValue(norms[doc]); // normalize + float raw = similarity.tf(freq) * value; // raw score + return norms == null? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize } @Override @@ -109,7 +111,7 @@ int expDoc = advance(doc); float phraseFreq = (expDoc == doc) ? freq : 0.0f; - tfExplanation.setValue(getSimilarity().tf(phraseFreq)); + tfExplanation.setValue(similarity.tf(phraseFreq)); tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); return tfExplanation; Index: lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java (working copy) @@ -26,7 +26,7 @@ private PhrasePositions tmpPos[]; // for flipping repeating pps. private boolean checkedRepeats; - SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity, + SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, FieldSimilarity similarity, int slop, byte[] norms) { super(weight, postings, similarity, norms); this.slop = slop; @@ -78,7 +78,7 @@ int matchLength = end - start; if (matchLength <= slop) - freq += getSimilarity().sloppyFreq(matchLength); // score match + freq += similarity.sloppyFreq(matchLength); // score match if (pp.position > end) end = pp.position; Index: lucene/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java (working copy) @@ -53,9 +53,7 @@ * length may be larger than the actual number of scorers. */ public DisjunctionMaxScorer(float tieBreakerMultiplier, - Similarity similarity, Scorer[] subScorers, int numScorers) throws IOException { - super(similarity); - + Scorer[] subScorers, int numScorers) throws IOException { this.tieBreakerMultiplier = tieBreakerMultiplier; // The passed subScorers array includes only scorers which have documents // (DisjunctionMaxQuery takes care of that), and their nextDoc() was already Index: lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (working copy) @@ -129,7 +129,7 @@ private class MultiPhraseWeight extends Weight { - private Similarity similarity; + private FieldSimilarity similarity; private float value; private float idf; private float queryNorm; @@ -137,7 +137,7 @@ public MultiPhraseWeight(IndexSearcher searcher) throws IOException { - this.similarity = getSimilarity(searcher); + this.similarity = searcher.getSimilarity().get(field); // compute idf final int maxDoc = searcher.maxDoc(); Index: lucene/src/java/org/apache/lucene/search/FilteredQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FilteredQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/FilteredQuery.java (working copy) @@ -62,7 +62,6 @@ @Override public Weight createWeight(final IndexSearcher searcher) throws IOException { final Weight weight = query.createWeight (searcher); - final Similarity similarity = query.getSimilarity(searcher); return new Weight() { private float value; @@ -127,7 +126,7 @@ return null; } - return new Scorer(similarity, this) { + return new Scorer(this) { private int doc = -1; Index: lucene/src/java/org/apache/lucene/search/PhraseScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PhraseScorer.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/PhraseScorer.java (working copy) @@ -37,12 +37,13 @@ private boolean more = true; protected PhraseQueue pq; protected PhrasePositions first, last; - + protected final FieldSimilarity similarity; private float freq; //phrase frequency in current doc as computed by phraseFreq(). PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity similarity, byte[] norms) { - super(similarity, weight); + FieldSimilarity similarity, byte[] norms) { + super(weight); + this.similarity = similarity; this.norms = norms; this.value = weight.getValue(); @@ -105,8 +106,8 @@ @Override public float score() throws IOException { //System.out.println("scoring " + first.doc); - float raw = getSimilarity().tf(freq) * value; // raw score - return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[first.doc]); // normalize + float raw = similarity.tf(freq) * value; // raw score + return norms == null ? raw : raw * similarity.decodeNormValue(norms[first.doc]); // normalize } @Override Index: lucene/src/java/org/apache/lucene/search/function/ValueSourceQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/function/ValueSourceQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/function/ValueSourceQuery.java (working copy) @@ -64,12 +64,10 @@ } class ValueSourceWeight extends Weight { - Similarity similarity; float queryNorm; float queryWeight; public ValueSourceWeight(IndexSearcher searcher) { - this.similarity = getSimilarity(searcher); } /*(non-Javadoc) @see org.apache.lucene.search.Weight#getQuery() */ @@ -100,7 +98,7 @@ @Override public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException { - return new ValueSourceScorer(similarity, context, this); + return new ValueSourceScorer(context, this); } /*(non-Javadoc) @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader, int) */ @@ -133,8 +131,8 @@ private int doc = -1; // constructor - private ValueSourceScorer(Similarity similarity, AtomicReaderContext context, ValueSourceWeight w) throws IOException { - super(similarity,w); + private ValueSourceScorer(AtomicReaderContext context, ValueSourceWeight w) throws IOException { + super(w); final IndexReader reader = context.reader; qWeight = w.getValue(); // this is when/where the values are first created. Index: lucene/src/java/org/apache/lucene/search/function/CustomScoreQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/function/CustomScoreQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/function/CustomScoreQuery.java (working copy) @@ -183,13 +183,11 @@ //=========================== W E I G H T ============================ private class CustomWeight extends Weight { - Similarity similarity; Weight subQueryWeight; Weight[] valSrcWeights; boolean qStrict; public CustomWeight(IndexSearcher searcher) throws IOException { - this.similarity = getSimilarity(searcher); this.subQueryWeight = subQuery.weight(searcher); this.valSrcWeights = new Weight[valSrcQueries.length]; for(int i = 0; i < valSrcQueries.length; i++) { @@ -254,7 +252,7 @@ for(int i = 0; i < valSrcScorers.length; i++) { valSrcScorers[i] = valSrcWeights[i].scorer(context, scorerContext.scoreDocsInOrder(true)); } - return new CustomScorer(similarity, context.reader, this, subQueryScorer, valSrcScorers); + return new CustomScorer(context.reader, this, subQueryScorer, valSrcScorers); } @Override @@ -303,9 +301,9 @@ private float vScores[]; // reused in score() to avoid allocating this array for each doc // constructor - private CustomScorer(Similarity similarity, IndexReader reader, CustomWeight w, + private CustomScorer(IndexReader reader, CustomWeight w, Scorer subQueryScorer, Scorer[] valSrcScorers) throws IOException { - super(similarity,w); + super(w); this.qWeight = w.getValue(); this.subQueryScorer = subQueryScorer; this.valSrcScorers = valSrcScorers; Index: lucene/src/java/org/apache/lucene/search/BooleanQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/BooleanQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/BooleanQuery.java (working copy) @@ -162,14 +162,14 @@ */ protected class BooleanWeight extends Weight { /** The Similarity implementation. */ - protected Similarity similarity; + protected SimilarityProvider similarity; protected ArrayList weights; protected int maxCoord; // num optional + num required private final boolean disableCoord; public BooleanWeight(IndexSearcher searcher, boolean disableCoord) throws IOException { - this.similarity = getSimilarity(searcher); + this.similarity = searcher.getSimilarity(); this.disableCoord = disableCoord; weights = new ArrayList(clauses.size()); for (int i = 0 ; i < clauses.size(); i++) { @@ -201,6 +201,9 @@ return sum ; } + public float coord(int overlap, int maxOverlap) { + return similarity.coord(overlap, maxOverlap); + } @Override public void normalize(float norm) { @@ -273,7 +276,7 @@ sumExpl.setMatch(0 < coord ? Boolean.TRUE : Boolean.FALSE); sumExpl.setValue(sum); - final float coordFactor = disableCoord ? 1.0f : similarity.coord(coord, maxCoord); + final float coordFactor = disableCoord ? 1.0f : coord(coord, maxCoord); if (coordFactor == 1.0f) { return sumExpl; // eliminate wrapper } else { @@ -312,7 +315,7 @@ // Check if we can return a BooleanScorer if (!scorerContext.scoreDocsInOrder && scorerContext.topScorer && required.size() == 0 && prohibited.size() < 32) { - return new BooleanScorer(this, disableCoord, similarity, minNrShouldMatch, optional, prohibited, maxCoord); + return new BooleanScorer(this, disableCoord, minNrShouldMatch, optional, prohibited, maxCoord); } if (required.size() == 0 && optional.size() == 0) { @@ -326,7 +329,7 @@ } // Return a BooleanScorer2 - return new BooleanScorer2(this, disableCoord, similarity, minNrShouldMatch, required, prohibited, optional, maxCoord); + return new BooleanScorer2(this, disableCoord, minNrShouldMatch, required, prohibited, optional, maxCoord); } @Override Index: lucene/src/java/org/apache/lucene/search/Query.java =================================================================== --- lucene/src/java/org/apache/lucene/search/Query.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/Query.java (working copy) @@ -98,7 +98,7 @@ Query query = searcher.rewrite(this); Weight weight = query.createWeight(searcher); float sum = weight.sumOfSquaredWeights(); - float norm = getSimilarity(searcher).queryNorm(sum); + float norm = searcher.getSimilarity().queryNorm(sum); if (Float.isInfinite(norm) || Float.isNaN(norm)) norm = 1.0f; weight.normalize(norm); @@ -124,16 +124,7 @@ // needs to be implemented by query subclasses throw new UnsupportedOperationException(); } - - /** Expert: Returns the Similarity implementation to be used for this query. - * Subclasses may override this method to specify their own Similarity - * implementation, perhaps one that delegates through that of the Searcher. - * By default the Searcher's Similarity implementation is returned.*/ - public Similarity getSimilarity(IndexSearcher searcher) { - return searcher.getSimilarity(); - } - /** Returns a clone of this query. */ @Override public Object clone() { Index: lucene/src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PhraseQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -137,7 +137,7 @@ } private class PhraseWeight extends Weight { - private final Similarity similarity; + private final FieldSimilarity similarity; private float value; private float idf; private float queryNorm; @@ -146,7 +146,7 @@ public PhraseWeight(IndexSearcher searcher) throws IOException { - this.similarity = getSimilarity(searcher); + this.similarity = searcher.getSimilarity().get(field); idfExp = similarity.idfExplain(terms, searcher); idf = idfExp.getIdf(); Index: lucene/src/java/org/apache/lucene/search/BooleanScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/BooleanScorer.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/BooleanScorer.java (working copy) @@ -22,6 +22,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery.BooleanWeight; /* Description from Doug Cutting (excerpted from * LUCENE-1483): @@ -197,9 +198,9 @@ private Bucket current; private int doc = -1; - BooleanScorer(Weight weight, boolean disableCoord, Similarity similarity, int minNrShouldMatch, + BooleanScorer(BooleanWeight weight, boolean disableCoord, int minNrShouldMatch, List optionalScorers, List prohibitedScorers, int maxCoord) throws IOException { - super(null, weight); // Similarity not used + super(weight); this.minNrShouldMatch = minNrShouldMatch; if (optionalScorers != null && optionalScorers.size() > 0) { @@ -223,7 +224,7 @@ coordFactors = new float[optionalScorers.size() + 1]; for (int i = 0; i < coordFactors.length; i++) { - coordFactors[i] = disableCoord ? 1.0f : similarity.coord(i, maxCoord); + coordFactors[i] = disableCoord ? 1.0f : weight.coord(i, maxCoord); } } Index: lucene/src/java/org/apache/lucene/search/Scorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/Scorer.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/Scorer.java (working copy) @@ -40,31 +40,22 @@ * with these scores. */ public abstract class Scorer extends DocIdSetIterator { - private final Similarity similarity; protected final Weight weight; /** Constructs a Scorer. - * @param similarity The Similarity implementation used by this scorer. */ - protected Scorer(Similarity similarity) { - this(similarity, null); + protected Scorer() { + this(null); } /** * Constructs a Scorer - * @param similarity The Similarity implementation used by this scorer. * @param weight The scorers Weight */ - protected Scorer(Similarity similarity, Weight weight) { - this.similarity = similarity; + protected Scorer(Weight weight) { this.weight = weight; } - /** Returns the Similarity implementation used by this scorer. */ - public Similarity getSimilarity() { - return this.similarity; - } - /** Scores and collects all matching documents. * @param collector The collector to which all matching documents are passed. */ Index: lucene/src/java/org/apache/lucene/search/TermQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TermQuery.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/TermQuery.java (working copy) @@ -42,7 +42,7 @@ private transient PerReaderTermState perReaderTermState; private class TermWeight extends Weight { - private final Similarity similarity; + private final FieldSimilarity similarity; private float value; private final float idf; private float queryNorm; @@ -54,7 +54,7 @@ throws IOException { assert termStates != null : "PerReaderTermState must not be null"; this.termStates = termStates; - this.similarity = getSimilarity(searcher); + this.similarity = searcher.getSimilarity().get(term.field()); if (docFreq != -1) { idfExp = similarity.idfExplain(term, searcher, docFreq); } else { Index: lucene/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java (working copy) @@ -38,7 +38,6 @@ /** Creates a new instance by wrapping the given scorer. */ public ScoreCachingWrappingScorer(Scorer scorer) { - super(scorer.getSimilarity()); this.scorer = scorer; } @@ -46,11 +45,6 @@ public boolean score(Collector collector, int max, int firstDocID) throws IOException { return scorer.score(collector, max, firstDocID); } - - @Override - public Similarity getSimilarity() { - return scorer.getSimilarity(); - } @Override public float score() throws IOException { Index: lucene/src/java/org/apache/lucene/search/IndexSearcher.java =================================================================== --- lucene/src/java/org/apache/lucene/search/IndexSearcher.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/IndexSearcher.java (working copy) @@ -70,8 +70,20 @@ private final ExecutorService executor; protected final IndexSearcher[] subSearchers; + // the default SimilarityProvider + private static final SimilarityProvider defaultProvider = new DefaultSimilarity(); + + /** + * Expert: returns a default SimilarityProvider instance. + * In general, this should not be used. + * @lucene.internal + */ + public static SimilarityProvider getDefaultProvider() { + return defaultProvider; + } + /** The Similarity implementation used by this searcher. */ - private Similarity similarity = Similarity.getDefault(); + private SimilarityProvider similarity = defaultProvider; /** Creates a searcher searching the index in the named * directory, with readOnly=true @@ -250,13 +262,12 @@ /** Expert: Set the Similarity implementation used by this Searcher. * - * @see Similarity#setDefault(Similarity) */ - public void setSimilarity(Similarity similarity) { + public void setSimilarity(SimilarityProvider similarity) { this.similarity = similarity; } - public Similarity getSimilarity() { + public SimilarityProvider getSimilarity() { return similarity; } Index: lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (working copy) @@ -60,9 +60,12 @@ private int docID = -1; private int freq; + private final FieldSimilarity similarity; + ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, - Similarity similarity, byte[] norms) throws IOException { - super(similarity, weight); + FieldSimilarity similarity, byte[] norms) throws IOException { + super(weight); + this.similarity = similarity; this.norms = norms; this.value = weight.getValue(); @@ -87,7 +90,7 @@ } for (int i = 0; i < SCORE_CACHE_SIZE; i++) { - scoreCache[i] = getSimilarity().tf((float) i) * value; + scoreCache[i] = similarity.tf((float) i) * value; } } @@ -207,9 +210,9 @@ if (freq < SCORE_CACHE_SIZE) { raw = scoreCache[freq]; } else { - raw = getSimilarity().tf((float) freq) * value; + raw = similarity.tf((float) freq) * value; } - return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize + return norms == null ? raw : raw * similarity.decodeNormValue(norms[docID]); // normalize } private int phraseFreq() throws IOException { Index: lucene/src/java/org/apache/lucene/search/TermScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TermScorer.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/search/TermScorer.java (working copy) @@ -38,7 +38,8 @@ private int[] docs; private int[] freqs; private final DocsEnum.BulkReadResult bulkResult; - + private final FieldSimilarity similarity; + /** * Construct a TermScorer. * @@ -52,16 +53,16 @@ * @param norms * The field norms of the document fields for the Term. */ - TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) { - super(similarity, weight); - + TermScorer(Weight weight, DocsEnum td, FieldSimilarity similarity, byte[] norms) { + super(weight); + this.similarity = similarity; this.docsEnum = td; this.norms = norms; this.weightValue = weight.getValue(); bulkResult = td.getBulkResult(); for (int i = 0; i < SCORE_CACHE_SIZE; i++) - scoreCache[i] = getSimilarity().tf(i) * weightValue; + scoreCache[i] = similarity.tf(i) * weightValue; } @Override @@ -136,9 +137,9 @@ float raw = // compute tf(f)*weight freq < SCORE_CACHE_SIZE // check cache ? scoreCache[freq] // cache hit - : getSimilarity().tf(freq)*weightValue; // cache miss + : similarity.tf(freq)*weightValue; // cache miss - return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[doc]); // normalize for field + return norms == null ? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize for field } /** Index: lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java (revision 1059431) +++ lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java (working copy) @@ -17,6 +17,7 @@ * limitations under the License. */ +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.util.ArrayUtil; /** Taps into DocInverter, as an InvertedDocEndConsumer, @@ -29,7 +30,8 @@ final NormsWriterPerThread perThread; final FieldInfo fieldInfo; final DocumentsWriter.DocState docState; - + final FieldSimilarity similarity; + // Holds all docID/norm pairs we've seen int[] docIDs = new int[1]; byte[] norms = new byte[1]; @@ -49,6 +51,7 @@ this.fieldInfo = fieldInfo; docState = perThread.docState; fieldState = docInverterPerField.fieldState; + similarity = docState.similarity.get(fieldInfo.name); } @Override @@ -71,8 +74,8 @@ assert norms.length == upto; norms = ArrayUtil.grow(norms, 1+upto); } - final float norm = docState.similarity.computeNorm(fieldInfo.name, fieldState); - norms[upto] = docState.similarity.encodeNormValue(norm); + final float norm = similarity.computeNorm(fieldInfo.name, fieldState); + norms[upto] = similarity.encodeNormValue(norm); docIDs[upto] = docState.docID; upto++; } Index: lucene/src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (revision 1059498) +++ lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -31,6 +31,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.search.Query; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMFile; @@ -128,7 +129,7 @@ PrintStream infoStream; int maxFieldLength = IndexWriterConfig.UNLIMITED_FIELD_LENGTH; - Similarity similarity; + SimilarityProvider similarity; // max # simultaneous threads; if there are more than // this, they wait for others to finish first @@ -142,7 +143,7 @@ Analyzer analyzer; int maxFieldLength; PrintStream infoStream; - Similarity similarity; + SimilarityProvider similarity; int docID; Document doc; String maxTermPrefix; @@ -365,7 +366,7 @@ } } - synchronized void setSimilarity(Similarity similarity) { + synchronized void setSimilarity(SimilarityProvider similarity) { this.similarity = similarity; for(int i=0;i * NOTE: the similarity cannot be null. If null is passed, - * the similarity will be set to the default. - * - * @see Similarity#setDefault(Similarity) + * the similarity will be set to the default implementation (unspecified). */ - public IndexWriterConfig setSimilarity(Similarity similarity) { - this.similarity = similarity == null ? Similarity.getDefault() : similarity; + public IndexWriterConfig setSimilarity(SimilarityProvider similarity) { + this.similarity = similarity == null ? IndexSearcher.getDefaultProvider() : similarity; return this; } /** - * Expert: returns the {@link Similarity} implementation used by this - * IndexWriter. This defaults to the current value of - * {@link Similarity#getDefault()}. + * Expert: returns the {@link SimilarityProvider} implementation used by this + * IndexWriter. */ - public Similarity getSimilarity() { + public SimilarityProvider getSimilarity() { return similarity; } Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java =================================================================== --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (revision 1059498) +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (working copy) @@ -42,7 +42,9 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CollectionUtil; @@ -67,7 +69,7 @@ private final InstantiatedIndex index; private final Analyzer analyzer; - private Similarity similarity = Similarity.getDefault(); // how to normalize; + private SimilarityProvider similarity = IndexSearcher.getDefaultProvider(); // how to normalize; private transient Set fieldNameBuffer; /** @@ -236,11 +238,12 @@ termsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size(); if (eFieldTermDocInfoFactoriesByTermText.getKey().indexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) { + final String fieldName = eFieldTermDocInfoFactoriesByTermText.getKey().fieldName; final FieldInvertState invertState = new FieldInvertState(); invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost()); invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength); - final float norm = similarity.computeNorm(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName, invertState); - normsByFieldNameAndDocumentNumber.get(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName)[document.getDocumentNumber()] = similarity.encodeNormValue(norm); + final float norm = similarity.get(fieldName).computeNorm(fieldName, invertState); + normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarity.get(fieldName).encodeNormValue(norm); } else { System.currentTimeMillis(); } @@ -659,11 +662,11 @@ addDocument(doc, analyzer); } - public Similarity getSimilarity() { + public SimilarityProvider getSimilarity() { return similarity; } - public void setSimilarity(Similarity similarity) { + public void setSimilarity(SimilarityProvider similarity) { this.similarity = similarity; } Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java =================================================================== --- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 1059498) +++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy) @@ -53,10 +53,12 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader.ReaderContext; import org.apache.lucene.search.Collector; +import org.apache.lucene.search.FieldSimilarity; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.store.RAMDirectory; // for javadocs import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -1169,9 +1171,9 @@ }; } - private Similarity getSimilarity() { + private SimilarityProvider getSimilarity() { if (searcher != null) return searcher.getSimilarity(); - return Similarity.getDefault(); + return IndexSearcher.getDefaultProvider(); } private void setSearcher(IndexSearcher searcher) { @@ -1181,20 +1183,21 @@ /** performance hack: cache norms to avoid repeated expensive calculations */ private byte[] cachedNorms; private String cachedFieldName; - private Similarity cachedSimilarity; + private SimilarityProvider cachedSimilarity; @Override public byte[] norms(String fieldName) { byte[] norms = cachedNorms; - Similarity sim = getSimilarity(); + SimilarityProvider sim = getSimilarity(); if (fieldName != cachedFieldName || sim != cachedSimilarity) { // not cached? Info info = getInfo(fieldName); + FieldSimilarity fieldSim = sim.get(fieldName); int numTokens = info != null ? info.numTokens : 0; int numOverlapTokens = info != null ? info.numOverlapTokens : 0; float boost = info != null ? info.getBoost() : 1.0f; FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost); - float n = sim.computeNorm(fieldName, invertState); - byte norm = sim.encodeNormValue(n); + float n = fieldSim.computeNorm(fieldName, invertState); + byte norm = fieldSim.encodeNormValue(n); norms = new byte[] {norm}; // cache it for future reuse Index: lucene/contrib/queries/src/java/org/apache/lucene/search/BoostingQuery.java =================================================================== --- lucene/contrib/queries/src/java/org/apache/lucene/search/BoostingQuery.java (revision 1059498) +++ lucene/contrib/queries/src/java/org/apache/lucene/search/BoostingQuery.java (working copy) @@ -21,10 +21,9 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.Query; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Similarity; + /** * The BoostingQuery class can be used to effectively demote results that match a given query. * Unlike the "NOT" clause, this still selects documents that contain undesirable terms, @@ -56,10 +55,9 @@ @Override public Query rewrite(IndexReader reader) throws IOException { BooleanQuery result = new BooleanQuery() { - @Override - public Similarity getSimilarity(IndexSearcher searcher) { - return new DefaultSimilarity() { + public Weight createWeight(IndexSearcher searcher) throws IOException { + return new BooleanWeight(searcher, false) { @Override public float coord(int overlap, int max) {