Index: lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java (revision 1496446) +++ lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java (working copy) @@ -19,10 +19,40 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.SmallFloat; -/** Expert: Default scoring implementation. */ +/** + * Expert: Default scoring implementation which {@link #encodeNormValue(float) + * encodes} norm values as a single byte before being stored. At search time, + * the norm byte value is read from the index + * {@link org.apache.lucene.store.Directory directory} and + * {@link #decodeNormValue(long) decoded} back to a float norm value. + * This encoding/decoding, while reducing index size, comes with the price of + * precision loss - it is not guaranteed that decode(encode(x)) = x. For + * instance, decode(encode(0.89)) = 0.75. + *
+ * Compression of norm values to a single byte saves memory at search time, + * because once a field is referenced at search time, its norms - for all + * documents - are maintained in memory. + *
+ * The rationale supporting such lossy compression of norm values is that given
+ * the difficulty (and inaccuracy) of users to express their true information
+ * need by a query, only big differences matter.
+ *
+ * Last, note that search time is too late to modify this norm part of
+ * scoring, e.g. by using a different {@link Similarity} for search.
+ */
public class DefaultSimilarity extends TFIDFSimilarity {
+ /** Cache of decoded bytes. */
+ private static final float[] NORM_TABLE = new float[256];
+
+ static {
+ for (int i = 0; i < 256; i++) {
+ NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
+ }
+ }
+
/** Sole constructor: parameter-free */
public DefaultSimilarity() {}
@@ -38,6 +68,30 @@
return (float)(1.0 / Math.sqrt(sumOfSquaredWeights));
}
+ /**
+ * Encodes a normalization factor for storage in an index.
+ *
+ * The encoding uses a three-bit mantissa, a five-bit exponent, and the
+ * zero-exponent point at 15, thus representing values from around 7x10^9 to
+ * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
+ * represented. Negative numbers are rounded up to zero. Values too large to
+ * represent are rounded down to the largest representable value. Positive
+ * values too small to represent are rounded up to the smallest positive
+ * representable value.
+ *
+ * @see org.apache.lucene.document.Field#setBoost(float)
+ * @see org.apache.lucene.util.SmallFloat
+ */
+ @Override
+ public long encodeNormValue(float f) {
+ return SmallFloat.floatToByte315(f);
+ }
+
+ @Override
+ public float decodeNormValue(long norm) {
+ return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
+ }
+
/** Implemented as
* state.getBoost()*lengthNorm(numTerms), where
* numTerms is {@link FieldInvertState#getLength()} if {@link
Index: lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (revision 1496446)
+++ lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (working copy)
@@ -28,7 +28,6 @@
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.SmallFloat;
/**
@@ -496,27 +495,8 @@
*
The encoding uses a three-bit mantissa, a five-bit exponent, and - * the zero-exponent point at 15, thus - * representing values from around 7x10^9 to 2x10^-9 with about one - * significant decimal digit of accuracy. Zero is also represented. - * Negative numbers are rounded up to zero. Values too large to represent - * are rounded down to the largest representable value. Positive values too - * small to represent are rounded up to the smallest positive representable - * value. - * @see org.apache.lucene.document.Field#setBoost(float) - * @see org.apache.lucene.util.SmallFloat - */ - public byte encodeNormValue(float f) { - return SmallFloat.floatToByte315(f); - } + /** Encodes a normalization factor for storage in an index. */ + public abstract long encodeNormValue(float f); /** Computes the amount of a sloppy phrase match, based on an edit distance. * This value is summed for each sloppy phrase match in a document to form @@ -756,7 +713,7 @@ public float score(int doc, float freq) { final float raw = tf(freq) * weightValue; // compute tf(f)*weight - return norms == null ? raw : raw * decodeNormValue((byte)norms.get(doc)); // normalize for field + return norms == null ? raw : raw * decodeNormValue(norms.get(doc)); // normalize for field } @Override @@ -843,8 +800,7 @@ fieldExpl.addDetail(stats.idf); Explanation fieldNormExpl = new Explanation(); - float fieldNorm = - norms!=null ? decodeNormValue((byte) norms.get(doc)) : 1.0f; + float fieldNorm = norms != null ? decodeNormValue(norms.get(doc)) : 1.0f; fieldNormExpl.setValue(fieldNorm); fieldNormExpl.setDescription("fieldNorm(doc="+doc+")"); fieldExpl.addDetail(fieldNormExpl); Index: lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java (revision 1496446) +++ lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java (working copy) @@ -100,13 +100,13 @@ class TestSimilarity extends DefaultSimilarity { @Override - public byte encodeNormValue(float f) { - return (byte) f; + public long encodeNormValue(float f) { + return (long) f; } @Override - public float decodeNormValue(byte b) { - return (float) b; + public float decodeNormValue(long norm) { + return norm; } @Override Index: lucene/core/src/test/org/apache/lucene/index/TestNorms.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestNorms.java (revision 1496446) +++ lucene/core/src/test/org/apache/lucene/index/TestNorms.java (working copy) @@ -47,13 +47,13 @@ class CustomNormEncodingSimilarity extends DefaultSimilarity { @Override - public byte encodeNormValue(float f) { - return (byte) f; + public long encodeNormValue(float f) { + return (long) f; } @Override - public float decodeNormValue(byte b) { - return (float) b; + public float decodeNormValue(long norm) { + return norm; } @Override Index: lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java (revision 1496446) +++ lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java (working copy) @@ -18,26 +18,35 @@ */ import java.io.IOException; -import java.util.concurrent.ExecutionException; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; -import org.apache.lucene.search.*; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; public class TestOmitTf extends LuceneTestCase { public static class SimpleSimilarity extends TFIDFSimilarity { + @Override public float decodeNormValue(long norm) { return norm; } + @Override public long encodeNormValue(float f) { return (long) f; } @Override public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } @Override Index: lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java =================================================================== --- lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java (revision 1496446) +++ lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java (working copy) @@ -108,6 +108,16 @@ private class Sim1 extends TFIDFSimilarity { @Override + public long encodeNormValue(float f) { + return (long) f; + } + + @Override + public float decodeNormValue(long norm) { + return norm; + } + + @Override public float coord(int overlap, int maxOverlap) { return 1f; } @@ -146,6 +156,16 @@ private class Sim2 extends TFIDFSimilarity { @Override + public long encodeNormValue(float f) { + return (long) f; + } + + @Override + public float decodeNormValue(long norm) { + return norm; + } + + @Override public float coord(int overlap, int maxOverlap) { return 1f; } Index: lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java =================================================================== --- lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java (revision 1496446) +++ lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java (working copy) @@ -29,7 +29,7 @@ import java.util.Map; /** - * Function that returns {@link TFIDFSimilarity#decodeNormValue(byte)} + * Function that returns {@link TFIDFSimilarity#decodeNormValue(long)} * for every document. *
* Note that the configured Similarity for the field must be Index: solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java =================================================================== --- solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java (revision 1496446) +++ solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java (working copy) @@ -357,9 +357,7 @@ */ private static byte expectedNorm(final DefaultSimilarity sim, final int length, final float boost) { - - return sim.encodeNormValue(boost / ((float) Math.sqrt(length))); - + return (byte) sim.encodeNormValue(boost / ((float) Math.sqrt(length))); }