Index: lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java (revision 1496446) +++ lucene/core/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java (working copy) @@ -19,10 +19,40 @@ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.SmallFloat; -/** Expert: Default scoring implementation. */ +/** + * Expert: Default scoring implementation which {@link #encodeNormValue(float) + * encodes} norm values as a single byte before being stored. At search time, + * the norm byte value is read from the index + * {@link org.apache.lucene.store.Directory directory} and + * {@link #decodeNormValue(long) decoded} back to a float norm value. + * This encoding/decoding, while reducing index size, comes with the price of + * precision loss - it is not guaranteed that decode(encode(x)) = x. For + * instance, decode(encode(0.89)) = 0.75. + *

+ * Compression of norm values to a single byte saves memory at search time, + * because once a field is referenced at search time, its norms - for all + * documents - are maintained in memory. + *

+ * The rationale supporting such lossy compression of norm values is that given + * the difficulty (and inaccuracy) of users to express their true information + * need by a query, only big differences matter.
+ *  
+ * Last, note that search time is too late to modify this norm part of + * scoring, e.g. by using a different {@link Similarity} for search. + */ public class DefaultSimilarity extends TFIDFSimilarity { + /** Cache of decoded bytes. */ + private static final float[] NORM_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) { + NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); + } + } + /** Sole constructor: parameter-free */ public DefaultSimilarity() {} @@ -38,6 +68,30 @@ return (float)(1.0 / Math.sqrt(sumOfSquaredWeights)); } + /** + * Encodes a normalization factor for storage in an index. + *

+ * The encoding uses a three-bit mantissa, a five-bit exponent, and the + * zero-exponent point at 15, thus representing values from around 7x10^9 to + * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also + * represented. Negative numbers are rounded up to zero. Values too large to + * represent are rounded down to the largest representable value. Positive + * values too small to represent are rounded up to the smallest positive + * representable value. + * + * @see org.apache.lucene.document.Field#setBoost(float) + * @see org.apache.lucene.util.SmallFloat + */ + @Override + public long encodeNormValue(float f) { + return SmallFloat.floatToByte315(f); + } + + @Override + public float decodeNormValue(long norm) { + return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127 + } + /** Implemented as * state.getBoost()*lengthNorm(numTerms), where * numTerms is {@link FieldInvertState#getLength()} if {@link Index: lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (revision 1496446) +++ lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (working copy) @@ -28,7 +28,6 @@ import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermStatistics; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.SmallFloat; /** @@ -496,27 +495,8 @@ * * * - *
 
- * However the resulted norm value is {@link #encodeNormValue(float) encoded} as a single byte - * before being stored. - * At search time, the norm byte value is read from the index - * {@link org.apache.lucene.store.Directory directory} and - * {@link #decodeNormValue(byte) decoded} back to a float norm value. - * This encoding/decoding, while reducing index size, comes with the price of - * precision loss - it is not guaranteed that decode(encode(x)) = x. - * For instance, decode(encode(0.89)) = 0.75. - *
 
- * Compression of norm values to a single byte saves memory at search time, - * because once a field is referenced at search time, its norms - for - * all documents - are maintained in memory. - *
 
- * The rationale supporting such lossy compression of norm values is that - * given the difficulty (and inaccuracy) of users to express their true information - * need by a query, only big differences matter. - *
 
- * Last, note that search time is too late to modify this norm part of scoring, e.g. by - * using a different {@link Similarity} for search. - *
 
+ * Note that search time is too late to modify this norm part of scoring, + * e.g. by using a different {@link Similarity} for search. * * * @@ -666,38 +646,15 @@ return encodeNormValue(normValue); } - /** Cache of decoded bytes. */ - private static final float[] NORM_TABLE = new float[256]; - - static { - for (int i = 0; i < 256; i++) { - NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); - } - } - - /** Decodes a normalization factor stored in an index. + /** + * Decodes a normalization factor stored in an index. + * * @see #encodeNormValue(float) */ - public float decodeNormValue(byte b) { - return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 - } + public abstract float decodeNormValue(long norm); - /** Encodes a normalization factor for storage in an index. - * - *

The encoding uses a three-bit mantissa, a five-bit exponent, and - * the zero-exponent point at 15, thus - * representing values from around 7x10^9 to 2x10^-9 with about one - * significant decimal digit of accuracy. Zero is also represented. - * Negative numbers are rounded up to zero. Values too large to represent - * are rounded down to the largest representable value. Positive values too - * small to represent are rounded up to the smallest positive representable - * value. - * @see org.apache.lucene.document.Field#setBoost(float) - * @see org.apache.lucene.util.SmallFloat - */ - public byte encodeNormValue(float f) { - return SmallFloat.floatToByte315(f); - } + /** Encodes a normalization factor for storage in an index. */ + public abstract long encodeNormValue(float f); /** Computes the amount of a sloppy phrase match, based on an edit distance. * This value is summed for each sloppy phrase match in a document to form @@ -756,7 +713,7 @@ public float score(int doc, float freq) { final float raw = tf(freq) * weightValue; // compute tf(f)*weight - return norms == null ? raw : raw * decodeNormValue((byte)norms.get(doc)); // normalize for field + return norms == null ? raw : raw * decodeNormValue(norms.get(doc)); // normalize for field } @Override @@ -843,8 +800,7 @@ fieldExpl.addDetail(stats.idf); Explanation fieldNormExpl = new Explanation(); - float fieldNorm = - norms!=null ? decodeNormValue((byte) norms.get(doc)) : 1.0f; + float fieldNorm = norms != null ? decodeNormValue(norms.get(doc)) : 1.0f; fieldNormExpl.setValue(fieldNorm); fieldNormExpl.setDescription("fieldNorm(doc="+doc+")"); fieldExpl.addDetail(fieldNormExpl); Index: lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java (revision 1496446) +++ lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java (working copy) @@ -100,13 +100,13 @@ class TestSimilarity extends DefaultSimilarity { @Override - public byte encodeNormValue(float f) { - return (byte) f; + public long encodeNormValue(float f) { + return (long) f; } @Override - public float decodeNormValue(byte b) { - return (float) b; + public float decodeNormValue(long norm) { + return norm; } @Override Index: lucene/core/src/test/org/apache/lucene/index/TestNorms.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestNorms.java (revision 1496446) +++ lucene/core/src/test/org/apache/lucene/index/TestNorms.java (working copy) @@ -47,13 +47,13 @@ class CustomNormEncodingSimilarity extends DefaultSimilarity { @Override - public byte encodeNormValue(float f) { - return (byte) f; + public long encodeNormValue(float f) { + return (long) f; } @Override - public float decodeNormValue(byte b) { - return (float) b; + public float decodeNormValue(long norm) { + return norm; } @Override Index: lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java =================================================================== --- lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java (revision 1496446) +++ lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java (working copy) @@ -18,26 +18,35 @@ */ import java.io.IOException; -import java.util.concurrent.ExecutionException; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; -import org.apache.lucene.search.*; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.CollectionStatistics; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TermStatistics; import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; public class TestOmitTf extends LuceneTestCase { public static class SimpleSimilarity extends TFIDFSimilarity { + @Override public float decodeNormValue(long norm) { return norm; } + @Override public long encodeNormValue(float f) { return (long) f; } @Override public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } @Override Index: lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java =================================================================== --- lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java (revision 1496446) +++ lucene/core/src/test/org/apache/lucene/search/TestSimilarityProvider.java (working copy) @@ -108,6 +108,16 @@ private class Sim1 extends TFIDFSimilarity { @Override + public long encodeNormValue(float f) { + return (long) f; + } + + @Override + public float decodeNormValue(long norm) { + return norm; + } + + @Override public float coord(int overlap, int maxOverlap) { return 1f; } @@ -146,6 +156,16 @@ private class Sim2 extends TFIDFSimilarity { @Override + public long encodeNormValue(float f) { + return (long) f; + } + + @Override + public float decodeNormValue(long norm) { + return norm; + } + + @Override public float coord(int overlap, int maxOverlap) { return 1f; } Index: lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java =================================================================== --- lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java (revision 1496446) +++ lucene/queries/src/java/org/apache/lucene/queries/function/valuesource/NormValueSource.java (working copy) @@ -29,7 +29,7 @@ import java.util.Map; /** - * Function that returns {@link TFIDFSimilarity#decodeNormValue(byte)} + * Function that returns {@link TFIDFSimilarity#decodeNormValue(long)} * for every document. *

* Note that the configured Similarity for the field must be Index: solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java =================================================================== --- solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java (revision 1496446) +++ solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java (working copy) @@ -357,9 +357,7 @@ */ private static byte expectedNorm(final DefaultSimilarity sim, final int length, final float boost) { - - return sim.encodeNormValue(boost / ((float) Math.sqrt(length))); - + return (byte) sim.encodeNormValue(boost / ((float) Math.sqrt(length))); }