Index: src/test/org/apache/lucene/search/TestSimilarity.java =================================================================== --- src/test/org/apache/lucene/search/TestSimilarity.java (revision 651373) +++ src/test/org/apache/lucene/search/TestSimilarity.java (working copy) @@ -17,18 +17,18 @@ * limitations under the License. */ +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; import java.util.Collection; +import java.io.IOException; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.analysis.SimpleAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - /** Similarity unit test. * * @@ -38,29 +38,49 @@ public TestSimilarity(String name) { super(name); } - + public static class SimpleSimilarity extends Similarity { - public float lengthNorm(String field, int numTerms) { return 1.0f; } - public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } - public float tf(float freq) { return freq; } - public float sloppyFreq(int distance) { return 2.0f; } - public float idf(Collection terms, Searcher searcher) { return 1.0f; } - public float idf(int docFreq, int numDocs) { return 1.0f; } - public float coord(int overlap, int maxOverlap) { return 1.0f; } + public float lengthNorm(String field, int numTerms) { + return 1.0f; + } + + public float queryNorm(float sumOfSquaredWeights) { + return 1.0f; + } + + public float tf(float freq) { + return freq; + } + + public float sloppyFreq(int distance) { + return 2.0f; + } + + public float idf(Collection terms, Searcher searcher) { + return 1.0f; + } + + public float idf(int docFreq, int numDocs) { + return 1.0f; + } + + public float coord(int overlap, int maxOverlap) { + return 1.0f; + } } public void testSimilarity() throws Exception { RAMDirectory store = new RAMDirectory(); - IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true, - IndexWriter.MaxFieldLength.LIMITED); + IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); writer.setSimilarity(new SimpleSimilarity()); - + Document d1 = new Document(); d1.add(new Field("field", "a c", Field.Store.YES, Field.Index.TOKENIZED)); Document d2 = new Document(); d2.add(new Field("field", "a b c", Field.Store.YES, Field.Index.TOKENIZED)); - + writer.addDocument(d1); writer.addDocument(d2); writer.optimize(); @@ -74,48 +94,106 @@ Term c = new Term("field", "c"); searcher.search - (new TermQuery(b), - new HitCollector() { - public final void collect(int doc, float score) { - assertTrue(score == 1.0f); - } - }); + (new TermQuery(b), + new HitCollector() { + public final void collect(int doc, float score) { + assertTrue(score == 1.0f); + } + }); BooleanQuery bq = new BooleanQuery(); bq.add(new TermQuery(a), BooleanClause.Occur.SHOULD); bq.add(new TermQuery(b), BooleanClause.Occur.SHOULD); //System.out.println(bq.toString("field")); searcher.search - (bq, - new HitCollector() { - public final void collect(int doc, float score) { - //System.out.println("Doc=" + doc + " score=" + score); - assertTrue(score == (float)doc+1); - } - }); + (bq, + new HitCollector() { + public final void collect(int doc, float score) { + //System.out.println("Doc=" + doc + " score=" + score); + assertTrue(score == (float) doc + 1); + } + }); PhraseQuery pq = new PhraseQuery(); pq.add(a); pq.add(c); //System.out.println(pq.toString("field")); searcher.search - (pq, - new HitCollector() { - public final void collect(int doc, float score) { - //System.out.println("Doc=" + doc + " score=" + score); - assertTrue(score == 1.0f); - } - }); + (pq, + new HitCollector() { + public final void collect(int doc, float score) { + //System.out.println("Doc=" + doc + " score=" + score); + assertTrue(score == 1.0f); + } + }); pq.setSlop(2); //System.out.println(pq.toString("field")); searcher.search - (pq, - new HitCollector() { - public final void collect(int doc, float score) { - //System.out.println("Doc=" + doc + " score=" + score); - assertTrue(score == 2.0f); - } - }); + (pq, + new HitCollector() { + public final void collect(int doc, float score) { + //System.out.println("Doc=" + doc + " score=" + score); + assertTrue(score == 2.0f); + } + }); } + + public void testNormCodec() throws IOException { + + Similarity.NormCodec normCodec; + + normCodec = new Similarity.DefaultNormCodec(); + assertEquals(10f, normCodec.decodeNorm(normCodec.encodeNorm(10f))); + assertEquals(10f, normCodec.decodeNorm(normCodec.encodeNorm(11f))); + assertEquals(12f, normCodec.decodeNorm(normCodec.encodeNorm(12f))); + assertEquals(12f, normCodec.decodeNorm(normCodec.encodeNorm(13f))); + assertEquals(14f, normCodec.decodeNorm(normCodec.encodeNorm(14f))); + + + new Similarity.SimpleNormCodec(new float[255]); + new Similarity.SimpleNormCodec(new float[256]); + + try { + new Similarity.SimpleNormCodec(new float[257]); + fail("Supposed to throw ArrayOutOfBoundsException"); + } catch (Exception e) { + // all good + } + + try { + new Similarity.SimpleNormCodec(new float[]{9f, 10f, 11, 13f, 12f}); + fail("Supposed to throw IllegalArgumentException due to unsorted array"); + } catch (IllegalArgumentException e) { + // all good + } + + + try { + new Similarity.SimpleNormCodec(null); + fail("Supposed to throw NullPointerException"); + } catch (NullPointerException e) { + // all good + } + + normCodec = new Similarity.SimpleNormCodec(new float[]{9f, 10f, 11, 12f, 13f}); + assertEquals(9f, normCodec.decodeNorm(normCodec.encodeNorm(9f))); + assertEquals(10f, normCodec.decodeNorm(normCodec.encodeNorm(10f))); + assertEquals(11f, normCodec.decodeNorm(normCodec.encodeNorm(11f))); + assertEquals(12f, normCodec.decodeNorm(normCodec.encodeNorm(12f))); + assertEquals(13f, normCodec.decodeNorm(normCodec.encodeNorm(13f))); + + Directory dir = new RAMDirectory(); + + assertEquals(Similarity.DefaultNormCodec.class, Similarity.readNormCodec(dir).getClass()); + Similarity.writeNormCodec(dir, normCodec); + normCodec = Similarity.readNormCodec(dir); + assertEquals(9f, normCodec.decodeNorm(normCodec.encodeNorm(9f))); + assertEquals(10f, normCodec.decodeNorm(normCodec.encodeNorm(10f))); + assertEquals(11f, normCodec.decodeNorm(normCodec.encodeNorm(11f))); + assertEquals(12f, normCodec.decodeNorm(normCodec.encodeNorm(12f))); + assertEquals(13f, normCodec.decodeNorm(normCodec.encodeNorm(13f))); + + + } } Index: src/java/org/apache/lucene/search/Similarity.java =================================================================== --- src/java/org/apache/lucene/search/Similarity.java (revision 651373) +++ src/java/org/apache/lucene/search/Similarity.java (working copy) @@ -19,9 +19,13 @@ import org.apache.lucene.index.Term; import org.apache.lucene.util.SmallFloat; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.Directory; import java.io.IOException; import java.io.Serializable; +import java.util.Arrays; import java.util.Collection; import java.util.Iterator; @@ -99,13 +103,13 @@ * (the number of documents in which the term t appears). * This means rarer terms give higher contribution to the total score. * The default computation for idf(t) in - * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: + * {@link org.apache.lucene.search.DefaultSimilarity#idf(int,int) DefaultSimilarity} is: * *
 
* * * * *
- * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}  =   + * {@link org.apache.lucene.search.DefaultSimilarity#idf(int,int) idf(t)}  =   * * 1 + log ( @@ -132,7 +136,7 @@ * Typically, a document that contains more of the query's terms will receive a higher score * than another document with fewer query terms. * This is a search time factor computed in - * {@link #coord(int, int) coord(q,d)} + * {@link #coord(int,int) coord(q,d)} * by the Similarity in effect at search time. *
 
* @@ -232,7 +236,7 @@ * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()} * before adding the field to a document. * - *
  • {@link #lengthNorm(String, int) lengthNorm(field)} - computed + *
  • {@link #lengthNorm(String,int) lengthNorm(field)} - computed * when the document is added to the index in accordance with the number of tokens * of this field in the document, so that shorter fields contribute more to the score. * LengthNorm is computed by the Similarity class in effect at indexing. @@ -250,7 +254,7 @@ * norm(t,d)   =   * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()} *  ·  - * {@link #lengthNorm(String, int) lengthNorm(field)} + * {@link #lengthNorm(String,int) lengthNorm(field)} *  ·  *
  • @@ -311,26 +315,19 @@ return Similarity.defaultImpl; } - /** Cache of decoded bytes. */ - private static final float[] NORM_TABLE = new float[256]; - - static { - for (int i = 0; i < 256; i++) - NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); - } - /** Decodes a normalization factor stored in an index. * @see #encodeNorm(float) */ public static float decodeNorm(byte b) { - return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 + return normCodec.decodeNorm(b); } /** Returns a table for decoding normalization bytes. + * @deprecated access norm decoder using {@link Similarity#getNormCodec()} * @see #encodeNorm(float) */ public static float[] getNormDecoder() { - return NORM_TABLE; + return normCodec.getNormsTable(); } /** Computes the normalization value for a field given the total number of @@ -385,12 +382,12 @@ * @see org.apache.lucene.util.SmallFloat */ public static byte encodeNorm(float f) { - return SmallFloat.floatToByte315(f); + return normCodec.encodeNorm(f); } /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(Term, Searcher)} + * document. This value is multiplied by the {@link #idf(Term,Searcher)} * factor for each term in the query and these products are then summed to * form the initial score for a document. * @@ -405,7 +402,7 @@ * @return a score factor based on a term's within-document frequency */ public float tf(int freq) { - return tf((float)freq); + return tf((float) freq); } /** Computes the amount of a sloppy phrase match, based on an edit distance. @@ -424,7 +421,7 @@ public abstract float sloppyFreq(int distance); /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(Term, Searcher)} + * document. This value is multiplied by the {@link #idf(Term,Searcher)} * factor for each term in the query and these products are then summed to * form the initial score for a document. * @@ -470,7 +467,7 @@ float idf = 0.0f; Iterator i = terms.iterator(); while (i.hasNext()) { - idf += idf((Term)i.next(), searcher); + idf += idf((Term) i.next(), searcher); } return idf; } @@ -518,9 +515,269 @@ * @param length The length in the array * @return An implementation dependent float to be used as a scoring factor */ - public float scorePayload(String fieldName, byte [] payload, int offset, int length) - { + public float scorePayload(String fieldName, byte[] payload, int offset, int length) { //Do nothing return 1; } + + + private static final String NORM_CODEC_FILE = "codec.nrm"; + private static NormCodec normCodec = new DefaultNormCodec(); + + /** + * @return Strategy used to encode and decode field norm values. + */ + public static NormCodec getNormCodec() { + return normCodec; + } + + /** + * EXPERT: + * + *

    + * Replaces the {@link org.apache.lucene.search.Similarity.NormCodec} used to read and write field norms in any + * index that is using the same class loader as this class was created in. + *

    + * + *

    + * The new codec will not automatically be written to or read from a directory, + * see {@link #writeNormCodec(org.apache.lucene.store.Directory,org.apache.lucene.search.Similarity.NormCodec)} + * and {@link #readNormCodec(org.apache.lucene.store.Directory)}. + *

    + * + *

    + * Changinging the codec is usually not possible once an index has been created. Some implementations does however + * allow for reservation of space in the 8 bit value that can be used when sweetspots occurs. It would also be + * possible to analyze stored norms to see if there are unused slots in the bitspace one can use. + *

    + * + * @param normCodec Strategy used to encode and decode field norm values. + */ + public static void setNormCodec(NormCodec normCodec) throws IOException { + Similarity.normCodec = normCodec; + } + + + /** + *

    + * Returns the {@link org.apache.lucene.search.Similarity.NormCodec} associated with a {@link Directory}. + * A new instance of {@link org.apache.lucene.search.Similarity.DefaultNormCodec} + * will be returned if no known norm codec is available. + *

    + * + *

    This codec will not automatically be used by the directory, it has to be set manually.

    + * + * @see #setNormCodec(org.apache.lucene.search.Similarity.NormCodec) + * @see #writeNormCodec(org.apache.lucene.store.Directory,org.apache.lucene.search.Similarity.NormCodec) + */ + public static NormCodec readNormCodec(Directory directory) throws IOException { + if (!directory.fileExists(NORM_CODEC_FILE)) { + return new DefaultNormCodec(); + } + + IndexInput in = directory.openInput(NORM_CODEC_FILE); + + String codecClassName = in.readString(); + NormCodec normCodec; + try { + normCodec = (NormCodec) Class.forName(codecClassName).newInstance(); + } catch (Exception e) { + throw new RuntimeException(e); + } + normCodec.readFile(in); + in.close(); + return normCodec; + } + + /** + *

    + * Associates a {@link org.apache.lucene.search.Similarity.NormCodec} + * with a {@link org.apache.lucene.store.Directory} + * by serializing the codec as a file + * retreivable using {@link #readNormCodec(org.apache.lucene.store.Directory)}. + *

    + * + *

    + * The directory will not automatically load a codec associated with it, + * see {@link #setNormCodec(org.apache.lucene.search.Similarity.NormCodec)}. + *

    + * + * @see #setNormCodec(org.apache.lucene.search.Similarity.NormCodec) + */ + public static void writeNormCodec(Directory directory, NormCodec normCodec) throws IOException { + IndexOutput out = directory.createOutput(NORM_CODEC_FILE); + out.writeString(normCodec.getClass().getName()); + normCodec.writeFile(out); + out.close(); + } + + /** + *

    + * Strategy used to translate the persistent field norms values (stored lengthNorm and boost) + * from a byte to a float and vice versa. + *

    + * + *

    + * All NormNodec implementations must have a default constructor for deserialization purposes. + *

    + * + * @see org.apache.lucene.search.Similarity#setNormCodec(org.apache.lucene.search.Similarity.NormCodec) + */ + public static abstract class NormCodec { + + /** Encodes a normalization factor for storage in an index. + * @see org.apache.lucene.document.Field#setBoost(float) + */ + public abstract byte encodeNorm(float f); + + /** Decodes a normalization factor stored in an index. + * @see #encodeNorm(float) + */ + public abstract float decodeNorm(byte b); + + /** + * @deprecated 2.3.1 backwards compatibility, use org.apache.lucene.search.Similarity#getNormDecoder() + * @throws RuntimeException if no backwards compatibile norms table + * @see org.apache.lucene.search.Similarity#getNormDecoder() + */ + public abstract float[] getNormsTable(); + + public abstract void readFile(IndexInput in) throws IOException; + + public abstract void writeFile(IndexOutput out) throws IOException; + + } + + + /** + * Uses {@link org.apache.lucene.util.SmallFloat} to encode and decode norm values. + */ + public static class DefaultNormCodec extends NormCodec { + + /** Cache of decoded bytes. */ + private float[] normsTable = new float[256]; + + public DefaultNormCodec() { + for (int i = 0; i < 256; i++) { + normsTable[i] = SmallFloat.byte315ToFloat((byte) i); + } + } + + /** + * @see org.apache.lucene.util.SmallFloat#floatToByte315(float) + */ + public byte encodeNorm(float f) { + return SmallFloat.floatToByte315(f); + } + + /** + * @see org.apache.lucene.util.SmallFloat#byte315ToFloat(byte) + */ + public float decodeNorm(byte b) { + return normsTable[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 + } + + public float[] getNormsTable() { + return normsTable; + } + + + public void readFile(IndexInput in) throws IOException { + // do nothing + } + + public void writeFile(IndexOutput out) throws IOException { + // do nothing + } + } + + /** + *

    Uses binary search over a sorted float[1..256] to encode and decode norm values.

    + * + *

    + * It is the consumer of this codec that has to sort the array. This is to ensure that + * the order is no tampered with by the codec at any time. It will however makes sure + * that the array indeed is ordered and throw an exception if that is not the case. + *

    + */ + public static class SimpleNormCodec extends NormCodec { + + public SimpleNormCodec() { + } + + private float[] normsTable; + + /** + * @param normsTable ordered float[] containg supported float values for normalization. + * @throws RuntimeException if paremter normsTable is of any other length than 256. + * @throws NullPointerException if parameter normsTable is null + * @throws IllegalArgumentException if parameter normsTable is not sorted. + */ + public SimpleNormCodec(float[] normsTable) { + setNormsTable(normsTable); + } + + public float decodeNorm(byte b) { + return normsTable[b]; + } + + public byte encodeNorm(float f) { + int pos = Arrays.binarySearch(normsTable, f); + if (pos < 0) { + pos *= -1; + pos--; + } + return (byte) pos; + } + + + /** + * Unsupported depricated method that always throws a {@link RuntimeException}. + * @throws RuntimeException always + */ + public float[] getNormsTable() { + throw new RuntimeException("Depricated method not supported"); + } + + /** + * @param normsTable sorted float[] containg supported float values for normalization. + * @throws RuntimeException if paremter normsTable is of any other length than 256. + * @throws NullPointerException if parameter normsTable is null + * @throws IllegalArgumentException if parameter normsTable is not sorted. + */ + public void setNormsTable(float[] normsTable) { + if (normsTable == null) { + throw new NullPointerException("Parameter normsTable is null."); + } else if (normsTable.length > 256) { + throw new ArrayIndexOutOfBoundsException("Parameter normsTable must not exceed length 256 (" + normsTable.length + ")."); + } + + // ensure table is ordered + float[] clone = normsTable.clone(); + Arrays.sort(clone); + if (!Arrays.equals(normsTable, clone)) { + throw new IllegalArgumentException("Parameter normsTable is not sorted."); + } + + this.normsTable = normsTable; + } + + + public void readFile(IndexInput in) throws IOException { + float[] normsTable = new float[in.readInt()]; + for (int i = 0; i < normsTable.length; i++) { + normsTable[i] = Float.intBitsToFloat(in.readInt()); + } + this.normsTable = normsTable; + } + + public void writeFile(IndexOutput out) throws IOException { + out.writeInt(normsTable.length); + for (int i = 0; i < normsTable.length; i++) { + out.writeInt(Float.floatToRawIntBits(normsTable[i])); // do we really need NaN? + } + } + } + + } Index: src/java/org/apache/lucene/search/TermScorer.java =================================================================== --- src/java/org/apache/lucene/search/TermScorer.java (revision 651373) +++ src/java/org/apache/lucene/search/TermScorer.java (working copy) @@ -63,7 +63,6 @@ protected boolean score(HitCollector c, int end) throws IOException { Similarity similarity = getSimilarity(); // cache sim in local - float[] normDecoder = Similarity.getNormDecoder(); while (doc < end) { // for docs in window int f = freqs[pointer]; float score = // compute tf(f)*weight @@ -71,7 +70,7 @@ ? scoreCache[f] // cache hit : similarity.tf(f)*weightValue; // cache miss - score *= normDecoder[norms[doc] & 0xFF]; // normalize for field + score *= Similarity.decodeNorm(norms[doc]); // normalize for field c.collect(doc, score); // collect score