Index: src/test/org/apache/lucene/search/TestSimilarity.java
===================================================================
--- src/test/org/apache/lucene/search/TestSimilarity.java (revision 651373)
+++ src/test/org/apache/lucene/search/TestSimilarity.java (working copy)
@@ -17,18 +17,18 @@
* limitations under the License.
*/
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import java.util.Collection;
+import java.io.IOException;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.analysis.SimpleAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-
/** Similarity unit test.
*
*
@@ -38,29 +38,49 @@
public TestSimilarity(String name) {
super(name);
}
-
+
public static class SimpleSimilarity extends Similarity {
- public float lengthNorm(String field, int numTerms) { return 1.0f; }
- public float queryNorm(float sumOfSquaredWeights) { return 1.0f; }
- public float tf(float freq) { return freq; }
- public float sloppyFreq(int distance) { return 2.0f; }
- public float idf(Collection terms, Searcher searcher) { return 1.0f; }
- public float idf(int docFreq, int numDocs) { return 1.0f; }
- public float coord(int overlap, int maxOverlap) { return 1.0f; }
+ public float lengthNorm(String field, int numTerms) {
+ return 1.0f;
+ }
+
+ public float queryNorm(float sumOfSquaredWeights) {
+ return 1.0f;
+ }
+
+ public float tf(float freq) {
+ return freq;
+ }
+
+ public float sloppyFreq(int distance) {
+ return 2.0f;
+ }
+
+ public float idf(Collection terms, Searcher searcher) {
+ return 1.0f;
+ }
+
+ public float idf(int docFreq, int numDocs) {
+ return 1.0f;
+ }
+
+ public float coord(int overlap, int maxOverlap) {
+ return 1.0f;
+ }
}
public void testSimilarity() throws Exception {
RAMDirectory store = new RAMDirectory();
- IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true,
- IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true,
+ IndexWriter.MaxFieldLength.LIMITED);
writer.setSimilarity(new SimpleSimilarity());
-
+
Document d1 = new Document();
d1.add(new Field("field", "a c", Field.Store.YES, Field.Index.TOKENIZED));
Document d2 = new Document();
d2.add(new Field("field", "a b c", Field.Store.YES, Field.Index.TOKENIZED));
-
+
writer.addDocument(d1);
writer.addDocument(d2);
writer.optimize();
@@ -74,48 +94,106 @@
Term c = new Term("field", "c");
searcher.search
- (new TermQuery(b),
- new HitCollector() {
- public final void collect(int doc, float score) {
- assertTrue(score == 1.0f);
- }
- });
+ (new TermQuery(b),
+ new HitCollector() {
+ public final void collect(int doc, float score) {
+ assertTrue(score == 1.0f);
+ }
+ });
BooleanQuery bq = new BooleanQuery();
bq.add(new TermQuery(a), BooleanClause.Occur.SHOULD);
bq.add(new TermQuery(b), BooleanClause.Occur.SHOULD);
//System.out.println(bq.toString("field"));
searcher.search
- (bq,
- new HitCollector() {
- public final void collect(int doc, float score) {
- //System.out.println("Doc=" + doc + " score=" + score);
- assertTrue(score == (float)doc+1);
- }
- });
+ (bq,
+ new HitCollector() {
+ public final void collect(int doc, float score) {
+ //System.out.println("Doc=" + doc + " score=" + score);
+ assertTrue(score == (float) doc + 1);
+ }
+ });
PhraseQuery pq = new PhraseQuery();
pq.add(a);
pq.add(c);
//System.out.println(pq.toString("field"));
searcher.search
- (pq,
- new HitCollector() {
- public final void collect(int doc, float score) {
- //System.out.println("Doc=" + doc + " score=" + score);
- assertTrue(score == 1.0f);
- }
- });
+ (pq,
+ new HitCollector() {
+ public final void collect(int doc, float score) {
+ //System.out.println("Doc=" + doc + " score=" + score);
+ assertTrue(score == 1.0f);
+ }
+ });
pq.setSlop(2);
//System.out.println(pq.toString("field"));
searcher.search
- (pq,
- new HitCollector() {
- public final void collect(int doc, float score) {
- //System.out.println("Doc=" + doc + " score=" + score);
- assertTrue(score == 2.0f);
- }
- });
+ (pq,
+ new HitCollector() {
+ public final void collect(int doc, float score) {
+ //System.out.println("Doc=" + doc + " score=" + score);
+ assertTrue(score == 2.0f);
+ }
+ });
}
+
+ public void testNormCodec() throws IOException {
+
+ Similarity.NormCodec normCodec;
+
+ normCodec = new Similarity.DefaultNormCodec();
+ assertEquals(10f, normCodec.decodeNorm(normCodec.encodeNorm(10f)));
+ assertEquals(10f, normCodec.decodeNorm(normCodec.encodeNorm(11f)));
+ assertEquals(12f, normCodec.decodeNorm(normCodec.encodeNorm(12f)));
+ assertEquals(12f, normCodec.decodeNorm(normCodec.encodeNorm(13f)));
+ assertEquals(14f, normCodec.decodeNorm(normCodec.encodeNorm(14f)));
+
+
+ new Similarity.SimpleNormCodec(new float[255]);
+ new Similarity.SimpleNormCodec(new float[256]);
+
+ try {
+ new Similarity.SimpleNormCodec(new float[257]);
+ fail("Supposed to throw ArrayOutOfBoundsException");
+ } catch (Exception e) {
+ // all good
+ }
+
+ try {
+ new Similarity.SimpleNormCodec(new float[]{9f, 10f, 11, 13f, 12f});
+ fail("Supposed to throw IllegalArgumentException due to unsorted array");
+ } catch (IllegalArgumentException e) {
+ // all good
+ }
+
+
+ try {
+ new Similarity.SimpleNormCodec(null);
+ fail("Supposed to throw NullPointerException");
+ } catch (NullPointerException e) {
+ // all good
+ }
+
+ normCodec = new Similarity.SimpleNormCodec(new float[]{9f, 10f, 11, 12f, 13f});
+ assertEquals(9f, normCodec.decodeNorm(normCodec.encodeNorm(9f)));
+ assertEquals(10f, normCodec.decodeNorm(normCodec.encodeNorm(10f)));
+ assertEquals(11f, normCodec.decodeNorm(normCodec.encodeNorm(11f)));
+ assertEquals(12f, normCodec.decodeNorm(normCodec.encodeNorm(12f)));
+ assertEquals(13f, normCodec.decodeNorm(normCodec.encodeNorm(13f)));
+
+ Directory dir = new RAMDirectory();
+
+ assertEquals(Similarity.DefaultNormCodec.class, Similarity.readNormCodec(dir).getClass());
+ Similarity.writeNormCodec(dir, normCodec);
+ normCodec = Similarity.readNormCodec(dir);
+ assertEquals(9f, normCodec.decodeNorm(normCodec.encodeNorm(9f)));
+ assertEquals(10f, normCodec.decodeNorm(normCodec.encodeNorm(10f)));
+ assertEquals(11f, normCodec.decodeNorm(normCodec.encodeNorm(11f)));
+ assertEquals(12f, normCodec.decodeNorm(normCodec.encodeNorm(12f)));
+ assertEquals(13f, normCodec.decodeNorm(normCodec.encodeNorm(13f)));
+
+
+ }
}
Index: src/java/org/apache/lucene/search/Similarity.java
===================================================================
--- src/java/org/apache/lucene/search/Similarity.java (revision 651373)
+++ src/java/org/apache/lucene/search/Similarity.java (working copy)
@@ -19,9 +19,13 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.util.SmallFloat;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.Directory;
import java.io.IOException;
import java.io.Serializable;
+import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
@@ -99,13 +103,13 @@
* (the number of documents in which the term t appears).
* This means rarer terms give higher contribution to the total score.
* The default computation for idf(t) in
- * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
+ * {@link org.apache.lucene.search.DefaultSimilarity#idf(int,int) DefaultSimilarity} is:
*
*
*
| - * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)} = + * {@link org.apache.lucene.search.DefaultSimilarity#idf(int,int) idf(t)} = * | *
* 1 + log (
@@ -132,7 +136,7 @@
* Typically, a document that contains more of the query's terms will receive a higher score
* than another document with fewer query terms.
* This is a search time factor computed in
- * {@link #coord(int, int) coord(q,d)}
+ * {@link #coord(int,int) coord(q,d)}
* by the Similarity in effect at search time.
* * @@ -232,7 +236,7 @@ * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()} * before adding the field to a document. * - * |
*
@@ -311,26 +315,19 @@
return Similarity.defaultImpl;
}
- /** Cache of decoded bytes. */
- private static final float[] NORM_TABLE = new float[256];
-
- static {
- for (int i = 0; i < 256; i++)
- NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
- }
-
/** Decodes a normalization factor stored in an index.
* @see #encodeNorm(float)
*/
public static float decodeNorm(byte b) {
- return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
+ return normCodec.decodeNorm(b);
}
/** Returns a table for decoding normalization bytes.
+ * @deprecated access norm decoder using {@link Similarity#getNormCodec()}
* @see #encodeNorm(float)
*/
public static float[] getNormDecoder() {
- return NORM_TABLE;
+ return normCodec.getNormsTable();
}
/** Computes the normalization value for a field given the total number of
@@ -385,12 +382,12 @@
* @see org.apache.lucene.util.SmallFloat
*/
public static byte encodeNorm(float f) {
- return SmallFloat.floatToByte315(f);
+ return normCodec.encodeNorm(f);
}
/** Computes a score factor based on a term or phrase's frequency in a
- * document. This value is multiplied by the {@link #idf(Term, Searcher)}
+ * document. This value is multiplied by the {@link #idf(Term,Searcher)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
@@ -405,7 +402,7 @@
* @return a score factor based on a term's within-document frequency
*/
public float tf(int freq) {
- return tf((float)freq);
+ return tf((float) freq);
}
/** Computes the amount of a sloppy phrase match, based on an edit distance.
@@ -424,7 +421,7 @@
public abstract float sloppyFreq(int distance);
/** Computes a score factor based on a term or phrase's frequency in a
- * document. This value is multiplied by the {@link #idf(Term, Searcher)}
+ * document. This value is multiplied by the {@link #idf(Term,Searcher)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
@@ -470,7 +467,7 @@
float idf = 0.0f;
Iterator i = terms.iterator();
while (i.hasNext()) {
- idf += idf((Term)i.next(), searcher);
+ idf += idf((Term) i.next(), searcher);
}
return idf;
}
@@ -518,9 +515,269 @@
* @param length The length in the array
* @return An implementation dependent float to be used as a scoring factor
*/
- public float scorePayload(String fieldName, byte [] payload, int offset, int length)
- {
+ public float scorePayload(String fieldName, byte[] payload, int offset, int length) {
//Do nothing
return 1;
}
+
+
+ private static final String NORM_CODEC_FILE = "codec.nrm";
+ private static NormCodec normCodec = new DefaultNormCodec();
+
+ /**
+ * @return Strategy used to encode and decode field norm values.
+ */
+ public static NormCodec getNormCodec() {
+ return normCodec;
+ }
+
+ /**
+ * EXPERT:
+ *
+ * + * Replaces the {@link org.apache.lucene.search.Similarity.NormCodec} used to read and write field norms in any + * index that is using the same class loader as this class was created in. + * + * + *+ * The new codec will not automatically be written to or read from a directory, + * see {@link #writeNormCodec(org.apache.lucene.store.Directory,org.apache.lucene.search.Similarity.NormCodec)} + * and {@link #readNormCodec(org.apache.lucene.store.Directory)}. + * + * + *+ * Changinging the codec is usually not possible once an index has been created. Some implementations does however + * allow for reservation of space in the 8 bit value that can be used when sweetspots occurs. It would also be + * possible to analyze stored norms to see if there are unused slots in the bitspace one can use. + * + * + * @param normCodec Strategy used to encode and decode field norm values. + */ + public static void setNormCodec(NormCodec normCodec) throws IOException { + Similarity.normCodec = normCodec; + } + + + /** + *+ * Returns the {@link org.apache.lucene.search.Similarity.NormCodec} associated with a {@link Directory}. + * A new instance of {@link org.apache.lucene.search.Similarity.DefaultNormCodec} + * will be returned if no known norm codec is available. + * + * + *This codec will not automatically be used by the directory, it has to be set manually. + * + * @see #setNormCodec(org.apache.lucene.search.Similarity.NormCodec) + * @see #writeNormCodec(org.apache.lucene.store.Directory,org.apache.lucene.search.Similarity.NormCodec) + */ + public static NormCodec readNormCodec(Directory directory) throws IOException { + if (!directory.fileExists(NORM_CODEC_FILE)) { + return new DefaultNormCodec(); + } + + IndexInput in = directory.openInput(NORM_CODEC_FILE); + + String codecClassName = in.readString(); + NormCodec normCodec; + try { + normCodec = (NormCodec) Class.forName(codecClassName).newInstance(); + } catch (Exception e) { + throw new RuntimeException(e); + } + normCodec.readFile(in); + in.close(); + return normCodec; + } + + /** + *+ * Associates a {@link org.apache.lucene.search.Similarity.NormCodec} + * with a {@link org.apache.lucene.store.Directory} + * by serializing the codec as a file + * retreivable using {@link #readNormCodec(org.apache.lucene.store.Directory)}. + * + * + *+ * The directory will not automatically load a codec associated with it, + * see {@link #setNormCodec(org.apache.lucene.search.Similarity.NormCodec)}. + * + * + * @see #setNormCodec(org.apache.lucene.search.Similarity.NormCodec) + */ + public static void writeNormCodec(Directory directory, NormCodec normCodec) throws IOException { + IndexOutput out = directory.createOutput(NORM_CODEC_FILE); + out.writeString(normCodec.getClass().getName()); + normCodec.writeFile(out); + out.close(); + } + + /** + *+ * Strategy used to translate the persistent field norms values (stored lengthNorm and boost) + * from a byte to a float and vice versa. + * + * + *+ * All NormNodec implementations must have a default constructor for deserialization purposes. + * + * + * @see org.apache.lucene.search.Similarity#setNormCodec(org.apache.lucene.search.Similarity.NormCodec) + */ + public static abstract class NormCodec { + + /** Encodes a normalization factor for storage in an index. + * @see org.apache.lucene.document.Field#setBoost(float) + */ + public abstract byte encodeNorm(float f); + + /** Decodes a normalization factor stored in an index. + * @see #encodeNorm(float) + */ + public abstract float decodeNorm(byte b); + + /** + * @deprecated 2.3.1 backwards compatibility, use org.apache.lucene.search.Similarity#getNormDecoder() + * @throws RuntimeException if no backwards compatibile norms table + * @see org.apache.lucene.search.Similarity#getNormDecoder() + */ + public abstract float[] getNormsTable(); + + public abstract void readFile(IndexInput in) throws IOException; + + public abstract void writeFile(IndexOutput out) throws IOException; + + } + + + /** + * Uses {@link org.apache.lucene.util.SmallFloat} to encode and decode norm values. + */ + public static class DefaultNormCodec extends NormCodec { + + /** Cache of decoded bytes. */ + private float[] normsTable = new float[256]; + + public DefaultNormCodec() { + for (int i = 0; i < 256; i++) { + normsTable[i] = SmallFloat.byte315ToFloat((byte) i); + } + } + + /** + * @see org.apache.lucene.util.SmallFloat#floatToByte315(float) + */ + public byte encodeNorm(float f) { + return SmallFloat.floatToByte315(f); + } + + /** + * @see org.apache.lucene.util.SmallFloat#byte315ToFloat(byte) + */ + public float decodeNorm(byte b) { + return normsTable[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 + } + + public float[] getNormsTable() { + return normsTable; + } + + + public void readFile(IndexInput in) throws IOException { + // do nothing + } + + public void writeFile(IndexOutput out) throws IOException { + // do nothing + } + } + + /** + *Uses binary search over a sorted float[1..256] to encode and decode norm values. + * + *+ * It is the consumer of this codec that has to sort the array. This is to ensure that + * the order is no tampered with by the codec at any time. It will however makes sure + * that the array indeed is ordered and throw an exception if that is not the case. + * + */ + public static class SimpleNormCodec extends NormCodec { + + public SimpleNormCodec() { + } + + private float[] normsTable; + + /** + * @param normsTable ordered float[] containg supported float values for normalization. + * @throws RuntimeException if paremter normsTable is of any other length than 256. + * @throws NullPointerException if parameter normsTable is null + * @throws IllegalArgumentException if parameter normsTable is not sorted. + */ + public SimpleNormCodec(float[] normsTable) { + setNormsTable(normsTable); + } + + public float decodeNorm(byte b) { + return normsTable[b]; + } + + public byte encodeNorm(float f) { + int pos = Arrays.binarySearch(normsTable, f); + if (pos < 0) { + pos *= -1; + pos--; + } + return (byte) pos; + } + + + /** + * Unsupported depricated method that always throws a {@link RuntimeException}. + * @throws RuntimeException always + */ + public float[] getNormsTable() { + throw new RuntimeException("Depricated method not supported"); + } + + /** + * @param normsTable sorted float[] containg supported float values for normalization. + * @throws RuntimeException if paremter normsTable is of any other length than 256. + * @throws NullPointerException if parameter normsTable is null + * @throws IllegalArgumentException if parameter normsTable is not sorted. + */ + public void setNormsTable(float[] normsTable) { + if (normsTable == null) { + throw new NullPointerException("Parameter normsTable is null."); + } else if (normsTable.length > 256) { + throw new ArrayIndexOutOfBoundsException("Parameter normsTable must not exceed length 256 (" + normsTable.length + ")."); + } + + // ensure table is ordered + float[] clone = normsTable.clone(); + Arrays.sort(clone); + if (!Arrays.equals(normsTable, clone)) { + throw new IllegalArgumentException("Parameter normsTable is not sorted."); + } + + this.normsTable = normsTable; + } + + + public void readFile(IndexInput in) throws IOException { + float[] normsTable = new float[in.readInt()]; + for (int i = 0; i < normsTable.length; i++) { + normsTable[i] = Float.intBitsToFloat(in.readInt()); + } + this.normsTable = normsTable; + } + + public void writeFile(IndexOutput out) throws IOException { + out.writeInt(normsTable.length); + for (int i = 0; i < normsTable.length; i++) { + out.writeInt(Float.floatToRawIntBits(normsTable[i])); // do we really need NaN? + } + } + } + + } Index: src/java/org/apache/lucene/search/TermScorer.java =================================================================== --- src/java/org/apache/lucene/search/TermScorer.java (revision 651373) +++ src/java/org/apache/lucene/search/TermScorer.java (working copy) @@ -63,7 +63,6 @@ protected boolean score(HitCollector c, int end) throws IOException { Similarity similarity = getSimilarity(); // cache sim in local - float[] normDecoder = Similarity.getNormDecoder(); while (doc < end) { // for docs in window int f = freqs[pointer]; float score = // compute tf(f)*weight @@ -71,7 +70,7 @@ ? scoreCache[f] // cache hit : similarity.tf(f)*weightValue; // cache miss - score *= normDecoder[norms[doc] & 0xFF]; // normalize for field + score *= Similarity.decodeNorm(norms[doc]); // normalize for field c.collect(doc, score); // collect score |