Index: src/test/org/apache/lucene/index/TestFieldNormModifier.java =================================================================== --- src/test/org/apache/lucene/index/TestFieldNormModifier.java (revision 0) +++ src/test/org/apache/lucene/index/TestFieldNormModifier.java (revision 0) @@ -0,0 +1,201 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; + +import junit.framework.TestCase; + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.HitCollector; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +/** + * Tests changing of field norms with a custom similarity and with fake norms. + * + * @version $Id$ + */ +public class TestFieldNormModifier extends TestCase { + public TestFieldNormModifier(String name) { + super(name); + } + + public static byte DEFAULT_NORM = Similarity.encodeNorm(1.0f); + + public static int NUM_DOCS = 5; + + public Directory store = new RAMDirectory(); + + /** inverts the normal notion of lengthNorm */ + public static Similarity s = new DefaultSimilarity() { + public float lengthNorm(String fieldName, int numTokens) { + return (float)numTokens; + } + }; + + public void setUp() throws Exception { + IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true); + + for (int i = 0; i < NUM_DOCS; i++) { + Document d = new Document(); + d.add(new Field("field", "word", Field.Store.YES, Field.Index.TOKENIZED)); + d.add(new Field("nonorm", "word", Field.Store.YES, Field.Index.NO_NORMS)); + d.add(new Field("untokfield", "20061212 20071212", Field.Store.YES, Field.Index.TOKENIZED)); + + for (int j = 1; j <= i; j++) { + d.add(new Field("field", "crap", Field.Store.YES, Field.Index.TOKENIZED)); + d.add(new Field("nonorm", "more words", Field.Store.YES, Field.Index.NO_NORMS)); + } + writer.addDocument(d); + } + writer.close(); + } + + public void testMissingField() { + FieldNormModifier fnm = new FieldNormModifier(store, s); + try { + fnm.reSetNorms("nobodyherebutuschickens"); + } catch (Exception e) { + assertNull("caught something", e); + } + } + + public void testFieldWithNoNorm() throws Exception { + + IndexReader r = IndexReader.open(store); + byte[] norms = r.norms("nonorm"); + + // sanity check, norms should all be 1 + assertTrue("Whoops we have norms?", !r.hasNorms("nonorm")); + for (int i = 0; i< norms.length; i++) { + assertEquals(""+i, DEFAULT_NORM, norms[i]); + } + + r.close(); + + FieldNormModifier fnm = new FieldNormModifier(store, s); + try { + fnm.reSetNorms("nonorm"); + } catch (Exception e) { + assertNull("caught something", e); + } + + // nothing should have changed + r = IndexReader.open(store); + + norms = r.norms("nonorm"); + assertTrue("Whoops we have norms?", !r.hasNorms("nonorm")); + for (int i = 0; i< norms.length; i++) { + assertEquals(""+i, DEFAULT_NORM, norms[i]); + } + + r.close(); + } + + + public void testGoodCases() throws Exception { + + IndexSearcher searcher = new IndexSearcher(store); + final float[] scores = new float[NUM_DOCS]; + float lastScore = 0.0f; + + // default similarity should put docs with shorter length first + searcher.search(new TermQuery(new Term("field", "word")), new HitCollector() { + public final void collect(int doc, float score) { + scores[doc] = score; + } + }); + searcher.close(); + + lastScore = Float.MAX_VALUE; + for (int i = 0; i < NUM_DOCS; i++) { + String msg = "i=" + i + ", " + scores[i] + " <= " + lastScore; + assertTrue(msg, scores[i] <= lastScore); + //System.out.println(msg); + lastScore = scores[i]; + } + + FieldNormModifier fnm = new FieldNormModifier(store, s); + fnm.reSetNorms("field"); + + // new norm (with default similarity) should put longer docs first + searcher = new IndexSearcher(store); + searcher.search(new TermQuery(new Term("field", "word")), new HitCollector() { + public final void collect(int doc, float score) { + scores[doc] = score; + } + }); + searcher.close(); + + lastScore = 0.0f; + for (int i = 0; i < NUM_DOCS; i++) { + String msg = "i=" + i + ", " + scores[i] + " >= " + lastScore; + assertTrue(msg, scores[i] >= lastScore); + //System.out.println(msg); + lastScore = scores[i]; + } + } + + public void testNormKiller() throws IOException { + + IndexReader r = IndexReader.open(store); + byte[] oldNorms = r.norms("untokfield"); + r.close(); + + FieldNormModifier fnm = new FieldNormModifier(store, s); + fnm.reSetNorms("untokfield"); + + r = IndexReader.open(store); + byte[] newNorms = r.norms("untokfield"); + r.close(); + assertFalse(Arrays.equals(oldNorms, newNorms)); + + + // verify that we still get documents in the same order as originally + IndexSearcher searcher = new IndexSearcher(store); + final float[] scores = new float[NUM_DOCS]; + float lastScore = 0.0f; + + // default similarity should return the same score for all documents for this query + searcher.search(new TermQuery(new Term("untokfield", "20061212")), new HitCollector() { + public final void collect(int doc, float score) { + scores[doc] = score; + } + }); + searcher.close(); + + lastScore = scores[0]; + for (int i = 0; i < NUM_DOCS; i++) { + String msg = "i=" + i + ", " + scores[i] + " == " + lastScore; + assertTrue(msg, scores[i] == lastScore); + //System.out.println(msg); + lastScore = scores[i]; + } + } +} Index: src/java/org/apache/lucene/index/FieldNormModifier.java =================================================================== --- src/java/org/apache/lucene/index/FieldNormModifier.java (revision 0) +++ src/java/org/apache/lucene/index/FieldNormModifier.java (revision 0) @@ -0,0 +1,158 @@ +package org.apache.lucene.index; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Date; + +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + +/** + * Given a directory and a list of fields, updates the fieldNorms in place for every document. + * If Similarity class is specified, uses its lengthNorm method to set norms. + * If -n command line argument is used, removed field norms, as if {@link Field.Index.NO_NORMS} was used. + * + *

+ * NOTE: This will overwrite any length normalization or field/document boosts. + *

+ * + * @author Chris Hostetter + * @author Otis Gospodnetic + */ +public class FieldNormModifier { + + /** + * Command Line Execution method + * + *
+   * Usage: FieldNormModifier /path/index  field1 field2 ...
+   * 
+ */ + public static void main(String[] args) throws IOException { + if (args.length < 3) { + System.err.println("Usage: FieldNormModifier [field2] ..."); + System.exit(1); + } + + Similarity s = null; + if (!args[1].equals("-n")) { + try { + Class simClass = Class.forName(args[1]); + s = (Similarity)simClass.newInstance(); + } catch (Exception e) { + System.err.println("Couldn't instantiate similarity with empty constructor: " + args[1]); + e.printStackTrace(System.err); + System.exit(1); + } + } + + Directory d = FSDirectory.getDirectory(args[0], false); + FieldNormModifier fnm = new FieldNormModifier(d, s); + + for (int i = 2; i < args.length; i++) { + System.out.print("Updating field: " + args[i] + " " + (new Date()).toString() + " ... "); + fnm.reSetNorms(args[i]); + System.out.println(new Date().toString()); + } + + d.close(); + } + + + private Directory dir; + private Similarity sim; + + /** + * Constructor for code that wishes to use this class programatically + * If Similarity is null, kill the field norms. + * + * @param d the Directory to modify + * @param s the Similiary to use (can be null) + */ + public FieldNormModifier(Directory d, Similarity s) { + dir = d; + sim = s; + } + + /** + * Resets the norms for the specified field. + * + *

+ * Opens a new IndexReader on the Directory given to this instance, + * modifies the norms (either using the Similarity given to this instance, or by using fake norms, + * and closes the IndexReader. + *

+ * + * @param field the field whose norms should be reset + */ + public void reSetNorms(String field) throws IOException { + String fieldName = field.intern(); + int[] termCounts = new int[0]; + byte[] fakeNorms = new byte[0]; + + IndexReader reader = null; + TermEnum termEnum = null; + TermDocs termDocs = null; + try { + reader = IndexReader.open(dir); + termCounts = new int[reader.maxDoc()]; + // if we are killing norms, get fake ones + if (sim == null) + fakeNorms = SegmentReader.createFakeNorms(reader.maxDoc()); + try { + termEnum = reader.terms(new Term(field,"")); + try { + termDocs = reader.termDocs(); + do { + Term term = termEnum.term(); + if (term != null && term.field().equals(fieldName)) { + termDocs.seek(termEnum.term()); + while (termDocs.next()) { + termCounts[termDocs.doc()] += termDocs.freq(); + } + } + } while (termEnum.next()); + + } finally { + if (null != termDocs) termDocs.close(); + } + } finally { + if (null != termEnum) termEnum.close(); + } + } finally { + if (null != reader) reader.close(); + } + + try { + reader = IndexReader.open(dir); + for (int d = 0; d < termCounts.length; d++) { + if (! reader.isDeleted(d)) { + if (sim == null) + reader.setNorm(d, fieldName, fakeNorms[0]); + else + reader.setNorm(d, fieldName, sim.encodeNorm(sim.lengthNorm(fieldName, termCounts[d]))); + } + } + + } finally { + if (null != reader) reader.close(); + } + } + +}