Index: src/test/org/apache/lucene/index/TestNorms.java =================================================================== --- src/test/org/apache/lucene/index/TestNorms.java (revision 0) +++ src/test/org/apache/lucene/index/TestNorms.java (revision 0) @@ -0,0 +1,224 @@ +package org.apache.lucene.index; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + +/** + * Test that norms info is preserved during index life - incluidng seprate norms, addDocument, addIndexes, optimize. + */ +public class TestNorms extends TestCase { + + private class SimilarityOne extends DefaultSimilarity { + public float lengthNorm(String fieldName, int numTerms) { + return 1; + } + } + + private static final int NUM_FIELDS = 10; + + private Similarity similarityOne; + private Analyzer anlzr; + private int numDocNorms; + private ArrayList norms; + private ArrayList modifiedNorms; + private float lastNorm = 0; + private float normDelta = (float) 0.001; + + public TestNorms(String s) { + super(s); + } + + protected void setUp() throws IOException { + similarityOne = new SimilarityOne(); + anlzr = new StandardAnalyzer(); + } + + protected void tearDown() throws IOException { + } + + /** + * Test that norms values are preserved as the index is maintained. + * Including separate norms. + * Including merging indexes with seprate norms. + * Including optimize. + */ + public void testNorms() throws IOException { + // tmp dir + String tempDir = System.getProperty("java.io.tmpdir"); + if (tempDir == null) { + throw new IOException("java.io.tmpdir undefined, cannot run test"); + } + + // test with a single index: index1 + File indexDir1 = new File(tempDir, "lucenetestindex1"); + Directory dir1 = FSDirectory.getDirectory(indexDir1, true); + + norms = new ArrayList(); + modifiedNorms = new ArrayList(); + + createIndex(dir1); + doTestNorms(dir1); + + // test with a single index: index2 + ArrayList norms1 = norms; + ArrayList modifiedNorms1 = modifiedNorms; + int numDocNorms1 = numDocNorms; + + norms = new ArrayList(); + modifiedNorms = new ArrayList(); + numDocNorms = 0; + + File indexDir2 = new File(tempDir, "lucenetestindex2"); + Directory dir2 = FSDirectory.getDirectory(indexDir2, true); + + createIndex(dir2); + doTestNorms(dir2); + + // add index1 and index2 to a third index: index3 + File indexDir3 = new File(tempDir, "lucenetestindex3"); + Directory dir3 = FSDirectory.getDirectory(indexDir3, true); + + createIndex(dir3); + IndexWriter iw = new IndexWriter(dir3,anlzr,false); + iw.setMaxBufferedDocs(5); + iw.setMergeFactor(3); + iw.addIndexes(new Directory[]{dir1,dir2}); + iw.close(); + + norms1.addAll(norms); + norms = norms1; + modifiedNorms1.addAll(modifiedNorms); + modifiedNorms = modifiedNorms1; + numDocNorms += numDocNorms1; + + // test with index3 + verifyIndex(dir3); + doTestNorms(dir3); + + // now with optimize + iw = new IndexWriter(dir3,anlzr,false); + iw.setMaxBufferedDocs(5); + iw.setMergeFactor(3); + iw.optimize(); + iw.close(); + verifyIndex(dir3); + + dir1.close(); + dir2.close(); + dir3.close(); + } + + private void doTestNorms(Directory dir) throws IOException { + for (int i=0; i<5; i++) { + addDocs(dir,12,true); + verifyIndex(dir); + modifyNormsForF1(dir); + verifyIndex(dir); + addDocs(dir,12,false); + verifyIndex(dir); + modifyNormsForF1(dir); + verifyIndex(dir); + } + } + + private void createIndex(Directory dir) throws IOException { + IndexWriter iw = new IndexWriter(dir,anlzr,true); + iw.setMaxBufferedDocs(5); + iw.setMergeFactor(3); + iw.setSimilarity(similarityOne); + iw.setUseCompoundFile(true); + iw.close(); + } + + private void modifyNormsForF1(Directory dir) throws IOException { + IndexReader ir = IndexReader.open(dir); + int n = ir.maxDoc(); + for (int i = 0; i < n; i+=3) { // modify for every third doc + int k = (i*3) % modifiedNorms.size(); + float origNorm = ((Float)modifiedNorms.get(i)).floatValue(); + float newNorm = ((Float)modifiedNorms.get(k)).floatValue(); + //System.out.println("Modifying: for "+i+" from "+origNorm+" to "+newNorm); + //System.out.println(" and: for "+k+" from "+newNorm+" to "+origNorm); + modifiedNorms.set(i, new Float(newNorm)); + modifiedNorms.set(k, new Float(origNorm)); + ir.setNorm(i, "f"+1, newNorm); + ir.setNorm(k, "f"+1, origNorm); + } + ir.close(); + } + + + private void verifyIndex(Directory dir) throws IOException { + IndexReader ir = IndexReader.open(dir); + for (int i = 0; i < NUM_FIELDS; i++) { + String field = "f"+i; + byte b[] = ir.norms(field); + assertEquals("number of norms mismatches",numDocNorms,b.length); + ArrayList storedNorms = (i==1 ? modifiedNorms : norms); + for (int j = 0; j < b.length; j++) { + float norm = Similarity.decodeNorm(b[j]); + float norm1 = ((Float)storedNorms.get(j)).floatValue(); + assertEquals("stored norm value of "+field+" for doc "+j+" is "+norm+" - a mismatch!", norm, norm1, 0.000001); + } + } + } + + private void addDocs(Directory dir, int ndocs, boolean compound) throws IOException { + IndexWriter iw = new IndexWriter(dir,anlzr,false); + iw.setMaxBufferedDocs(5); + iw.setMergeFactor(3); + iw.setSimilarity(similarityOne); + iw.setUseCompoundFile(compound); + for (int i = 0; i < ndocs; i++) { + iw.addDocument(newDoc()); + } + iw.close(); + } + + // create the next document + private Document newDoc() { + Document d = new Document(); + float boost = nextNorm(); + for (int i = 0; i < 10; i++) { + Field f = new Field("f"+i,"v"+i,Store.NO,Index.UN_TOKENIZED); + f.setBoost(boost); + d.add(f); + } + return d; + } + + // return unique norm values that are unchanged by encoding/decoding + private float nextNorm() { + float norm = lastNorm + normDelta; + do { + float norm1 = Similarity.decodeNorm(Similarity.encodeNorm(norm)); + if (norm1 > lastNorm) { + //System.out.println(norm1+" > "+lastNorm); + norm = norm1; + break; + } + norm += normDelta; + } while (true); + norms.add(numDocNorms, new Float(norm)); + modifiedNorms.add(numDocNorms, new Float(norm)); + //System.out.println("creating norm("+numDocNorms+"): "+norm); + numDocNorms ++; + lastNorm = (norm>10 ? 0 : norm); //there's a limit to how many distinct values can be stored in a ingle byte + return norm; + } + +} Property changes on: src/test/org/apache/lucene/index/TestNorms.java ___________________________________________________________________ Name: svn:executable + * Index: src/java/org/apache/lucene/index/IndexFileNames.java =================================================================== --- src/java/org/apache/lucene/index/IndexFileNames.java (revision 492322) +++ src/java/org/apache/lucene/index/IndexFileNames.java (working copy) @@ -35,6 +35,9 @@ * pre-lockless indices) */ static final String DELETABLE = "deletable"; + /** Extension of norms file */ + static final String NORMS_EXTENSION = "nrm"; + /** * This array contains all filename extensions used by * Lucene's index files, with two exceptions, namely the @@ -45,7 +48,8 @@ */ static final String INDEX_EXTENSIONS[] = new String[] { "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del", - "tvx", "tvd", "tvf", "tvp", "gen"}; + "tvx", "tvd", "tvf", "tvp", "gen", "nrm" + }; /** File extensions of old-style index files */ static final String COMPOUND_EXTENSIONS[] = new String[] { Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 492322) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -639,7 +639,7 @@ String segmentName = newRAMSegmentName(); dw.addDocument(segmentName, doc); synchronized (this) { - ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false)); + ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false, false)); maybeFlushRamSegments(); } } @@ -772,10 +772,10 @@ while (segmentInfos.size() > 1 || (segmentInfos.size() == 1 && (SegmentReader.hasDeletions(segmentInfos.info(0)) || + SegmentReader.hasSeparateNorms(segmentInfos.info(0)) || segmentInfos.info(0).dir != directory || (useCompoundFile && - (!SegmentReader.usesCompoundFile(segmentInfos.info(0)) || - SegmentReader.hasSeparateNorms(segmentInfos.info(0))))))) { + (!SegmentReader.usesCompoundFile(segmentInfos.info(0))))))) { int minSegment = segmentInfos.size() - mergeFactor; mergeSegments(segmentInfos, minSegment < 0 ? 0 : minSegment, segmentInfos.size()); } @@ -1127,7 +1127,7 @@ int docCount = merger.merge(); // merge 'em segmentInfos.setSize(0); // pop old infos & add new - info = new SegmentInfo(mergedName, docCount, directory, false); + info = new SegmentInfo(mergedName, docCount, directory, false, true); segmentInfos.addElement(info); commitPending = true; @@ -1347,7 +1347,7 @@ } newSegment = new SegmentInfo(mergedName, mergedDocCount, - directory, false); + directory, false, true); if (sourceSegments == ramSegmentInfos) { Index: src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMerger.java (revision 492322) +++ src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -40,6 +40,10 @@ * @see #add */ final class SegmentMerger { + + /** norms header placeholder */ + static final byte[] NORMS_HEADER = new byte[]{'N','R','M',-1}; + private Directory directory; private String segment; private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL; @@ -116,7 +120,7 @@ new CompoundFileWriter(directory, fileName); Vector files = - new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + fieldInfos.size()); + new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + 1); // Basic files for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) { @@ -127,7 +131,8 @@ for (int i = 0; i < fieldInfos.size(); i++) { FieldInfo fi = fieldInfos.fieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { - files.add(segment + ".f" + i); + files.add(segment + "." + IndexFileNames.NORMS_EXTENSION); + break; } } @@ -408,11 +413,15 @@ private void mergeNorms() throws IOException { byte[] normBuffer = null; - for (int i = 0; i < fieldInfos.size(); i++) { - FieldInfo fi = fieldInfos.fieldInfo(i); - if (fi.isIndexed && !fi.omitNorms) { - IndexOutput output = directory.createOutput(segment + ".f" + i); - try { + IndexOutput output = null; + try { + for (int i = 0; i < fieldInfos.size(); i++) { + FieldInfo fi = fieldInfos.fieldInfo(i); + if (fi.isIndexed && !fi.omitNorms) { + if (output == null) { + output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION); + output.writeBytes(NORMS_HEADER,NORMS_HEADER.length); + } for (int j = 0; j < readers.size(); j++) { IndexReader reader = (IndexReader) readers.elementAt(j); int maxDoc = reader.maxDoc(); @@ -434,10 +443,12 @@ } } } - } finally { - output.close(); } } + } finally { + if (output != null) { + output.close(); + } } } Index: src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfo.java (revision 492322) +++ src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -42,8 +42,13 @@ private byte isCompoundFile; // -1 if it is not; 1 if it is; 0 if it's // pre-2.1 (ie, must check file system to see - // if .cfs exists) + // if .cfs and .nrm exist) + private byte withNrm; // 1 if this segment maintains norms in a single file; + // -1 if not; 0 if check file is required to tell. + // would be -1 for segments populated by DocumentWriter. + // would be 1 for (newly created) merge resulted segments (both compound and non compound). + public SegmentInfo(String name, int docCount, Directory dir) { this.name = name; this.docCount = docCount; @@ -51,14 +56,13 @@ delGen = -1; isCompoundFile = 0; preLockless = true; + withNrm = 0; } - public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile) { + + public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean withNrm) { this(name, docCount, dir); - if (isCompoundFile) { - this.isCompoundFile = 1; - } else { - this.isCompoundFile = -1; - } + this.isCompoundFile = (byte) (isCompoundFile ? 1 : -1); + this.withNrm = (byte) (withNrm ? 1 : -1); preLockless = false; } @@ -78,6 +82,7 @@ System.arraycopy(src.normGen, 0, normGen, 0, src.normGen.length); } isCompoundFile = src.isCompoundFile; + withNrm = src.withNrm; } /** @@ -111,19 +116,20 @@ isCompoundFile = 0; preLockless = true; } + withNrm = 0; } - void setNumField(int numField) { + void setNumFields(int numFields) { if (normGen == null) { // normGen is null if we loaded a pre-2.1 segment // file, or, if this segments file hasn't had any // norms set against it yet: - normGen = new long[numField]; + normGen = new long[numFields]; if (!preLockless) { // This is a FORMAT_LOCKLESS segment, which means // there are no norms: - for(int i=0;i
Normalization Factors -

There's a norm file for each indexed field with a byte for +

+ Pre-2.1: + There's a norm file for each indexed field with a byte for each document. The .f[0-9]* file contains, for each document, a byte that encodes a value that is multiplied into the score for hits on that field: @@ -1406,6 +1408,27 @@ (.f[0-9]*) --> <Byte> SegSize

+

+ 2.1 and above: + There's a single .nrm file containing all norms: +

+

AllNorms + (.nrm) --> NormsHeader,<Norms> + NumFieldsWithNorms +

+

Norms + --> <Byte> + SegSize +

+

NormsHeader + --> 'N','R','M',Version +

+

Version + --> Byte +

+

NormsHeader + has 4 bytes, last of which is the format version for this file, currently -1. +

Each byte encodes a floating point value. Bits 0-2 contain the 3-bit mantissa, and bits 3-8 contain the 5-bit exponent. @@ -1441,6 +1464,18 @@

+

A separate norm file is created when the norm values of an existing segment are modified. + When field N is modified, a separate norm file .sN + is created, to maintain the norm values for that field. +

+

+ Pre-2.1: + Separate norm files are created only for compound segments. +

+

+ 2.1 and above: + Separate norm files are created (when adequate) for both compound and non compound segments. +

Term Vectors