Index: src/test/org/apache/lucene/index/TestOmitTf.java =================================================================== --- src/test/org/apache/lucene/index/TestOmitTf.java (revision 0) +++ src/test/org/apache/lucene/index/TestOmitTf.java (revision 0) @@ -0,0 +1,318 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Collection; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.HitCollector; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.MockRAMDirectory; + + +public class TestOmitTf extends LuceneTestCase { + + public static class SimpleSimilarity extends Similarity { + public float lengthNorm(String field, int numTerms) { return 1.0f; } + public float queryNorm(float sumOfSquaredWeights) { return 1.0f; } + + public float tf(float freq) { return freq; } + + public float sloppyFreq(int distance) { return 2.0f; } + public float idf(Collection terms, Searcher searcher) { return 1.0f; } + public float idf(int docFreq, int numDocs) { return 1.0f; } + public float coord(int overlap, int maxOverlap) { return 1.0f; } + } + + + // Tests whether the DocumentWriter correctly enable the + // omitTf bit in the FieldInfo + public void testOmitTf() throws Exception { + Directory ram = new MockRAMDirectory(); + Analyzer analyzer = new StandardAnalyzer(); + IndexWriter writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + Document d = new Document(); + + // this field will have Tf + Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.TOKENIZED); + d.add(f1); + + // this field will NOT have Tf + Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.TOKENIZED); + f2.setOmitTf(true); + d.add(f2); + + writer.addDocument(d); + writer.optimize(); + // now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger + // keep things constant + d = new Document(); + + // Reverese + f1.setOmitTf(true); + d.add(f1); + + f2.setOmitTf(false); + d.add(f2); + + writer.addDocument(d); + // force merge + writer.optimize(); + // flush + writer.close(); + _TestUtil.checkIndex(ram); + + // only one segment in the index, so we can cast to SegmentReader + SegmentReader reader = (SegmentReader) IndexReader.open(ram); + FieldInfos fi = reader.fieldInfos(); + assertTrue("OmitTf field bit should be set.", fi.fieldInfo("f1").omitTf); + assertTrue("OmitTf field bit should be set.", fi.fieldInfo("f2").omitTf); + + reader.close(); + ram.close(); + } + + // Tests whether merging of docs that have different + // omitTf for the same field works + public void testMixedMerge() throws Exception { + Directory ram = new MockRAMDirectory(); + Analyzer analyzer = new StandardAnalyzer(); + IndexWriter writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); + writer.setMaxBufferedDocs(3); + writer.setMergeFactor(2); + Document d = new Document(); + + // this field will have Tf + Field f1 = new Field("f1", "This field has term freqs", Field.Store.NO, Field.Index.TOKENIZED); + d.add(f1); + + // this field will NOT have Tf + Field f2 = new Field("f2", "This field has NO Tf in all docs", Field.Store.NO, Field.Index.TOKENIZED); + f2.setOmitTf(true); + d.add(f2); + + for(int i=0;i<30;i++) + writer.addDocument(d); + + // now we add another document which has term freq for field f2 and not for f1 and verify if the SegmentMerger + // keep things constant + d = new Document(); + + // Reverese + f1.setOmitTf(true); + d.add(f1); + + f2.setOmitTf(false); + d.add(f2); + + for(int i=0;i<30;i++) + writer.addDocument(d); + + // force merge + writer.optimize(); + // flush + writer.close(); + + _TestUtil.checkIndex(ram); + + // only one segment in the index, so we can cast to SegmentReader + SegmentReader reader = (SegmentReader) IndexReader.open(ram); + FieldInfos fi = reader.fieldInfos(); + assertTrue("OmitTf field bit should be set.", fi.fieldInfo("f1").omitTf); + assertTrue("OmitTf field bit should be set.", fi.fieldInfo("f2").omitTf); + + reader.close(); + ram.close(); + } + + private void assertNoPrx(Directory dir) throws Throwable { + final String[] files = dir.list(); + for(int i=0;i 0; f--) { // skip unread positions readDeltaPosition(); skipPayload(); Index: src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- src/java/org/apache/lucene/index/SegmentMerger.java (revision 678362) +++ src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -83,6 +83,10 @@ checkAbort = new CheckAbort(merge, directory); termIndexInterval = writer.getTermIndexInterval(); } + + boolean hasProx() { + return fieldInfos.hasProx(); + } /** * Add an IndexReader to the collection of readers that are to be merged @@ -164,6 +168,10 @@ // Basic files for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) { String ext = IndexFileNames.COMPOUND_EXTENSIONS[i]; + + if (ext.equals(IndexFileNames.PROX_EXTENSION) && !hasProx()) + continue; + if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) && !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) files.add(segment + "." + ext); @@ -198,11 +206,11 @@ } private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector, - boolean storeOffsetWithTermVector, boolean storePayloads) throws IOException { + boolean storeOffsetWithTermVector, boolean storePayloads, boolean omitTf) throws IOException { Iterator i = names.iterator(); while (i.hasNext()) { String field = (String)i.next(); - fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads); + fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads, omitTf); } } @@ -265,15 +273,16 @@ SegmentReader segmentReader = (SegmentReader) reader; for (int j = 0; j < segmentReader.getFieldInfos().size(); j++) { FieldInfo fi = segmentReader.getFieldInfos().fieldInfo(j); - fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.hasNorms(fi.name), fi.storePayloads); + fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.hasNorms(fi.name), fi.storePayloads, fi.omitTf); } } else { - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true); - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.OMIT_TF), false, false, false, false, true); + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false, false); fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false); } } @@ -477,7 +486,8 @@ private final void mergeTerms() throws CorruptIndexException, IOException { try { freqOutput = directory.createOutput(segment + ".frq"); - proxOutput = directory.createOutput(segment + ".prx"); + if (hasProx()) + proxOutput = directory.createOutput(segment + ".prx"); termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); @@ -561,11 +571,20 @@ */ private final int mergeTermInfo(SegmentMergeInfo[] smis, int n) throws CorruptIndexException, IOException { - long freqPointer = freqOutput.getFilePointer(); - long proxPointer = proxOutput.getFilePointer(); + final long freqPointer = freqOutput.getFilePointer(); + final long proxPointer; + if (proxOutput != null) + proxPointer = proxOutput.getFilePointer(); + else + proxPointer = 0; - int df = appendPostings(smis, n); // append posting data - + int df; + if (fieldInfos.fieldInfo(smis[0].term.field).omitTf) { // append posting data + df = appendPostingsNoTf(smis, n); + } else{ + df = appendPostings(smis, n); + } + long skipPointer = skipListWriter.writeSkip(freqOutput); if (df > 0) { @@ -672,6 +691,53 @@ return df; } + /** Process postings from multiple segments without tf, all positioned on the + * same term. Writes out merged entries only into freqOutput, proxOut is not written. + * + * @param smis array of segments + * @param n number of cells in the array actually occupied + * @return number of documents across all segments where this term was found + * @throws CorruptIndexException if the index is corrupt + * @throws IOException if there is a low-level IO error + */ + private final int appendPostingsNoTf(SegmentMergeInfo[] smis, int n) + throws CorruptIndexException, IOException { + int lastDoc = 0; + int df = 0; // number of docs w/ term + skipListWriter.resetSkip(); + int lastPayloadLength = -1; // ensures that we write the first length + for (int i = 0; i < n; i++) { + SegmentMergeInfo smi = smis[i]; + TermPositions postings = smi.getPositions(); + assert postings != null; + int base = smi.base; + int[] docMap = smi.getDocMap(); + postings.seek(smi.termEnum); + while (postings.next()) { + int doc = postings.doc(); + if (docMap != null) + doc = docMap[doc]; // map around deletions + doc += base; // convert to merged space + + if (doc < 0 || (df > 0 && doc <= lastDoc)) + throw new CorruptIndexException("docs out of order (" + doc + + " <= " + lastDoc + " )"); + + df++; + + if ((df % skipInterval) == 0) { + skipListWriter.setSkipData(lastDoc, false, lastPayloadLength); + skipListWriter.bufferSkip(df); + } + + int docCode = (doc - lastDoc); + lastDoc = doc; + freqOutput.writeVInt(docCode); // write doc & freq=1 + } + } + return df; + } + private void mergeNorms() throws IOException { byte[] normBuffer = null; IndexOutput output = null; Index: src/java/org/apache/lucene/index/TermsHashPerField.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashPerField.java (revision 678362) +++ src/java/org/apache/lucene/index/TermsHashPerField.java (working copy) @@ -57,9 +57,9 @@ bytePool = perThread.bytePool; docState = perThread.docState; fieldState = docInverterPerField.fieldState; - streamCount = perThread.termsHash.streamCount; + this.consumer = perThread.consumer.addField(this, fieldInfo); + streamCount = consumer.getStreamCount(); numPostingInt = 2*streamCount; - this.consumer = perThread.consumer.addField(this, fieldInfo); this.fieldInfo = fieldInfo; if (nextPerThread != null) nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo); Index: src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfo.java (revision 678362) +++ src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -77,6 +77,8 @@ private int delCount; // How many deleted docs in this segment, or -1 if not yet known // (if it's an older index) + private boolean hasProx; // True if this segment has any fields with omitTf==false + public SegmentInfo(String name, int docCount, Directory dir) { this.name = name; this.docCount = docCount; @@ -89,14 +91,15 @@ docStoreSegment = name; docStoreIsCompoundFile = false; delCount = 0; + hasProx = true; } public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) { - this(name, docCount, dir, isCompoundFile, hasSingleNormFile, -1, null, false); + this(name, docCount, dir, isCompoundFile, hasSingleNormFile, -1, null, false, true); } public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile, - int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile) { + int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx) { this(name, docCount, dir); this.isCompoundFile = (byte) (isCompoundFile ? YES : NO); this.hasSingleNormFile = hasSingleNormFile; @@ -104,6 +107,7 @@ this.docStoreOffset = docStoreOffset; this.docStoreSegment = docStoreSegment; this.docStoreIsCompoundFile = docStoreIsCompoundFile; + this.hasProx = hasProx; delCount = 0; assert docStoreOffset == -1 || docStoreSegment != null; } @@ -180,6 +184,10 @@ assert delCount <= docCount; } else delCount = -1; + if (format <= SegmentInfos.FORMAT_HAS_PROX) + hasProx = input.readByte() == 1; + else + hasProx = true; } else { delGen = CHECK_DIR; normGen = null; @@ -190,6 +198,7 @@ docStoreIsCompoundFile = false; docStoreSegment = null; delCount = -1; + hasProx = true; } } @@ -507,8 +516,18 @@ } output.writeByte(isCompoundFile); output.writeInt(delCount); + output.writeByte((byte) (hasProx ? 1:0)); } + void setHasProx(boolean hasProx) { + this.hasProx = hasProx; + clearFiles(); + } + + boolean getHasProx() { + return hasProx; + } + private void addIfExists(List files, String fileName) throws IOException { if (dir.fileExists(fileName)) files.add(fileName); Index: src/java/org/apache/lucene/index/FreqProxTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 678362) +++ src/java/org/apache/lucene/index/FreqProxTermsWriter.java (working copy) @@ -31,9 +31,10 @@ final class FreqProxTermsWriter extends TermsHashConsumer { + /* FreqProxTermsWriter() { - streamCount = 2; } + */ public TermsHashConsumerPerThread addThread(TermsHashPerThread perThread) { return new FreqProxTermsWriterPerThread(perThread); @@ -102,8 +103,13 @@ state.docWriter.writer.getTermIndexInterval()); final IndexOutput freqOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.FREQ_EXTENSION)); - final IndexOutput proxOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.PROX_EXTENSION)); + final IndexOutput proxOut; + if (fieldInfos.hasProx()) + proxOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.PROX_EXTENSION)); + else + proxOut = null; + final DefaultSkipListWriter skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, state.numDocsInRAM, freqOut, proxOut); @@ -148,13 +154,15 @@ } freqOut.close(); - proxOut.close(); + if (proxOut != null) { + state.flushedFiles.add(state.segmentFileName(IndexFileNames.PROX_EXTENSION)); + proxOut.close(); + } termsOut.close(); // Record all files we have flushed state.flushedFiles.add(state.segmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION)); state.flushedFiles.add(state.segmentFileName(IndexFileNames.FREQ_EXTENSION)); - state.flushedFiles.add(state.segmentFileName(IndexFileNames.PROX_EXTENSION)); state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION)); state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); } @@ -205,8 +213,12 @@ } final int skipInterval = termsOut.skipInterval; - final boolean currentFieldStorePayloads = fields[0].fieldInfo.storePayloads; + final boolean currentFieldOmitTf = fields[0].fieldInfo.omitTf; + // If current field omits tf then it cannot store + // payloads. We silently drop the payloads in this case: + final boolean currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads; + FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields]; while(numFields > 0) { @@ -235,8 +247,12 @@ final char[] text = termStates[0].text; final int start = termStates[0].textOffset; - long freqPointer = freqOut.getFilePointer(); - long proxPointer = proxOut.getFilePointer(); + final long freqPointer = freqOut.getFilePointer(); + final long proxPointer; + if (proxOut != null) + proxPointer = proxOut.getFilePointer(); + else + proxPointer = 0; skipListWriter.resetSkip(); @@ -261,45 +277,53 @@ assert doc < flushState.numDocsInRAM; assert doc > lastDoc || df == 1; - final int newDocCode = (doc-lastDoc)<<1; - - lastDoc = doc; - final ByteSliceReader prox = minState.prox; // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. - for(int j=0;j 0) - copyBytes(prox, proxOut, payloadLength); - } else { - assert 0 == (code & 1); - proxOut.writeVInt(code>>1); + if (!currentFieldOmitTf) { + // omitTf == false so we do write positions & payload + assert proxOut != null; + for(int j=0;j 0) + copyBytes(prox, proxOut, payloadLength); + } else { + assert 0 == (code & 1); + proxOut.writeVInt(code>>1); + } + } //End for + + final int newDocCode = (doc-lastDoc)<<1; + + if (1 == termDocFreq) { + freqOut.writeVInt(newDocCode|1); + } else { + freqOut.writeVInt(newDocCode); + freqOut.writeVInt(termDocFreq); } - } - - if (1 == termDocFreq) { - freqOut.writeVInt(newDocCode|1); } else { - freqOut.writeVInt(newDocCode); - freqOut.writeVInt(termDocFreq); + // omitTf==true: we store only the docs, without + // term freq, positions, payloads + freqOut.writeVInt(doc-lastDoc); } + lastDoc = doc; + if (!minState.nextDoc()) { // Remove from termStates Index: src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- src/java/org/apache/lucene/index/SegmentReader.java (revision 678362) +++ src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -298,6 +298,12 @@ fieldInfos = new FieldInfos(cfsDir, segment + ".fnm"); + boolean anyProx = false; + final int numFields = fieldInfos.size(); + for(int i=0;!anyProx && i 0) { termsHashPerField.writeVInt(1, (proxCode<<1)|1); Index: src/java/org/apache/lucene/index/FieldInfo.java =================================================================== --- src/java/org/apache/lucene/index/FieldInfo.java (revision 678362) +++ src/java/org/apache/lucene/index/FieldInfo.java (working copy) @@ -27,13 +27,14 @@ boolean storeOffsetWithTermVector; boolean storePositionWithTermVector; - boolean omitNorms; // omit norms associated with indexed fields + boolean omitNorms; // omit norms associated with indexed fields + boolean omitTf; // omit tf boolean storePayloads; // whether this field stores payloads together with term positions FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads) { + boolean omitNorms, boolean storePayloads, boolean omitTf) { name = na; isIndexed = tk; number = nu; @@ -42,15 +43,16 @@ this.storePositionWithTermVector = storePositionWithTermVector; this.omitNorms = omitNorms; this.storePayloads = storePayloads; + this.omitTf = omitTf; } public Object clone() { return new FieldInfo(name, isIndexed, number, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads); + storeOffsetWithTermVector, omitNorms, storePayloads, omitTf); } void update(boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, - boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads) { + boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTf) { if (this.isIndexed != isIndexed) { this.isIndexed = true; // once indexed, always index } @@ -66,6 +68,9 @@ if (this.omitNorms != omitNorms) { this.omitNorms = false; // once norms are stored, always store } + if (this.omitTf != omitTf) { + this.omitTf = true; // if one require omitTf at least once, it remains off for life + } if (this.storePayloads != storePayloads) { this.storePayloads = true; } @@ -87,6 +92,9 @@ if (omitNorms != other.omitNorms) { omitNorms = false; // once norms are stored, always store } + if (this.omitTf != omitTf) { + this.omitTf = true; // if one require omitTf at least once, it remains off for life + } if (storePayloads != other.storePayloads) { storePayloads = true; } Index: src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java =================================================================== --- src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (revision 678362) +++ src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (working copy) @@ -183,7 +183,7 @@ // easily add it FieldInfo fi = fieldInfos.add(fieldName, field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), - field.getOmitNorms(), false); + field.getOmitNorms(), false, field.getOmitTf()); fp = new DocFieldProcessorPerField(this, fi); fp.next = fieldHash[hashPos]; @@ -195,7 +195,7 @@ } else fp.fieldInfo.update(field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), - field.getOmitNorms(), false); + field.getOmitNorms(), false, field.getOmitTf()); if (thisFieldGen != fp.lastGen) { Index: src/java/org/apache/lucene/index/TermsHashConsumer.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashConsumer.java (revision 678362) +++ src/java/org/apache/lucene/index/TermsHashConsumer.java (working copy) @@ -28,8 +28,6 @@ abstract void abort(); abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException; - int streamCount; - FieldInfos fieldInfos; void setFieldInfos(FieldInfos fieldInfos) { Index: src/java/org/apache/lucene/index/TermVectorsTermsWriter.java =================================================================== --- src/java/org/apache/lucene/index/TermVectorsTermsWriter.java (revision 678362) +++ src/java/org/apache/lucene/index/TermVectorsTermsWriter.java (working copy) @@ -39,7 +39,6 @@ public TermVectorsTermsWriter(DocumentsWriter docWriter) { this.docWriter = docWriter; - streamCount = 2; } public TermsHashConsumerPerThread addThread(TermsHashPerThread termsHashPerThread) { Index: src/java/org/apache/lucene/index/TermsHashConsumerPerField.java =================================================================== --- src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (revision 678362) +++ src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (working copy) @@ -32,4 +32,5 @@ abstract void skippingLongTerm(Token t) throws IOException; abstract void newTerm(Token t, RawPostingList p) throws IOException; abstract void addTerm(Token t, RawPostingList p) throws IOException; + abstract int getStreamCount(); } Index: src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- src/java/org/apache/lucene/index/IndexReader.java (revision 678362) +++ src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -75,6 +75,8 @@ public static final FieldOption INDEXED = new FieldOption ("INDEXED"); /** All fields that store payloads */ public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS"); + /** All fields that omit tf */ + public static final FieldOption OMIT_TF = new FieldOption ("OMIT_TF"); /** All fields which are not indexed */ public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED"); /** All fields which are indexed with termvectors enabled */ Index: src/java/org/apache/lucene/index/DocumentsWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 678362) +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) @@ -132,6 +132,8 @@ boolean bufferIsFull; // True when it's time to write segment private boolean aborting; // True if an abort is pending + private DocFieldProcessor docFieldProcessor; + PrintStream infoStream; int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH; Similarity similarity; @@ -261,9 +263,15 @@ final DocInverter docInverter = new DocInverter(termsHash, normsWriter); final StoredFieldsWriter fieldsWriter = new StoredFieldsWriter(this); final DocFieldConsumers docFieldConsumers = new DocFieldConsumers(docInverter, fieldsWriter); - consumer = new DocFieldProcessor(this, docFieldConsumers); + consumer = docFieldProcessor = new DocFieldProcessor(this, docFieldConsumers); } + /** Returns true if any of the fields in the current + * buffered docs have omitTf==false */ + boolean hasProx() { + return docFieldProcessor.fieldInfos.hasProx(); + } + /** If non-null, various details of indexing are printed * here. */ synchronized void setInfoStream(PrintStream infoStream) { Index: src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java =================================================================== --- src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (revision 678362) +++ src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (working copy) @@ -47,6 +47,10 @@ fieldState = termsHashPerField.fieldState; } + int getStreamCount() { + return 2; + } + boolean start(Fieldable[] fields, int count) { doVectors = false; doVectorPositions = false; Index: src/java/org/apache/lucene/index/SegmentInfos.java =================================================================== --- src/java/org/apache/lucene/index/SegmentInfos.java (revision 678362) +++ src/java/org/apache/lucene/index/SegmentInfos.java (working copy) @@ -65,8 +65,13 @@ * This way IndexWriter can efficiently report numDocs(). */ public static final int FORMAT_DEL_COUNT = -6; + /** This format adds the boolean hasProx to record if any + * fields in the segment store prox information (ie, have + * omitTf==false) */ + public static final int FORMAT_HAS_PROX = -7; + /* This must always point to the most recent file format. */ - static final int CURRENT_FORMAT = FORMAT_DEL_COUNT; + static final int CURRENT_FORMAT = FORMAT_HAS_PROX; public int counter = 0; // used to name new segments /** Index: src/java/org/apache/lucene/index/SegmentTermDocs.java =================================================================== --- src/java/org/apache/lucene/index/SegmentTermDocs.java (revision 678362) +++ src/java/org/apache/lucene/index/SegmentTermDocs.java (working copy) @@ -41,7 +41,8 @@ private boolean haveSkipped; protected boolean currentFieldStoresPayloads; - + protected boolean currentFieldOmitTf; + protected SegmentTermDocs(SegmentReader parent) { this.parent = parent; this.freqStream = (IndexInput) parent.freqStream.clone(); @@ -75,6 +76,7 @@ void seek(TermInfo ti, Term term) throws IOException { count = 0; FieldInfo fi = parent.fieldInfos.fieldInfo(term.field); + currentFieldOmitTf = (fi != null) ? fi.omitTf : false; currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false; if (ti == null) { df = 0; @@ -105,14 +107,19 @@ while (true) { if (count == df) return false; - - int docCode = freqStream.readVInt(); - doc += docCode >>> 1; // shift off low bit - if ((docCode & 1) != 0) // if low bit is set - freq = 1; // freq is one - else - freq = freqStream.readVInt(); // else read freq - + final int docCode = freqStream.readVInt(); + + if (currentFieldOmitTf) { + doc += docCode; + freq = 1; + } else { + doc += docCode >>> 1; // shift off low bit + if ((docCode & 1) != 0) // if low bit is set + freq = 1; // freq is one + else + freq = freqStream.readVInt(); // else read freq + } + count++; if (deletedDocs == null || !deletedDocs.get(doc)) @@ -126,27 +133,49 @@ public int read(final int[] docs, final int[] freqs) throws IOException { final int length = docs.length; + if (currentFieldOmitTf) { + return readNoTf(docs, freqs, length); + } else { + int i = 0; + while (i < length && count < df) { + // manually inlined call to next() for speed + final int docCode = freqStream.readVInt(); + doc += docCode >>> 1; // shift off low bit + if ((docCode & 1) != 0) // if low bit is set + freq = 1; // freq is one + else + freq = freqStream.readVInt(); // else read freq + count++; + + if (deletedDocs == null || !deletedDocs.get(doc)) { + docs[i] = doc; + freqs[i] = freq; + ++i; + } + } + return i; + } + } + + private final int readNoTf(final int[] docs, final int[] freqs, final int length) throws IOException { int i = 0; while (i < length && count < df) { - // manually inlined call to next() for speed - final int docCode = freqStream.readVInt(); - doc += docCode >>> 1; // shift off low bit - if ((docCode & 1) != 0) // if low bit is set - freq = 1; // freq is one - else - freq = freqStream.readVInt(); // else read freq + doc += freqStream.readVInt(); count++; if (deletedDocs == null || !deletedDocs.get(doc)) { docs[i] = doc; - freqs[i] = freq; + // Hardware freq to 1 when term freqs were not + // stored in the index + freqs[i] = 1; ++i; } } return i; } - + + /** Overridden by SegmentTermPositions to skip in prox stream. */ protected void skipProx(long proxPointer, int payloadLength) throws IOException {} Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 678362) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -3054,7 +3054,7 @@ synchronized(this) { segmentInfos.setSize(0); // pop old infos & add new info = new SegmentInfo(mergedName, docCount, directory, false, true, - -1, null, false); + -1, null, false, merger.hasProx()); segmentInfos.addElement(info); } @@ -3362,7 +3362,8 @@ flushedDocCount, directory, false, true, docStoreOffset, docStoreSegment, - docStoreIsCompoundFile); + docStoreIsCompoundFile, + docWriter.hasProx()); } docWriter.pushDeletes(); @@ -3600,6 +3601,8 @@ } } + merge.info.setHasProx(merger.hasProx()); + segmentInfos.subList(start, start + merge.segments.size()).clear(); segmentInfos.add(start, merge.info); @@ -3890,7 +3893,8 @@ directory, false, true, docStoreOffset, docStoreSegment, - docStoreIsCompoundFile); + docStoreIsCompoundFile, + false); // Also enroll the merged segment into mergingSegments; // this prevents it from getting selected for a merge Index: src/java/org/apache/lucene/index/DefaultSkipListWriter.java =================================================================== --- src/java/org/apache/lucene/index/DefaultSkipListWriter.java (revision 678362) +++ src/java/org/apache/lucene/index/DefaultSkipListWriter.java (working copy) @@ -62,7 +62,8 @@ this.curStorePayloads = storePayloads; this.curPayloadLength = payloadLength; this.curFreqPointer = freqOutput.getFilePointer(); - this.curProxPointer = proxOutput.getFilePointer(); + if (proxOutput != null) + this.curProxPointer = proxOutput.getFilePointer(); } protected void resetSkip() { @@ -70,7 +71,8 @@ Arrays.fill(lastSkipDoc, 0); Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list Arrays.fill(lastSkipFreqPointer, freqOutput.getFilePointer()); - Arrays.fill(lastSkipProxPointer, proxOutput.getFilePointer()); + if (proxOutput != null) + Arrays.fill(lastSkipProxPointer, proxOutput.getFilePointer()); } protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { Index: src/java/org/apache/lucene/index/CheckIndex.java =================================================================== --- src/java/org/apache/lucene/index/CheckIndex.java (revision 678362) +++ src/java/org/apache/lucene/index/CheckIndex.java (working copy) @@ -114,13 +114,12 @@ else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE) sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]"; else { - // LUCENE-1255: All versions before 2.3.2/2.4 were - // able to create position=-1 when the very first - // Token has positionIncrement 0 if (format == SegmentInfos.FORMAT_CHECKSUM) sFormat = "FORMAT_CHECKSUM [Lucene 2.4]"; else if (format == SegmentInfos.FORMAT_DEL_COUNT) - sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]"; + sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]"; + else if (format == SegmentInfos.FORMAT_HAS_PROX) + sFormat = "FORMAT_HAS_PROX [Lucene 2.4]"; else if (format < SegmentInfos.CURRENT_FORMAT) { sFormat = "int=" + format + " [newer version of Lucene than this tool]"; skip = true; @@ -161,6 +160,7 @@ try { out.println(" compound=" + info.getUseCompoundFile()); + out.println(" hasProx=" + info.getHasProx()); out.println(" numFiles=" + info.files().size()); out.println(" size (MB)=" + nf.format(info.sizeInBytes()/(1024.*1024.))); final int docStoreOffset = info.getDocStoreOffset(); @@ -224,7 +224,7 @@ final int doc = termPositions.doc(); final int freq = termPositions.freq(); if (doc <= lastDoc) - throw new RuntimeException("term " + term + ": doc " + doc + " < lastDoc " + lastDoc); + throw new RuntimeException("term " + term + ": doc " + doc + " <= lastDoc " + lastDoc); lastDoc = doc; if (freq <= 0) throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); Index: src/java/org/apache/lucene/document/AbstractField.java =================================================================== --- src/java/org/apache/lucene/document/AbstractField.java (revision 678362) +++ src/java/org/apache/lucene/document/AbstractField.java (working copy) @@ -33,6 +33,7 @@ protected boolean isBinary = false; protected boolean isCompressed = false; protected boolean lazy = false; + protected boolean omitTf = false; protected float boost = 1.0f; // the one and only data object for all different kind of field values protected Object fieldsData = null; @@ -203,6 +204,9 @@ /** True if norms are omitted for this indexed field */ public boolean getOmitNorms() { return omitNorms; } + /** True if tf is omitted for this indexed field */ + public boolean getOmitTf() { return omitTf; } + /** Expert: * * If set, omit normalization factors associated with this indexed field. @@ -210,6 +214,12 @@ */ public void setOmitNorms(boolean omitNorms) { this.omitNorms=omitNorms; } + /** Expert: + * + * If set, omit tf from postings of this indexed field. + */ + public void setOmitTf(boolean omitTf) { this.omitTf=omitTf; } + public boolean isLazy() { return lazy; } @@ -257,6 +267,9 @@ if (omitNorms) { result.append(",omitNorms"); } + if (omitTf) { + result.append(",omitTf"); + } if (lazy){ result.append(",lazy"); } Index: src/java/org/apache/lucene/document/Fieldable.java =================================================================== --- src/java/org/apache/lucene/document/Fieldable.java (revision 678362) +++ src/java/org/apache/lucene/document/Fieldable.java (working copy) @@ -133,6 +133,15 @@ */ void setOmitNorms(boolean omitNorms); + /** Expert: + * + * If set, omit term freq, positions and payloads from postings for this field. + */ + void setOmitTf(boolean omitTf); + + /** True if tf is omitted for this indexed field */ + boolean getOmitTf(); + /** * Indicates whether a Field is Lazy or not. The semantics of Lazy loading are such that if a Field is lazily loaded, retrieving * it's values via {@link #stringValue()} or {@link #binaryValue()} is only valid as long as the {@link org.apache.lucene.index.IndexReader} that