Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java (revision 932780) +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java (working copy) @@ -183,7 +183,7 @@ } @Override - public void finishTerm(BytesRef text, int numDocs) { + public void finishTerm(BytesRef text, int numDocs, long totalFreq) { assert numDocs > 0; assert numDocs == current.docs.size(); field.termToDocs.put(current.term, current); @@ -320,6 +320,11 @@ } @Override + public long totalFreq() { + throw new UnsupportedOperationException(); + } + + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) { return new RAMDocsEnum(ramField.termToDocs.get(current), skipDocs); } Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestCodecs.java (revision 932780) +++ lucene/src/test/org/apache/lucene/index/TestCodecs.java (working copy) @@ -175,6 +175,7 @@ public void write(final TermsConsumer termsConsumer) throws Throwable { final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text); + long totalFreq = 0; for(int i=0;i subReaders = new ArrayList(); + ReaderUtil.gatherSubReaders(subReaders, topReader); + + for(IndexReader r : subReaders) { + Stats.Reader statsReader = r.getStatsReader(); + if (statsReader != null) { + Stats.FieldReader fieldStatsReader = statsReader.getField(field); + if (fieldStatsReader != null) { + final Stats.DocFieldStats stats = fieldStatsReader.getDocFieldStats(); + final int maxDoc = r.maxDoc(); + final Bits skipDocs = r.getDeletedDocs(); + for(int i=0;i byNumber = new ArrayList(); private final HashMap byName = new HashMap(); @@ -116,7 +117,8 @@ List fields = doc.getFields(); for (Fieldable field : fields) { add(field.name(), field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), - field.isStoreOffsetWithTermVector(), field.getOmitNorms(), false, field.getOmitTermFreqAndPositions()); + field.isStoreOffsetWithTermVector(), field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), + field.getIndexStats()); } } @@ -215,7 +217,7 @@ synchronized public void add(String name, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) { add(name, isIndexed, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, false, false); + storeOffsetWithTermVector, omitNorms, false, false, false); } /** If the field is not yet known, adds it. If it is known, checks to make @@ -233,23 +235,25 @@ * @param omitTermFreqAndPositions true if term freqs should be omitted for this field */ synchronized public FieldInfo add(String name, boolean isIndexed, boolean storeTermVector, - boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, + boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, + boolean storeIndexStats) { FieldInfo fi = fieldInfo(name); if (fi == null) { - return addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + return addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, storeIndexStats); } else { - fi.update(isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + fi.update(isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, storeIndexStats); } return fi; } private FieldInfo addInternal(String name, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, - boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, + boolean storeIndexStats) { name = StringHelper.intern(name); FieldInfo fi = new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, storeIndexStats); byNumber.add(fi); byName.put(name, fi); return fi; @@ -301,6 +305,17 @@ return hasVectors; } + public boolean hasStats() { + final int fieldCount = size(); + for(int i=0;i(si.files()); } + public Stats.Reader getStatsReader() throws IOException { + if (core.statsReader != null) { + return core.statsReader; + } else { + return null; + } + } + @Override public TermEnum terms() throws IOException { ensureOpen(); Index: lucene/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (working copy) @@ -194,7 +194,8 @@ // easily add it FieldInfo fi = fieldInfos.add(fieldName, field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), - field.getOmitNorms(), false, field.getOmitTermFreqAndPositions()); + field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), + field.getIndexStats()); fp = new DocFieldProcessorPerField(this, fi); fp.next = fieldHash[hashPos]; @@ -206,7 +207,8 @@ } else fp.fieldInfo.update(field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), - field.getOmitNorms(), false, field.getOmitTermFreqAndPositions()); + field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), + field.getIndexStats()); if (thisFieldGen != fp.lastGen) { Index: lucene/src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- lucene/src/java/org/apache/lucene/index/SegmentMerger.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -148,6 +148,7 @@ // threads. mergedDocs = mergeFields(); + mergeStats(); mergeTerms(); mergeNorms(); @@ -176,6 +177,9 @@ // Basic files for (String ext : IndexFileNames.COMPOUND_EXTENSIONS_NOT_CODEC) { + if (ext.equals(Stats.EXTENSION) && !fieldInfos.hasStats()) { + continue; + } if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) && !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) fileSet.add(IndexFileNames.segmentFileName(segment, ext)); @@ -212,14 +216,14 @@ } private void addIndexed(IndexReader reader, FieldInfos fInfos, - Collection names, boolean storeTermVectors, - boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean storePayloads, boolean omitTFAndPositions) + Collection names, boolean storeTermVectors, + boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, + boolean storePayloads, boolean omitTFAndPositions, boolean storeIndexStats) throws IOException { for (String field : names) { fInfos.add(field, true, storeTermVectors, - storePositionWithTermVector, storeOffsetWithTermVector, !reader - .hasNorms(field), storePayloads, omitTFAndPositions); + storePositionWithTermVector, storeOffsetWithTermVector, !reader + .hasNorms(field), storePayloads, omitTFAndPositions, storeIndexStats); } } @@ -286,18 +290,18 @@ for (int j = 0; j < numReaderFieldInfos; j++) { FieldInfo fi = readerFieldInfos.fieldInfo(j); fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, - fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, - !reader.hasNorms(fi.name), fi.storePayloads, - fi.omitTermFreqAndPositions); + fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, + !reader.hasNorms(fi.name), fi.storePayloads, + fi.omitTermFreqAndPositions, fi.storeIndexStats); } } else { - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false, false); fieldInfos.add(reader.getFieldNames(FieldOption.UNINDEXED), false); } } @@ -433,6 +437,66 @@ return docCount; } + private void mergeStats() throws IOException { + // naive -- would be better to be sparse wrt fieldCount + if (fieldInfos.hasStats()) { + + final int fieldCount = fieldInfos.size(); + Stats.Writer w = null; + + for(int field=0;field 0) { FreqProxFieldMergeState minState = termStates[0]; @@ -213,6 +213,7 @@ final int termDocFreq = minState.termFreq; numDocs++; + totalFreq += termDocFreq; assert minState.docID < flushedDocCount: "doc=" + minState.docID + " maxDoc=" + flushedDocCount; @@ -291,7 +292,7 @@ } assert numDocs > 0; - termsConsumer.finishTerm(text, numDocs); + termsConsumer.finishTerm(text, numDocs, totalFreq); } termsConsumer.finish(); Index: lucene/src/java/org/apache/lucene/index/TermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermsEnum.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/TermsEnum.java (working copy) @@ -99,6 +99,12 @@ * {@link SeekStatus#END}.*/ public abstract int docFreq(); + /** Optional: returns the total count of how many times + * this term occurs. While docFreq increments by 1 for + * each document the term occurs in, this count + * increments by the term's frequency in the document. */ + public abstract long totalFreq(); + /** Get {@link DocsEnum} for the current term. Do not * call this before calling {@link #next} or {@link * #seek} for the first time. This method will not @@ -154,6 +160,11 @@ } @Override + public long totalFreq() { + throw new IllegalStateException("this method should never be called"); + } + + @Override public long ord() { throw new IllegalStateException("this method should never be called"); } Index: lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (working copy) @@ -124,6 +124,7 @@ postings.docFreqs[termID] = 1; writeProx(termID, fieldState.position); } + fieldState.uniqueTermCount++; } @Override @@ -141,6 +142,7 @@ termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]); postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID]; postings.lastDocIDs[termID] = docState.docID; + fieldState.uniqueTermCount++; } } else { if (docState.docID != postings.lastDocIDs[termID]) { @@ -160,6 +162,7 @@ postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1; postings.lastDocIDs[termID] = docState.docID; writeProx(termID, fieldState.position); + fieldState.uniqueTermCount++; } else { postings.docFreqs[termID]++; writeProx(termID, fieldState.position-postings.lastPositions[termID]); Index: lucene/src/java/org/apache/lucene/index/Stats.java =================================================================== --- lucene/src/java/org/apache/lucene/index/Stats.java (revision 0) +++ lucene/src/java/org/apache/lucene/index/Stats.java (revision 0) @@ -0,0 +1,301 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Closeable; +import java.util.Map; +import java.util.HashMap; +import java.util.Arrays; + +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; + +/** + @lucene.experimental + */ + +public final class Stats { + + public static final String EXTENSION = "sts"; + public static final String CODEC_NAME = "Stats"; + + public static final int VERSION_START = 0; + public static final int VERSION_CURRENT = VERSION_START; + + public static class DocFieldStats { + public int termCount; // length in terms + public int overlapTermCount; // number of terms with 0 posIncr + public int uniqueTermCount; // number of unique terms + public float boost; // field's boost + } + + /** Used to read a segment */ + public static class FieldReader { + private final IndexInput in; + private final DocFieldStats docFieldStats = new DocFieldStats(); + + FieldReader(IndexInput in) { + this.in = in; + } + + /** Shared instance used to return results from {@link #next}. */ + public DocFieldStats getDocFieldStats() { + return docFieldStats; + } + + public void next() throws IOException { + docFieldStats.boost = in.readFloat(); + docFieldStats.uniqueTermCount = in.readVInt(); + docFieldStats.overlapTermCount = in.readVInt(); + docFieldStats.termCount = in.readVInt(); + } + } + + /** Used to write a new segment */ + public static class FieldWriter { + private final IndexOutput out; + private int upto; + + FieldWriter(IndexOutput out) { + this.out = out; + } + + public void write(int docID, DocFieldStats stats) throws IOException { + fill(docID-upto); + out.writeFloat(stats.boost); + out.writeVInt(stats.uniqueTermCount); + out.writeVInt(stats.overlapTermCount); + out.writeVInt(stats.termCount); + upto++; + } + + public void finish(int docCount) throws IOException { + if (docCount > upto) { + fill(docCount-upto); + } + } + + private void fill(int count) throws IOException { + for(int i=0;i= boosts.length) { + grow(docID); + } + boosts[docID] = state.boost; + uniqueTermCounts[docID] = state.uniqueTermCount; + overlapTermCounts[docID] = state.numOverlap; + totalTermCounts[docID] = state.length; + } + + public long ramBytesUsed() { + return (3*RamUsageEstimator.NUM_BYTES_INT + RamUsageEstimator.NUM_BYTES_FLOAT) * boosts.length; + } + + public void flush(FieldWriter w, int maxDocID) throws IOException { + final int limit = Math.min(maxDocID, boosts.length); + int upto = 0; + final DocFieldStats stats = new Stats.DocFieldStats(); + + while(upto < limit) { + stats.boost = boosts[upto]; + stats.uniqueTermCount = uniqueTermCounts[upto]; + stats.overlapTermCount = overlapTermCounts[upto]; + stats.termCount = totalTermCounts[upto]; + w.write(upto++, stats); + } + w.finish(maxDocID); + reset(); + } + + public int compareTo(Object other) { + return fieldInfo.number - ((BufferedField) other).fieldInfo.number; + } + } + + public static class Reader implements Closeable { + + private final IndexInput in; + private final Map fields = new HashMap(); + + public Reader(FieldInfos fieldInfos, Directory dir, String segment) throws IOException { + final String fileName = IndexFileNames.segmentFileName(segment, EXTENSION); + in = dir.openInput(fileName); + CodecUtil.checkHeader(in, CODEC_NAME, VERSION_CURRENT); + final long index = in.readLong(); + in.seek(index); + + final int fieldCount = in.readVInt(); + for(int i=0;i map = new HashMap(); + + private static class PerField implements Comparable { + final long fileOffset; + final int fieldNumber; + + public PerField(int fieldNumber, long fileOffset) { + this.fieldNumber = fieldNumber; + this.fileOffset = fileOffset; + } + + public int compareTo(Object other) { + return fieldNumber - ((PerField) other).fieldNumber; + } + } + + public Writer(SegmentWriteState state) throws IOException { + final String fileName = IndexFileNames.segmentFileName(state.segmentName, EXTENSION); + state.flushedFiles.add(fileName); + + out = state.directory.createOutput(fileName); + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + + // Placeholder for index + out.writeLong(0); + + fieldWriter = new FieldWriter(out); + } + + public FieldWriter addField(FieldInfo fieldInfo) throws IOException { + map.put(fieldInfo.name, new PerField(fieldInfo.number, out.getFilePointer())); + + // reuse + return fieldWriter; + } + + public void close() throws IOException { + final long indexFP = out.getFilePointer(); + + final PerField[] fields = map.values().toArray(new PerField[map.size()]); + Arrays.sort(fields); + + // Write index: + out.writeVInt(fields.length); + for(PerField field : fields) { + out.writeVInt(field.fieldNumber); + out.writeVLong(field.fileOffset); + } + out.seek(CodecUtil.headerLength(CODEC_NAME)); + out.writeLong(indexFP); + out.close(); + } + } + + /** Buffers stats in RAM, flushing them in the end using + * {@link Writer}. */ + public static class Buffer { + + private final Map map = new HashMap(); + + public synchronized BufferedField getField(FieldInfo fieldInfo) { + BufferedField result = map.get(fieldInfo.name); + if (result == null) { + result = new BufferedField(fieldInfo); + map.put(fieldInfo.name, result); + } + return result; + } + + public synchronized void flush(SegmentWriteState state) throws IOException { + if (map.size() > 0) { + + final Writer w = new Writer(state); + + final BufferedField[] fields = map.values().toArray(new BufferedField[map.size()]); + Arrays.sort(fields); + + for(BufferedField field : fields) { + field.flush(w.addField(field.fieldInfo), state.numDocs); + } + + w.close(); + } + } + } +} \ No newline at end of file Property changes on: lucene/src/java/org/apache/lucene/index/Stats.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/index/FieldInfo.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldInfo.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/FieldInfo.java (working copy) @@ -27,6 +27,7 @@ boolean storeTermVector; boolean storeOffsetWithTermVector; boolean storePositionWithTermVector; + public boolean storeIndexStats; public boolean omitNorms; // omit norms associated with indexed fields public boolean omitTermFreqAndPositions; @@ -35,7 +36,8 @@ FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, + boolean storeIndexStats) { name = na; isIndexed = tk; number = nu; @@ -46,6 +48,7 @@ this.storePayloads = storePayloads; this.omitNorms = omitNorms; this.omitTermFreqAndPositions = omitTermFreqAndPositions; + this.storeIndexStats = storeIndexStats; } else { // for non-indexed fields, leave defaults this.storeTermVector = false; this.storeOffsetWithTermVector = false; @@ -53,17 +56,19 @@ this.storePayloads = false; this.omitNorms = true; this.omitTermFreqAndPositions = false; + this.storeIndexStats = false; } } @Override public Object clone() { return new FieldInfo(name, isIndexed, number, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, storeIndexStats); } void update(boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, - boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, + boolean storeIndexStats) { if (this.isIndexed != isIndexed) { this.isIndexed = true; // once indexed, always index } @@ -86,6 +91,10 @@ if (this.omitTermFreqAndPositions != omitTermFreqAndPositions) { this.omitTermFreqAndPositions = true; // if one require omitTermFreqAndPositions at least once, it remains off for life } + if (this.storeIndexStats != storeIndexStats) { + // nocommit -- true? + this.storeIndexStats = false; + } } } } Index: lucene/src/java/org/apache/lucene/index/FieldInvertState.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldInvertState.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/FieldInvertState.java (working copy) @@ -19,18 +19,23 @@ import org.apache.lucene.util.AttributeSource; /** - * This class tracks the number and position / offset parameters of terms - * being added to the index. The information collected in this class is - * also used to calculate the normalization factor for a field. + * This class gathers statistics during inversion of a + * field's value for a single document. This information is + * also used to calculate the normalization factor for a + * field, or to store index statistics. * * @lucene.experimental */ + public final class FieldInvertState { + // nocommit -- need binary length? int position; - int length; - int numOverlap; + int length; // number of tokens in this field + int numOverlap; // number of tokens w/ posIncr==0 int offset; - float boost; + int uniqueTermCount; // number of unique tokens in this field + float boost; // net boost (product of per-Fieldable boosts, + // for multi-valued fields) AttributeSource attributeSource; public FieldInvertState() { @@ -51,6 +56,7 @@ void reset(float docBoost) { position = 0; length = 0; + uniqueTermCount = 0; numOverlap = 0; offset = 0; boost = docBoost; Index: lucene/src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexReader.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -1429,4 +1429,10 @@ Bits retrieveDelDocs() { return storedDelDocs; } + + /** @lucene.experimental */ + public Stats.Reader getStatsReader() throws IOException { + // nocommit -- throw UOE? + return null; + } } Index: lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (working copy) @@ -229,6 +229,15 @@ } @Override + public long totalFreq() { + long sum = 0; + for(int i=0;i 0. */ - public abstract void finishTerm(BytesRef text, int numDocs) throws IOException; + public abstract void finishTerm(BytesRef text, int numDocs, long totalCount) throws IOException; /** Called when we are done adding terms to this field */ public abstract void finish() throws IOException; @@ -69,9 +69,9 @@ if (docsEnumIn != null) { docsEnum.reset(docsEnumIn); final PostingsConsumer postingsConsumer = startTerm(term); - final int numDocs = postingsConsumer.merge(mergeState, docsEnum); - if (numDocs > 0) { - finishTerm(term, numDocs); + final PostingsConsumer.MergeResult result = postingsConsumer.merge(mergeState, docsEnum); + if (result.numDocs > 0) { + finishTerm(term, result.numDocs, result.totalFreq); } } } @@ -86,9 +86,9 @@ if (postingsEnumIn != null) { postingsEnum.reset(postingsEnumIn); final PostingsConsumer postingsConsumer = startTerm(term); - final int numDocs = postingsConsumer.merge(mergeState, postingsEnum); - if (numDocs > 0) { - finishTerm(term, numDocs); + final PostingsConsumer.MergeResult result = postingsConsumer.merge(mergeState, postingsEnum); + if (result.numDocs > 0) { + finishTerm(term, result.numDocs, result.totalFreq); } } } Index: lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java (working copy) @@ -439,6 +439,11 @@ bytesReader.read(); state.docFreq = in.readVInt(); + if (fieldInfo.storeIndexStats) { + state.totalFreq = in.readVLong(); + } + + // TODO: would be cleaner, but space-wasting, to // simply record a bit into each index entry as to // whether it's an index entry or not, rather than @@ -461,6 +466,11 @@ } @Override + public long totalFreq() { + return state.totalFreq; + } + + @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse); assert docsEnum != null; Index: lucene/src/java/org/apache/lucene/index/codecs/standard/TermState.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/standard/TermState.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/codecs/standard/TermState.java (working copy) @@ -31,10 +31,14 @@ public long filePointer; // fp into the terms dict primary file (_X.tis) public int docFreq; // how many docs have this term + // nocommit -- can we do this in a subclass...? + public long totalFreq; // total # times this term occurs + public void copy(TermState other) { ord = other.ord; filePointer = other.filePointer; docFreq = other.docFreq; + totalFreq = other.totalFreq; } @Override @@ -49,6 +53,6 @@ @Override public String toString() { - return "tis.fp=" + filePointer + " docFreq=" + docFreq + " ord=" + ord; + return "tis.fp=" + filePointer + " docFreq=" + docFreq + " totalFreq=" + totalFreq + " ord=" + ord; } } Index: lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java (working copy) @@ -154,7 +154,7 @@ } @Override - public void finishTerm(BytesRef text, int numDocs) throws IOException { + public void finishTerm(BytesRef text, int numDocs, long totalFreq) throws IOException { assert numDocs > 0; @@ -162,6 +162,9 @@ termWriter.write(text); out.writeVInt(numDocs); + if (fieldInfo.storeIndexStats) { + out.writeVLong(totalFreq); + } postingsWriter.finishTerm(numDocs, isIndexTerm); numTerms++; Index: lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java (revision 932780) +++ lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java (working copy) @@ -52,11 +52,20 @@ * for each doc */ public abstract void finishDoc() throws IOException; + /** @lucene.experimental */ + public static class MergeResult { + int numDocs; + int totalFreq; + } + + private MergeResult mergeResult; + /** Default merge impl: append documents, mapping around * deletes */ - public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException { + public MergeResult merge(final MergeState mergeState, final DocsEnum postings) throws IOException { int df = 0; + int totalFreq = 0; if (mergeState.fieldInfo.omitTermFreqAndPositions) { while(true) { @@ -76,6 +85,7 @@ break; } final int freq = postingsEnum.freq(); + totalFreq += freq; this.startDoc(doc, freq); for(int i=0;i readStringStringMap() throws IOException { final Map map = new HashMap(); final int count = readInt(); Index: lucene/src/java/org/apache/lucene/store/DataOutput.java =================================================================== --- lucene/src/java/org/apache/lucene/store/DataOutput.java (revision 932780) +++ lucene/src/java/org/apache/lucene/store/DataOutput.java (working copy) @@ -97,6 +97,11 @@ writeByte((byte)i); } + public void writeFloat(float f) throws IOException { + // nocommit -- byte order? + writeInt(Float.floatToRawIntBits(f)); + } + /** Writes a string. * @see DataInput#readString() */ Index: lucene/src/java/org/apache/lucene/util/ArrayUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/ArrayUtil.java (revision 932780) +++ lucene/src/java/org/apache/lucene/util/ArrayUtil.java (working copy) @@ -347,6 +347,29 @@ return array; } + public static float[] grow(float[] array, int minSize) { + if (array.length < minSize) { + float[] newArray = new float[oversize(minSize, RamUsageEstimator.NUM_BYTES_FLOAT)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static float[] grow(float[] array) { + return grow(array, 1 + array.length); + } + + public static float[] shrink(float[] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_FLOAT); + if (newSize != array.length) { + float[] newArray = new float[newSize]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else + return array; + } + /** * Returns hash of chars in range start (inclusive) to * end (inclusive) Index: lucene/src/java/org/apache/lucene/document/Fieldable.java =================================================================== --- lucene/src/java/org/apache/lucene/document/Fieldable.java (revision 932780) +++ lucene/src/java/org/apache/lucene/document/Fieldable.java (working copy) @@ -209,4 +209,10 @@ * silently fail to find results. */ void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions); + + // nocommit jdocs + boolean getIndexStats(); + + // nocommit jdocs + void setIndexStats(boolean doStoreIndexStats); } Index: lucene/src/java/org/apache/lucene/document/AbstractField.java =================================================================== --- lucene/src/java/org/apache/lucene/document/AbstractField.java (revision 932780) +++ lucene/src/java/org/apache/lucene/document/AbstractField.java (working copy) @@ -33,6 +33,7 @@ protected boolean storeOffsetWithTermVector = false; protected boolean storePositionWithTermVector = false; protected boolean omitNorms = false; + protected boolean storeIndexStats = false; protected boolean isStored = false; protected boolean isIndexed = true; protected boolean isTokenized = true; @@ -231,6 +232,14 @@ */ public void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions) { this.omitTermFreqAndPositions=omitTermFreqAndPositions; } + public boolean getIndexStats() { + return storeIndexStats; + } + + public void setIndexStats(boolean v) { + storeIndexStats = v; + } + public boolean isLazy() { return lazy; } Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (revision 932780) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (working copy) @@ -214,6 +214,8 @@ } Field bodyField = ds.getField(BODY_FIELD, bodyStoreVal, bodyIndexVal, termVecVal); bodyField.setValue(bdy); + // nocommit -- alg should decide + bodyField.setIndexStats(true); doc.add(bodyField); if (storeBytes) {