Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java (revision 1023783) +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java (working copy) @@ -185,7 +185,7 @@ } @Override - public void finishTerm(BytesRef text, int numDocs) { + public void finishTerm(BytesRef text, int numDocs, long totalFreq) { assert numDocs > 0; assert numDocs == current.docs.size(); field.termToDocs.put(current.term, current); @@ -327,6 +327,11 @@ } @Override + public long totalFreq() { + throw new UnsupportedOperationException(); + } + + @Override public void cacheCurrentTerm() { } Index: lucene/src/test/org/apache/lucene/search/TestCustomSimProvider.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestCustomSimProvider.java (revision 0) +++ lucene/src/test/org/apache/lucene/search/TestCustomSimProvider.java (revision 0) @@ -0,0 +1,73 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.TrueFloatDefaultSimilarityProvider; +import org.apache.lucene.search.similarity.AggregatesProvider; +import org.apache.lucene.search.similarity.DFRSimilarityProvider; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + +public class TestCustomSimProvider extends LuceneTestCase { + IndexSearcher searcher; + + @Override + public void setUp() throws Exception { + super.setUp(); + RAMDirectory ramdir = new RAMDirectory(); + IndexWriter iw = new IndexWriter(ramdir, new MockAnalyzer(), MaxFieldLength.UNLIMITED); + iw.setUseCompoundFile(false); + Document d = new Document(); + Field docID = new Field("docname", "foobar", Field.Store.YES, Field.Index.NOT_ANALYZED); + Field docText = new Field("body", "this is an example just for testing similarity", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES); + d.add(docID); + d.add(docText); + iw.addDocument(d); + iw.optimize(); + iw.close(); + searcher = new IndexSearcher(ramdir); + } + + public void assertQuery() throws Exception { + QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "body", new MockAnalyzer()); + Query q = qp.parse("example testing similarity"); + TopDocs td = searcher.search(q, 10); + assertEquals(1, td.totalHits); + } + + public void testDefault() throws Exception { + searcher.setSimilarityProvider(new DefaultSimilarityProvider()); + assertQuery(); + } + + public void testDFR() throws Exception { + searcher.setSimilarityProvider(new DFRSimilarityProvider(new AggregatesProvider(), Math.E, 0.25D)); + assertQuery(); + } + + public void testTrueFloats() throws Exception { + searcher.setSimilarityProvider(new TrueFloatDefaultSimilarityProvider()); + assertQuery(); + } +} Property changes on: lucene\src\test\org\apache\lucene\search\TestCustomSimProvider.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/test/org/apache/lucene/index/TestCodecs.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestCodecs.java (revision 1023783) +++ lucene/src/test/org/apache/lucene/index/TestCodecs.java (working copy) @@ -171,6 +171,7 @@ public void write(final TermsConsumer termsConsumer) throws Throwable { final PostingsConsumer postingsConsumer = termsConsumer.startTerm(text); + long totalFreq = 0; for(int i=0;i 0) { long skipPointer = skipListWriter.writeSkip(freqOut); termInfo.docFreq = numDocs; Index: lucene/src/test/org/apache/lucene/index/TestStats.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestStats.java (revision 0) +++ lucene/src/test/org/apache/lucene/index/TestStats.java (revision 0) @@ -0,0 +1,160 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.search.SimilarityProvider; +//import org.apache.lucene.search.similarity.PivotedLogSimilarityProvider; +import org.apache.lucene.search.similarity.AggregatesProvider; + +public class TestStats extends LuceneTestCase { + + private void add(IndexWriter w) throws Throwable { + Document d = new Document(); + Field f = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); + f.setIndexStats(true); + d.add(f); + + f.setValue("a"); + w.addDocument(d); + + f.setValue("a b"); + f.setBoost(2.0f); + w.addDocument(d); + + f.setValue("a b a"); + f.setBoost(3.0f); + w.addDocument(d); + + f.setValue("a b a c"); + f.setBoost(10.0f); + w.addDocument(d); + } + + private void verify(Stats.FieldReader reader) throws Throwable { + Stats.DocFieldStats stats = reader.getDocFieldStats(); + reader.next(); + // a + assertEquals(1, stats.termCount); + assertEquals(1, stats.uniqueTermCount); + assertEquals(1.0, stats.boost, 0.00001); + + // a b + reader.next(); + assertEquals(2, stats.termCount); + assertEquals(2, stats.uniqueTermCount); + assertEquals(2.0, stats.boost, 0.00001); + + // a b a + reader.next(); + assertEquals(3, stats.termCount); + assertEquals(2, stats.uniqueTermCount); + assertEquals(3.0, stats.boost, 0.00001); + + // a b a c + reader.next(); + assertEquals(4, stats.termCount); + assertEquals(3, stats.uniqueTermCount); + assertEquals(10.0, stats.boost, 0.00001); + } + + // nocommit -- need test w/ overlap terms + + private void verifyTerms(IndexReader reader, int mul) throws Throwable { + final Terms terms = MultiFields.getFields(reader).terms("field"); + assertEquals(10 * mul, terms.getTotalTermCount()); + + final TermsEnum te = terms.iterator(); + + assertEquals(te.seek(new BytesRef("a")), TermsEnum.SeekStatus.FOUND); + assertEquals(te.totalFreq(), 6*mul); + + assertEquals(te.seek(new BytesRef("b")), TermsEnum.SeekStatus.FOUND); + assertEquals(te.totalFreq(), 3*mul); + + assertEquals(te.seek(new BytesRef("c")), TermsEnum.SeekStatus.FOUND); + assertEquals(te.totalFreq(), mul); + + } + + public void testBasic() throws Throwable { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + add(w); + + IndexReader r = w.getReader(); + IndexReader sub = r.getSequentialSubReaders()[0]; + Stats.Reader stats = sub.getStatsReader(); + assertNotNull(stats); + + Stats.FieldReader fieldStats = stats.getField("field"); + assertNotNull(fieldStats); + + verify(fieldStats); + verifyTerms(sub, 1); + r.close(); + + add(w); + + // test multireader + r = w.getReader(); + verifyTerms(r, 2); + + // nocommit -- reenable + //SimilarityProvider p = new PivotedLogSimilarityProvider(new AggregatesProvider()); + // nocommit -- check that the boosts are right! + // nocommit -- also test DefaultFieldSimProvider + //p.create(r.getSequentialSubReaders()[0], "field"); + //p.create(r.getSequentialSubReaders()[1], "field"); + r.close(); + + w.optimize(); + + r = w.getReader(); + sub = r.getSequentialSubReaders()[0]; + stats = sub.getStatsReader(); + assertNotNull(stats); + fieldStats = stats.getField("field"); + verify(fieldStats); + verify(fieldStats); + verifyTerms(sub, 2); + + r.close(); + w.close(); + + dir.close(); + } + + public void testAggregates() throws Throwable { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + add(w); + IndexReader r = w.getReader(); + assertEquals(2.5, new AggregatesProvider().getAvgTermLength(r, "field"), 0.00000001); + r.close(); + w.rollback(); + w.close(); + dir.close(); + } +} Property changes on: lucene\src\test\org\apache\lucene\index\TestStats.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/search/DefaultSimilarityProvider.java =================================================================== --- lucene/src/java/org/apache/lucene/search/DefaultSimilarityProvider.java (revision 0) +++ lucene/src/java/org/apache/lucene/search/DefaultSimilarityProvider.java (revision 0) @@ -0,0 +1,93 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Stats.DocFieldStats; +import org.apache.lucene.util.BytesRef; + +/** Matches current DefaultSimilarity, except, the + * computation of boost bytes (norms) are done via index + * stats. */ + +public class DefaultSimilarityProvider extends SimilarityProvider { + + private final static class DefaultFieldDocScorer extends SimpleFieldDocScorer { + + public DefaultFieldDocScorer(IndexReader segment, float queryWeight, TermAndPostings termAndPostings, FieldSimilarity fieldSim) throws IOException { + super(segment, queryWeight, termAndPostings, fieldSim, + new ComputesLengthNorm() { + @Override + protected float lengthNorm(int docID, DocFieldStats stats) { + return (float) (stats.boost / Math.sqrt(stats.termCount)); + } + }); + } + } + + public class DefaultFieldSimilarity extends FieldSimilarity { + + // nocommit -- had to add transient to fake that this is + // serializable -- but this means it's not!! if you + // deserialize it on the other end, you cannot use it + private transient final IndexReader topReader; + + public DefaultFieldSimilarity(IndexReader topReader, String field) throws IOException { + super(field); + this.topReader = topReader; + } + + @Override + public float tf(float freq) { + return (float)Math.sqrt(freq); + } + + @Override + public float sloppyFreq(int distance) { + return 1.0f / (distance + 1); + } + + @Override + public float idf(BytesRef term) throws IOException { + return (float)(Math.log(topReader.maxDoc()/(double)(topReader.docFreq(field, term)+1)) + 1.0); + } + + @Override + public FieldDocScorer getTermScorer(float queryWeight, TermAndPostings termsAndPostings, IndexReader segment) throws IOException { + return new DefaultFieldDocScorer(segment, queryWeight, termsAndPostings, this); + } + } + + @Override + public FieldSimilarity getField(IndexReader topReader, String field) throws IOException { + return new DefaultFieldSimilarity(topReader, field); + } + + @Override + public float coord(int overlap, int maxOverlap) { + return overlap / (float)maxOverlap; + } + + @Override + public float queryNorm(float sumOfSquaredWeights) { + return (float)(1.0 / Math.sqrt(sumOfSquaredWeights)); + } +} + Property changes on: lucene\src\java\org\apache\lucene\search\DefaultSimilarityProvider.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java =================================================================== --- lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/search/cache/DocTermsIndexCreator.java (working copy) @@ -318,6 +318,11 @@ public Comparator getComparator() throws IOException { throw new UnsupportedOperationException(); } + + @Override + public long totalFreq() { + throw new UnsupportedOperationException(); + } } } } Index: lucene/src/java/org/apache/lucene/search/Searcher.java =================================================================== --- lucene/src/java/org/apache/lucene/search/Searcher.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/search/Searcher.java (working copy) @@ -136,6 +136,8 @@ /** The Similarity implementation used by this searcher. */ private Similarity similarity = Similarity.getDefault(); + private SimilarityProvider simProvider = new DefaultSimilarityProvider(); + /** Expert: Set the Similarity implementation used by this Searcher. * * @see Similarity#setDefault(Similarity) @@ -144,6 +146,11 @@ this.similarity = similarity; } + // nocommit + public void setSimilarityProvider(SimilarityProvider sim) { + this.simProvider = sim; + } + /** Expert: Return the Similarity implementation used by this Searcher. * *

This defaults to the current value of {@link Similarity#getDefault()}. @@ -152,6 +159,10 @@ return this.similarity; } + public SimilarityProvider getSimilarityProvider() { + return simProvider; + } + /** * creates a weight for query * @return new weight Index: lucene/src/java/org/apache/lucene/search/TrueFloatDefaultSimilarityProvider.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TrueFloatDefaultSimilarityProvider.java (revision 0) +++ lucene/src/java/org/apache/lucene/search/TrueFloatDefaultSimilarityProvider.java (revision 0) @@ -0,0 +1,150 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Stats.DocFieldStats; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.Stats; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Bits; + +/** Matches current DefaultSimilarity, except, the + * computation of boost bytes (norms) are done via index + * stats. */ + +public class TrueFloatDefaultSimilarityProvider extends SimilarityProvider { + + private final static class FieldDocScorer extends SimilarityProvider.FieldDocScorer { + + private final float[] docBoost; + private final FieldSimilarity fieldSim; + + private static final int SCORE_CACHE_SIZE = 32; + private final float[] scoreCache = new float[SCORE_CACHE_SIZE]; + private final float queryWeight; + private final DocsEnum docsEnum; + + public FieldDocScorer(IndexReader segment, float queryWeight, TermAndPostings termAndPostings, FieldSimilarity fieldSim) throws IOException { + // nocommit -- cache needs another part of the key, so + // if I use 2 sims on same reader they don't stomp on + // each other + this.queryWeight = queryWeight; + docsEnum = termAndPostings.docsEnum; + this.fieldSim = fieldSim; + + for (int i = 0; i < SCORE_CACHE_SIZE; i++) { + scoreCache[i] = fieldSim.tf(i) * queryWeight; + } + final String field = fieldSim.getField(); + + float[] floats = (float[]) segment.getSimDataCache(field); + if (floats == null) { + Stats.Reader statsReader = segment.getStatsReader(); + if (statsReader == null) { + throw new IllegalStateException("reader " + segment + " has no stats"); + } + + Stats.FieldReader fieldStatsReader = statsReader.getField(field); + if (fieldStatsReader == null) { + throw new IllegalStateException("reader " + segment + " has no stats for field " + field); + } + + final int maxDoc = segment.maxDoc(); + + floats = new float[maxDoc]; + final Stats.DocFieldStats stats = fieldStatsReader.getDocFieldStats(); + final Bits skipDocs = segment.getDeletedDocs(); + + for(int doc=0;doc for PQ? + // nocommit -- how to pass DocsAndPositionsEnum? so + // custom scorers can eg score payloads or other attrs... + public abstract FieldDocScorer getTermScorer(float queryWeight, TermAndPostings termAndPostings, IndexReader segment) throws IOException; + } + + public abstract static class FieldDocScorer { + // nocommit -- hmm also a scorePosition? + public abstract float score(); + } + + protected static final float[] NORM_TABLE = new float[256]; + + static { + for (int i = 0; i < 256; i++) + NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); + } + + /** For sims that compile each doc's boost to a boost + * byte using {@link SmallFloat} + * + */ + protected static abstract class BoostBytesFieldDocScorer extends FieldDocScorer { + private final byte[] boostBytes; + protected final FieldSimilarity fieldSim; + + protected static abstract class ComputesLengthNorm { + protected abstract float lengthNorm(int docID, Stats.DocFieldStats stats); + } + + protected float getDocBoost(int docID) { + return NORM_TABLE[boostBytes[docID]&0xFF]; + } + + public BoostBytesFieldDocScorer(IndexReader segment, /* noccommit */float queryWeight, TermAndPostings termAndPostings, FieldSimilarity fieldSim, ComputesLengthNorm computesLengthNorm) throws IOException { + this.fieldSim = fieldSim; + final String field = fieldSim.getField(); + + byte[] bb = (byte[]) segment.getSimDataCache(field); + if (bb == null) { + bb = computeBoostBytes(segment, field, computesLengthNorm); + segment.putSimDataCache(field, bb); + } + boostBytes = bb; + } + + protected byte[] computeBoostBytes(IndexReader segment, String field, ComputesLengthNorm computesLengthNorm) throws IOException { + Stats.Reader statsReader = segment.getStatsReader(); + if (statsReader == null) { + throw new IllegalStateException("reader " + segment + " has no stats"); + } + + Stats.FieldReader fieldStatsReader = statsReader.getField(field); + if (fieldStatsReader == null) { + throw new IllegalStateException("reader " + segment + " has no stats for field " + field); + } + + final int maxDoc = segment.maxDoc(); + + byte[] bytes = new byte[maxDoc]; + final Stats.DocFieldStats stats = fieldStatsReader.getDocFieldStats(); + final Bits skipDocs = segment.getDeletedDocs(); + + for(int doc=0;doc + * These sims compile each doc's boost to a boost + * byte using {@link SmallFloat}, and also compute + * combined score as: + *

queryWeight * tf(tf) * docBoost[docID]
+ *

+ * Extending this model provides the best performance, as this class will provide + * caching based on these assumptions of independence. + */ + protected static abstract class SimpleFieldDocScorer extends BoostBytesFieldDocScorer { + + /** Cache of scores for tf=0..31. */ + private static final int SCORE_CACHE_SIZE = 32; + private final float[] scoreCache = new float[SCORE_CACHE_SIZE]; + private final float queryWeight; + private final DocsEnum docsEnum; + + public SimpleFieldDocScorer(IndexReader segment, float queryWeight, TermAndPostings termAndPostings, FieldSimilarity fieldSim, ComputesLengthNorm computesLengthNorm) + throws IOException { + super(segment,queryWeight,termAndPostings,fieldSim, computesLengthNorm); + this.queryWeight = queryWeight; + docsEnum = termAndPostings.docsEnum; + + for (int i = 0; i < SCORE_CACHE_SIZE; i++) { + scoreCache[i] = fieldSim.tf(i) * queryWeight; + } + } + + // nocommit -- how to support bulk read? how to + // support source code spec...? + public float score() { + final int tf = docsEnum.freq(); + final float raw = tf < SCORE_CACHE_SIZE ? scoreCache[tf] : fieldSim.tf(tf) * queryWeight; + return raw * getDocBoost(docsEnum.docID()); + } + } + + public abstract FieldSimilarity getField(IndexReader topReader, String field) throws IOException; + + public abstract float coord(int overlap, int maxOverlap); + + public abstract float queryNorm(float sumOfSquaredWeights); +} + Property changes on: lucene\src\java\org\apache\lucene\search\SimilarityProvider.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/search/SimProviderTermScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/SimProviderTermScorer.java (revision 0) +++ lucene/src/java/org/apache/lucene/search/SimProviderTermScorer.java (revision 0) @@ -0,0 +1,112 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.BytesRef; + +/** Expert: A Scorer for documents matching a Term. + */ +final class SimProviderTermScorer extends Scorer { + private Weight weight; + private DocsEnum docsEnum; + private int doc = -1; + + private final SimilarityProvider.FieldDocScorer simScorer; + + /** + * Construct a TermScorer. + * + * @param weight + * The weight of the Term in the query. + * @param td + * An iterator over the documents matching the Term. + * @param similarity + * The Similarity implementation to be used for score + * computations. + * @param norms + * The field norms of the document fields for the Term. + */ + SimProviderTermScorer(BytesRef term, Weight weight, DocsEnum td, SimilarityProvider.FieldSimilarity fieldSim, IndexReader segment) throws IOException { + super(null); + this.weight = weight; + this.docsEnum = td; + simScorer = fieldSim.getTermScorer(weight.getValue(), new SimilarityProvider.TermAndPostings(term, docsEnum), segment); + } + + @Override + public void score(Collector c) throws IOException { + score(c, Integer.MAX_VALUE, nextDoc()); + } + + // firstDocID is ignored since nextDoc() sets 'doc' + @Override + protected boolean score(Collector c, int end, int firstDocID) throws IOException { + c.setScorer(this); + while (doc < end) { // for docs in window + c.collect(doc); // collect score + doc = docsEnum.nextDoc(); + } + return true; + } + + @Override + public int docID() { + return doc; + } + + /** + * Advances to the next document matching the query.
+ * The iterator over the matching documents is buffered using + * {@link TermDocs#read(int[],int[])}. + * + * @return the document matching the query or NO_MORE_DOCS if there are no more documents. + */ + @Override + public int nextDoc() throws IOException { + return doc = docsEnum.nextDoc(); + } + + @Override + public float score() { + // nocommit -- silly to have to delegate here -- + // SimScorer should directly sublcass Scorer??? + return simScorer.score(); + } + + /** + * Advances to the first match beyond the current whose document number is + * greater than or equal to a given target.
+ * The implementation uses {@link DocsEnum#advance(int)}. + * + * @param target + * The target document number. + * @return the matching document or NO_MORE_DOCS if none exist. + */ + @Override + public int advance(int target) throws IOException { + return doc = docsEnum.advance(target); + } + + /** Returns a string representation of this TermScorer. */ + @Override + public String toString() { return "scorer(" + weight + ")"; } +} Property changes on: lucene\src\java\org\apache\lucene\search\SimProviderTermScorer.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/search/similarity/LnbLtcSimilarityProvider.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarity/LnbLtcSimilarityProvider.java (revision 0) +++ lucene/src/java/org/apache/lucene/search/similarity/LnbLtcSimilarityProvider.java (revision 0) @@ -0,0 +1,64 @@ +package org.apache.lucene.search.similarity; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Stats.DocFieldStats; +import org.apache.lucene.search.DefaultSimilarityProvider; + +/** + * LUCENE-2187: crappy name... really the 'b' should be bytes. + * Its like vector space with BM25's length norm and log TF right now. + */ +public class LnbLtcSimilarityProvider extends DefaultSimilarityProvider { + private final float slope; + private final AggregatesProvider aggs; + + public LnbLtcSimilarityProvider(AggregatesProvider aggs, float slope) { + this.aggs = aggs; + this.slope = slope; + } + + public LnbLtcSimilarityProvider(AggregatesProvider aggs) { + this(aggs, 0.25F); + } + + private class LnbLtcTermScorer extends SimpleFieldDocScorer { + public LnbLtcTermScorer(IndexReader segment, float queryWeight, + TermAndPostings termAndPostings, final LnbLtcFieldSimilarity fieldSim) throws IOException { + super(segment, queryWeight, termAndPostings, fieldSim, + new ComputesLengthNorm() { + @Override + protected float lengthNorm(int docID, DocFieldStats stats) { + return (float) (stats.boost / ((1 - slope) * fieldSim.avgTermLength + slope * stats.termCount)); + } + }); + } + } + + public class LnbLtcFieldSimilarity extends DefaultFieldSimilarity { + final IndexReader topReader; + final float avgTermLength; + + public LnbLtcFieldSimilarity(IndexReader topReader, String field, double avgTermLength) throws IOException { + super(topReader, field); + this.topReader = topReader; + this.avgTermLength = (float) avgTermLength; + } + + @Override + public FieldDocScorer getTermScorer(float queryWeight, TermAndPostings termsAndPostings, IndexReader segment) + throws IOException { + return new LnbLtcTermScorer(segment, queryWeight, termsAndPostings, this); + } + + @Override + public float tf(float freq) { + return 1 + (float) Math.log(freq); + } + } + @Override + public FieldSimilarity getField(IndexReader topReader, String field) throws IOException { + return new LnbLtcFieldSimilarity(topReader, field, aggs.getAvgTermLength(topReader, field)); + } +} Property changes on: lucene\src\java\org\apache\lucene\search\similarity\LnbLtcSimilarityProvider.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/search/similarity/AggregatesProvider.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarity/AggregatesProvider.java (revision 0) +++ lucene/src/java/org/apache/lucene/search/similarity/AggregatesProvider.java (revision 0) @@ -0,0 +1,69 @@ +package org.apache.lucene.search.similarity; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import java.util.HashMap; +import java.util.List; +import java.util.ArrayList; +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Stats; +import org.apache.lucene.util.ReaderUtil; +import org.apache.lucene.util.Bits; + +public final class AggregatesProvider { + + // nocommit -- cache never clears... though may be minor + private final Map cache = new HashMap(); + + public double getAvgTermLength(IndexReader topReader, String field) throws IOException { + + Double ret = cache.get(field); + if (ret == null) { + long sumTermLength = 0; + int count = 0; + + List subReaders = new ArrayList(); + ReaderUtil.gatherSubReaders(subReaders, topReader); + + for(IndexReader r : subReaders) { + Stats.Reader statsReader = r.getStatsReader(); + if (statsReader != null) { + Stats.FieldReader fieldStatsReader = statsReader.getField(field); + if (fieldStatsReader != null) { + final Stats.DocFieldStats stats = fieldStatsReader.getDocFieldStats(); + final int maxDoc = r.maxDoc(); + final Bits skipDocs = r.getDeletedDocs(); + for(int i=0;i + + + + + + +nocommit: some alternate sim impls for playing with flex scoring + + \ No newline at end of file Property changes on: lucene\src\java\org\apache\lucene\search\similarity\package.html ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/search/TermQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TermQuery.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/search/TermQuery.java (working copy) @@ -24,6 +24,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.ToStringUtils; /** A Query that matches documents containing a term. @@ -35,21 +36,43 @@ private class TermWeight extends Weight { private final Similarity similarity; + private final SimilarityProvider simProvider; + private final SimilarityProvider.FieldSimilarity fieldSim; private float value; - private float idf; + private final float idf; private float queryNorm; private float queryWeight; private IDFExplanation idfExp; public TermWeight(Searcher searcher) throws IOException { - this.similarity = getSimilarity(searcher); - if (docFreq != -1) { - idfExp = similarity.idfExplain(term, searcher, docFreq); + SimilarityProvider sp = searcher.getSimilarityProvider(); + similarity = getSimilarity(searcher); + // nocommit -- hmmm -- what if it's not an + // IndexSearcher??? eg MultiSearcher??? + // nocommit: fieldSim.idf needs to be able to take a docFreq + if (sp != null && searcher instanceof IndexSearcher) { + simProvider = sp; + fieldSim = simProvider.getField(((IndexSearcher) searcher).getIndexReader(), term.field()); + idf = fieldSim.idf(new BytesRef(term.text())); + + // nocommit -- hack until we can get explain working: + if (docFreq != -1) { + idfExp = similarity.idfExplain(term, searcher, docFreq); + } else { + idfExp = similarity.idfExplain(term, searcher); + } + } else { - idfExp = similarity.idfExplain(term, searcher); + simProvider = null; + fieldSim = null; + if (docFreq != -1) { + idfExp = similarity.idfExplain(term, searcher, docFreq); + } else { + idfExp = similarity.idfExplain(term, searcher); + } + idf = idfExp.getIdf(); } - idf = idfExp.getIdf(); } @Override @@ -83,8 +106,13 @@ if (docs == null) { return null; } - - return new TermScorer(this, docs, similarity, reader.norms(term.field())); + if (simProvider != null && reader.hasIndexStats(term.field())) { + // nocommit -- need native BytesRef passed to + // TermQuery! + return new SimProviderTermScorer(new BytesRef(term.text()), this, docs, fieldSim, reader); + } else { + return new TermScorer(this, docs, similarity, reader.norms(term.field())); + } } @Override Index: lucene/src/java/org/apache/lucene/search/IndexSearcher.java =================================================================== --- lucene/src/java/org/apache/lucene/search/IndexSearcher.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/search/IndexSearcher.java (working copy) @@ -229,7 +229,7 @@ } int docID = scorer.docID(); - assert docID == -1 || docID == DocIdSetIterator.NO_MORE_DOCS; + assert docID == -1 || docID == DocIdSetIterator.NO_MORE_DOCS: "docID=" + docID + " scorer=" + scorer; // CHECKME: use ConjunctionScorer here? DocIdSet filterDocIdSet = filter.getDocIdSet(reader); Index: lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/search/FuzzyTermsEnum.java (working copy) @@ -241,6 +241,11 @@ } @Override + public long totalFreq() { + return actualEnum.totalFreq(); + } + + @Override public void cacheCurrentTerm() throws IOException { actualEnum.cacheCurrentTerm(); } Index: lucene/src/java/org/apache/lucene/index/DocInverterPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocInverterPerField.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/DocInverterPerField.java (working copy) @@ -41,6 +41,7 @@ final InvertedDocEndConsumerPerField endConsumer; final DocumentsWriter.DocState docState; final FieldInvertState fieldState; + private Stats.BufferedField statsWriter; public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) { this.perThread = perThread; @@ -156,8 +157,12 @@ fieldState.position--; } - if (posIncr == 0) + if (posIncr == 0) { + fieldState.currentTokenOverlaps = true; fieldState.numOverlap++; + } else { + fieldState.currentTokenOverlaps = false; + } boolean success = false; try { @@ -200,6 +205,13 @@ fields[i] = null; } + if (fieldInfo.storeIndexStats) { + if (statsWriter == null) { + statsWriter = perThread.docInverter.statsBuffer.getField(fieldInfo); + } + statsWriter.save(docState.docID, fieldState); + } + consumer.finish(); endConsumer.finish(); } Index: lucene/src/java/org/apache/lucene/index/FieldInfos.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldInfos.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/FieldInfos.java (working copy) @@ -51,6 +51,7 @@ static final byte OMIT_NORMS = 0x10; static final byte STORE_PAYLOADS = 0x20; static final byte OMIT_TERM_FREQ_AND_POSITIONS = 0x40; + static final byte STORE_INDEX_STATS = (byte) 0x80; private final ArrayList byNumber = new ArrayList(); private final HashMap byName = new HashMap(); @@ -94,7 +95,8 @@ List fields = doc.getFields(); for (Fieldable field : fields) { add(field.name(), field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), - field.isStoreOffsetWithTermVector(), field.getOmitNorms(), false, field.getOmitTermFreqAndPositions()); + field.isStoreOffsetWithTermVector(), field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), + field.getIndexStats()); } } @@ -193,7 +195,7 @@ synchronized public void add(String name, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) { add(name, isIndexed, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, false, false); + storeOffsetWithTermVector, omitNorms, false, false, false); } /** If the field is not yet known, adds it. If it is known, checks to make @@ -211,23 +213,25 @@ * @param omitTermFreqAndPositions true if term freqs should be omitted for this field */ synchronized public FieldInfo add(String name, boolean isIndexed, boolean storeTermVector, - boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, + boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, + boolean storeIndexStats) { FieldInfo fi = fieldInfo(name); if (fi == null) { - return addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + return addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, storeIndexStats); } else { - fi.update(isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + fi.update(isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, storeIndexStats); } return fi; } private FieldInfo addInternal(String name, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, - boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, + boolean storeIndexStats) { name = StringHelper.intern(name); FieldInfo fi = new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, storeIndexStats); byNumber.add(fi); byName.put(name, fi); return fi; @@ -279,6 +283,17 @@ return hasVectors; } + public boolean hasStats() { + final int fieldCount = size(); + for(int i=0;i getComparator() throws IOException { return in.getComparator(); } + + @Override + public long totalFreq() { + return in.totalFreq(); + } } /** Base class for filtering {@link DocsEnum} implementations. */ Index: lucene/src/java/org/apache/lucene/index/SegmentMerger.java =================================================================== --- lucene/src/java/org/apache/lucene/index/SegmentMerger.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/SegmentMerger.java (working copy) @@ -151,6 +151,7 @@ // threads. mergedDocs = mergeFields(); + mergeStats(); mergeTerms(); mergeNorms(); @@ -179,6 +180,9 @@ // Basic files for (String ext : IndexFileNames.COMPOUND_EXTENSIONS_NOT_CODEC) { + if (ext.equals(Stats.EXTENSION) && !fieldInfos.hasStats()) { + continue; + } if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) && !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) fileSet.add(IndexFileNames.segmentFileName(segment, "", ext)); @@ -215,14 +219,14 @@ } private void addIndexed(IndexReader reader, FieldInfos fInfos, - Collection names, boolean storeTermVectors, - boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean storePayloads, boolean omitTFAndPositions) + Collection names, boolean storeTermVectors, + boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, + boolean storePayloads, boolean omitTFAndPositions, boolean storeIndexStats) throws IOException { for (String field : names) { fInfos.add(field, true, storeTermVectors, - storePositionWithTermVector, storeOffsetWithTermVector, !reader - .hasNorms(field), storePayloads, omitTFAndPositions); + storePositionWithTermVector, storeOffsetWithTermVector, !reader + .hasNorms(field), storePayloads, omitTFAndPositions, storeIndexStats); } } @@ -289,18 +293,18 @@ for (int j = 0; j < numReaderFieldInfos; j++) { FieldInfo fi = readerFieldInfos.fieldInfo(j); fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, - fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, - !reader.hasNorms(fi.name), fi.storePayloads, - fi.omitTermFreqAndPositions); + fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, + !reader.hasNorms(fi.name), fi.storePayloads, + fi.omitTermFreqAndPositions, fi.storeIndexStats); } } else { - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false, false); fieldInfos.add(reader.getFieldNames(FieldOption.UNINDEXED), false); } } @@ -437,6 +441,66 @@ return docCount; } + private void mergeStats() throws IOException { + if (fieldInfos.hasStats()) { + + final int fieldCount = fieldInfos.size(); + Stats.Writer w = null; + + // nocommit: naive -- would be better to be sparse wrt fieldCount + for(int field=0;field getComparator() { return termComp; } + + @Override + public long getTotalTermCount() throws IOException { + long sum = 0; + for(Terms terms : subs) { + sum += terms.getTotalTermCount(); + } + return sum; + } } Index: lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (working copy) @@ -21,7 +21,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Comparator; @@ -202,6 +201,7 @@ // which all share the same term. Now we must // interleave the docID streams. int numDocs = 0; + long totalFreq = 0; while(numToMerge > 0) { FreqProxFieldMergeState minState = termStates[0]; @@ -213,6 +213,7 @@ final int termDocFreq = minState.termFreq; numDocs++; + totalFreq += termDocFreq; assert minState.docID < flushedDocCount: "doc=" + minState.docID + " maxDoc=" + flushedDocCount; @@ -291,7 +292,7 @@ } assert numDocs > 0; - termsConsumer.finishTerm(text, numDocs); + termsConsumer.finishTerm(text, numDocs, totalFreq); } termsConsumer.finish(); Index: lucene/src/java/org/apache/lucene/index/SegmentInfo.java =================================================================== --- lucene/src/java/org/apache/lucene/index/SegmentInfo.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/SegmentInfo.java (working copy) @@ -427,8 +427,9 @@ } private void addIfExists(Set files, String fileName) throws IOException { - if (dir.fileExists(fileName)) + if (dir.fileExists(fileName)) { files.add(fileName); + } } /* @@ -477,6 +478,9 @@ fileSet.add(delFileName); } + addIfExists(fileSet, IndexFileNames.segmentFileName(name, "", Stats.EXTENSION)); + + // Careful logic for norms files if (normGen != null) { for (int i = 0; i < normGen.length; i++) { long gen = normGen[i]; Index: lucene/src/java/org/apache/lucene/index/TermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermsEnum.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/TermsEnum.java (working copy) @@ -99,6 +99,12 @@ * {@link SeekStatus#END}.*/ public abstract int docFreq(); + /** Optional: returns the total count of how many times + * this term occurs. While docFreq increments by 1 for + * each document the term occurs in, this count + * increments by the term's frequency in the document. */ + public abstract long totalFreq(); + /** Get {@link DocsEnum} for the current term. Do not * call this before calling {@link #next} or {@link * #seek} for the first time. This method will not @@ -160,6 +166,11 @@ } @Override + public long totalFreq() { + throw new IllegalStateException("this method should never be called"); + } + + @Override public long ord() { throw new IllegalStateException("this method should never be called"); } Index: lucene/src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/SegmentReader.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -97,6 +97,8 @@ final int readBufferSize; final int termsIndexDivisor; + final Stats.Reader statsReader; + private final SegmentReader origInstance; FieldsReader fieldsReaderOrig; @@ -130,6 +132,12 @@ fieldInfos = new FieldInfos(cfsDir, IndexFileNames.segmentFileName(segment, "", IndexFileNames.FIELD_INFOS_EXTENSION)); + if (fieldInfos.hasStats()) { + statsReader = new Stats.Reader(fieldInfos, cfsDir, segment); + } else { + statsReader = null; + } + this.termsIndexDivisor = termsIndexDivisor; // Ask codec for its Fields @@ -190,6 +198,10 @@ storeCFSReader.close(); } + if (statsReader != null) { + statsReader.close(); + } + // Force FieldCache to evict our entries at this // point. If the exception occurred while // initializing the core readers, then @@ -874,6 +886,14 @@ return new ArrayList(si.files()); } + public Stats.Reader getStatsReader() throws IOException { + if (core.statsReader != null) { + return core.statsReader; + } else { + return null; + } + } + FieldInfos fieldInfos() { return core.fieldInfos; } Index: lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (working copy) @@ -124,6 +124,12 @@ postings.docFreqs[termID] = 1; writeProx(termID, fieldState.position); } + fieldState.uniqueTermCount++; + // nocommit: only works if we can guarantee + // analyzers always "inject" terms after the real + // term, ie, it's always the fake terms that have + // posIncr==0 + fieldState.uniqueOverlapTermCount += fieldState.currentTokenOverlaps ? 1 : 0; } @Override @@ -141,6 +147,12 @@ termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]); postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID]; postings.lastDocIDs[termID] = docState.docID; + fieldState.uniqueTermCount++; + // nocommit: only works if we can guarantee + // analyzers always "inject" terms after the real + // term, ie, it's always the fake terms that have + // posIncr==0 + fieldState.uniqueOverlapTermCount += fieldState.currentTokenOverlaps ? 1 : 0; } } else { if (docState.docID != postings.lastDocIDs[termID]) { @@ -160,6 +172,12 @@ postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1; postings.lastDocIDs[termID] = docState.docID; writeProx(termID, fieldState.position); + fieldState.uniqueTermCount++; + // nocommit: only works if we can guarantee + // analyzers always "inject" terms after the real + // term, ie, it's always the fake terms that have + // posIncr==0 + fieldState.uniqueOverlapTermCount += fieldState.currentTokenOverlaps ? 1 : 0; } else { postings.docFreqs[termID]++; writeProx(termID, fieldState.position-postings.lastPositions[termID]); Index: lucene/src/java/org/apache/lucene/index/Stats.java =================================================================== --- lucene/src/java/org/apache/lucene/index/Stats.java (revision 0) +++ lucene/src/java/org/apache/lucene/index/Stats.java (revision 0) @@ -0,0 +1,309 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Closeable; +import java.util.Map; +import java.util.HashMap; +import java.util.Arrays; + +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; + +/** + @lucene.experimental + */ + +public final class Stats { + + public static final String EXTENSION = "sts"; + public static final String CODEC_NAME = "Stats"; + + public static final int VERSION_START = 0; + public static final int VERSION_CURRENT = VERSION_START; + + public static class DocFieldStats { + public int termCount; // length in terms + public int uniqueTermCount; // number of unique terms + public float boost; // field's boost + } + + /** Used to read a segment */ + public static class FieldReader { + private final IndexInput in; + private final DocFieldStats docFieldStats = new DocFieldStats(); + + FieldReader(IndexInput in) { + this.in = in; + } + + /** Shared instance used to return results from {@link #next}. */ + public DocFieldStats getDocFieldStats() { + return docFieldStats; + } + + public void next() throws IOException { + docFieldStats.boost = in.readFloat(); + docFieldStats.uniqueTermCount = in.readVInt(); + docFieldStats.termCount = in.readVInt(); + } + } + + /** Used to write a new segment */ + public static class FieldWriter { + private final IndexOutput out; + private int upto; + + FieldWriter(IndexOutput out) { + this.out = out; + } + + public void write(int docID, DocFieldStats stats) throws IOException { + fill(docID-upto); + out.writeFloat(stats.boost); + out.writeVInt(stats.uniqueTermCount); + out.writeVInt(stats.termCount); + upto++; + } + + public void finish(int docCount) throws IOException { + if (docCount > upto) { + fill(docCount-upto); + } + } + + public void reset() { + upto = 0; + } + + private void fill(int count) throws IOException { + for(int i=0;i= boosts.length) { + grow(docID); + } + boosts[docID] = state.boost; + + // nocommit -- nees to be configurable via Field/FieldType/SimAtIndexTime/Something: + boolean IGNORE_OVERLAPS = true; + if (IGNORE_OVERLAPS) { + uniqueTermCounts[docID] = state.uniqueTermCount - state.uniqueOverlapTermCount; + totalTermCounts[docID] = state.length - state.numOverlap; + } else { + uniqueTermCounts[docID] = state.uniqueTermCount; + totalTermCounts[docID] = state.length; + } + } + + public long ramBytesUsed() { + return (3*RamUsageEstimator.NUM_BYTES_INT + RamUsageEstimator.NUM_BYTES_FLOAT) * boosts.length; + } + + public void flush(FieldWriter w, int maxDocID) throws IOException { + final int limit = Math.min(maxDocID, boosts.length); + int upto = 0; + final DocFieldStats stats = new Stats.DocFieldStats(); + + while(upto < limit) { + stats.boost = boosts[upto]; + stats.uniqueTermCount = uniqueTermCounts[upto]; + stats.termCount = totalTermCounts[upto]; + w.write(upto++, stats); + } + w.finish(maxDocID); + reset(); + } + + public int compareTo(Object other) { + return fieldInfo.number - ((BufferedField) other).fieldInfo.number; + } + } + + public static class Reader implements Closeable { + + private final IndexInput in; + private final Map fields = new HashMap(); + + public Reader(FieldInfos fieldInfos, Directory dir, String segment) throws IOException { + final String fileName = IndexFileNames.segmentFileName(segment, "", EXTENSION); + in = dir.openInput(fileName); + CodecUtil.checkHeader(in, CODEC_NAME, VERSION_CURRENT, VERSION_CURRENT); + final long index = in.readLong(); + in.seek(index); + + final int fieldCount = in.readVInt(); + for(int i=0;i map = new HashMap(); + + private static class PerField implements Comparable { + final long fileOffset; + final int fieldNumber; + + public PerField(int fieldNumber, long fileOffset) { + this.fieldNumber = fieldNumber; + this.fileOffset = fileOffset; + } + + public int compareTo(Object other) { + return fieldNumber - ((PerField) other).fieldNumber; + } + } + + public Writer(SegmentWriteState state) throws IOException { + final String fileName = IndexFileNames.segmentFileName(state.segmentName, "", EXTENSION); + state.flushedFiles.add(fileName); + + out = state.directory.createOutput(fileName); + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + + // Placeholder for index + out.writeLong(0); + + fieldWriter = new FieldWriter(out); + } + + public FieldWriter addField(FieldInfo fieldInfo) throws IOException { + map.put(fieldInfo.name, new PerField(fieldInfo.number, out.getFilePointer())); + + // reuse + fieldWriter.reset(); + return fieldWriter; + } + + public void close() throws IOException { + final long indexFP = out.getFilePointer(); + + final PerField[] fields = map.values().toArray(new PerField[map.size()]); + Arrays.sort(fields); + + // Write index: + out.writeVInt(fields.length); + for(PerField field : fields) { + out.writeVInt(field.fieldNumber); + out.writeVLong(field.fileOffset); + } + out.seek(CodecUtil.headerLength(CODEC_NAME)); + out.writeLong(indexFP); + out.close(); + } + } + + /** Buffers stats in RAM, flushing them in the end using + * {@link Writer}. */ + public static class Buffer { + + private final Map map = new HashMap(); + + public synchronized BufferedField getField(FieldInfo fieldInfo) { + BufferedField result = map.get(fieldInfo.name); + if (result == null) { + result = new BufferedField(fieldInfo); + map.put(fieldInfo.name, result); + } + return result; + } + + public synchronized void flush(SegmentWriteState state) throws IOException { + if (map.size() > 0) { + + final Writer w = new Writer(state); + + final BufferedField[] fields = map.values().toArray(new BufferedField[map.size()]); + Arrays.sort(fields); + + for(BufferedField field : fields) { + field.flush(w.addField(field.fieldInfo), state.numDocs); + } + + w.close(); + } + } + } +} \ No newline at end of file Property changes on: lucene\src\java\org\apache\lucene\index\Stats.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/src/java/org/apache/lucene/index/FieldInfo.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldInfo.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/FieldInfo.java (working copy) @@ -27,6 +27,7 @@ boolean storeTermVector; boolean storeOffsetWithTermVector; boolean storePositionWithTermVector; + public boolean storeIndexStats; public boolean omitNorms; // omit norms associated with indexed fields public boolean omitTermFreqAndPositions; @@ -35,7 +36,8 @@ FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, + boolean storeIndexStats) { name = na; isIndexed = tk; number = nu; @@ -46,6 +48,7 @@ this.storePayloads = storePayloads; this.omitNorms = omitNorms; this.omitTermFreqAndPositions = omitTermFreqAndPositions; + this.storeIndexStats = storeIndexStats; } else { // for non-indexed fields, leave defaults this.storeTermVector = false; this.storeOffsetWithTermVector = false; @@ -53,17 +56,19 @@ this.storePayloads = false; this.omitNorms = true; this.omitTermFreqAndPositions = false; + this.storeIndexStats = false; } } @Override public Object clone() { return new FieldInfo(name, isIndexed, number, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, storeIndexStats); } void update(boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, - boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, + boolean storeIndexStats) { if (this.isIndexed != isIndexed) { this.isIndexed = true; // once indexed, always index } @@ -86,6 +91,10 @@ if (this.omitTermFreqAndPositions != omitTermFreqAndPositions) { this.omitTermFreqAndPositions = true; // if one require omitTermFreqAndPositions at least once, it remains off for life } + if (this.storeIndexStats != storeIndexStats) { + // nocommit -- true? + this.storeIndexStats = false; + } } } } Index: lucene/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (working copy) @@ -194,7 +194,8 @@ // easily add it FieldInfo fi = fieldInfos.add(fieldName, field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), - field.getOmitNorms(), false, field.getOmitTermFreqAndPositions()); + field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), + field.getIndexStats()); fp = new DocFieldProcessorPerField(this, fi); fp.next = fieldHash[hashPos]; @@ -206,7 +207,8 @@ } else fp.fieldInfo.update(field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), - field.getOmitNorms(), false, field.getOmitTermFreqAndPositions()); + field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), + field.getIndexStats()); if (thisFieldGen != fp.lastGen) { Index: lucene/src/java/org/apache/lucene/index/FieldInvertState.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FieldInvertState.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/FieldInvertState.java (working copy) @@ -19,20 +19,29 @@ import org.apache.lucene.util.AttributeSource; /** - * This class tracks the number and position / offset parameters of terms - * being added to the index. The information collected in this class is - * also used to calculate the normalization factor for a field. + * This class gathers statistics during inversion of a + * field's value for a single document. This information is + * also used to calculate the normalization factor for a + * field, or to store index statistics. * * @lucene.experimental */ + public final class FieldInvertState { + // nocommit -- need binary length? int position; - int length; - int numOverlap; + int length; // number of tokens in this field + int numOverlap; // number of tokens w/ posIncr==0 int offset; - float boost; + int uniqueTermCount; // number of unique tokens in this field + int uniqueOverlapTermCount; // number of unique posIncr==0 tokens in this field + float boost; // net boost (product of per-Fieldable boosts, + // for multi-valued fields) AttributeSource attributeSource; + // nocommit -- not great that this is here? + boolean currentTokenOverlaps; + public FieldInvertState() { } @@ -51,6 +60,7 @@ void reset(float docBoost) { position = 0; length = 0; + uniqueTermCount = 0; numOverlap = 0; offset = 0; boost = docBoost; Index: lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java =================================================================== --- lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (working copy) @@ -273,6 +273,15 @@ } @Override + public long totalFreq() { + long sum = 0; + for(int i=0;it. This method returns 0 if the term or - * field does not exists. This method does not take into - * account deleted documents that have not yet been merged - * away. */ + * t. This method returns 0 if the term or + * field does not exists. This method does not take into + * account deleted documents that have not yet been + * merged away. */ public int docFreq(String field, BytesRef term) throws IOException { final Fields fields = fields(); if (fields == null) { @@ -984,6 +985,23 @@ return terms.docFreq(term); } + /** Returns the total number of occurrences of this term, + * across all docs. This method returns 0 if the term or + * field does not exists. This method does not take into + * account deleted documents that have not yet been + * merged away. */ + public long totalFreq(String field, BytesRef term) throws IOException { + final Fields fields = fields(); + if (fields == null) { + return 0; + } + final Terms terms = fields.terms(field); + if (terms == null) { + return 0; + } + return terms.totalFreq(term); + } + /** This may return null if the field does not exist.*/ public Terms terms(String field) throws IOException { final Fields fields = fields(); @@ -1385,4 +1403,32 @@ Fields retrieveFields() { return fields; } + + /** @lucene.experimental */ + public Stats.Reader getStatsReader() throws IOException { + // nocommit -- throw UOE? + return null; + } + + public boolean hasIndexStats(String field) throws IOException { + final Stats.Reader r = getStatsReader(); + if (r != null) { + return r.hasField(field); + } else { + return false; + } + } + + // nocommit -- not right -- eg SegmentReader.clone should + // make forward shallow copy of this + + private final Map simDataCache = new HashMap(); + + public Object getSimDataCache(String field) { + return simDataCache.get(field); + } + + public void putSimDataCache(String field, Object v) { + simDataCache.put(field, v); + } } Index: lucene/src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexWriter.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -3594,6 +3594,7 @@ if (infoStream != null) { message("handleMergeException: merge=" + merge.segString(directory) + " exc=" + t); + t.printStackTrace(infoStream); } // Set the exception on the merge, so if @@ -3651,7 +3652,7 @@ } finally { synchronized(this) { mergeFinish(merge); - + if (!success) { if (infoStream != null) message("hit exception during merge"); Index: lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java (working copy) @@ -113,9 +113,13 @@ out.writeInt(fieldCount); for(int i=0;i 0; @@ -178,9 +183,13 @@ termWriter.write(text); out.writeVInt(numDocs); - + if (fieldInfo.storeIndexStats) { + out.writeVLong(totalFreq); + } + postingsWriter.finishTerm(numDocs, isIndexTerm); numTerms++; + fieldTotalFreq += totalFreq; } // Finishes all terms in this field Index: lucene/src/java/org/apache/lucene/index/codecs/TermState.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/TermState.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/codecs/TermState.java (working copy) @@ -33,10 +33,14 @@ public long filePointer; // fp into the terms dict primary file (_X.tis) public int docFreq; // how many docs have this term + // nocommit -- can we do this in a subclass...? + public long totalFreq; // total # times this term occurs + public void copy(TermState other) { ord = other.ord; filePointer = other.filePointer; docFreq = other.docFreq; + totalFreq = other.totalFreq; } @Override @@ -51,6 +55,6 @@ @Override public String toString() { - return "tis.fp=" + filePointer + " docFreq=" + docFreq + " ord=" + ord; + return "tis.fp=" + filePointer + " docFreq=" + docFreq + " totalFreq=" + totalFreq + " ord=" + ord; } } Index: lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsWriter.java (working copy) @@ -85,7 +85,7 @@ } @Override - public void finishTerm(BytesRef term, int numDocs) throws IOException { + public void finishTerm(BytesRef term, int numDocs, long totalFreq) throws IOException { } @Override Index: lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java (working copy) @@ -244,6 +244,11 @@ public Comparator getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } + + @Override + public long totalFreq() { + throw new UnsupportedOperationException("SimpleTextCodec doesn't support flex scoring yet"); + } } private class SimpleTextDocsEnum extends DocsEnum { Index: lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java (working copy) @@ -52,11 +52,20 @@ * for each doc */ public abstract void finishDoc() throws IOException; + /** @lucene.experimental */ + public static class MergeResult { + int numDocs; + int totalFreq; + } + + private MergeResult mergeResult; + /** Default merge impl: append documents, mapping around * deletes */ - public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException { + public MergeResult merge(final MergeState mergeState, final DocsEnum postings) throws IOException { int df = 0; + int totalFreq = 0; if (mergeState.fieldInfo.omitTermFreqAndPositions) { while(true) { @@ -76,6 +85,7 @@ break; } final int freq = postingsEnum.freq(); + totalFreq += freq; this.startDoc(doc, freq); for(int i=0;i 0. */ - public abstract void finishTerm(BytesRef text, int numDocs) throws IOException; + public abstract void finishTerm(BytesRef text, int numDocs, long totalCount) throws IOException; /** Called when we are done adding terms to this field */ public abstract void finish() throws IOException; @@ -69,9 +69,9 @@ if (docsEnumIn != null) { docsEnum.reset(docsEnumIn); final PostingsConsumer postingsConsumer = startTerm(term); - final int numDocs = postingsConsumer.merge(mergeState, docsEnum); - if (numDocs > 0) { - finishTerm(term, numDocs); + final PostingsConsumer.MergeResult result = postingsConsumer.merge(mergeState, docsEnum); + if (result.numDocs > 0) { + finishTerm(term, result.numDocs, result.totalFreq); } } } @@ -94,9 +94,9 @@ } } final PostingsConsumer postingsConsumer = startTerm(term); - final int numDocs = postingsConsumer.merge(mergeState, postingsEnum); - if (numDocs > 0) { - finishTerm(term, numDocs); + final PostingsConsumer.MergeResult result = postingsConsumer.merge(mergeState, postingsEnum); + if (result.numDocs > 0) { + finishTerm(term, result.numDocs, result.totalFreq); } } } Index: lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java (working copy) @@ -141,11 +141,17 @@ assert numTerms >= 0; final long termsStartPointer = in.readLong(); final TermsIndexReaderBase.FieldReader fieldIndexReader; + final long totalTermCount; final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + if (fieldInfo.storeIndexStats) { + totalTermCount = in.readVLong(); + } else { + totalTermCount = 0; + } fieldIndexReader = indexReader.getField(fieldInfo); if (numTerms > 0) { assert !fields.containsKey(fieldInfo.name); - fields.put(fieldInfo.name, new FieldReader(fieldIndexReader, fieldInfo, numTerms, termsStartPointer)); + fields.put(fieldInfo.name, new FieldReader(fieldIndexReader, fieldInfo, numTerms, termsStartPointer, totalTermCount)); } } success = true; @@ -249,16 +255,18 @@ private class FieldReader extends Terms implements Closeable { final long numTerms; + final long totalTermCount; final FieldInfo fieldInfo; final long termsStartPointer; final TermsIndexReaderBase.FieldReader fieldIndexReader; - FieldReader(TermsIndexReaderBase.FieldReader fieldIndexReader, FieldInfo fieldInfo, long numTerms, long termsStartPointer) { + FieldReader(TermsIndexReaderBase.FieldReader fieldIndexReader, FieldInfo fieldInfo, long numTerms, long termsStartPointer, long totalTermCount) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.numTerms = numTerms; this.termsStartPointer = termsStartPointer; this.fieldIndexReader = fieldIndexReader; + this.totalTermCount = totalTermCount; } @Override @@ -281,6 +289,11 @@ return numTerms; } + @Override + public long getTotalTermCount() throws IOException { + return totalTermCount; + } + // Iterates through terms in this field private class SegmentTermsEnum extends TermsEnum { private final IndexInput in; @@ -471,6 +484,10 @@ bytesReader.read(); state.docFreq = in.readVInt(); + if (fieldInfo.storeIndexStats) { + state.totalFreq = in.readVLong(); + } + // TODO: would be cleaner, but space-wasting, to // simply record a bit into each index entry as to // whether it's an index entry or not, rather than @@ -491,6 +508,11 @@ public int docFreq() { return state.docFreq; } + + @Override + public long totalFreq() { + return state.totalFreq; + } @Override public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { Index: lucene/src/java/org/apache/lucene/index/MultiFields.java =================================================================== --- lucene/src/java/org/apache/lucene/index/MultiFields.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/MultiFields.java (working copy) @@ -177,6 +177,15 @@ return result; } + // nocommit jdocs + public static long totalFreq(IndexReader reader, String field, BytesRef term) throws IOException { + final Terms terms = getTerms(reader, field); + if (terms == null) { + return 0; + } + return terms.totalFreq(term); + } + /** This method may return null if the field does not exist.*/ public static Terms getTerms(IndexReader r, String field) throws IOException { final Fields fields = getFields(r); Index: lucene/src/java/org/apache/lucene/index/DocInverter.java =================================================================== --- lucene/src/java/org/apache/lucene/index/DocInverter.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/DocInverter.java (working copy) @@ -33,6 +33,7 @@ final InvertedDocConsumer consumer; final InvertedDocEndConsumer endConsumer; + Stats.Buffer statsBuffer; public DocInverter(InvertedDocConsumer consumer, InvertedDocEndConsumer endConsumer) { this.consumer = consumer; @@ -42,6 +43,9 @@ @Override void setFieldInfos(FieldInfos fieldInfos) { super.setFieldInfos(fieldInfos); + if (statsBuffer == null) { + statsBuffer = new Stats.Buffer(); + } consumer.setFieldInfos(fieldInfos); endConsumer.setFieldInfos(fieldInfos); } @@ -71,6 +75,7 @@ consumer.flush(childThreadsAndFields, state); endConsumer.flush(endChildThreadsAndFields, state); + statsBuffer.flush(state); } @Override Index: lucene/src/java/org/apache/lucene/index/Terms.java =================================================================== --- lucene/src/java/org/apache/lucene/index/Terms.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/index/Terms.java (working copy) @@ -57,6 +57,18 @@ } } + /** Returns the total number of occurrences of this term + * in the field (see {@link TermsEnum#totalFreq}. + * Returns 0 if the term does not exist. */ + public long totalFreq(BytesRef text) throws IOException { + final TermsEnum termsEnum = getThreadTermsEnum(); + if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) { + return termsEnum.totalFreq(); + } else { + return 0; + } + } + /** Get {@link DocsEnum} for the specified term. This * method may return null if the term does not exist. */ public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException { @@ -80,10 +92,18 @@ } } + /** Optional: returns total unique number of terms. */ public long getUniqueTermCount() throws IOException { throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()"); } + /** Optional: returns total number of term ocurrences (sum + * of length of all documents across this field) in this + * field. */ + public long getTotalTermCount() throws IOException { + throw new UnsupportedOperationException("this reader does not implement getTotalTermCount()"); + } + protected TermsEnum getThreadTermsEnum() throws IOException { TermsEnum termsEnum = threadEnums.get(); if (termsEnum == null) { @@ -97,5 +117,6 @@ protected void close() { threadEnums.close(); } + public final static Terms[] EMPTY_ARRAY = new Terms[0]; } Index: lucene/src/java/org/apache/lucene/store/DataInput.java =================================================================== --- lucene/src/java/org/apache/lucene/store/DataInput.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/store/DataInput.java (working copy) @@ -141,6 +141,11 @@ return clone; } + public float readFloat() throws IOException { + // nocommit -- byte order? + return Float.intBitsToFloat(readInt()); + } + public Map readStringStringMap() throws IOException { final Map map = new HashMap(); final int count = readInt(); Index: lucene/src/java/org/apache/lucene/store/DataOutput.java =================================================================== --- lucene/src/java/org/apache/lucene/store/DataOutput.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/store/DataOutput.java (working copy) @@ -95,6 +95,11 @@ writeByte((byte)i); } + public void writeFloat(float f) throws IOException { + // nocommit -- byte order? + writeInt(Float.floatToRawIntBits(f)); + } + /** Writes a string. * @see DataInput#readString() */ Index: lucene/src/java/org/apache/lucene/util/ArrayUtil.java =================================================================== --- lucene/src/java/org/apache/lucene/util/ArrayUtil.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/util/ArrayUtil.java (working copy) @@ -254,6 +254,16 @@ return grow(array, 1 + array.length); } + public static float[] shrink(float[] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_FLOAT); + if (newSize != array.length) { + float[] newArray = new float[newSize]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else + return array; + } + public static short[] shrink(short[] array, int targetSize) { final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_SHORT); if (newSize != array.length) { Index: lucene/src/java/org/apache/lucene/document/Fieldable.java =================================================================== --- lucene/src/java/org/apache/lucene/document/Fieldable.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/document/Fieldable.java (working copy) @@ -209,4 +209,10 @@ * silently fail to find results. */ void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions); + + // nocommit jdocs + boolean getIndexStats(); + + // nocommit jdocs + void setIndexStats(boolean doStoreIndexStats); } Index: lucene/src/java/org/apache/lucene/document/AbstractField.java =================================================================== --- lucene/src/java/org/apache/lucene/document/AbstractField.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/document/AbstractField.java (working copy) @@ -39,6 +39,8 @@ protected boolean isBinary = false; protected boolean lazy = false; protected boolean omitTermFreqAndPositions = false; + // nocommit -- change back to false + protected boolean storeIndexStats = true; protected float boost = 1.0f; // the data object for all different kind of field values protected Object fieldsData = null; @@ -229,8 +231,19 @@ * PhraseQuery} or {@link SpanQuery} subclasses will * silently fail to find results. */ - public void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions) { this.omitTermFreqAndPositions=omitTermFreqAndPositions; } + public void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions) { + this.omitTermFreqAndPositions=omitTermFreqAndPositions; + storeIndexStats = false; + } + public boolean getIndexStats() { + return storeIndexStats; + } + + public void setIndexStats(boolean v) { + storeIndexStats = v; + } + public boolean isLazy() { return lazy; } Index: lucene/src/java/org/apache/lucene/document/Field.java =================================================================== --- lucene/src/java/org/apache/lucene/document/Field.java (revision 1023783) +++ lucene/src/java/org/apache/lucene/document/Field.java (working copy) @@ -417,6 +417,7 @@ this.omitNorms = index.omitNorms(); if (index == Index.NO) { this.omitTermFreqAndPositions = false; + storeIndexStats = false; } this.isBinary = false; @@ -586,6 +587,7 @@ isIndexed = false; isTokenized = false; omitTermFreqAndPositions = false; + storeIndexStats = false; omitNorms = true; isBinary = true;