Index: lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java (revision 1235565) +++ lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java (working copy) @@ -336,7 +336,7 @@ } @Override - public float idf(int docFreq, int numDocs) { + public float idf(long docFreq, long numDocs) { return 1; } Index: lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java (revision 1235565) +++ lucene/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java (working copy) @@ -73,7 +73,7 @@ } @Override - public float idf(int docFreq, int numDocs) { + public float idf(long docFreq, long numDocs) { return 1.0f; } } Index: lucene/src/test/org/apache/lucene/search/TestSimilarity.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSimilarity.java (revision 1235565) +++ lucene/src/test/org/apache/lucene/search/TestSimilarity.java (working copy) @@ -49,7 +49,7 @@ @Override public void computeNorm(FieldInvertState state, Norm norm) { norm.setByte(encodeNormValue(state.getBoost())); } @Override public float tf(float freq) { return freq; } @Override public float sloppyFreq(int distance) { return 2.0f; } - @Override public float idf(int docFreq, int numDocs) { return 1.0f; } + @Override public float idf(long docFreq, long numDocs) { return 1.0f; } @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] stats) { return new Explanation(1.0f, "Inexplicable"); } Index: lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java (revision 1235565) +++ lucene/src/test/org/apache/lucene/search/TestSimilarityProvider.java (working copy) @@ -129,7 +129,7 @@ } @Override - public float idf(int docFreq, int numDocs) { + public float idf(long docFreq, long numDocs) { return 1f; } @@ -157,7 +157,7 @@ } @Override - public float idf(int docFreq, int numDocs) { + public float idf(long docFreq, long numDocs) { return 10f; } Index: lucene/src/test/org/apache/lucene/index/TestOmitTf.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestOmitTf.java (revision 1235565) +++ lucene/src/test/org/apache/lucene/index/TestOmitTf.java (working copy) @@ -47,7 +47,7 @@ @Override public void computeNorm(FieldInvertState state, Norm norm) { norm.setByte(encodeNormValue(state.getBoost())); } @Override public float tf(float freq) { return freq; } @Override public float sloppyFreq(int distance) { return 2.0f; } - @Override public float idf(int docFreq, int numDocs) { return 1.0f; } + @Override public float idf(long docFreq, long numDocs) { return 1.0f; } @Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) { return new Explanation(1.0f, "Inexplicable"); } Index: lucene/src/java/org/apache/lucene/search/TermStatistics.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TermStatistics.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/TermStatistics.java (working copy) @@ -25,10 +25,10 @@ */ public class TermStatistics { private final BytesRef term; - private final int docFreq; + private final long docFreq; private final long totalTermFreq; - public TermStatistics(BytesRef term, int docFreq, long totalTermFreq) { + public TermStatistics(BytesRef term, long docFreq, long totalTermFreq) { this.term = term; this.docFreq = docFreq; this.totalTermFreq = totalTermFreq; @@ -41,7 +41,7 @@ /** returns the number of documents this term occurs in * @see IndexReader#docFreq(String, BytesRef) */ - public final int docFreq() { + public final long docFreq() { return docFreq; } @@ -50,4 +50,22 @@ public final long totalTermFreq() { return totalTermFreq; } + + /** sugar method: returns the summation with another statistics */ + public TermStatistics add(TermStatistics other) { + assert term().equals(other.term()); + return new TermStatistics(term(), docFreq()+other.docFreq(), + add(totalTermFreq(),other.totalTermFreq())); + } + + // TODO: can't we put this somewhere better? TermContext has this logic too + // note: messy because we have to preserve -1 if there are any + // 3.x segments that don't support these stats: remove this in 5.0 + private long add(long l1, long l2) { + if (l1 >= 0 && l2 >= 0) { + return l1 + l2; + } else { + return -1; + } + } } Index: lucene/src/java/org/apache/lucene/search/CollectionStatistics.java =================================================================== --- lucene/src/java/org/apache/lucene/search/CollectionStatistics.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/CollectionStatistics.java (working copy) @@ -26,12 +26,12 @@ */ public class CollectionStatistics { private final String field; - private final int maxDoc; - private final int docCount; + private final long maxDoc; + private final long docCount; private final long sumTotalTermFreq; private final long sumDocFreq; - public CollectionStatistics(String field, int maxDoc, int docCount, long sumTotalTermFreq, long sumDocFreq) { + public CollectionStatistics(String field, long maxDoc, long docCount, long sumTotalTermFreq, long sumDocFreq) { this.field = field; this.maxDoc = maxDoc; this.docCount = docCount; @@ -47,14 +47,14 @@ /** returns the total number of documents, regardless of * whether they all contain values for this field. * @see IndexReader#maxDoc() */ - public final int maxDoc() { + public final long maxDoc() { return maxDoc; } /** returns the total number of documents that * have at least one term for this field. * @see Terms#getDocCount() */ - public final int docCount() { + public final long docCount() { return docCount; } @@ -69,4 +69,23 @@ public final long sumDocFreq() { return sumDocFreq; } + + /** sugar method: returns the summation with another statistics */ + public CollectionStatistics add(CollectionStatistics other) { + assert field().equals(other.field()); + return new CollectionStatistics(field(), + maxDoc()+other.maxDoc(), add(docCount(),other.docCount()), + add(sumTotalTermFreq(),other.sumTotalTermFreq()), add(sumDocFreq(),other.sumDocFreq())); + } + + // TODO: can't we put this somewhere better? TermContext has this logic too + // note: messy because we have to preserve -1 if there are any + // 3.x segments that don't support these stats: remove this in 5.0 + private long add(long l1, long l2) { + if (l1 >= 0 && l2 >= 0) { + return l1 + l2; + } else { + return -1; + } + } } Index: lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/similarities/BM25Similarity.java (working copy) @@ -58,7 +58,7 @@ } /** Implemented as log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5)). */ - protected float idf(int docFreq, int numDocs) { + protected float idf(long docFreq, long numDocs) { return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D)); } @@ -131,19 +131,19 @@ } public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { - final int df = termStats.docFreq(); - final int max = collectionStats.maxDoc(); + final long df = termStats.docFreq(); + final long max = collectionStats.maxDoc(); final float idf = idf(df, max); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); } public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { - final int max = collectionStats.maxDoc(); + final long max = collectionStats.maxDoc(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats ) { - final int df = stat.docFreq(); + final long df = stat.docFreq(); final float termIdf = idf(df, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; Index: lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/similarities/BasicModelIF.java (working copy) @@ -26,7 +26,7 @@ public class BasicModelIF extends BasicModel { @Override public final float score(BasicStats stats, float tfn) { - int N = stats.getNumberOfDocuments(); + long N = stats.getNumberOfDocuments(); long F = stats.getTotalTermFreq(); return tfn * (float)(log2(1 + (N + 1) / (F + 0.5))); } Index: lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/similarities/SimilarityBase.java (working copy) @@ -87,9 +87,9 @@ /** Fills all member fields defined in {@code BasicStats} in {@code stats}. * Subclasses can override this method to fill additional stats. */ protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { - int numberOfDocuments = collectionStats.maxDoc(); + long numberOfDocuments = collectionStats.maxDoc(); - int docFreq = termStats.docFreq(); + long docFreq = termStats.docFreq(); long totalTermFreq = termStats.totalTermFreq(); // codec does not supply totalTermFreq: substitute docFreq Index: lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/similarities/BasicStats.java (working copy) @@ -25,13 +25,13 @@ */ public class BasicStats extends Similarity.Stats { /** The number of documents. */ - protected int numberOfDocuments; + protected long numberOfDocuments; /** The total number of tokens in the field. */ protected long numberOfFieldTokens; /** The average field length. */ protected float avgFieldLength; /** The document frequency. */ - protected int docFreq; + protected long docFreq; /** The total number of occurrences of this term across all documents. */ protected long totalTermFreq; @@ -55,12 +55,12 @@ // ------------------------- Getter/setter methods ------------------------- /** Returns the number of documents. */ - public int getNumberOfDocuments() { + public long getNumberOfDocuments() { return numberOfDocuments; } /** Sets the number of documents. */ - public void setNumberOfDocuments(int numberOfDocuments) { + public void setNumberOfDocuments(long numberOfDocuments) { this.numberOfDocuments = numberOfDocuments; } @@ -91,12 +91,12 @@ } /** Returns the document frequency. */ - public int getDocFreq() { + public long getDocFreq() { return docFreq; } /** Sets the document frequency. */ - public void setDocFreq(int docFreq) { + public void setDocFreq(long docFreq) { this.docFreq = docFreq; } Index: lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java (working copy) @@ -333,13 +333,13 @@ * idf(t) appears for t in both the query and the document, * hence it is squared in the equation. * The default computation for idf(t) in - * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: + * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(long, long) DefaultSimilarity} is: * *
 
* * * *
- * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) idf(t)}  =   + * {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(long, long) idf(t)}  =   * * 1 + log ( @@ -526,7 +526,7 @@ public abstract class TFIDFSimilarity extends Similarity { /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} + * document. This value is multiplied by the {@link #idf(long, long)} * factor for each term in the query and these products are then summed to * form the initial score for a document. * @@ -545,7 +545,7 @@ } /** Computes a score factor based on a term or phrase's frequency in a - * document. This value is multiplied by the {@link #idf(int, int)} + * document. This value is multiplied by the {@link #idf(long, long)} * factor for each term in the query and these products are then summed to * form the initial score for a document. * @@ -583,8 +583,8 @@ * @throws IOException */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { - final int df = termStats.docFreq(); - final int max = collectionStats.maxDoc(); + final long df = termStats.docFreq(); + final long max = collectionStats.maxDoc(); final float idf = idf(df, max); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); } @@ -604,12 +604,12 @@ * @throws IOException */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { - final int max = collectionStats.maxDoc(); + final long max = collectionStats.maxDoc(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats ) { - final int df = stat.docFreq(); + final long df = stat.docFreq(); final float termIdf = idf(df, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; @@ -631,7 +631,7 @@ * @param numDocs the total number of documents in the collection * @return a score factor based on the term's document frequency */ - public abstract float idf(int docFreq, int numDocs); + public abstract float idf(long docFreq, long numDocs); /** Cache of decoded bytes. */ private static final float[] NORM_TABLE = new float[256]; Index: lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/similarities/AfterEffectB.java (working copy) @@ -27,7 +27,7 @@ @Override public final float score(BasicStats stats, float tfn) { long F = stats.getTotalTermFreq()+1; - int n = stats.getDocFreq()+1; + long n = stats.getDocFreq()+1; return (F + 1) / (n * (tfn + 1)); } Index: lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/similarities/BasicModelIn.java (working copy) @@ -27,8 +27,8 @@ public class BasicModelIn extends BasicModel { @Override public final float score(BasicStats stats, float tfn) { - int N = stats.getNumberOfDocuments(); - int n = stats.getDocFreq(); + long N = stats.getNumberOfDocuments(); + long n = stats.getDocFreq(); return tfn * (float)(log2((N + 1) / (n + 0.5))); } Index: lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/similarities/DefaultSimilarity.java (working copy) @@ -62,7 +62,7 @@ /** Implemented as log(numDocs/(docFreq+1)) + 1. */ @Override - public float idf(int docFreq, int numDocs) { + public float idf(long docFreq, long numDocs) { return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); } Index: lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java =================================================================== --- lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java (revision 1235565) +++ lucene/src/java/org/apache/lucene/search/similarities/BasicModelIne.java (working copy) @@ -27,7 +27,7 @@ public class BasicModelIne extends BasicModel { @Override public final float score(BasicStats stats, float tfn) { - int N = stats.getNumberOfDocuments(); + long N = stats.getNumberOfDocuments(); long F = stats.getTotalTermFreq(); double ne = N * (1 - Math.pow((N - 1) / (double)N, F)); return tfn * (float)(log2((N + 1) / (ne + 0.5)));