Index: lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java =================================================================== --- lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java (revision 1306869) +++ lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java (working copy) @@ -27,6 +27,7 @@ import org.apache.lucene.index.PruningReader; import org.apache.lucene.index.pruning.CarmelTopKTermPruningPolicy; import org.apache.lucene.index.pruning.PruningPolicy; +import org.apache.lucene.index.pruning.RIDFTermPruningPolicy; import org.apache.lucene.index.pruning.StorePruningPolicy; import org.apache.lucene.index.pruning.TFTermPruningPolicy; import org.apache.lucene.search.IndexSearcher; @@ -55,7 +56,8 @@ try { int i = 0; while(td.next()) { - assertEquals(t + ", i=" + i, ids[i], td.doc()); + int doc = td.doc(); + assertEquals(t + ", i=" + i, ids[i], doc); i++; } assertEquals(ids.length, i); @@ -208,6 +210,20 @@ ir.deleteDocument(5); ir.close(); } + + public void testRIDFPruning() throws Exception { + RAMDirectory targetDir = new RAMDirectory(); + IndexReader in = IndexReader.open(sourceDir, true); + // remove only very popular terms + RIDFTermPruningPolicy ridf = new RIDFTermPruningPolicy(in, null, null, -0.12); + PruningReader tfr = new PruningReader(in, null, ridf); + assertTDCount(tfr, new Term("body", "one"), 0); + assertTD(tfr, new Term("body", "two"), new int[]{0, 1, 2, 4}); + assertTD(tfr, new Term("body", "three"), new int[]{0, 1, 3}); + assertTD(tfr, new Term("test", "one"), new int[]{4}); + assertTD(tfr, new Term("body", "four"), new int[]{0}); + assertTD(tfr, new Term("test", "four"), new int[]{4}); + } public void testTfPruning() throws Exception { RAMDirectory targetDir = new RAMDirectory(); Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java (revision 1306869) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java (working copy) @@ -29,15 +29,27 @@ /** * Implementation of {@link TermPruningPolicy} that uses "residual IDF" - * metric to determine the postings of terms to keep/remove. Residual - * IDF is a difference between a collection-wide IDF of a term and the - * observed in-document frequency of the term. + * metric to determine the postings of terms to keep/remove, as defined in + * http://www.dc.fi.udc.es/~barreiro/publications/blanco_barreiro_ecir2007.pdf. + *
Residual IDF measures a difference between a collection-wide IDF of a term + * (which assumes a uniform distribution of occurrences) and the actual + * observed total number of occurrences of a term in all documents. Positive + * values indicate that a term is informative (e.g. for rare terms), negative + * values indicate that a term is not informative (e.g. too popular to offer + * good selectivity). + *
This metric produces small values close to [-1, 1], so useful ranges for
+ * thresholds under this metrics are somewhere between [0, 1]. The higher the
+ * threshold the more informative (and more rare) terms will be retained. For
+ * filtering of common words a value of close to or slightly below 0 (e.g. -0.1)
+ * should be a good starting point.
+ *
*/
public class RIDFTermPruningPolicy extends TermPruningPolicy {
double defThreshold;
Map