Index: lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java =================================================================== --- lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java (revision 1306869) +++ lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java (working copy) @@ -27,6 +27,7 @@ import org.apache.lucene.index.PruningReader; import org.apache.lucene.index.pruning.CarmelTopKTermPruningPolicy; import org.apache.lucene.index.pruning.PruningPolicy; +import org.apache.lucene.index.pruning.RIDFTermPruningPolicy; import org.apache.lucene.index.pruning.StorePruningPolicy; import org.apache.lucene.index.pruning.TFTermPruningPolicy; import org.apache.lucene.search.IndexSearcher; @@ -55,7 +56,8 @@ try { int i = 0; while(td.next()) { - assertEquals(t + ", i=" + i, ids[i], td.doc()); + int doc = td.doc(); + assertEquals(t + ", i=" + i, ids[i], doc); i++; } assertEquals(ids.length, i); @@ -208,6 +210,20 @@ ir.deleteDocument(5); ir.close(); } + + public void testRIDFPruning() throws Exception { + RAMDirectory targetDir = new RAMDirectory(); + IndexReader in = IndexReader.open(sourceDir, true); + // remove only very popular terms + RIDFTermPruningPolicy ridf = new RIDFTermPruningPolicy(in, null, null, -0.12); + PruningReader tfr = new PruningReader(in, null, ridf); + assertTDCount(tfr, new Term("body", "one"), 0); + assertTD(tfr, new Term("body", "two"), new int[]{0, 1, 2, 4}); + assertTD(tfr, new Term("body", "three"), new int[]{0, 1, 3}); + assertTD(tfr, new Term("test", "one"), new int[]{4}); + assertTD(tfr, new Term("body", "four"), new int[]{0}); + assertTD(tfr, new Term("test", "four"), new int[]{4}); + } public void testTfPruning() throws Exception { RAMDirectory targetDir = new RAMDirectory(); Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java (revision 1306869) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java (working copy) @@ -29,15 +29,27 @@ /** * Implementation of {@link TermPruningPolicy} that uses "residual IDF" - * metric to determine the postings of terms to keep/remove. Residual - * IDF is a difference between a collection-wide IDF of a term and the - * observed in-document frequency of the term. + * metric to determine the postings of terms to keep/remove, as defined in + * http://www.dc.fi.udc.es/~barreiro/publications/blanco_barreiro_ecir2007.pdf. + *

Residual IDF measures a difference between a collection-wide IDF of a term + * (which assumes a uniform distribution of occurrences) and the actual + * observed total number of occurrences of a term in all documents. Positive + * values indicate that a term is informative (e.g. for rare terms), negative + * values indicate that a term is not informative (e.g. too popular to offer + * good selectivity). + *

This metric produces small values close to [-1, 1], so useful ranges for + * thresholds under this metrics are somewhere between [0, 1]. The higher the + * threshold the more informative (and more rare) terms will be retained. For + * filtering of common words a value of close to or slightly below 0 (e.g. -0.1) + * should be a good starting point. + * */ public class RIDFTermPruningPolicy extends TermPruningPolicy { double defThreshold; Map thresholds; - double df; + double idf; double maxDoc; + double ridf; public RIDFTermPruningPolicy(IndexReader in, Map fieldFlags, Map thresholds, @@ -54,7 +66,18 @@ @Override public void initPositionsTerm(TermPositions tp, Term t) throws IOException { - df = Math.log(in.docFreq(t) / maxDoc); + // from formula [2], not the formula [1] + // + idf = - Math.log((double)in.docFreq(t) / maxDoc); + // calculate total number of occurrences + int totalFreq = 0; + while (tp.next()) { + totalFreq += tp.freq(); + } + // reposition the enum + tp.seek(t); + // rest of the formula [2] in the paper + ridf = idf + Math.log(1 - Math.pow(Math.E, - totalFreq / maxDoc)); } @Override @@ -65,7 +88,6 @@ @Override public boolean pruneAllPositions(TermPositions termPositions, Term t) throws IOException { - double ridf = Math.log(1 - Math.pow(Math.E, termPositions.freq() / maxDoc)) - df; double thr = defThreshold; String key = t.field() + ":" + t.text(); if (thresholds.containsKey(key)) {