Index: build.xml
===================================================================
--- build.xml	(revision 831898)
+++ build.xml	(working copy)
@@ -312,6 +312,7 @@
           <packageset dir="contrib/lucli/src/java"/>
           <packageset dir="contrib/memory/src/java"/>
           <packageset dir="contrib/misc/src/java"/>
+          <packageset dir="contrib/pruning/src/java"/>
           <packageset dir="contrib/queries/src/java"/>
           <packageset dir="contrib/regex/src/java"/>
           <packageset dir="contrib/remote/src/java"/>
Index: contrib/pruning/pom.xml.template
===================================================================
--- contrib/pruning/pom.xml.template	(revision 0)
+++ contrib/pruning/pom.xml.template	(revision 0)
@@ -0,0 +1,38 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+    
+    http://www.apache.org/licenses/LICENSE-2.0
+    
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+  -->
+
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.lucene</groupId>
+    <artifactId>lucene-contrib</artifactId>
+    <version>@version@</version>
+  </parent>
+  <groupId>org.apache.lucene</groupId>
+  <artifactId>lucene-pruning</artifactId>
+  <name>Lucene Pruning</name>
+  <version>@version@</version>
+  <description>
+    Pruning Lucene indexes by various criteria.
+  </description>
+  <packaging>jar</packaging>
+</project>
Index: contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java
===================================================================
--- contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java	(revision 0)
+++ contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java	(revision 0)
@@ -0,0 +1,165 @@
+package org.apache.lucene.index;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.PruningPolicy;
+import org.apache.lucene.index.PruningReader;
+import org.apache.lucene.index.StorePruningPolicy;
+import org.apache.lucene.index.TFTermPruningPolicy;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.RAMDirectory;
+
+import junit.framework.TestCase;
+
+public class TestPruningReader extends TestCase {
+  RAMDirectory sourceDir = new RAMDirectory();
+  
+  private void assertTD(IndexReader ir, Term t, int[] ids) throws Exception {
+    TermPositions td = ir.termPositions(t);
+    assertNotNull(td);
+    try {
+      int i = 0;
+      while(td.next()) {
+        assertEquals(t + ", i=" + i, ids[i], td.doc());
+        i++;
+      }
+      assertEquals(ids.length, i);
+    } finally {
+      td.close();
+    }
+  }
+  
+  private void assertTDCount(IndexReader ir, Term t, int count) throws Exception {
+    TermPositions td = ir.termPositions(t);
+    assertNotNull(td);
+    try {
+      int i = 0;
+      while (td.next()) i++;
+      assertEquals(t.toString(), count, i);
+    } finally {
+      td.close();
+    }
+  }
+  
+  public void setUp() throws Exception {
+    IndexWriter iw = new IndexWriter(sourceDir, new WhitespaceAnalyzer(),MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    doc.add(new Field("body", "one two three four", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "0", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one two three one two three", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one two one two one two", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one three one three one three", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one one one one two", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("test", "one two one two three three three four", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    // to be deleted
+    doc = new Document();
+    doc.add(new Field("body", "one three one three one three five five five", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    iw.close();
+    IndexReader ir = IndexReader.open(sourceDir, false);
+    ir.deleteDocument(5);
+    ir.close();
+  }
+
+  public void testTfPruning() throws Exception {
+    RAMDirectory targetDir = new RAMDirectory();
+    IndexReader in = IndexReader.open(sourceDir, true);
+    TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, null, null, 2);
+    PruningReader tfr = new PruningReader(in, null, tfp);
+    // verify
+    assertTD(tfr, new Term("body", "one"), new int[]{1, 2, 3, 4});
+    assertTD(tfr, new Term("body", "two"), new int[]{1, 2});
+    assertTD(tfr, new Term("body", "three"), new int[]{1, 3});
+    assertTD(tfr, new Term("test", "one"), new int[]{4});
+    assertTDCount(tfr, new Term("body", "four"), 0);
+    assertTDCount(tfr, new Term("test", "four"), 0);
+    // verify new reader
+    IndexWriter iw = new IndexWriter(targetDir, new WhitespaceAnalyzer(), MaxFieldLength.LIMITED);
+    iw.addIndexes(new IndexReader[]{tfr});
+    iw.close();
+    IndexReader ir = IndexReader.open(targetDir, true);
+    assertTD(ir, new Term("body", "one"), new int[]{1, 2, 3, 4});
+    assertTD(ir, new Term("body", "two"), new int[]{1, 2});
+    assertTD(ir, new Term("body", "three"), new int[]{1, 3});
+    assertTD(ir, new Term("test", "one"), new int[]{4});
+    tfr.close();
+    ir.close();
+  }
+  
+  public void testThresholds() throws Exception {
+    Map<String, Integer> thresholds = new HashMap<String, Integer>();
+    thresholds.put("test", 3);
+    IndexReader in = IndexReader.open(sourceDir, true);
+    TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, null, thresholds, 2);
+    PruningReader tfr = new PruningReader(in, null, tfp);
+    assertTDCount(tfr, new Term("test", "one"), 0);
+    assertTDCount(tfr, new Term("test", "two"), 0);
+    assertTD(tfr, new Term("test", "three"), new int[]{4});
+    assertTDCount(tfr, new Term("test", "four"), 0);
+  }
+  
+  public void testRemoveFields() throws Exception {
+    RAMDirectory targetDir = new RAMDirectory();
+    Map<String, Integer> removeFields = new HashMap<String, Integer>();
+    removeFields.put("test", PruningPolicy.DEL_POSTINGS | PruningPolicy.DEL_STORED);
+    IndexReader in = IndexReader.open(sourceDir, true);
+    TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, removeFields, null, 2);
+    StorePruningPolicy stp = new StorePruningPolicy(in, removeFields);
+    PruningReader tfr = new PruningReader(in, stp, tfp);
+    Document doc = tfr.document(4);
+    // removed stored values?
+    assertNull(doc.get("test"));
+    // removed postings ?
+    TermEnum te = tfr.terms();
+    while (te.next()) {
+      assertFalse("test".equals(te.term().field()));
+    }
+    // but vectors should be present !
+    TermFreqVector tv = tfr.getTermFreqVector(4, "test");
+    assertNotNull(tv);
+    assertEquals(4, tv.getTerms().length); // term "four" not deleted yet from TermEnum
+    // verify new reader
+    IndexWriter iw = new IndexWriter(targetDir, new WhitespaceAnalyzer(), MaxFieldLength.LIMITED);
+    iw.addIndexes(new IndexReader[]{tfr});
+    iw.close();
+    IndexReader ir = IndexReader.open(targetDir, true);
+    tv = ir.getTermFreqVector(4, "test");
+    assertNotNull(tv);
+    assertEquals(3, tv.getTerms().length); // term "four" was deleted from TermEnum
+  }
+}

Property changes on: contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/src/java/org/apache/lucene/index/StorePruningPolicy.java
===================================================================
--- contrib/pruning/src/java/org/apache/lucene/index/StorePruningPolicy.java	(revision 0)
+++ contrib/pruning/src/java/org/apache/lucene/index/StorePruningPolicy.java	(revision 0)
@@ -0,0 +1,124 @@
+package org.apache.lucene.index;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+import java.util.logging.Logger;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.FieldSelectorResult;
+import org.apache.lucene.index.IndexReader;
+
+/**
+ * This class implements rules for removing stored fields from documents.
+ */
+public class StorePruningPolicy extends PruningPolicy {
+  private static final Logger LOG = Logger.getLogger(StorePruningPolicy.class.getName());
+  protected Map<String, Integer> fieldFlags;
+  protected Set<String> deleteAll;
+  protected DelFieldSelector fs;
+  protected IndexReader in;
+  protected int delFields;
+  
+  /**
+   * Constructs a policy.
+   * @param in input reader.
+   * @param fieldFlags a map where keys are field names, and flags are
+   * bitwise-OR values of flags defined in {@link PruningPolicy}.
+   */
+  @SuppressWarnings("unchecked")
+  public StorePruningPolicy(IndexReader in, Map<String, Integer> fieldFlags) {
+    if (fieldFlags != null) {
+      this.fieldFlags = fieldFlags;
+      deleteAll = new HashSet<String>();
+      for (Entry<String, Integer> e : fieldFlags.entrySet()) {
+        if (e.getValue() == PruningPolicy.DEL_ALL) {
+          deleteAll.add(e.getKey());
+        }
+      }
+    } else {
+      this.fieldFlags = Collections.EMPTY_MAP;
+      deleteAll = Collections.EMPTY_SET;
+    }
+    fs = new DelFieldSelector(fieldFlags);
+    this.in = in;
+  }
+  
+  @SuppressWarnings("unchecked")
+  public Collection getFieldNames(Collection allFields) {
+    // for simplicity remove only fields with DEL_ALL
+    if (!deleteAll.isEmpty()) {
+      allFields.removeAll(deleteAll);
+    }
+    return allFields;    
+  }
+  
+  /**
+   * Prune stored fields of a document. Note that you can also arbitrarily
+   * change values of the retrieved fields, so long as the field names belong
+   * to a list of fields returned from {@link #getFieldNames(Collection)}.
+   * @param doc document number
+   * @param parent original field selector that limits what fields will be
+   * retrieved.
+   * @return a pruned instance of a Document.
+   * @throws IOException
+   */
+  public Document pruneDocument(int doc, FieldSelector parent) throws IOException {
+    if (fieldFlags.isEmpty()) {
+      return in.document(doc, parent);
+    } else {
+      fs.setParent(parent);
+      return in.document(doc, fs);
+    }    
+  }
+  
+  @SuppressWarnings("serial")
+  class DelFieldSelector implements FieldSelector {    
+    private FieldSelector parent;
+    private Map<String, Integer> remove;
+    
+    public DelFieldSelector(Map<String, Integer> remove) {
+      this.remove = remove;
+    }
+    
+    public void setParent(FieldSelector parent) {
+      this.parent = parent;
+    }
+    
+    @Override
+    public FieldSelectorResult accept(String fieldName) {
+      if (!remove.isEmpty() && remove.containsKey(fieldName) &&
+              ((remove.get(fieldName) & DEL_STORED) > 0)) {
+        delFields++;
+        if (delFields % 10000 == 0) {
+          LOG.info(" - stored fields: removed " + delFields + " fields.");
+        }
+        return FieldSelectorResult.NO_LOAD;
+      } else if (parent != null) {
+        return parent.accept(fieldName);
+      } else return FieldSelectorResult.LOAD;
+    }
+  };
+
+}

Property changes on: contrib/pruning/src/java/org/apache/lucene/index/StorePruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java
===================================================================
--- contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java	(revision 0)
+++ contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java	(revision 0)
@@ -0,0 +1,154 @@
+package org.apache.lucene.index;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
+/**
+ * A command-line tool to configure and run a {@link PruningReader} on an input
+ * index and produce a pruned output index using
+ * {@link IndexWriter#addIndexes(IndexReader...)}.
+ */
+public class PruningTool {
+
+  public static void main(String[] args) throws Exception {
+    int res = run(args);
+    System.exit(res);
+  }
+  
+  public static int run(String[] args) throws Exception {
+    if (args.length < 5) {
+      System.err.println("Usage: PruningTool -impl (tf | carmel) (-in <path1> [-in <path2> ...]) -out <outPath> -t <NN> [-del f1,f2,..] [-conf <file>]");
+      System.err.println("\t-impl (tf | carmel)\timplementation name: TFPruningReader or CarmelPruningReader");
+      System.err.println("\t-in path\tpath to the input index. Can specify multiple input indexes.");
+      System.err.println("\t-out path\toutput path where the output index will be stored.");
+      System.err.println("\t-t NN\tdefault threshold value (minimum in-document frequency) for all terms");
+      System.err.println("\t-del f1,f2,..\tcomma-separated list of field specs to delete (postings, vectors & stored):");
+      System.err.println("\t\tfield spec : fieldName ( ':' [pPsv] )");
+      System.err.println("\t\twhere: p - postings, P - payloads, s - stored value, v - vectors");
+      System.err.println("\t-conf file\tpath to config file with per-term thresholds");
+      return -1;
+    }
+    ArrayList<IndexReader> inputs = new ArrayList<IndexReader>();
+    Directory out = null;
+    float thr = -1;
+    Map<String, Integer> delFields = new HashMap<String, Integer>();
+    String impl = null;
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-in")) {
+        Directory d = FSDirectory.open(new File(args[++i]));
+        if (!IndexReader.indexExists(d)) {
+          System.err.println("WARN: no index in " + args[i] + ", skipping ...");
+        }
+        inputs.add(IndexReader.open(d, true));
+      } else if (args[i].equals("-out")) {
+        File outFile = new File(args[++i]);
+        if (outFile.exists()) {
+          throw new Exception("Output " + outFile + " already exists.");
+        }
+        outFile.mkdirs();
+        out = FSDirectory.open(outFile);
+      } else if (args[i].equals("-impl")) {
+        impl = args[++i];
+      } else if (args[i].equals("-t")) {
+        thr = Float.parseFloat(args[++i]);
+      } else if (args[i].equals("-del")) {
+        String[] fields = args[++i].split(",");
+        for (String f : fields) {
+          // parse field spec
+          String[] spec = f.split(":");
+          int opts = PruningPolicy.DEL_ALL;
+          if (spec.length > 0) {
+            opts = 0;
+            if (spec[1].indexOf('p') != -1) {
+              opts |= PruningPolicy.DEL_POSTINGS;
+            }
+            if (spec[1].indexOf('P') != -1) {
+              opts |= PruningPolicy.DEL_PAYLOADS;
+            }
+            if (spec[1].indexOf('s') != -1) {
+              opts |= PruningPolicy.DEL_STORED;
+            }
+            if (spec[1].indexOf('v') != -1) {
+              opts |= PruningPolicy.DEL_VECTOR;
+            }
+          }
+          delFields.put(spec[0], opts);
+        }
+      } else if (args[i].equals("-conf")) {
+        ++i;
+        System.err.println("WARN: -conf option not implemented yet.");
+      } else {
+        throw new Exception("Invalid argument: '" + args[i] + "'");
+      }
+    }
+    if (impl == null) {
+      throw new Exception("Must select algorithm implementation");
+    }
+    if (inputs.size() == 0) {
+      throw new Exception("At least one input index is required.");
+    }
+    if (out == null) {
+      throw new Exception("Output path is not set.");
+    }
+    if (thr == -1) {
+      throw new Exception("Threshold value is not set.");
+    }
+    IndexReader in;
+    if (inputs.size() == 1) {
+      in = inputs.get(0);
+    } else {
+      in = new MultiReader((IndexReader[])inputs
+              .toArray(new IndexReader[inputs.size()]), true);
+    }
+    if (in.hasDeletions()) {
+      System.err.println("WARN: input index(es) with deletions - document ID-s will NOT be preserved!");
+    }
+    IndexReader pruning = null;
+    StorePruningPolicy stp = null;
+    if (delFields.size() > 0) {
+      stp = new StorePruningPolicy(in, delFields);
+    }
+    TermPruningPolicy tpp = null;
+    if (impl.equals("tf")) {
+      tpp = new TFTermPruningPolicy(in, delFields, null, (int)thr);
+    } else if (impl.equals("carmel")) {
+      tpp = new CarmelTermPruningPolicy(in, delFields, null, thr, null);      
+    } else {
+      throw new Exception("Unknown algorithm: '" + impl + "'");
+    }
+    pruning = new PruningReader(in, stp, tpp);
+    IndexWriter iw = new IndexWriter(out, new WhitespaceAnalyzer(), MaxFieldLength.UNLIMITED);
+    iw.setUseCompoundFile(false);
+    iw.addIndexes(new IndexReader[]{pruning});
+    iw.close();
+    System.err.println("DONE.");
+    return 0;
+  }
+}

Property changes on: contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java
===================================================================
--- contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java	(revision 0)
+++ contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java	(revision 0)
@@ -0,0 +1,31 @@
+package org.apache.lucene.index;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Useful constants that define operations to be performed on field data.*/
+public class PruningPolicy {
+  /** Delete (some or all) postings for this field. */
+  public static final int DEL_POSTINGS  = 0x01;
+  /** Delete (some or all) stored values for this field. */
+  public static final int DEL_STORED    = 0x02;
+  /** Delete term frequency vectors for this field (whole vectors or individual terms). */
+  public static final int DEL_VECTOR    = 0x04;
+  /** Delete (some or all) payloads in these fields. */
+  public static final int DEL_PAYLOADS  = 0x08;
+  /** Delete all data for this field. */
+  public static final int DEL_ALL       = 0xff;
+}

Property changes on: contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java
===================================================================
--- contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java	(revision 0)
+++ contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java	(revision 0)
@@ -0,0 +1,127 @@
+package org.apache.lucene.index;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+
+/**
+ * This class produces a subset of the input index, by removing postings data
+ * for those terms where their in-document frequency is below a specified
+ * threshold. The net effect of this processing is a much smaller index that for
+ * many types of queries returns nearly identical top-N results as compared with
+ * the original index, but with increased performance.
+ * <p>See the comments in {@link CarmelTermPruningPolicy} for more details. This
+ * implementation uses simple term frequency thresholds to remove all postings
+ * from documents where a given term occurs rarely (i.e. its TF in a document
+ * is smaller than the threshold).
+ * <p>Threshold values in this method are expressed as absolute term frequencies.
+ */
+public class TFTermPruningPolicy extends TermPruningPolicy {
+  protected Map<String, Integer> thresholds;
+  protected int defThreshold;
+  protected int curThr;
+
+  @SuppressWarnings("unchecked")
+  protected TFTermPruningPolicy(IndexReader in, Map<String, Integer> fieldFlags,
+          Map<String, Integer> thresholds, int defThreshold) {
+    super(in, fieldFlags);
+    this.defThreshold = defThreshold;
+    if (thresholds != null) {
+      this.thresholds = thresholds;
+    } else {
+      this.thresholds = Collections.EMPTY_MAP;
+    }
+  }
+
+  @Override
+  public boolean pruneTermEnum(TermEnum te) throws IOException {
+    // check that at least one doc exceeds threshold
+    int thr = defThreshold;
+    String termKey = te.term().field() + ":" + te.term().text();
+    if (thresholds.containsKey(termKey)) {
+      thr = thresholds.get(termKey);
+    } else if (thresholds.containsKey(te.term().field())) {
+      thr = thresholds.get(te.term().field());
+    }
+    TermDocs td = in.termDocs(te.term());
+    boolean pass = false;
+    while (td.next()) {
+      if (td.freq() >= thr) {
+        pass = true;
+        break;
+      }
+    }
+    td.close();
+    return !pass;
+  }
+
+  @Override
+  public void initTermPositions(TermPositions in, Term t) throws IOException {
+    // set threshold for this field
+    curThr = defThreshold;
+    String termKey = t.field() + ":" + t.text();
+    if (thresholds.containsKey(termKey)) {
+      curThr = thresholds.get(termKey);
+    } else if (thresholds.containsKey(t.field())) {
+      curThr = thresholds.get(t.field());
+    }
+  }
+
+  @Override
+  public boolean pruneTermPositions(TermPositions termPositions, Term t)
+          throws IOException {
+    if (termPositions.freq() < curThr) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  @Override
+  public int pruneTermVectorTerms(int docNumber, String field, String[] terms,
+          int[] freqs, TermFreqVector tfv)
+          throws IOException {
+    int thr = defThreshold;
+    if (thresholds.containsKey(field)) {
+      thr = thresholds.get(field);
+    }
+    int removed = 0;
+    for (int i = 0; i < terms.length; i++) {
+      // check per-term thresholds
+      int termThr = thr;
+      String t = field + ":" + terms[i];
+      if (thresholds.containsKey(t)) {
+        termThr = thresholds.get(t);
+      }
+      if (freqs[i] < termThr) {
+        terms[i] = null;
+        removed++;
+      }      
+    }
+    return removed;
+  }
+
+}

Property changes on: contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java
===================================================================
--- contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java	(revision 0)
+++ contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java	(revision 0)
@@ -0,0 +1,293 @@
+package org.apache.lucene.index;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.logging.Logger;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FilterIndexReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.SegmentTermPositionVector;
+import org.apache.lucene.index.SegmentTermVector;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+
+/**
+ * This class produces a subset of the input index, by removing some
+ * postings data according to rules implemented in a
+ * {@link TermPruningPolicy}, and optionally it can also remove
+ * stored fields of documents according to rules implemented in a
+ * {@link StorePruningPolicy}.
+ */
+public class PruningReader extends FilterIndexReader {
+  private static final Logger LOG = Logger.getLogger(PruningReader.class.getName());
+  
+  protected int docCount;
+  protected int vecCount;
+  protected int termCount, delTermCount;
+  protected int prunedVecCount, delVecCount;
+  
+  protected TermPruningPolicy termPolicy;
+  protected StorePruningPolicy storePolicy;
+    
+  
+  /**
+   * Constructor.
+   * @param in input reader
+   * @param storePolicy implementation of {@link StorePruningPolicy} - if null
+   * then stored values will be retained as is.
+   * @param termPolicy implementation of {@link TermPruningPolicy}, must not
+   * be null.
+   */
+  public PruningReader(IndexReader in, StorePruningPolicy storePolicy,
+          TermPruningPolicy termPolicy) {
+    super(in);
+    this.termPolicy = termPolicy;
+    assert termPolicy != null;
+    this.storePolicy = storePolicy;
+  }
+
+  /**
+   * Applies a {@link StorePruningPolicy} to stored fields of a document.
+   */
+  @Override
+  public Document document(final int n, FieldSelector fieldSelector)
+          throws CorruptIndexException, IOException {
+    docCount++;
+    if ((docCount % 10000) == 0) {
+      LOG.info(" - stored fields: " + docCount + " docs.");
+    }
+    if (storePolicy != null) {
+      return storePolicy.pruneDocument(n, fieldSelector);
+    } else {
+      return in.document(n, fieldSelector);
+    }
+  }
+
+  /**
+   * Applies a {@link StorePruningPolicy} to the list of available field names.
+   */
+  @SuppressWarnings("unchecked")
+  @Override
+  public Collection getFieldNames(FieldOption fieldNames) {
+    Collection res = super.getFieldNames(fieldNames);
+    if (storePolicy == null) {
+      return res;
+    }
+    return storePolicy.getFieldNames(res);
+  }
+
+
+  /**
+   * Applies {@link TermPruningPolicy} to terms inside term vectors.
+   */
+  @Override
+  public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
+    TermFreqVector[] vectors = super.getTermFreqVectors(docNumber);
+    if (vectors == null) {
+      return null;
+    }
+    ArrayList<TermFreqVector> newVectors = new ArrayList<TermFreqVector>();
+    for (TermFreqVector v : vectors) {
+      if (v == null) {
+        continue;
+      }
+      if (termPolicy.pruneWholeTermVector(docNumber, v.getField())) {
+        delVecCount++;
+        if ((delVecCount % 10000) == 0) {
+          LOG.info(" - deleted vectors: " + delVecCount);
+        }
+        continue;
+      }
+      if (v.size() == 0) {
+        continue;
+      }
+      String[] terms = v.getTerms();
+      int[] freqs = v.getTermFrequencies();
+      
+      int removed = termPolicy.pruneTermVectorTerms(docNumber, v.getField(), terms, freqs, v);
+      if (removed > 0 && removed < terms.length) {
+        String[] newTerms = new String[terms.length - removed];
+        int[] newFreqs = new int[terms.length - removed];
+        int j = 0;
+        for (int i = 0; i < terms.length; i++) {
+          if (terms[i] != null) {
+            newTerms[j] = terms[i];
+            newFreqs[j] = freqs[i];
+            j++;
+          }
+        }
+        // create a modified vector
+        if (v instanceof TermPositionVector) {
+          TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[terms.length - removed][];
+          boolean withOffsets = false;
+          j = 0;
+          for (int i = 0; i < terms.length; i++) {
+            if (terms[i] == null) {
+              continue;
+            }
+            offsets[j] = ((TermPositionVector)v).getOffsets(i);
+            if (offsets[j] != null && offsets[j] != TermVectorOffsetInfo.EMPTY_OFFSET_INFO) {
+              withOffsets = true;
+            }
+            j++;
+          }
+          j = 0;
+          int[][] positions = new int[terms.length - removed][];
+          boolean withPositions = false;
+          for (int i = 0; i < terms.length; i++) {
+            if (terms[i] == null) {
+              continue;
+            }
+            positions[j] = ((TermPositionVector)v).getTermPositions(i);
+            if (positions[j] != null && positions[j].length > 0) {
+              withPositions = true;
+            }
+            j++;
+          }
+          v = new SegmentTermPositionVector(v.getField(), newTerms, newFreqs,
+                  withPositions ? positions : null,
+                  withOffsets ? offsets : null);
+        } else {
+          v = new SegmentTermVector(v.getField(), newTerms, newFreqs);
+        }
+        newVectors.add(v);
+      }
+    }
+    vecCount++;
+    if ((vecCount % 10000) == 0) {
+      LOG.info(" - vectors: " + vecCount + " docs.");
+    }
+    if (newVectors.size() == 0) {
+      prunedVecCount++;
+      if ((prunedVecCount % 1000) == 0) {
+        LOG.info(" - deleted pruned vectors: " + prunedVecCount);
+      }
+      return null;
+    }
+    return (TermFreqVector[])newVectors.toArray(new TermFreqVector[newVectors.size()]);
+  }
+  
+  /**
+   * Applies {@link TermPruningPolicy} to term positions.
+   */
+  @Override
+  public TermPositions termPositions() throws IOException {
+    return new PruningTermPositions(in.termPositions());
+  }
+
+  /**
+   * Applies {@link TermPruningPolicy} to term enum.
+   */
+  @Override
+  public TermEnum terms() throws IOException {
+    return new PruningTermEnum(in.terms());
+  }
+  
+  class PruningTermEnum extends FilterTermEnum {
+    
+    public PruningTermEnum(TermEnum in) {
+      super(in);
+    }
+    
+    @Override
+    public boolean next() throws IOException {
+      for ( ; ; ) {
+        if (!super.next()) {
+          //System.out.println("TE: end");
+          return false;
+        }
+        termCount++;
+        if ((termCount % 50000) == 0) {
+          LOG.info(" - terms: " + termCount + " (" + term() + "), deleted: " + delTermCount);
+        }
+        if (termPolicy.pruneAllPostings(term().field())) {
+          delTermCount++;
+          //System.out.println("TE: remove " + term());
+          continue;
+        }
+        if (!termPolicy.pruneTermEnum(in)) {
+          //System.out.println("TE: pass " + term());
+          return true;
+        }
+        delTermCount++;
+        //System.out.println("TE: skip " + term());
+      }
+    }
+
+  }
+  
+  class PruningTermPositions extends FilterTermPositions {
+    protected Term curTerm = null;
+    
+    public PruningTermPositions(TermPositions in) {
+      super(in);
+    }
+    
+    @Override
+    public void seek(Term t) throws IOException {
+      super.seek(t);
+      informPolicy(t);
+    }
+    
+    @Override
+    public void seek(TermEnum termEnum) throws IOException {
+      super.seek(termEnum);
+      informPolicy(termEnum.term());
+    }
+    
+    private void informPolicy(Term t) throws IOException {
+      termPolicy.initTermPositions((TermPositions)super.in, t);
+      curTerm = new Term(t.field(), t.text());
+    }
+    
+    @Override
+    public boolean next() throws IOException {
+      for ( ; ; ) {
+        if (!super.next()) {
+          return false;
+        }
+        if (termPolicy.pruneTermPositions((TermPositions)super.in, curTerm)) {
+          continue;
+        }
+        break;
+      }
+      return true;
+    }
+    
+    @Override
+    public boolean isPayloadAvailable() {
+      if (!super.isPayloadAvailable()) {
+        return false;
+      }
+      if (termPolicy.prunePayload((TermPositions)in, curTerm)) {
+        return false;
+      }
+      return true;
+    }
+  }
+}
\ No newline at end of file

Property changes on: contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java
===================================================================
--- contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java	(revision 0)
+++ contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java	(revision 0)
@@ -0,0 +1,146 @@
+package org.apache.lucene.index;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+
+/**
+ * Superclass of term pruning policies.
+ */
+public abstract class TermPruningPolicy extends PruningPolicy {
+  protected Map<String, Integer> fieldFlags;
+  protected IndexReader in;
+  
+  /**
+   * Construct a policy.
+   * @param in input reader
+   * @param fieldFlags a map, where keys are field names and values
+   * are bitwise-OR flags of operations to be performed (see
+   * {@link PruningPolicy} for more details).
+   */
+  @SuppressWarnings("unchecked")
+  protected TermPruningPolicy(IndexReader in, Map<String, Integer> fieldFlags) {
+    this.in = in;
+    if (fieldFlags != null) {
+      this.fieldFlags = fieldFlags;
+    } else {
+      this.fieldFlags = Collections.EMPTY_MAP;
+    }
+  }
+  
+  /**
+   * Term vector pruning.
+   * @param docNumber document number
+   * @param field field name
+   * @return true if the complete term vector for this field should be
+   * removed (as specified by {@link PruningPolicy#DEL_VECTOR} flag).
+   * @throws IOException
+   */
+  public boolean pruneWholeTermVector(int docNumber, String field)
+      throws IOException {
+    if (fieldFlags.containsKey(field) && 
+            (fieldFlags.get(field) & DEL_VECTOR) != 0) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+  
+  /**
+   * Pruning of all postings for a field
+   * @param field field name
+   * @return true if all postings for all terms in this field should be
+   * removed (as specified by {@link PruningPolicy#DEL_POSTINGS}).
+   * @throws IOException
+   */
+  public boolean pruneAllPostings(String field) throws IOException {
+    if (fieldFlags.containsKey(field) && 
+            (fieldFlags.get(field) & DEL_POSTINGS) != 0) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+  
+  /**
+   * Called when moving {@link TermPositions} to a new {@link Term}.
+   * @param in input term positions
+   * @param t current term
+   * @throws IOException
+   */
+  public abstract void initTermPositions(TermPositions in, Term t)
+    throws IOException;
+
+  /**
+   * Called when checking for the presence of payload for the current
+   * term at a current position
+   * @param in positioned term positions
+   * @param curTerm current term associated with these positions
+   * @return true if the payload should be removed, false otherwise.
+   */
+  public boolean prunePayload(TermPositions in, Term curTerm) {
+    if (fieldFlags.containsKey(curTerm.field()) &&
+            (fieldFlags.get(curTerm.field()) & DEL_PAYLOADS) != 0) {
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Pruning of individual terms in term vectors.
+   * @param docNumber document number
+   * @param field field name
+   * @param terms array of terms
+   * @param freqs array of term frequencies
+   * @param v the original term frequency vector
+   * @return 0 if no terms are to be removed, positive number to indicate
+   * how many terms need to be removed. The same number of entries in the terms
+   * array must be set to null to indicate which terms to remove.
+   * @throws IOException
+   */
+  public abstract int pruneTermVectorTerms(int docNumber, String field,
+          String[] terms, int[] freqs, TermFreqVector v) throws IOException;
+
+  /**
+   * Pruning of all postings for a term.
+   * @param te positioned term enum.
+   * @return true if all postings for this term should be removed, false
+   * otherwise.
+   * @throws IOException
+   */
+  public abstract boolean pruneTermEnum(TermEnum te) throws IOException;
+
+  /**
+   * Prune individual postings per term.
+   * @param termPositions positioned term positions. Implementations MUST NOT
+   * advance this by calling {@link TermPositions} methods that advance either
+   * the position pointer (next, skipTo) or term pointer (seek).
+   * @param t current term
+   * @return true if the current posting should be removed, false otherwise.
+   * @throws IOException
+   */
+  public abstract boolean pruneTermPositions(TermPositions termPositions, Term t)
+      throws IOException;
+}

Property changes on: contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/src/java/org/apache/lucene/index/CarmelTermPruningPolicy.java
===================================================================
--- contrib/pruning/src/java/org/apache/lucene/index/CarmelTermPruningPolicy.java	(revision 0)
+++ contrib/pruning/src/java/org/apache/lucene/index/CarmelTermPruningPolicy.java	(revision 0)
@@ -0,0 +1,203 @@
+package org.apache.lucene.index;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopScoreDocCollector;
+
+/**
+ * This class produces a subset of the input index, by removing postings data
+ * for those terms where their in-document frequency is below a specified
+ * threshold. The net effect of this processing is a much smaller index that for
+ * many types of queries returns nearly identical top-N results as compared with
+ * the original index, but with increased performance.
+ * <p>
+ * See the following paper for more details about this method: <a
+ * href="http://portal.acm.org/citation.cfm?id=383958">Static index pruning for
+ * information retrieval systems, D. Carmel at al, ACM SIGIR 2001 </a>.
+ * Conclusions of this paper indicate that it's best to use per-term thresholds,
+ * but in practice this is tedious for large number of terms - instead, this
+ * implementation allows to specify three levels of thresholds: one default
+ * threshold, then a threshold per field, and then a threshold per term. These
+ * thresholds are applied so that always the most specific one takes precedence.
+ * <p>
+ * Please note that while this method produces good results for term queries it
+ * often leads to poor results for phrase queries (because we remove postings
+ * without considering whether they belong to an important phrase).
+ * 
+ * <p>Thresholds in this method of pruning are expressed as the percentage of
+ * the top-N scoring documents per term that are retained. The list of top-N
+ * documents is established by using a regular {@link IndexSearcher}
+ * and {@link Similarity} to run a simple {@link TermQuery}.
+ * <p>
+ * <small>See the following papers for a discussion of this problem and the
+ * proposed solutions to improve the quality of a pruned index (not implemented
+ * here):
+ * <ul>
+ * <li><a href="http://portal.acm.org/citation.cfm?id=1148235">Pruned query
+ * evaluation using pre-computed impacts, V. Anh et al, ACM SIGIR 2006</a></li>
+ * <li><a href="http://portal.acm.org/citation.cfm?id=1183614.1183644"> A
+ * document-centric approach to static index pruning in text retrieval systems,
+ * S. Buettcher et al, ACM SIGIR 2006</a></li>
+ * <li><a href=" http://oak.cs.ucla.edu/~cho/papers/ntoulas-sigir07.pdf">
+ * Pruning Policies for Two-Tiered Inverted Index with Correctness Guarantee, A.
+ * Ntoulas et al, ACM SIGIR 2007.</a></li>
+ * </ul>
+ * </small>
+ * 
+ * <p>
+ * As the threshold values increase, the total size of the index decreases,
+ * search performance increases, and recall decreases (i.e. search quality
+ * deteriorates). NOTE: especially phrase recall deteriorates significantly at
+ * higher threshold values.
+ * <p>
+ * Primary purpose of this class is to produce small first-tier indexes that fit
+ * completely in RAM, and store these indexes using
+ * {@link IndexWriter#addIndexes(IndexReader[])}. <b>NOTE: If the input index is
+ * optimized (i.e. doesn't contain deletions) then the index produced via
+ * {@link IndexWriter#addIndexes(IndexReader[])} will preserve internal document
+ * id-s so that they are in sync with the original index.</b> This means that
+ * all other auxiliary information not necessary for first-tier processing, such
+ * as some stored fields, can also be removed, to be quickly retrieved on-demand
+ * from the original index using the same internal document id. See
+ * {@link StorePruningPolicy} for information about removing stored fields.
+ * <p>
+ * Threshold values can be specified globally (for terms in all fields) using
+ * <code>defaultThreshold</code> parameter, and can be overriden using per-field
+ * or per-term values supplied in a <code>thresholds</code> map. Keys in this
+ * map are either field names, or terms in <code>field:text</code> format. The
+ * precedence of these values is the following: first a per-term threshold is
+ * used if present, then per-field threshold if present, and finally the default
+ * threshold.
+ */
+
+public class CarmelTermPruningPolicy extends TermPruningPolicy {
+  int docsPos = 0;
+  float curThr;
+  float defThreshold;
+  Map<String, Float> thresholds;
+  ScoreDoc[] docs = null;
+  IndexSearcher is;
+  Similarity sim;
+
+  @SuppressWarnings("unchecked")
+  protected CarmelTermPruningPolicy(IndexReader in,
+          Map<String, Integer> fieldFlags, Map<String, Float> thresholds,
+          float defThreshold, Similarity sim) {
+    super(in, fieldFlags);
+    this.defThreshold = defThreshold;
+    if (thresholds != null) {
+      this.thresholds = thresholds;
+    } else {
+      this.thresholds = Collections.EMPTY_MAP;
+    }
+    if (sim != null) {
+      this.sim = sim;
+    } else {
+      sim = new DefaultSimilarity();
+    }
+    is = new IndexSearcher(in);
+    is.setSimilarity(sim);
+  }
+
+  // too costly - pass everything at this stage
+  @Override
+  public boolean pruneTermEnum(TermEnum te) throws IOException {
+    return false;
+  }
+
+  @Override
+  public void initTermPositions(TermPositions tp, Term t) throws IOException {
+    curThr = defThreshold;
+    String termKey = t.field() + ":" + t.text();
+    if (thresholds.containsKey(termKey)) {
+      curThr = thresholds.get(termKey);
+    } else if (thresholds.containsKey(t.field())) {
+      curThr = thresholds.get(t.field());
+    }
+    // calculate count
+    int df = in.docFreq(t);
+    int count = Math.round((float)df * curThr);
+    if (count < 100) count = 100;
+    TopScoreDocCollector collector = TopScoreDocCollector.create(count, true);
+    TermQuery tq = new TermQuery(t);
+    is.search(tq, collector);
+    docs = collector.topDocs().scoreDocs;
+    Arrays.sort(docs, ByDocComparator.INSTANCE);
+    if (docs.length > count) {
+      ScoreDoc[] subset = new ScoreDoc[count];
+      System.arraycopy(docs, 0, subset, 0, count);
+      docs = subset;
+    }
+    docsPos = 0;
+  }
+
+  @Override
+  public boolean pruneTermPositions(TermPositions termPositions, Term t)
+          throws IOException {
+    if (docsPos >= docs.length) { // used up all doc id-s
+      return true; // skip any remaining docs
+    }
+    while ((docsPos < docs.length - 1) && termPositions.doc() > docs[docsPos].doc) {
+      docsPos++;
+    }
+    if (termPositions.doc() == docs[docsPos].doc) {
+      // pass
+      docsPos++; // move to next doc id
+      return false;
+    } else if (termPositions.doc() < docs[docsPos].doc) {
+      return true; // skip this one - it's less important
+    }
+    // should not happen!
+    throw new IOException("termPositions.doc > docs[docsPos].doc");
+  }
+
+  // it probably doesn't make sense to prune term vectors using this method,
+  // due to its overhead
+  @Override
+  public int pruneTermVectorTerms(int docNumber, String field, String[] terms,
+          int[] freqs, TermFreqVector tfv) throws IOException {
+    return 0;
+  }
+
+  public static class ByDocComparator implements Comparator<ScoreDoc> {
+    public static final ByDocComparator INSTANCE = new ByDocComparator();
+
+    @Override
+    public int compare(ScoreDoc o1, ScoreDoc o2) {
+      return o1.doc - o2.doc;
+    }    
+  }
+  
+}

Property changes on: contrib/pruning/src/java/org/apache/lucene/index/CarmelTermPruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/build.xml
===================================================================
--- contrib/pruning/build.xml	(revision 0)
+++ contrib/pruning/build.xml	(revision 0)
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="pruning" default="default">
+
+  <description>
+    Pruning Lucene indexes by various criteria
+  </description>
+
+  <import file="../contrib-build.xml"/>
+</project>

Property changes on: contrib/pruning/build.xml
___________________________________________________________________
Added: svn:eol-style
   + native

