Index: contrib/pruning/pom.xml.template
===================================================================
--- contrib/pruning/pom.xml.template	(revision 0)
+++ contrib/pruning/pom.xml.template	(revision 0)
@@ -0,0 +1,38 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+    
+    http://www.apache.org/licenses/LICENSE-2.0
+    
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+  -->
+
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.lucene</groupId>
+    <artifactId>lucene-contrib</artifactId>
+    <version>@version@</version>
+  </parent>
+  <groupId>org.apache.lucene</groupId>
+  <artifactId>lucene-pruning</artifactId>
+  <name>Lucene Pruning</name>
+  <version>@version@</version>
+  <description>
+    Pruning Lucene indexes by various criteria.
+  </description>
+  <packaging>jar</packaging>
+</project>
Index: contrib/pruning/src/test/org/apache/lucene/index/TestTFPruningReader.java
===================================================================
--- contrib/pruning/src/test/org/apache/lucene/index/TestTFPruningReader.java	(revision 0)
+++ contrib/pruning/src/test/org/apache/lucene/index/TestTFPruningReader.java	(revision 0)
@@ -0,0 +1,140 @@
+package org.apache.lucene.index;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.RAMDirectory;
+
+import junit.framework.TestCase;
+
+public class TestTFPruningReader extends TestCase {
+  RAMDirectory sourceDir = new RAMDirectory();
+  
+  private void assertTD(IndexReader ir, Term t, int[] ids) throws Exception {
+    TermPositions td = ir.termPositions(t);
+    assertNotNull(td);
+    try {
+      int i = 0;
+      while(td.next()) {
+        assertEquals(t + ", i=" + i, ids[i], td.doc());
+        i++;
+      }
+      assertEquals(ids.length, i);
+    } finally {
+      td.close();
+    }
+  }
+  
+  private void assertTDCount(IndexReader ir, Term t, int count) throws Exception {
+    TermPositions td = ir.termPositions(t);
+    assertNotNull(td);
+    try {
+      int i = 0;
+      while (td.next()) i++;
+      assertEquals(t.toString(), count, i);
+    } finally {
+      td.close();
+    }
+  }
+  
+  public void setUp() throws Exception {
+    IndexWriter iw = new IndexWriter(sourceDir, new WhitespaceAnalyzer(),MaxFieldLength.LIMITED);
+    Document doc = new Document();
+    doc.add(new Field("body", "one two three four", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "0", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one two three one two three", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one two one two one two", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one three one three one three", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one one one one two", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("test", "one two one two three three three four", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS_OFFSETS));
+    doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    // to be deleted
+    doc = new Document();
+    doc.add(new Field("body", "one three one three one three five five five", Field.Store.YES, Field.Index.ANALYZED));
+    doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
+    iw.addDocument(doc);
+    iw.close();
+    IndexReader ir = IndexReader.open(sourceDir, false);
+    ir.deleteDocument(5);
+    ir.close();
+  }
+
+  public void testTfPruning() throws Exception {
+    RAMDirectory targetDir = new RAMDirectory();
+    TFPruningReader tfr = new TFPruningReader(IndexReader.open(sourceDir, true), 2, null, null);
+    // verify
+    assertTD(tfr, new Term("body", "one"), new int[]{1, 2, 3, 4});
+    assertTD(tfr, new Term("body", "two"), new int[]{1, 2});
+    assertTD(tfr, new Term("body", "three"), new int[]{1, 3});
+    assertTD(tfr, new Term("test", "one"), new int[]{4});
+    assertTDCount(tfr, new Term("body", "four"), 0);
+    assertTDCount(tfr, new Term("test", "four"), 0);
+    // verify new reader
+    IndexWriter iw = new IndexWriter(targetDir, new WhitespaceAnalyzer(), MaxFieldLength.LIMITED);
+    iw.addIndexes(new IndexReader[]{tfr});
+    iw.close();
+    IndexReader ir = IndexReader.open(targetDir, true);
+    assertTD(ir, new Term("body", "one"), new int[]{1, 2, 3, 4});
+    assertTD(ir, new Term("body", "two"), new int[]{1, 2});
+    assertTD(ir, new Term("body", "three"), new int[]{1, 3});
+    assertTD(ir, new Term("test", "one"), new int[]{4});
+    tfr.close();
+    ir.close();
+  }
+  
+  public void testThresholds() throws Exception {
+    Map<String, Integer> thresholds = new HashMap<String, Integer>();
+    thresholds.put("test", 3);
+    TFPruningReader tfr = new TFPruningReader(IndexReader.open(sourceDir, true), 2, thresholds, null);
+    assertTDCount(tfr, new Term("test", "one"), 0);
+    assertTDCount(tfr, new Term("test", "two"), 0);
+    assertTD(tfr, new Term("test", "three"), new int[]{4});
+    assertTDCount(tfr, new Term("test", "four"), 0);
+  }
+  
+  public void testRemoveFields() throws Exception {
+    RAMDirectory targetDir = new RAMDirectory();
+    Map<String, Integer> removeFields = new HashMap<String, Integer>();
+    removeFields.put("test", TFPruningReader.DEL_POSTINGS | TFPruningReader.DEL_STORED);
+    TFPruningReader tfr = new TFPruningReader(IndexReader.open(sourceDir, true), 2, null, removeFields);
+    Document doc = tfr.document(4);
+    // removed stored values?
+    assertNull(doc.get("test"));
+    // removed postings ?
+    TermEnum te = tfr.terms();
+    while (te.next()) {
+      assertFalse("test".equals(te.term().field()));
+    }
+    // but vectors should be present !
+    TermFreqVector tv = tfr.getTermFreqVector(4, "test");
+    assertNotNull(tv);
+    assertEquals(4, tv.getTerms().length); // term "four" not deleted yet from TermEnum
+    // verify new reader
+    IndexWriter iw = new IndexWriter(targetDir, new WhitespaceAnalyzer(), MaxFieldLength.LIMITED);
+    iw.addIndexes(new IndexReader[]{tfr});
+    iw.close();
+    IndexReader ir = IndexReader.open(targetDir, true);
+    tv = ir.getTermFreqVector(4, "test");
+    assertNotNull(tv);
+    assertEquals(3, tv.getTerms().length); // term "four" was deleted from TermEnum
+  }
+}

Property changes on: contrib/pruning/src/test/org/apache/lucene/index/TestTFPruningReader.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/src/java/org/apache/lucene/index/TFPruningReader.java
===================================================================
--- contrib/pruning/src/java/org/apache/lucene/index/TFPruningReader.java	(revision 0)
+++ contrib/pruning/src/java/org/apache/lucene/index/TFPruningReader.java	(revision 0)
@@ -0,0 +1,434 @@
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+import java.util.logging.Logger;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.FieldSelectorResult;
+
+/**
+ * This class produces a subset of the input index, by removing
+ * postings data for those terms where their
+ * in-document frequency is below a specified threshold. The net effect
+ * of this processing is a much smaller index that for many types of queries
+ * returns nearly identical top-N results as compared with the original index, but
+ * with increased performance.
+ * <p>See the following paper for more details about this method:
+ * <a href="http://portal.acm.org/citation.cfm?id=383958">Static index
+ * pruning for information retrieval systems, D. Carmel at al, ACM SIGIR 2001
+ * </a>. Conclusions of this paper indicate that it's best to use per-term
+ * thresholds, but in practice this is tedious for large number of terms - this
+ * implementation allows also to specify thresholds per field.
+ * <p>Please note that while this method produces good results for term
+ * queries it often leads to poor results for phrase queries (because we remove
+ * postings without considering whether they belong to an important phrase).
+ * 
+ * <p><small>See the following papers for a discussion of this problem and
+ * the proposed solutions to improve the quality of a pruned index
+ * (not implemented here):
+ * <ul>
+ * <li><a href="http://portal.acm.org/citation.cfm?id=1148235">Pruned query
+ * evaluation using pre-computed impacts, V. Anh et al, ACM SIGIR 2006</a></li>
+ * <li><a href="http://portal.acm.org/citation.cfm?id=1183614.1183644">
+ * A document-centric approach to static index pruning in text retrieval systems,
+ * S. Buettcher et al, ACM SIGIR 2006</a></li>
+ * <li><a href=" http://oak.cs.ucla.edu/~cho/papers/ntoulas-sigir07.pdf">
+ * Pruning Policies for Two-Tiered Inverted Index with Correctness Guarantee,
+ * A. Ntoulas et al, ACM SIGIR 2007.</a></li>
+ * </ul>
+ * </small>
+ * 
+ * <p>As the threshold values increase, the total
+ * size of the index decreases, search performance increases, and recall
+ * decreases (i.e. search quality deteriorates). NOTE: especially phrase recall
+ * deteriorates significantly at higher threshold values.
+ * <p>Primary purpose of this class is to produce small first-tier indexes
+ * that fit completely in RAM, and store these indexes using
+ * {@link IndexWriter#addIndexes(IndexReader[])}. <b>NOTE: If the input index is
+ * optimized (i.e. doesn't contain deletions) then the index produced via
+ * {@link IndexWriter#addIndexes(IndexReader[])} will preserve internal document
+ * id-s so that they are in sync with the original index.</b> This means
+ * that all other auxiliary information not necessary for first-tier processing,
+ * such as some stored fields, can also be removed, to be quickly retrieved
+ * on-demand from the original index using the same internal document id.
+ * <p>Threshold values can be specified globally (for terms in all fields) using
+ * <code>defaultThreshold</code> parameter, and can be overriden using per-field
+ * or per-term values supplied in a <code>thresholds</code> map. Keys in this
+ * map are either field names, or terms in <code>field:text</code> format.
+ * The precedence of these values is the following: first a per-term threshold
+ * is used if present, then per-field threshold if present, and finally the
+ * default threshold.
+ * <p>If required, some or all data (postings, term vectors and stored fields)
+ * from certain fields can be unconditionally removed. The list of such fields
+ * can be supplied in <code>removeFields</code> parameter.
+ */
+public class TFPruningReader extends FilterIndexReader {
+  private static final Logger LOG = Logger.getLogger(TFPruningReader.class.getName());
+  private Map<String, Integer> thresholds;
+  private Map<String, Integer> delFields;
+  private Set<String> delAll = new HashSet<String>();
+  private int defaultThreshold;
+  private DelFieldSelector fs;
+  private int termCount = 0, docCount = 0, vecCount = 0, postCount = 0;
+  private int delTermCount = 0, delVecCount = 0, delVecCount1 = 0, delPostCount = 0;
+  
+  public static final int DEL_POSTINGS  = 0x01;
+  public static final int DEL_STORED    = 0x02;
+  public static final int DEL_VECTOR    = 0x04;
+  public static final int DEL_PAYLOADS  = 0x08;
+  public static final int DEL_ALL       = 0xff;
+  
+  public static final int DEL_VECTOR_OR_POSTINGS = DEL_VECTOR | DEL_POSTINGS;
+  
+  private static class DelFieldSelector implements FieldSelector {
+    private FieldSelector parent;
+    private Map<String, Integer> remove;
+    
+    public DelFieldSelector(Map<String, Integer> remove) {
+      this.remove = remove;
+    }
+    
+    public void setParent(FieldSelector parent) {
+      this.parent = parent;
+    }
+    
+    @Override
+    public FieldSelectorResult accept(String fieldName) {
+      if (!remove.isEmpty() && remove.containsKey(fieldName) &&
+              ((remove.get(fieldName) & DEL_STORED) > 0)) {
+        return FieldSelectorResult.NO_LOAD;
+      } else if (parent != null) {
+        return parent.accept(fieldName);
+      } else return FieldSelectorResult.LOAD;
+    }
+  };
+
+  
+  /**
+   * Constructor.
+   * @param in input reader
+   * @param defaultThreshold default value (overridden in <code>thresholds</code>)
+   * of in-document frequency. Only postings with in-document frequency higher
+   * than this value will be retained.
+   * @param thresholds map of field names to in-document frequency thresholds.
+   * Postings with in-document frequency lower than thresholds will be ignored.
+   * @param removeFields names of fields to remove, with a bitwise-OR of flags
+   * that specify what to delete for such field.
+   */
+  public TFPruningReader(IndexReader in, int defaultThreshold,
+          Map<String, Integer> thresholds, Map<String, Integer> removeFields) {
+    super(in);
+    this.defaultThreshold = defaultThreshold;
+    if (thresholds != null) {
+      this.thresholds = thresholds;
+    } else {
+      this.thresholds = Collections.EMPTY_MAP;
+    }
+    if (removeFields != null) {
+      this.delFields = removeFields;
+      for (Entry<String, Integer> e : delFields.entrySet()) {
+        if (e.getValue() == DEL_ALL) {
+          delAll.add(e.getKey());
+        }
+      }
+    } else {
+      this.delFields = Collections.EMPTY_MAP;
+    }
+    fs = new DelFieldSelector(this.delFields);
+  }
+
+  @Override
+  public Document document(final int n, FieldSelector fieldSelector)
+          throws CorruptIndexException, IOException {
+    docCount++;
+    if ((docCount % 10000) == 0) {
+      LOG.info(" - stored fields: " + docCount + " docs.");
+    }
+    if (delFields.isEmpty()) {
+      return super.document(n, fieldSelector);
+    } else {
+      fs.setParent(fieldSelector);
+      return super.document(n, fs);
+    }
+  }
+
+  @Override
+  public Collection getFieldNames(FieldOption fieldNames) {
+    Collection res = super.getFieldNames(fieldNames);
+    // for simplicity remove only fields with DEL_ALL
+    if (!delAll.isEmpty()) {
+      res.removeAll(delAll);
+    }
+    return res;
+  }
+
+
+  @Override
+  public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
+    TermFreqVector[] vectors = super.getTermFreqVectors(docNumber);
+    if (vectors == null) {
+      return null;
+    }
+    ArrayList<TermFreqVector> newVectors = new ArrayList<TermFreqVector>();
+    for (TermFreqVector v : vectors) {
+      if (v == null) {
+        continue;
+      }
+      if (delFields.containsKey(v.getField()) && 
+              ((delFields.get(v.getField()) & DEL_VECTOR) > 0)) {
+        delVecCount++;
+        if ((delVecCount % 10000) == 0) {
+          LOG.info(" - deleted vectors: " + delVecCount);
+        }
+        continue;
+      }
+      if (v.size() == 0) {
+        continue;
+      }
+      int thr = defaultThreshold;
+      if (thresholds.containsKey(v.getField())) {
+        thr = thresholds.get(v.getField());
+      }
+      int removed = 0;
+      String[] terms = v.getTerms();
+      int[] freqs = v.getTermFrequencies();
+      for (int i = 0; i < terms.length; i++) {
+        int termThr = thr;
+        // check per-term thresholds
+        String termKey = v.getField() + ":" + terms[i];
+        if (thresholds.containsKey(termKey)) {
+          termThr = thresholds.get(termKey);
+        }
+        if (freqs[i] < termThr) {
+          terms[i] = null;
+          removed++;
+        }
+      }
+      if (removed > 0 && removed < terms.length) {
+        String[] newTerms = new String[terms.length - removed];
+        int[] newFreqs = new int[terms.length - removed];
+        int j = 0;
+        for (int i = 0; i < terms.length; i++) {
+          if (terms[i] != null) {
+            newTerms[j] = terms[i];
+            newFreqs[j] = freqs[i];
+            j++;
+          }
+        }
+        // create a modified vector
+        if (v instanceof TermPositionVector) {
+          TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[terms.length - removed][];
+          boolean withOffsets = false;
+          j = 0;
+          for (int i = 0; i < terms.length; i++) {
+            if (terms[i] == null) {
+              continue;
+            }
+            offsets[j] = ((TermPositionVector)v).getOffsets(i);
+            if (offsets[j] != null && offsets[j] != TermVectorOffsetInfo.EMPTY_OFFSET_INFO) {
+              withOffsets = true;
+            }
+            j++;
+          }
+          j = 0;
+          int[][] positions = new int[terms.length - removed][];
+          boolean withPositions = false;
+          for (int i = 0; i < terms.length; i++) {
+            if (terms[i] == null) {
+              continue;
+            }
+            positions[j] = ((TermPositionVector)v).getTermPositions(i);
+            if (positions[j] != null && positions[j].length > 0) {
+              withPositions = true;
+            }
+            j++;
+          }
+          v = new SegmentTermPositionVector(v.getField(), newTerms, newFreqs,
+                  withPositions ? positions : null,
+                  withOffsets ? offsets : null);
+        } else {
+          v = new SegmentTermVector(v.getField(), newTerms, newFreqs);
+        }
+        newVectors.add(v);
+      }
+    }
+    vecCount++;
+    if ((vecCount % 10000) == 0) {
+      LOG.info(" - vectors: " + vecCount + " docs.");
+    }
+    if (newVectors.size() == 0) {
+      delVecCount1++;
+      if ((delVecCount1 % 1000) == 0) {
+        LOG.info(" - deleted pruned vectors: " + delVecCount1);
+      }
+      return null;
+    }
+    return (TermFreqVector[])newVectors.toArray(new TermFreqVector[newVectors.size()]);
+  }
+
+  @Override
+  public boolean hasDeletions() {
+    return super.hasDeletions();
+  }
+
+  @Override
+  public boolean hasNorms(String field) throws IOException {
+    return super.hasNorms(field);
+  }
+
+  @Override
+  public boolean isDeleted(int n) {
+    return super.isDeleted(n);
+  }
+
+  @Override
+  public int maxDoc() {
+    // TODO Auto-generated method stub
+    return super.maxDoc();
+  }
+
+  @Override
+  public void norms(String f, byte[] bytes, int offset) throws IOException {
+    // TODO Auto-generated method stub
+    super.norms(f, bytes, offset);
+  }
+
+  @Override
+  public int numDocs() {
+    // TODO Auto-generated method stub
+    return super.numDocs();
+  }
+  
+  @Override
+  public TermPositions termPositions() throws IOException {
+    // TODO Auto-generated method stub
+    return new TfPruningTermPositions(in.termPositions());
+  }
+
+  @Override
+  public TermEnum terms() throws IOException {
+    // TODO Auto-generated method stub
+    return new TfPruningTermEnum(in.terms());
+  }
+  
+  private class TfPruningTermEnum extends FilterTermEnum {
+
+    @Override
+    public boolean next() throws IOException {
+      for ( ; ; ) {
+        if (!super.next()) {
+          //System.out.println("TE: end");
+          return false;
+        }
+        termCount++;
+        if ((termCount % 50000) == 0) {
+          LOG.info(" - terms: " + termCount + " (" + term() + "), deleted: " + delTermCount);
+        }
+        if (delFields.containsKey(term().field()) &&
+                (delFields.get(term().field()) & DEL_POSTINGS) > 0) {
+          delTermCount++;
+          //System.out.println("TE: remove " + term());
+          continue;
+        }
+        // check that at least one doc exceeds threshold
+        int thr = defaultThreshold;
+        String termKey = term().field() + ":" + term().text();
+        if (thresholds.containsKey(termKey)) {
+          thr = thresholds.get(termKey);
+        } else if (thresholds.containsKey(term().field())) {
+          thr = thresholds.get(term().field());
+        }
+        TermDocs td = TFPruningReader.this.termDocs(in.term());
+        boolean pass = false;
+        while (td.next()) {
+          if (td.freq() >= thr) {
+            pass = true;
+            break;
+          }
+        }
+        td.close();
+        if (pass) {
+          //System.out.println("TE: pass " + term());
+          return true;
+        }
+        delTermCount++;
+        //System.out.println("TE: skip " + term());
+      }
+    }
+
+    public TfPruningTermEnum(TermEnum in) {
+      super(in);
+    }
+    
+  }
+  
+  private class TfPruningTermPositions extends FilterTermPositions {
+    int thr;
+    String field;
+    
+    public TfPruningTermPositions(TermPositions in) {
+      super(in);
+    }
+    
+    @Override
+    public void seek(Term t) throws IOException {
+      super.seek(t);
+      setThreshold(t);
+    }
+    
+    @Override
+    public void seek(TermEnum termEnum) throws IOException {
+      super.seek(termEnum);
+      setThreshold(termEnum.term());
+    }
+    
+    private void setThreshold(Term t) {
+      field = t.field();
+      // set threshold for this field
+      thr = defaultThreshold;
+      String termKey = t.field() + ":" + t.text();
+      if (thresholds.containsKey(termKey)) {
+        thr = thresholds.get(termKey);
+      } else if (thresholds.containsKey(t.field())) {
+        thr = thresholds.get(t.field());
+      }
+      //System.out.println("TP: setThr " + t + ", thr=" + thr);
+    }
+    
+    @Override
+    public boolean next() throws IOException {
+      for ( ; ; ) {
+        if (!super.next()) {
+          return false;
+        }
+        if (super.freq() < thr) {
+          //System.out.println("TP: skip doc=" + doc() +
+          //        ", freq=" + freq() + " < " + thr);
+          continue;
+        }
+        break;
+      }
+      //System.out.println("TP: pass doc=" + doc() +
+      //        ", freq=" + freq() + " >= " + thr);
+      return true;
+    }
+    
+    @Override
+    public boolean isPayloadAvailable() {
+      boolean res = super.isPayloadAvailable();
+      if (res && delFields.containsKey(field) &&
+              (delFields.get(field) & DEL_PAYLOADS) > 0) {
+        res = false;
+      }
+      return res;
+    }
+  }
+}
\ No newline at end of file

Property changes on: contrib/pruning/src/java/org/apache/lucene/index/TFPruningReader.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java
===================================================================
--- contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java	(revision 0)
+++ contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java	(revision 0)
@@ -0,0 +1,109 @@
+package org.apache.lucene.index;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.index.IndexWriter.MaxFieldLength;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
+public class PruningTool {
+
+  public static void main(String[] args) throws Exception {
+    int res = run(args);
+    System.exit(res);
+  }
+  
+  public static int run(String[] args) throws Exception {
+    if (args.length < 5) {
+      System.err.println("Usage: PruningTool (-in <path1> [-in <path2> ...]) -out <outPath> -t <NN> [-del f1,f2,..] [-conf <file>]");
+      System.err.println("\t-in path\tpath to the input index. Can specify multiple input indexes.");
+      System.err.println("\t-out path\toutput path where the output index will be stored.");
+      System.err.println("\t-t NN\tdefault threshold value (minimum in-document frequency) for all terms");
+      System.err.println("\t-del f1,f2,..\tcomma-separated list of field specs to delete (postings, vectors & stored):");
+      System.err.println("\t\tfield spec : fieldName ( ':' [pPsv] )");
+      System.err.println("\t\twhere: p - postings, P - payloads, s - stored value, v - vectors");
+      System.err.println("\t-conf file\tpath to config file with per-term thresholds");
+      return -1;
+    }
+    ArrayList<IndexReader> inputs = new ArrayList<IndexReader>();
+    Directory out = null;
+    int thr = -1;
+    Map<String, Integer> delFields = new HashMap<String, Integer>();
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-in")) {
+        Directory d = FSDirectory.open(new File(args[++i]));
+        if (!IndexReader.indexExists(d)) {
+          System.err.println("WARN: no index in " + args[i] + ", skipping ...");
+        }
+        inputs.add(IndexReader.open(d, true));
+      } else if (args[i].equals("-out")) {
+        File outFile = new File(args[++i]);
+        if (outFile.exists()) {
+          throw new Exception("Output " + outFile + " already exists.");
+        }
+        outFile.mkdirs();
+        out = FSDirectory.open(outFile);
+      } else if (args[i].equals("-t")) {
+        thr = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-del")) {
+        String[] fields = args[++i].split(",");
+        for (String f : fields) {
+          // parse field spec
+          String[] spec = f.split(":");
+          int opts = TFPruningReader.DEL_ALL;
+          if (spec.length > 0) {
+            opts = 0;
+            if (spec[1].indexOf('p') != -1) {
+              opts |= TFPruningReader.DEL_POSTINGS;
+            }
+            if (spec[1].indexOf('P') != -1) {
+              opts |= TFPruningReader.DEL_PAYLOADS;
+            }
+            if (spec[1].indexOf('s') != -1) {
+              opts |= TFPruningReader.DEL_STORED;
+            }
+            if (spec[1].indexOf('v') != -1) {
+              opts |= TFPruningReader.DEL_VECTOR;
+            }
+          }
+          delFields.put(spec[0], opts);
+        }
+      } else if (args[i].equals("-conf")) {
+        String confFile = args[++i];
+        System.err.println("WARN: -conf option not implemented yet.");
+      } else {
+        throw new Exception("Invalid argument: '" + args[i] + "'");
+      }
+    }
+    if (inputs.size() == 0) {
+      throw new Exception("At least one input index is required.");
+    }
+    if (out == null) {
+      throw new Exception("Output path is not set.");
+    }
+    if (thr == -1) {
+      throw new Exception("Threshold value is not set.");
+    }
+    IndexReader in;
+    if (inputs.size() == 1) {
+      in = inputs.get(0);
+    } else {
+      in = new MultiReader((IndexReader[])inputs
+              .toArray(new IndexReader[inputs.size()]), true);
+    }
+    if (in.hasDeletions()) {
+      System.err.println("WARN: input index(es) with deletions - document ID-s will NOT be preserved!");
+    }
+    TFPruningReader tfr = new TFPruningReader(in, thr, null, delFields);
+    IndexWriter iw = new IndexWriter(out, new WhitespaceAnalyzer(), MaxFieldLength.UNLIMITED);
+    iw.setUseCompoundFile(false);
+    iw.addIndexes(new IndexReader[]{tfr});
+    iw.close();
+    System.err.println("DONE.");
+    return 0;
+  }
+}

Property changes on: contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/pruning/build.xml
===================================================================
--- contrib/pruning/build.xml	(revision 0)
+++ contrib/pruning/build.xml	(revision 0)
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="pruning" default="default">
+
+  <description>
+    Pruning Lucene indexes by various criteria
+  </description>
+
+  <import file="../contrib-build.xml"/>
+</project>

Property changes on: contrib/pruning/build.xml
___________________________________________________________________
Added: svn:eol-style
   + native

