From 81a16fe602358830417f003f0b297c533fe06eb1 Mon Sep 17 00:00:00 2001
From: Greg Bowyer <gbowyer@apache.org>
Date: Mon, 6 May 2013 11:37:31 -0700
Subject: [PATCH] LUCENE-3917: Initial port of index pruning

---
 dev-tools/idea/.idea/modules.xml                   |   1 +
 dev-tools/idea/.idea/workspace.xml                 |  38 ++-
 lucene/module-build.xml                            |  11 +
 lucene/pruning/README.txt                          |  30 ++
 lucene/pruning/build.xml                           |  48 +++
 lucene/pruning/ivy.xml                             |  21 ++
 .../apache/lucene/index/PruningAtomicReader.java   | 296 +++++++++++++++++
 .../index/pruning/CarmelTopKTermPruningPolicy.java | 271 +++++++++++++++
 .../pruning/CarmelUniformTermPruningPolicy.java    | 170 ++++++++++
 .../apache/lucene/index/pruning/PruningPolicy.java |  34 ++
 .../apache/lucene/index/pruning/PruningTool.java   | 202 ++++++++++++
 .../index/pruning/RIDFTermPruningPolicy.java       | 109 ++++++
 .../lucene/index/pruning/StorePruningPolicy.java   | 143 ++++++++
 .../lucene/index/pruning/TFTermPruningPolicy.java  | 134 ++++++++
 .../lucene/index/pruning/TermPruningPolicy.java    | 176 ++++++++++
 .../org/apache/lucene/index/pruning/package.html   |  41 +++
 lucene/pruning/src/java/overview.html              |  26 ++
 .../org/apache/lucene/index/TestPruningReader.java | 364 +++++++++++++++++++++
 18 files changed, 2100 insertions(+), 15 deletions(-)
 create mode 100644 lucene/pruning/README.txt
 create mode 100644 lucene/pruning/build.xml
 create mode 100644 lucene/pruning/ivy.xml
 create mode 100644 lucene/pruning/src/java/org/apache/lucene/index/PruningAtomicReader.java
 create mode 100644 lucene/pruning/src/java/org/apache/lucene/index/pruning/CarmelTopKTermPruningPolicy.java
 create mode 100644 lucene/pruning/src/java/org/apache/lucene/index/pruning/CarmelUniformTermPruningPolicy.java
 create mode 100644 lucene/pruning/src/java/org/apache/lucene/index/pruning/PruningPolicy.java
 create mode 100644 lucene/pruning/src/java/org/apache/lucene/index/pruning/PruningTool.java
 create mode 100644 lucene/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java
 create mode 100644 lucene/pruning/src/java/org/apache/lucene/index/pruning/StorePruningPolicy.java
 create mode 100644 lucene/pruning/src/java/org/apache/lucene/index/pruning/TFTermPruningPolicy.java
 create mode 100644 lucene/pruning/src/java/org/apache/lucene/index/pruning/TermPruningPolicy.java
 create mode 100644 lucene/pruning/src/java/org/apache/lucene/index/pruning/package.html
 create mode 100644 lucene/pruning/src/java/overview.html
 create mode 100644 lucene/pruning/src/test/org/apache/lucene/index/TestPruningReader.java

diff --git a/dev-tools/idea/.idea/modules.xml b/dev-tools/idea/.idea/modules.xml
index d3c0bc7..291ca8b 100644
--- a/dev-tools/idea/.idea/modules.xml
+++ b/dev-tools/idea/.idea/modules.xml
@@ -29,6 +29,7 @@
       <module filepath="$PROJECT_DIR$/lucene/join/join.iml" />
       <module filepath="$PROJECT_DIR$/lucene/memory/memory.iml" />
       <module filepath="$PROJECT_DIR$/lucene/misc/misc.iml" />
+      <module filepath="$PROJECT_DIR$/lucene/pruning/pruning.iml" />
       <module filepath="$PROJECT_DIR$/lucene/queries/queries.iml" />
       <module filepath="$PROJECT_DIR$/lucene/queryparser/queryparser.iml" />
       <module filepath="$PROJECT_DIR$/lucene/sandbox/sandbox.iml" />
diff --git a/dev-tools/idea/.idea/workspace.xml b/dev-tools/idea/.idea/workspace.xml
index 265715d..ee7fb1b 100644
--- a/dev-tools/idea/.idea/workspace.xml
+++ b/dev-tools/idea/.idea/workspace.xml
@@ -130,6 +130,13 @@
       <option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />
       <option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
     </configuration>
+    <configuration default="false" name="Module pruning" type="JUnit" factoryName="JUnit">
+      <module name="pruning" />
+      <option name="TEST_OBJECT" value="package" />
+      <option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/lucene/pruning" />
+      <option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />
+      <option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
+    </configuration>
     <configuration default="false" name="Module queries" type="JUnit" factoryName="JUnit">
       <module name="queries" />
       <option name="TEST_OBJECT" value="package" />
@@ -254,21 +261,22 @@
       <item index="15" class="java.lang.String" itemvalue="JUnit.Module join" />
       <item index="16" class="java.lang.String" itemvalue="JUnit.Module memory" />
       <item index="17" class="java.lang.String" itemvalue="JUnit.Module misc" />
-      <item index="18" class="java.lang.String" itemvalue="JUnit.Module queries" />
-      <item index="19" class="java.lang.String" itemvalue="JUnit.Module queryparser" />
-      <item index="20" class="java.lang.String" itemvalue="JUnit.Module sandbox" />
-      <item index="21" class="java.lang.String" itemvalue="JUnit.Module spatial" />
-      <item index="22" class="java.lang.String" itemvalue="JUnit.Module suggest" />
-      <item index="23" class="java.lang.String" itemvalue="JUnit.Solr core" />
-      <item index="24" class="java.lang.String" itemvalue="JUnit.Solr analysis-extras contrib" />
-      <item index="25" class="java.lang.String" itemvalue="JUnit.Solr clustering contrib" />
-      <item index="26" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
-      <item index="27" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
-      <item index="28" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
-      <item index="29" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
-      <item index="30" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
-      <item index="31" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
-      <item index="32" class="java.lang.String" itemvalue="JUnit.Solrj" />
+      <item index="18" class="java.lang.String" itemvalue="JUnit.Module pruning" />
+      <item index="19" class="java.lang.String" itemvalue="JUnit.Module queries" />
+      <item index="20" class="java.lang.String" itemvalue="JUnit.Module queryparser" />
+      <item index="21" class="java.lang.String" itemvalue="JUnit.Module sandbox" />
+      <item index="22" class="java.lang.String" itemvalue="JUnit.Module spatial" />
+      <item index="23" class="java.lang.String" itemvalue="JUnit.Module suggest" />
+      <item index="24" class="java.lang.String" itemvalue="JUnit.Solr core" />
+      <item index="25" class="java.lang.String" itemvalue="JUnit.Solr analysis-extras contrib" />
+      <item index="26" class="java.lang.String" itemvalue="JUnit.Solr clustering contrib" />
+      <item index="27" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
+      <item index="28" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
+      <item index="29" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
+      <item index="30" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
+      <item index="31" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
+      <item index="32" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
+      <item index="33" class="java.lang.String" itemvalue="JUnit.Solrj" />
     </list>
   </component>
 </project>
diff --git a/lucene/module-build.xml b/lucene/module-build.xml
index a35c0b2..22d5033 100644
--- a/lucene/module-build.xml
+++ b/lucene/module-build.xml
@@ -488,6 +488,17 @@
     <property name="misc-javadocs.uptodate" value="true"/>
   </target>
 
+  <property name="pruning.jar" value="${common.dir}/build/pruning/lucene-pruning-${version}.jar"/>
+  <target name="check-pruning-uptodate" unless="pruning.uptodate">
+    <module-uptodate name="pruning" jarfile="${pruning.jar}" property="pruning.uptodate"/>
+  </target>
+  <target name="jar-pruning" unless="pruning.uptodate" depends="check-pruning-uptodate">
+    <ant dir="${common.dir}/pruning" target="jar-core" inheritAll="false">
+      <propertyset refid="uptodate.and.compiled.properties"/>
+    </ant>
+    <property name="pruning.uptodate" value="true"/>
+  </target>
+
   <property name="sandbox.jar" value="${common.dir}/build/sandbox/lucene-sandbox-${version}.jar"/>
   <target name="check-sandbox-uptodate" unless="sandbox.uptodate">
     <module-uptodate name="sandbox" jarfile="${sandbox.jar}" property="sandbox.uptodate"/>
diff --git a/lucene/pruning/README.txt b/lucene/pruning/README.txt
new file mode 100644
index 0000000..670583e
--- /dev/null
+++ b/lucene/pruning/README.txt
@@ -0,0 +1,30 @@
+Static index pruning tools.
+===========================
+
+This package provides tools and API-s for static index pruning.
+
+Static pruning is an approach that reduces size of the index
+by removing terms and/or postings that are considered less
+important, i.e. they don't affect the quality of top-N
+retrieval too much.
+
+There are several different strategies for pruning, each with
+its own set of pros and cons. Plese consult the javadocs of
+TermPruningPolicy subclasses that contain also references
+to published papers on each method.
+
+There is also a simple command-line driver class that
+can apply some of the common pruning policies:
+
+Usage: PruningTool -impl (tf | carmel | carmeltopk | ridf) (-in <path1> [-in <path2> ...]) -out <outPath> -t <NN> [-del f1,f2,..] [-conf <file>] [-topkk <NN>] [-topke <NN>] [-topkr <NN>]
+  -impl (tf | carmel | carmeltopk | ridf) TermPruningPolicy implementation name: TF or CarmelUniform or or CarmelTopK or RIDFTerm
+  -in path  path to the input index. Can specify multiple input indexes.
+  -out path output path where the output index will be stored.
+  -t NN default threshold value (minimum in-document frequency) for all terms
+  -del f1,f2,.. comma-separated list of field specs to delete (postings, vectors & stored):
+    field spec : fieldName ( ':' [pPsv] )
+    where: p - postings, P - payloads, s - stored value, v - vectors
+  -conf file  path to config file with per-term thresholds
+  -topkk NN 'K' for Carmel TopK Pruning: number of guaranteed top scores
+  -topke NN 'Epsilon' for Carmel TopK Pruning: largest meaningless score difference
+  -topkr NN 'R' for Carmel TopK Pruning: planned maximal number of terms in a query on pruned index
\ No newline at end of file
diff --git a/lucene/pruning/build.xml b/lucene/pruning/build.xml
new file mode 100644
index 0000000..6cacc51
--- /dev/null
+++ b/lucene/pruning/build.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="pruning" default="default" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+  <description>
+    Pruning Lucene indexes by various criteria
+  </description>
+
+  <import file="../module-build.xml"/>
+
+  <path id="classpath">
+    <pathelement path="${analyzers-common.jar}"/>
+    <path refid="base.classpath"/>
+  </path>
+
+  <path id="run.classpath">
+    <path refid="classpath"/>
+    <pathelement location="${build.dir}/classes/java"/>
+  </path>
+
+  <target name="init" depends="module-build.init,jar-analyzers-common"/>
+
+  <target name="javadocs" depends="javadocs-analyzers-common,compile-core">
+    <invoke-module-javadoc>
+      <links>
+        <link href="../analyzers-common"/>
+      </links>
+    </invoke-module-javadoc>
+  </target>
+
+</project>
diff --git a/lucene/pruning/ivy.xml b/lucene/pruning/ivy.xml
new file mode 100644
index 0000000..851ae13
--- /dev/null
+++ b/lucene/pruning/ivy.xml
@@ -0,0 +1,21 @@
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+<ivy-module version="2.0">
+    <info organisation="org.apache.lucene" module="pruning"/>
+</ivy-module>
diff --git a/lucene/pruning/src/java/org/apache/lucene/index/PruningAtomicReader.java b/lucene/pruning/src/java/org/apache/lucene/index/PruningAtomicReader.java
new file mode 100644
index 0000000..2214872
--- /dev/null
+++ b/lucene/pruning/src/java/org/apache/lucene/index/PruningAtomicReader.java
@@ -0,0 +1,296 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.logging.Logger;
+
+import org.apache.lucene.index.pruning.StorePruningPolicy;
+import org.apache.lucene.index.pruning.TermPruningPolicy;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * This class produces a subset of the input index, by removing some postings
+ * data according to rules implemented in a {@link TermPruningPolicy}, and
+ * optionally it can also remove stored fields of documents according to rules
+ * implemented in a {@link StorePruningPolicy}.
+ */
+public class PruningAtomicReader extends FilterAtomicReader {
+  private static final Logger LOG = Logger.getLogger(PruningAtomicReader.class.getName());
+  
+  protected int docCount;
+  protected int vecCount;
+  protected int termCount, delTermCount;
+  protected int prunedVecCount, delVecCount;
+  
+  protected TermPruningPolicy termPolicy;
+  protected StorePruningPolicy storePolicy;
+  
+  /**
+   * Constructor.
+   * @param in input reader
+   * @param storePolicy implementation of {@link StorePruningPolicy} - if null
+   *          then stored values will be retained as is.
+   * @param termPolicy implementation of {@link TermPruningPolicy}, must not
+   * be null.
+   */
+  public PruningAtomicReader(AtomicReader in, StorePruningPolicy storePolicy,
+                             TermPruningPolicy termPolicy) {
+    super(in);
+    this.termPolicy = termPolicy;
+    assert termPolicy != null;
+    this.storePolicy = storePolicy;
+  }
+
+  @Override
+  public Fields fields() throws IOException {
+    return new PruningFields(this.termPolicy, super.fields());
+  }
+
+  /**
+   * Applies a {@link StorePruningPolicy} to stored fields of a document.
+   */
+  @Override
+  public void document(int docID, StoredFieldVisitor visitor) throws IOException {
+    docCount++;
+    if ((docCount % 10000) == 0) {
+      LOG.info(" - stored fields: " + docCount + " docs.");
+    }
+    if (storePolicy != null) {
+      storePolicy.pruneDocument(docID, visitor);
+    } else {
+      in.document(docID, visitor);
+    }
+  }
+
+  /**
+   * Applies a {@link StorePruningPolicy} to the list of available field infos.
+   */
+  @Override
+  public FieldInfos getFieldInfos() {
+    FieldInfos res = super.getFieldInfos();
+    if (storePolicy == null) {
+      return res;
+    }
+    return storePolicy.getFieldInfos(res);
+  }
+
+  /**
+   * Applies {@link TermPruningPolicy} to terms inside term vectors.
+   */
+  @Override
+  public Fields getTermVectors(final int docNumber) throws IOException {
+    Fields vectors = super.getTermVectors(docNumber);
+    if (vectors == null) {
+      return null;
+    }
+
+    return new FilterFields(vectors) {
+      @Override
+      public Terms terms(String field) throws IOException {
+        Terms terms = super.terms(field);
+        if (terms == null) return null;
+
+        if (termPolicy.pruneWholeTermVector(docNumber, field)) {
+          delVecCount++;
+          if ((delVecCount % 10000) == 0) {
+            LOG.info(" - deleted vectors: " + delVecCount);
+          }
+          return null;
+        }
+        return termPolicy.pruneTermVectorTerms(docNumber, field, terms);
+      }
+    };
+  }
+
+  /**
+   * Applies {@link TermPruningPolicy} to term positions.
+   */
+  /*
+  @Override
+  public TermPositions termPositions() throws IOException {
+    return new PruningTermPositions(in.termPositions());
+  }
+  */
+  
+  /**
+   * Applies {@link TermPruningPolicy} to term enum.
+   */
+  /*
+  @Override
+  public TermEnum terms() throws IOException {
+    return new PruningTermEnum(in.terms());
+  }
+  */
+
+  private static final class PruningDocsAndPositionsEnum extends FilterDocsAndPositionsEnum {
+    
+    protected int[] positions;
+    protected DocsAndPositionsEnum tp;
+
+    private final TermPruningPolicy termPolicy;
+    private final String field;
+    private final BytesRef term;
+
+    //TODO [Greg Bowyer] This is a bit weird
+    private final TermsEnum te;
+
+    private PruningDocsAndPositionsEnum(String field, BytesRef term, TermsEnum te,
+                                        TermPruningPolicy termPolicy, DocsAndPositionsEnum in) {
+      super(in);
+      this.tp = in;
+      this.field = field;
+      this.termPolicy = termPolicy;
+      this.term = term;
+      this.te = te;
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      int nextDoc = super.nextDoc();
+      for (;;) {
+        positions = null;
+        if (nextDoc == NO_MORE_DOCS) {
+          return NO_MORE_DOCS;
+        }
+        termPolicy.initPositionsTerm(this.field, this.te);
+        if (!termPolicy.pruneAllPositions(tp, this.term, this.field)) {
+          break;
+        }
+        nextDoc = super.nextDoc();
+      }
+      return nextDoc;
+    }
+    
+    @Override
+    public BytesRef getPayload() throws IOException {
+      return termPolicy.prunePayload(field) ? null : super.getPayload();
+    }
+
+  }
+
+  public static final class PruningTerms extends FilterTerms {
+
+    private final TermPruningPolicy termPolicy;
+    private final String field;
+
+    public PruningTerms(TermPruningPolicy termPolicy, String field, Terms in) {
+      super(in);
+      this.termPolicy = termPolicy;
+      this.field = field;
+    }
+
+    @Override
+    public TermsEnum iterator(TermsEnum reuse) throws IOException {
+      return new PruningAtomicReader.PruningTermsEnum(this.termPolicy, field, in.iterator(reuse));
+    }
+
+    //TODO [Greg Bowyer] - Is this correct ? The codec actually stores these things, but we lie
+    @Override
+    public long size() throws IOException {
+      return -1;
+    }
+  }
+
+  public final static class PruningTermsEnum extends FilterTermsEnum {
+    private final String field;
+    private final TermPruningPolicy termPolicy;
+
+    // TODO [Greg Bowyer] This is in the wrong place, all over the place
+    private long termCount = 0;
+    private long delTermCount = 0;
+
+    public PruningTermsEnum(TermPruningPolicy termPolicy, String field, TermsEnum in) {
+      super(in);
+      this.field = field;
+      this.termPolicy = termPolicy;
+    }
+
+    @Override
+    public boolean seekExact(BytesRef text, boolean useCache) throws IOException {
+      boolean toReturn = super.seekExact(text, useCache);
+      this.informPolicy();
+      return toReturn;
+    }
+
+    @Override
+    public SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException {
+      SeekStatus toReturn = super.seekCeil(text, useCache);
+      this.informPolicy();
+      return toReturn;
+    }
+
+    @Override
+    public void seekExact(long ord) throws IOException {
+      super.seekExact(ord);
+      this.informPolicy();
+    }
+
+    @Override
+    public void seekExact(BytesRef term, TermState state) throws IOException {
+      super.seekExact(term, state);
+      this.informPolicy();
+    }
+
+    @Override
+    public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
+      DocsAndPositionsEnum positionsEnum = super.docsAndPositions(liveDocs, reuse, flags);
+      return new PruningDocsAndPositionsEnum(this.field, this.term(), this, this.termPolicy, positionsEnum);
+    }
+
+    private void informPolicy() throws IOException {
+      termPolicy.initPositionsTerm(this.field, this);
+    }
+
+    @Override
+    public BytesRef next() throws IOException {
+      BytesRef ref;
+      while ((ref = super.next()) != null) {
+        termCount++;
+
+        if ((termCount % 50000) == 0) {
+          LOG.info(" - terms: " + termCount + " (" + term() + "), deleted: " + delTermCount);
+        }
+
+        if (termPolicy.pruneAllFieldPostings(field) || termPolicy.pruneTermsEnum(field, in)) {
+          delTermCount++;
+          // System.out.println("TE: remove " + term());
+          continue;
+        } else {
+          break;
+        }
+      }
+      return ref;
+    }
+  }
+
+  private class PruningFields extends FilterFields {
+    private final TermPruningPolicy termPolicy;
+
+    public PruningFields(TermPruningPolicy termPolicy, Fields fields) {
+      super(fields);
+      this.termPolicy = termPolicy;
+    }
+
+    @Override
+    public Terms terms(String field) throws IOException {
+      return new PruningTerms(termPolicy, field, super.terms(field));
+    }
+  }
+}
\ No newline at end of file
diff --git a/lucene/pruning/src/java/org/apache/lucene/index/pruning/CarmelTopKTermPruningPolicy.java b/lucene/pruning/src/java/org/apache/lucene/index/pruning/CarmelTopKTermPruningPolicy.java
new file mode 100644
index 0000000..ed6d273
--- /dev/null
+++ b/lucene/pruning/src/java/org/apache/lucene/index/pruning/CarmelTopKTermPruningPolicy.java
@@ -0,0 +1,271 @@
+package org.apache.lucene.index.pruning;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopScoreDocCollector;
+import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Pruning policy with a search quality parameterized guarantee - configuration
+ * of this policy allows to specify two parameters: <b>k</b> and
+ * <b>&epsilon;</b> such that:
+ * <p>
+ * <table border="1">
+ * <tr>
+ * <td>
+ * For any <b>OR</b> query with <b>r</b> terms, the score of each of the top
+ * <b>k</b> results in the original index, should be "practically the same" as
+ * the score that document in the pruned index: the scores difference should not
+ * exceed <b>r * &epsilon;</b>.</td>
+ * </tr>
+ * </table>
+ * <p>
+ * See the following paper for more details about this method: <a
+ * href="http://portal.acm.org/citation.cfm?id=383958">Static index pruning for
+ * information retrieval systems, D. Carmel at al, ACM SIGIR 2001 </a>.
+ * <p>
+ * The claim of this pruning technique is, quoting from the above paper:
+ * <p>
+ * <table border="1">
+ * <tr>
+ * <td>
+ * Prune the index in such a way that a human
+ * "cannot distinguish the difference" between the results of a search engine
+ * whose index is pruned and one whose index is not pruned.</td>
+ * </tr>
+ * </table>
+ * <p>
+ * For indexes with a large number of terms this policy might be too slow. In
+ * such situations, the uniform pruning approach in
+ * {@link CarmelUniformTermPruningPolicy} will be faster, though it might
+ * produce inferior search quality, as that policy does not pose a theoretical
+ * guarantee on resulted search quality.
+ * <p>
+ * TODO implement also CarmelTermPruningDeltaTopPolicy
+ */
+public class CarmelTopKTermPruningPolicy extends TermPruningPolicy {
+  
+  /**
+   * Default number of guaranteed top K scores
+   */
+  public static final int DEFAULT_TOP_K = 10;
+  
+  /**
+   * Default number of query terms
+   */
+  public static final int DEFAULT_R = 1;
+  
+  /**
+   * Default largest meaningless score difference
+   */
+  public static final float DEFAULT_EPSILON = .001f;
+  
+  private int docsPos = 0;
+  private int k;
+  private ScoreDoc[] docs = null;
+  private IndexSearcher is;
+  private boolean noPruningForCurrentTerm;
+  private float scoreDelta;
+  
+  /**
+   * Constructor with default parameters
+   * 
+   * @see #DEFAULT_TOP_K
+   * @see #DEFAULT_EPSILON
+   * @see #DEFAULT_R
+   * @see DefaultSimilarity
+   * @see #CarmelTopKTermPruningPolicy(AtomicReader, Map, int, float, int, Similarity)
+   */
+  public CarmelTopKTermPruningPolicy(AtomicReader in,
+      Map<String,Integer> fieldFlags) {
+    this(in, fieldFlags, DEFAULT_TOP_K, DEFAULT_EPSILON, DEFAULT_R, null);
+  }
+  
+  /**
+   * Constructor with specific settings
+   * 
+   * @param in reader for original index
+   * @param k number of guaranteed top scores. Each top K results in the pruned
+   *          index is either also an original top K result or its original
+   *          score is indistinguishable from some original top K result.
+   * @param epsilon largest meaningless score difference Results whose scores
+   *          difference is smaller or equal to epsilon are considered
+   *          indistinguishable.
+   * @param r maximal number of terms in a query for which search quality in
+   *          pruned index is guaranteed
+   * @param sim similarity to use when selecting top docs fir each index term.
+   *          When null, {@link DefaultSimilarity} is used.
+   */
+  public CarmelTopKTermPruningPolicy(AtomicReader in, Map<String,Integer> fieldFlags,
+                                     int k, float epsilon, int r, Similarity sim) {
+    super(in, fieldFlags);
+    this.k = k;
+    is = new IndexSearcher(in);
+    is.setSimilarity(sim != null ? sim : new DefaultSimilarity());
+    scoreDelta = epsilon * r;
+  }
+  
+  // too costly - pass everything at this stage
+  @Override
+  public boolean pruneTermsEnum(String field, TermsEnum te) throws IOException {
+    return false;
+  }
+  
+  @Override
+  public void initPositionsTerm(String field, TermsEnum in) throws IOException {
+    // check if there's any point to prune this term
+    int df = in.docFreq();
+    noPruningForCurrentTerm = (df <= k);
+    if (noPruningForCurrentTerm) {
+      return;
+    }
+    // take more results (k2>k), attempting for sufficient results to avoid a
+    // second search
+    int k2 = Math.min(2 * k, k + 100); // for small k's 2*k will do, but for
+    // large ones (1000's) keep overhead
+    // smaller
+    k2 = Math.min(k2, df); // no more than the potential number of results
+    TopScoreDocCollector collector = TopScoreDocCollector.create(k2, true);
+    TermQuery tq = new TermQuery(new Term(field, in.term()));
+    is.search(tq, collector);
+    docs = collector.topDocs().scoreDocs;
+    float threshold = docs[k - 1].score - scoreDelta;
+    
+    int nLast = k2 - 1;
+    nLast = Math.min(nLast, docs.length - 1); // protect in case of deleted docs
+    if (docs[nLast].score < threshold) {
+      // this is the better/faster case - no need to go over docs again - we
+      // have top ones
+      int n = nLast;
+      while (docs[n - 1].score < threshold)
+        --n; // n == num-valid-docs == first-invalid-doc
+      ScoreDoc[] subset = new ScoreDoc[n];
+      System.arraycopy(docs, 0, subset, 0, n);
+      docs = subset;
+      // sort by doc but only after taking top scores
+      Arrays.sort(docs, ByDocComparator.INSTANCE);
+    } else {
+      // this is the worse case - must go over docs again
+      ThresholdCollector thresholdCollector = new ThresholdCollector(threshold);
+      is.search(tq, thresholdCollector);
+      docs = thresholdCollector.scoreDocs.toArray(new ScoreDoc[0]);
+    }
+    docsPos = 0;
+  }
+  
+  @Override
+  public boolean pruneAllPositions(DocsAndPositionsEnum termPositions, BytesRef t, String field) throws IOException {
+    if (noPruningForCurrentTerm) {
+      return false;
+    }
+
+    if (termPositions.docID() == DocIdSetIterator.NO_MORE_DOCS) { // used up all doc id-s
+      return true; // skip any remaining docs
+    }
+
+    // TODO [Greg Bowyer] - ReWrite this to use advance()
+    while ((docsPos < docs.length - 1) && termPositions.docID() > docs[docsPos].doc) {
+      docsPos++;
+    }
+
+    if (termPositions.docID() == docs[docsPos].doc) {
+      // pass
+      docsPos++; // move to next doc id
+      return false;
+    } else if (termPositions.docID() < docs[docsPos].doc) {
+      return true; // skip this one - it's less important
+    }
+    // should not happen!
+    throw new IOException("termPositions.doc > docs[docsPos].doc");
+  }
+  
+  // it probably doesn't make sense to prune term vectors using this method,
+  // due to its overhead
+  @Override
+  public Terms pruneTermVectorTerms(int docNumber, String field, Terms terms) throws IOException {
+    return terms;
+  }
+  
+  public static class ByDocComparator implements Comparator<ScoreDoc> {
+    public static final ByDocComparator INSTANCE = new ByDocComparator();
+    
+    public int compare(ScoreDoc o1, ScoreDoc o2) {
+      return o1.doc - o2.doc;
+    }
+  }
+  
+  /**
+   * Collect all docs with score >= higher threshold
+   */
+  private static class ThresholdCollector extends Collector {
+    
+    private List<ScoreDoc> scoreDocs = new ArrayList<ScoreDoc>();
+    private Scorer scorer;
+    private float threshold;
+    private int docBase;
+    
+    public ThresholdCollector(float threshold) {
+      this.threshold = threshold;
+    }
+    
+    @Override
+    public boolean acceptsDocsOutOfOrder() {
+      return false;
+    }
+    
+    @Override
+    public void collect(int doc) throws IOException {
+      float score = scorer.score();
+      if (score >= threshold) {
+        scoreDocs.add(new ScoreDoc(docBase + doc, score));
+      }
+    }
+
+    @Override
+    public void setNextReader(AtomicReaderContext context) throws IOException {
+      this.docBase = context.docBase;
+    }
+
+    @Override
+    public void setScorer(Scorer scorer) throws IOException {
+      this.scorer = scorer;
+    }
+    
+  }
+}
diff --git a/lucene/pruning/src/java/org/apache/lucene/index/pruning/CarmelUniformTermPruningPolicy.java b/lucene/pruning/src/java/org/apache/lucene/index/pruning/CarmelUniformTermPruningPolicy.java
new file mode 100644
index 0000000..3149378
--- /dev/null
+++ b/lucene/pruning/src/java/org/apache/lucene/index/pruning/CarmelUniformTermPruningPolicy.java
@@ -0,0 +1,170 @@
+package org.apache.lucene.index.pruning;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopScoreDocCollector;
+import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Enhanced implementation of Carmel Uniform Pruning,
+ * <p>
+ * {@link DocsAndPositionsEnum} whose in-document frequency is below a specified
+ * threshold
+ * <p>
+ * See {@link CarmelTopKTermPruningPolicy} for link to the paper describing this
+ * policy. are pruned.
+ * <p>
+ * Conclusions of that paper indicate that it's best to compute per-term
+ * thresholds, as we do in {@link CarmelTopKTermPruningPolicy}. However for
+ * large indexes with a large number of terms that method might be too slow, and
+ * the (enhanced) uniform approach implemented here may will be faster, although
+ * it might produce inferior search quality.
+ * <p>
+ * This implementation enhances the Carmel uniform pruning approach, as it
+ * allows to specify three levels of thresholds:
+ * <ul>
+ * <li>one default threshold - globally (for terms in all fields)</li>
+ * <li>threshold per field</li>
+ * <li>threshold per term</li>
+ * </ul>
+ * <p>
+ * These thresholds are applied so that always the most specific one takes
+ * precedence: first a per-term threshold is used if present, then per-field
+ * threshold if present, and finally the default threshold.
+ * <p>
+ * Threshold are maintained in a map, keyed by either field names or terms in
+ * <code>field:text</code> format. precedence of these values is the following:
+ * <p>
+ * Thresholds in this method of pruning are expressed as the percentage of the
+ * top-N scoring documents per term that are retained. The list of top-N
+ * documents is established by using a regular {@link IndexSearcher} and
+ * {@link Similarity} to run a simple {@link TermQuery}.
+ * <p>
+ * Smaller threshold value will produce a smaller index. See
+ * {@link TermPruningPolicy} for size vs performance considerations.
+ * <p>
+ * For indexes with a large number of terms this policy might be still too slow,
+ * since it issues a term query for each term in the index. In such situations,
+ * the term frequency pruning approach in {@link TFTermPruningPolicy} will be
+ * faster, though it might produce inferior search quality.
+ */
+public class CarmelUniformTermPruningPolicy extends TermPruningPolicy {
+
+  private final float defThreshold;
+  private final Map<String,Float> thresholds;
+  private final IndexSearcher is;
+
+  private float curThr;
+  private int docsPos = 0;
+  private ScoreDoc[] docs = null;
+
+  public CarmelUniformTermPruningPolicy(AtomicReader in, Map<String,Integer> fieldFlags,
+                                        Map<String,Float> thresholds, float defThreshold,
+                                        Similarity sim) {
+    super(in, fieldFlags);
+
+    this.defThreshold = defThreshold;
+    this.thresholds = thresholds != null ? thresholds : Collections.<String, Float>emptyMap();
+
+    this.is = new IndexSearcher(in);
+    is.setSimilarity((sim != null) ? sim : new DefaultSimilarity());
+  }
+  
+  // too costly - pass everything at this stage
+  @Override
+  public boolean pruneTermsEnum(String field, TermsEnum te) throws IOException {
+    return false;
+  }
+  
+  @Override
+  public void initPositionsTerm(String field, TermsEnum in) throws IOException {
+    curThr = defThreshold;
+    String termKey = field + ":" + in.term().toString();
+    if (thresholds.containsKey(termKey)) {
+      curThr = thresholds.get(termKey);
+    } else if (thresholds.containsKey(field)) {
+      curThr = thresholds.get(field);
+    }
+    // calculate count
+    int df = in.docFreq();
+    int count = Math.round((float) df * curThr);
+    if (count < 100) count = 100;
+    TopScoreDocCollector collector = TopScoreDocCollector.create(count, true);
+    TermQuery tq = new TermQuery(new Term(field, in.term()));
+    is.search(tq, collector);
+    docs = collector.topDocs().scoreDocs;
+    Arrays.sort(docs, ByDocComparator.INSTANCE);
+    docsPos = 0;
+  }
+  
+  @Override
+  public boolean pruneAllPositions(DocsAndPositionsEnum termPositions, BytesRef t, String field) throws IOException {
+    // used up all doc id-s
+    if (termPositions.docID() == DocIdSetIterator.NO_MORE_DOCS) {
+      return true; // skip any remaining docs
+    }
+
+    // TODO [Greg Bowyer] - ReWrite this to use advance()
+    while ((docsPos < docs.length - 1) && termPositions.docID() > docs[docsPos].doc) {
+      docsPos++;
+    }
+
+    if (termPositions.docID() == docs[docsPos].doc) {
+      // pass
+      docsPos++; // move to next doc id
+      return false;
+    } else if (termPositions.docID() < docs[docsPos].doc) {
+      return true; // skip this one - it's less important
+    }
+    // should not happen!
+    throw new IOException("termPositions.doc > docs[docsPos].doc");
+  }
+  
+  // it probably doesn't make sense to prune term vectors using this method,
+  // due to its overhead
+  @Override
+  public Terms pruneTermVectorTerms(int docNumber, String field, Terms terms) throws IOException {
+    return terms;
+  }
+  
+  public static class ByDocComparator implements Comparator<ScoreDoc> {
+    public static final ByDocComparator INSTANCE = new ByDocComparator();
+    
+    public int compare(ScoreDoc o1, ScoreDoc o2) {
+      return o1.doc - o2.doc;
+    }
+  }
+
+}
diff --git a/lucene/pruning/src/java/org/apache/lucene/index/pruning/PruningPolicy.java b/lucene/pruning/src/java/org/apache/lucene/index/pruning/PruningPolicy.java
new file mode 100644
index 0000000..c956835
--- /dev/null
+++ b/lucene/pruning/src/java/org/apache/lucene/index/pruning/PruningPolicy.java
@@ -0,0 +1,34 @@
+package org.apache.lucene.index.pruning;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * General Definitions for Index Pruning, such as operations to be performed on field data.
+ */
+public class PruningPolicy {
+  /** Delete (some or all) postings for this field. */
+  public static final int DEL_POSTINGS = 0x01;
+  /** Delete (some or all) stored values for this field. */
+  public static final int DEL_STORED = 0x02;
+  /** Delete term frequency vectors for this field (whole vectors or individual terms). */
+  public static final int DEL_VECTOR = 0x04;
+  /** Delete (some or all) payloads in these fields. */
+  public static final int DEL_PAYLOADS = 0x08;
+  /** Delete all data for this field. */
+  public static final int DEL_ALL = 0xff;
+}
diff --git a/lucene/pruning/src/java/org/apache/lucene/index/pruning/PruningTool.java b/lucene/pruning/src/java/org/apache/lucene/index/pruning/PruningTool.java
new file mode 100644
index 0000000..a5c0b52
--- /dev/null
+++ b/lucene/pruning/src/java/org/apache/lucene/index/pruning/PruningTool.java
@@ -0,0 +1,202 @@
+package org.apache.lucene.index.pruning;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.CompositeReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.PruningAtomicReader;
+import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+
+/**
+ * A command-line tool to configure and run a {@link org.apache.lucene.index.PruningAtomicReader} on an input
+ * index and produce a pruned output index using
+ * {@link IndexWriter#addIndexes(IndexReader...)}.
+ */
+public class PruningTool {
+
+  public static void main(String[] args) throws Exception {
+    int res = run(args);
+    System.exit(res);
+  }
+  
+  public static int run(String[] args) throws Exception {
+    if (args.length < 5) {
+      System.err.println("Usage: PruningTool -impl (tf | carmel | carmeltopk | ridf) (-indexReader <path1> [-indexReader <path2> ...]) " +
+          "-out <outPath> -t <NN> [-del f1,f2,..] [-conf <file>] [-topkk <NN>] [-topke <NN>] [-topkr <NN>]");
+      System.err.println("\t-impl (tf | carmel | carmeltopk | ridf)\tTermPruningPolicy implementation name: TF or CarmelUniform or or CarmelTopK or RIDFTerm");
+      System.err.println("\t-indexReader path\tpath to the input index. Can specify multiple input indexes.");
+      System.err.println("\t-out path\toutput path where the output index will be stored.");
+      System.err.println("\t-t NN\tdefault threshold value (minimum indexReader-document frequency) for all terms");
+      System.err.println("\t-del f1,f2,..\tcomma-separated list of field specs to delete (postings, vectors & stored):");
+      System.err.println("\t\tfield spec : fieldName ( ':' [pPsv] )");
+      System.err.println("\t\twhere: p - postings, P - payloads, s - stored value, v - vectors");
+      System.err.println("\t-conf file\tpath to config file with per-term thresholds");
+      System.err.println("\t-topkk NN\t'K' for Carmel TopK Pruning: number of guaranteed top scores");
+      System.err.println("\t-topke NN\t'Epsilon' for Carmel TopK Pruning: largest meaningless score difference");
+      System.err.println("\t-topkr NN\t'R' for Carmel TopK Pruning: planned maximal number of terms indexReader a query on pruned index");
+      return -1;
+    }
+
+    List<CompositeReader> inputs = new ArrayList<CompositeReader>();
+    Directory out = null;
+    float thr = -1;
+    Map<String, Integer> delFields = new HashMap<String, Integer>();
+    
+    // parameters for top-K pruning 
+    int topkK = CarmelTopKTermPruningPolicy.DEFAULT_TOP_K;
+    float topkEpsilon = CarmelTopKTermPruningPolicy.DEFAULT_EPSILON;
+    int topkR = CarmelTopKTermPruningPolicy.DEFAULT_R;
+    
+    String impl = null;
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-indexReader")) {
+        Directory d = FSDirectory.open(new File(args[++i]));
+        if (!DirectoryReader.indexExists(d)) {
+          System.err.println("WARN: no index indexReader " + args[i] + ", skipping ...");
+        }
+        inputs.add(DirectoryReader.open(d));
+      } else if (args[i].equals("-out")) {
+        File outFile = new File(args[++i]);
+        if (outFile.exists()) {
+          throw new Exception("Output " + outFile + " already exists.");
+        }
+        //noinspection ResultOfMethodCallIgnored
+        outFile.mkdirs();
+        out = FSDirectory.open(outFile);
+      } else if (args[i].equals("-impl")) {
+        impl = args[++i];
+      } else if (args[i].equals("-t")) {
+        thr = Float.parseFloat(args[++i]);
+      } else if (args[i].equals("-topkk")) {
+        topkK = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-topke")) {
+        topkEpsilon = Float.parseFloat(args[++i]);
+      } else if (args[i].equals("-topkr")) {
+        topkR = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-del")) {
+        String[] fields = args[++i].split(",");
+        for (String f : fields) {
+          // parse field spec
+          String[] spec = f.split(":");
+          int opts = PruningPolicy.DEL_ALL;
+          if (spec.length > 0) {
+            opts = 0;
+            if (spec[1].indexOf('p') != -1) {
+              opts |= PruningPolicy.DEL_POSTINGS;
+            }
+            if (spec[1].indexOf('P') != -1) {
+              opts |= PruningPolicy.DEL_PAYLOADS;
+            }
+            if (spec[1].indexOf('s') != -1) {
+              opts |= PruningPolicy.DEL_STORED;
+            }
+            if (spec[1].indexOf('v') != -1) {
+              opts |= PruningPolicy.DEL_VECTOR;
+            }
+          }
+          delFields.put(spec[0], opts);
+        }
+      } else if (args[i].equals("-conf")) {
+        ++i;
+        System.err.println("WARN: -conf option not implemented yet.");
+      } else {
+        throw new Exception("Invalid argument: '" + args[i] + "'");
+      }
+    }
+    if (impl == null) {
+      throw new Exception("Must select algorithm implementation");
+    }
+    if (inputs.size() == 0) {
+      throw new Exception("At least one input index is required.");
+    }
+    if (out == null) {
+      throw new Exception("Output path is not set.");
+    }
+    if (thr == -1) {
+      throw new Exception("Threshold value is not set.");
+    }
+
+    /*
+    IndexReader indexReader;
+    if (inputs.size() == 1) {
+      indexReader = inputs.get(0);
+    } else {
+      indexReader = new MultiReader(inputs.toArray(new IndexReader[inputs.size()]), true);
+    }
+    */
+
+    for (CompositeReader reader : inputs) {
+      if (reader.hasDeletions()) {
+        System.err.println("WARN: input index(es) with deletions - document ID-s will NOT be preserved!");
+      }
+
+      IndexReader pruning = null;
+      StorePruningPolicy stp = null;
+
+      // TODO [Greg Bowyer] I dont see why we cant avoid this, but as a starter for ten
+      // While we transliterate the source code its a start
+      AtomicReader indexReader = SlowCompositeReaderWrapper.wrap(reader);
+
+      if (delFields.size() > 0) {
+        stp = new StorePruningPolicy(reader, delFields);
+      }
+
+      TermPruningPolicy tpp = null;
+
+      // TODO [Greg Bowyer] Can we make this a little nicer somehow ?
+      if (impl.equals("tf")) {
+        tpp = new TFTermPruningPolicy(indexReader, delFields, null, (int)thr);
+      } else if (impl.equals("carmel")) {
+        tpp = new CarmelUniformTermPruningPolicy(indexReader, delFields, null, thr, null);
+      } else if (impl.equals("carmeltopk")) {
+        tpp = new CarmelTopKTermPruningPolicy(indexReader, delFields, topkK, topkEpsilon, topkR, null);
+      } else if (impl.equals("ridf")) {
+        tpp = new RIDFTermPruningPolicy(indexReader, delFields, null, thr);
+      } else {
+        throw new Exception("Unknown algorithm: '" + impl + "'");
+      }
+
+      pruning = new PruningAtomicReader(indexReader, stp, tpp);
+      IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_50,
+              new WhitespaceAnalyzer(Version.LUCENE_50));
+      IndexWriter iw = new IndexWriter(out, cfg);
+      try {
+        iw.addIndexes(pruning);
+      } finally {
+        iw.close();
+      }
+    }
+
+    System.err.println("DONE.");
+    return 0;
+  }
+}
diff --git a/lucene/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java b/lucene/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java
new file mode 100644
index 0000000..464bebb
--- /dev/null
+++ b/lucene/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java
@@ -0,0 +1,109 @@
+package org.apache.lucene.index.pruning;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Implementation of {@link TermPruningPolicy} that uses "residual IDF"
+ * metric to determine the postings of terms to keep/remove, as defined in
+ * <a href="">http://www.dc.fi.udc.es/~barreiro/publications/blanco_barreiro_ecir2007.pdf</a>.
+ * <p>Residual IDF measures a difference between a collection-wide IDF of a term
+ * (which assumes a uniform distribution of occurrences) and the actual
+ * observed total number of occurrences of a term in all documents. Positive
+ * values indicate that a term is informative (e.g. for rare terms), negative
+ * values indicate that a term is not informative (e.g. too popular to offer
+ * good selectivity).
+ * <p>This metric produces small values close to [-1, 1], so useful ranges for
+ * thresholds under this metrics are somewhere between [0, 1]. The higher the
+ * threshold the more informative (and more rare) terms will be retained. For
+ * filtering of common words a value of close to or slightly below 0 (e.g. -0.1)
+ * should be a good starting point. 
+ * 
+ */
+public class RIDFTermPruningPolicy extends TermPruningPolicy {
+  double defThreshold;
+  Map<String, Double> thresholds;
+  double idf;
+  double maxDoc;
+  double ridf;
+
+  public RIDFTermPruningPolicy(AtomicReader in,
+          Map<String, Integer> fieldFlags, Map<String, Double> thresholds,
+          double defThreshold) {
+    super(in, fieldFlags);
+    this.defThreshold = defThreshold;
+    if (thresholds != null) {
+      this.thresholds = thresholds;
+    } else {
+      this.thresholds = Collections.emptyMap();
+    }
+    maxDoc = in.maxDoc();
+  }
+
+  @Override
+  public void initPositionsTerm(String field, TermsEnum in) throws IOException {
+    // from formula [2], not the formula [1]
+    //
+    idf = - Math.log((double)in.docFreq() / maxDoc);
+    // calculate total number of occurrences
+    long totalFreq = in.totalTermFreq();
+
+    /* ORIGINAL 3.x code
+    while (tp.next()) {
+      totalFreq += tp.freq();
+    }
+    // reposition the enum
+    tp.seek(t);
+    */
+
+    // rest of the formula [2] in the paper
+    ridf = idf + Math.log(1 - Math.pow(Math.E, - totalFreq / maxDoc));
+  }
+
+  @Override
+  public boolean pruneTermsEnum(String field, TermsEnum te) throws IOException {
+    return false;
+  }
+
+  @Override
+  public boolean pruneAllPositions(DocsAndPositionsEnum termPositions, BytesRef t, String field) throws IOException {
+    double thr = defThreshold;
+    String key = field + ":" + t.toString();
+    if (thresholds.containsKey(key)) {
+      thr = thresholds.get(key);
+    } else if (thresholds.containsKey(field)) {
+      thr = thresholds.get(field);
+    }
+    return ridf <= thr;
+  }
+
+  @Override
+  public Terms pruneTermVectorTerms(int docNumber, String field, Terms terms) throws IOException {
+    return terms;
+  }
+
+}
diff --git a/lucene/pruning/src/java/org/apache/lucene/index/pruning/StorePruningPolicy.java b/lucene/pruning/src/java/org/apache/lucene/index/pruning/StorePruningPolicy.java
new file mode 100644
index 0000000..c87619d
--- /dev/null
+++ b/lucene/pruning/src/java/org/apache/lucene/index/pruning/StorePruningPolicy.java
@@ -0,0 +1,143 @@
+package org.apache.lucene.index.pruning;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+import java.util.logging.Logger;
+
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.StoredFieldVisitor;
+
+/**
+ * Pruning policy for removing stored fields from documents.
+ */
+public class StorePruningPolicy extends PruningPolicy {
+  
+  private static final Logger LOG = Logger.getLogger(StorePruningPolicy.class.getName());
+  
+  /** Pruning in effect for each field */ 
+  protected Map<String,Integer> fieldFlags;
+  
+  /** Fields to be completely deleted */
+  protected Set<String> deleteAll;
+  
+  protected DelFieldSelector fs;
+  protected IndexReader in;
+  protected int delFields; // total number of fields deleted
+  
+  /**
+   * Constructs a policy.
+   * @param in input reader.
+   * @param fieldFlags a map where keys are field names, and flags are
+   * bitwise-OR values of flags defined in {@link PruningPolicy}.
+   */
+  public StorePruningPolicy(IndexReader in, Map<String,Integer> fieldFlags) {
+    if (fieldFlags != null) {
+      this.fieldFlags = fieldFlags;
+      deleteAll = new HashSet<String>();
+      for (Entry<String,Integer> e : fieldFlags.entrySet()) {
+        if (e.getValue() == PruningPolicy.DEL_ALL) {
+          deleteAll.add(e.getKey());
+        }
+      }
+    } else {
+      this.fieldFlags = Collections.emptyMap();
+      deleteAll = Collections.emptySet();
+    }
+    fs = new DelFieldSelector(fieldFlags);
+    this.in = in;
+  }
+  
+  /**
+   * Compute field infos that should be retained
+   * @param allInfos original field infos 
+   * @return those of the original field infos which should not be removed.
+   */
+  public FieldInfos getFieldInfos(FieldInfos allInfos) {
+    // for simplicity remove only fields with DEL_ALL
+    List<FieldInfo> res = new ArrayList<FieldInfo>(allInfos.size());
+    for (FieldInfo fi: allInfos) {
+      if (!deleteAll.contains(fi.name)) {
+        res.add(fi);
+      }
+    }
+
+    FieldInfo[] infos = new FieldInfo[res.size()];
+    res.toArray(infos);
+    return new FieldInfos(infos);
+  }
+  
+  /**
+   * Prune stored fields of a document. Note that you can also arbitrarily
+   * change values of the retrieved fields, so long as the field names belong
+   * to a list of fields returned from {@link #getFieldInfos(FieldInfos)}.
+   * @param doc document number
+   * @param visitor original field selector that limits what fields will be
+   * retrieved.
+   * @throws IOException
+   */
+  public void pruneDocument(int doc, StoredFieldVisitor visitor) throws IOException {
+    if (fieldFlags.isEmpty()) {
+      in.document(doc, visitor);
+    } else {
+      fs.setParent(visitor);
+      in.document(doc, fs);
+    }    
+  }
+  
+  class DelFieldSelector extends StoredFieldVisitor {
+    private static final long serialVersionUID = -4913592063491685103L;
+    private StoredFieldVisitor parent;
+    private Map<String, Integer> remove;
+    
+    public DelFieldSelector(Map<String,Integer> remove) {
+      this.remove = remove;
+    }
+    
+    public void setParent(StoredFieldVisitor parent) {
+      this.parent = parent;
+    }
+    
+    @Override
+    public Status needsField(FieldInfo fieldInfo) throws IOException {
+      String fieldName = fieldInfo.name;
+      if (!remove.isEmpty() && remove.containsKey(fieldName) && ((remove.get(fieldName) & DEL_STORED) > 0)) {
+        delFields++;
+        if (delFields % 10000 == 0) {
+          LOG.info(" - stored fields: removed " + delFields + " fields.");
+        }
+        return Status.NO;
+      } else if (parent != null) {
+        return parent.needsField(fieldInfo);
+      } else {
+        return Status.YES;
+      }
+
+    }
+
+  };
+
+}
diff --git a/lucene/pruning/src/java/org/apache/lucene/index/pruning/TFTermPruningPolicy.java b/lucene/pruning/src/java/org/apache/lucene/index/pruning/TFTermPruningPolicy.java
new file mode 100644
index 0000000..b6d4406
--- /dev/null
+++ b/lucene/pruning/src/java/org/apache/lucene/index/pruning/TFTermPruningPolicy.java
@@ -0,0 +1,134 @@
+package org.apache.lucene.index.pruning;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.PruningAtomicReader;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Policy for producing smaller index out of an input index, by removing postings data
+ * for those terms where their in-document frequency is below a specified
+ * threshold. 
+ * <p>
+ * Larger threshold value will produce a smaller index.
+ * See {@link TermPruningPolicy} for size vs performance considerations.
+ * <p>
+ * This implementation uses simple term frequency thresholds to remove all postings
+ * from documents where a given term occurs rarely (i.e. its TF in a document
+ * is smaller than the threshold).
+ * <p>
+ * Threshold values in this method are expressed as absolute term frequencies.
+ */
+public class TFTermPruningPolicy extends TermPruningPolicy {
+  protected Map<String,Integer> thresholds;
+  protected int defThreshold;
+  protected int curThr;
+
+  public TFTermPruningPolicy(AtomicReader in, Map<String,Integer> fieldFlags,
+          Map<String,Integer> thresholds, int defThreshold) {
+    super(in, fieldFlags);
+    this.defThreshold = defThreshold;
+    if (thresholds != null) {
+      this.thresholds = thresholds;
+    } else {
+      this.thresholds = Collections.emptyMap();
+    }
+  }
+
+  @Override
+  public boolean pruneTermsEnum(String field, TermsEnum termsEnum) throws IOException {
+    // check that at least one doc exceeds threshold
+    int thr = defThreshold;
+
+    // TODO [Greg Bowyer] I dont like this, maybe a map of maps ?
+    String termKey = field + ":" + termsEnum.term().toString();
+
+    if (thresholds.containsKey(termKey)) {
+      thr = thresholds.get(termKey);
+    } else if (thresholds.containsKey(field)) {
+      thr = thresholds.get(field);
+    }
+    // END TODO
+
+    Bits liveDocs = in.getLiveDocs();
+    DocsEnum td = termsEnum.docs(liveDocs, null, DocsEnum.FLAG_FREQS);
+
+    boolean pass = false;
+    do {
+      if (td.freq() >= thr) {
+        pass = true;
+        break;
+      }
+    } while (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
+    return !pass;
+  }
+
+  @Override
+  public void initPositionsTerm(String field, TermsEnum in) throws IOException {
+    // set threshold for this field
+    curThr = defThreshold;
+    String termKey = field + ":" + in.term().toString();
+    if (thresholds.containsKey(termKey)) {
+      curThr = thresholds.get(termKey);
+    } else if (thresholds.containsKey(field)) {
+      curThr = thresholds.get(field);
+    }
+  }
+
+  @Override
+  public boolean pruneAllPositions(DocsAndPositionsEnum termPositions, BytesRef t, String field) throws IOException {
+    return termPositions.freq() < curThr;
+  }
+
+  @Override
+  public Terms pruneTermVectorTerms(final int docNumber, final String field, final Terms terms) throws IOException {
+    return new PruningAtomicReader.PruningTerms(this, field, terms);
+    /*
+    ORIGINAL 3.x code
+    int thr = defThreshold;
+    if (thresholds.containsKey(field)) {
+      thr = thresholds.get(field);
+    }
+    int removed = 0;
+    for (int i = 0; i < terms.length; i++) {
+      // check per-term thresholds
+      int termThr = thr;
+      String t = field + ":" + terms[i];
+      if (thresholds.containsKey(t)) {
+        termThr = thresholds.get(t);
+      }
+      if (freqs[i] < termThr) {
+        terms[i] = null;
+        removed++;
+      }      
+    }
+    return removed;
+    */
+  }
+
+}
diff --git a/lucene/pruning/src/java/org/apache/lucene/index/pruning/TermPruningPolicy.java b/lucene/pruning/src/java/org/apache/lucene/index/pruning/TermPruningPolicy.java
new file mode 100644
index 0000000..bc2d7f7
--- /dev/null
+++ b/lucene/pruning/src/java/org/apache/lucene/index/pruning/TermPruningPolicy.java
@@ -0,0 +1,176 @@
+package org.apache.lucene.index.pruning;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Policy for producing smaller index out of an input index, by examining its terms
+ * and removing from the index some or all of their data as follows:
+ * <ul>
+ * <li>all terms of a certain field - see {@link #pruneAllFieldPostings(String)}</li>
+ * <li>all data of a certain term - see {@link #pruneTermsEnum(String, TermsEnum)}</li>
+ * <li>all positions of a certain term in a certain document - see #pruneAllPositions(DocsAndPositionsEnum, Term)</li>
+ * <li>some positions of a certain term in a certain document - see #pruneSomePositions(int, int[], Term)</li>
+ * </ul>
+ * <p>
+ * The pruned, smaller index would, for many types of queries return nearly 
+ * identical top-N results as compared with the original index, but with increased performance.
+ * <p>
+ * Pruning of indexes is handy for producing small first-tier indexes that fit
+ * completely in RAM, and store these indexes using {@link IndexWriter#addIndexes(IndexReader...)}
+ * <p>
+ * Interestingly, if the input index is optimized (i.e. doesn't contain deletions),
+ * then the index produced via {@link IndexWriter#addIndexes(IndexReader[])} will preserve internal document
+ * id-s so that they are in sync with the original index. This means that
+ * all other auxiliary information not necessary for first-tier processing, such
+ * as some stored fields, can also be removed, to be quickly retrieved on-demand
+ * from the original index using the same internal document id. See
+ * {@link StorePruningPolicy} for information about removing stored fields.
+ * <p>
+ * Please note that while this family of policies method produces good results for term queries it
+ * often leads to poor results for phrase queries (because postings are removed
+ * without considering whether they belong to an important phrase). 
+ * <p>
+ * Aggressive pruning policies produce smaller indexes - 
+ * search performance increases, and recall decreases (i.e. search quality
+ * deteriorates). 
+ * <p>
+ * See the following papers for a discussion of this problem and the
+ * proposed solutions to improve the quality of a pruned index (not implemented
+ * here):
+ * <small>
+ * <ul>
+ * <li><a href="http://portal.acm.org/citation.cfm?id=1148235">Pruned query
+ * evaluation using pre-computed impacts, V. Anh et al, ACM SIGIR 2006</a></li>
+ * <li><a href="http://portal.acm.org/citation.cfm?id=1183614.1183644"> A
+ * document-centric approach to static index pruning in text retrieval systems,
+ * S. Buettcher et al, ACM SIGIR 2006</a></li>
+ * <li><a href=" http://oak.cs.ucla.edu/~cho/papers/ntoulas-sigir07.pdf">
+ * Pruning Policies for Two-Tiered Inverted Index with Correctness Guarantee, A.
+ * Ntoulas et al, ACM SIGIR 2007.</a></li>
+ * </ul>
+ * </small>
+ */
+public abstract class TermPruningPolicy extends PruningPolicy {
+  /** Pruning operations to be conducted on fields. */
+  protected Map<String, Integer> fieldFlags;
+  protected AtomicReader in;
+  
+  /**
+   * Construct a policy.
+   * @param in input reader
+   * @param fieldFlags a map, where keys are field names and values
+   * are bitwise-OR flags of operations to be performed (see
+   * {@link PruningPolicy} for more details).
+   */
+  protected TermPruningPolicy(AtomicReader in, Map<String,Integer> fieldFlags) {
+    this.in = in;
+    if (fieldFlags != null) {
+      this.fieldFlags = fieldFlags;
+    } else {
+      this.fieldFlags = Collections.emptyMap();
+    }
+  }
+  
+  /**
+   * Term vector pruning.
+   * @param docNumber document number
+   * @param field field name
+   * @return true if the complete term vector for this field should be
+   * removed (as specified by {@link PruningPolicy#DEL_VECTOR} flag).
+   * @throws IOException
+   */
+  public boolean pruneWholeTermVector(int docNumber, String field) throws IOException {
+    return fieldFlags.containsKey(field) && (fieldFlags.get(field) & DEL_VECTOR) != 0;
+  }
+  
+  /**
+   * Pruning of all postings for a field
+   * @param field field name
+   * @return true if all postings for all terms in this field should be
+   * removed (as specified by {@link PruningPolicy#DEL_POSTINGS}).
+   * @throws IOException
+   */
+  public boolean pruneAllFieldPostings(String field) throws IOException {
+    return fieldFlags.containsKey(field) && (fieldFlags.get(field) & DEL_POSTINGS) != 0;
+  }
+  
+  /**
+   * Called when moving {@link DocsAndPositionsEnum} to a new {@link Term}.
+   * @param in input term enum
+   * @throws IOException
+   */
+  public abstract void initPositionsTerm(String field, TermsEnum in) throws IOException;
+
+  /**
+   * Called when checking for the presence of payload for the current
+   * term at a current position
+   * @param field the field name
+   * @return true if the payload should be removed, false otherwise.
+   */
+  public boolean prunePayload(String field) {
+    return fieldFlags.containsKey(field) && (fieldFlags.get(field) & DEL_PAYLOADS) != 0;
+  }
+
+  /**
+   * Pruning of individual terms in term vectors.
+   * @param docNumber document number
+   * @param field field name
+   * @param terms the fields terms
+   * @return The original terms if no terms are to be removed, otherwise a terms object that has the
+   * terms removed
+   * @throws IOException
+   */
+  public abstract Terms pruneTermVectorTerms(int docNumber, String field, Terms terms) throws IOException;
+
+  /**
+   * Pruning of all postings for a term (invoked once per term).
+   *
+   * @param field the field to prune
+   * @param te positioned term enum.
+   * @return true if all postings for this term should be removed, false
+   * otherwise.
+   * @throws IOException
+   */
+  public abstract boolean pruneTermsEnum(String field, TermsEnum te) throws IOException;
+
+  /**
+   * Prune <b>all</b> postings per term (invoked once per term per doc)
+   *
+   * @param termPositions positioned term positions. Implementations MUST NOT
+   * advance this by calling {@link org.apache.lucene.index.DocsAndPositionsEnum} methods that advance either
+   * the position pointer (next, skipTo) or term pointer (seek).
+   * @param term current term
+   * @param field the current field
+   * @return true if the current posting should be removed, false otherwise.
+   * @throws IOException
+   */
+  public abstract boolean pruneAllPositions(DocsAndPositionsEnum termPositions, BytesRef term, String field) throws IOException;
+
+}
diff --git a/lucene/pruning/src/java/org/apache/lucene/index/pruning/package.html b/lucene/pruning/src/java/org/apache/lucene/index/pruning/package.html
new file mode 100644
index 0000000..252a8d4
--- /dev/null
+++ b/lucene/pruning/src/java/org/apache/lucene/index/pruning/package.html
@@ -0,0 +1,41 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<HTML>
+<HEAD>
+    <TITLE>Index Pruning</TITLE>
+</HEAD>
+<BODY>
+<DIV>
+Static Index Pruning Tools
+<p>
+This package provides a framework for pruning an existing index into 
+a smaller index while retaining visible search quality as much as possible.
+</p>
+<p>
+An index can be pruned in several levels:
+<ol>
+  <li>Remove stored data: see <a href="./StorePruningPolicy.html">StorePruningPolicy</a></li>
+  <li>Remove terms data: see <a href="./TermPruningPolicy.html">TermPruningPolicy</a></li>
+</ol>
+Pruning can be applied programmatically via a <a href="../PruningReader.html">PruningReader</a>
+or with the static tool <a href="./PruningTool.html">PruningTool</a>.
+</p>
+</DIV>
+<DIV>&nbsp;</DIV>
+</BODY>
+</HTML>
diff --git a/lucene/pruning/src/java/overview.html b/lucene/pruning/src/java/overview.html
new file mode 100644
index 0000000..dc4b450
--- /dev/null
+++ b/lucene/pruning/src/java/overview.html
@@ -0,0 +1,26 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+  <head>
+    <title>
+    Static Index Pruning Tools
+    </title>
+  </head>
+  <body>
+  Static Index Pruning Tools
+  </body>
+</html>
\ No newline at end of file
diff --git a/lucene/pruning/src/test/org/apache/lucene/index/TestPruningReader.java b/lucene/pruning/src/test/org/apache/lucene/index/TestPruningReader.java
new file mode 100644
index 0000000..beb9756
--- /dev/null
+++ b/lucene/pruning/src/test/org/apache/lucene/index/TestPruningReader.java
@@ -0,0 +1,364 @@
+package org.apache.lucene.index;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.pruning.CarmelTopKTermPruningPolicy;
+import org.apache.lucene.index.pruning.PruningPolicy;
+import org.apache.lucene.index.pruning.RIDFTermPruningPolicy;
+import org.apache.lucene.index.pruning.StorePruningPolicy;
+import org.apache.lucene.index.pruning.TFTermPruningPolicy;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+
+
+public class TestPruningReader extends LuceneTestCase {
+
+  // parameters for the Carmel-TopK-Pruning 
+  private static final int R = 1; //number of terms in the query
+  private static final int K = 2; // top K results
+  private static final float EPSILON = .001f; // error in score
+
+  RAMDirectory sourceDir = new RAMDirectory();
+
+  /** once computed base on how index is created, these are the full scores, i.e. before pruning */ 
+  private static Map<Term,ScoreDoc[]> fullScores = initFullScores(); 
+  private static Map<Term,ScoreDoc[]> prunedScores = initPrunedScores(); 
+  
+  private void assertTD(AtomicReader ir, Term t, int[] ids) throws Exception {
+    DocsAndPositionsEnum td = ir.termPositionsEnum(t);
+    assertNotNull(td);
+    int i = 0;
+    while(td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+      int doc = td.docID();
+      assertEquals(t + ", i=" + i, ids[i], doc);
+      i++;
+    }
+    assertEquals(ids.length, i);
+  }
+  
+  /**
+   * Scores of the full, unpruned index.
+   */
+  private static Map<Term, ScoreDoc[]> initFullScores() {
+    HashMap<Term, ScoreDoc[]> res = new HashMap<Term, ScoreDoc[]>();
+    Term t;
+    ScoreDoc sd[]; 
+    t = new Term("body","one");
+    sd = new ScoreDoc[] {
+        new ScoreDoc(4, 0.7154686450958252f),
+        new ScoreDoc(2, 0.5310977101325989f),
+        new ScoreDoc(3, 0.5310977101325989f),
+        new ScoreDoc(1, 0.4336394667625427f),
+        new ScoreDoc(0, 0.40883922576904297f)
+        };
+    res.put(t,sd);
+    t = new Term("body","two");
+    sd = new ScoreDoc[] {
+        new ScoreDoc(2, 0.6495190262794495f),
+        new ScoreDoc(1, 0.5303300619125366f),
+        new ScoreDoc(0, 0.5f),
+        new ScoreDoc(4, 0.4375f)
+    };
+    res.put(t,sd);
+    t = new Term("body","three");
+    sd = new ScoreDoc[] {
+        new ScoreDoc(3, 0.7944550514221191f),
+        new ScoreDoc(1, 0.6486698389053345f),
+        new ScoreDoc(0, 0.6115717887878418f)
+    };
+    res.put(t,sd);
+    t = new Term("test","one");
+    sd = new ScoreDoc[] {
+        new ScoreDoc(4, 0.8468888998031616f)
+    };
+    res.put(t,sd);
+    t = new Term("allthesame","allthesame"); 
+    sd = new ScoreDoc[] {
+        new ScoreDoc(0, 0.8176784515380859f),
+        new ScoreDoc(1, 0.8176784515380859f),
+        new ScoreDoc(2, 0.8176784515380859f),
+        new ScoreDoc(3, 0.8176784515380859f),
+        new ScoreDoc(4, 0.8176784515380859f)
+    };
+    res.put(t,sd);
+    return res;
+  }
+
+  /**
+   * Expected scores of the pruned index - with EPSILON=0.001, K=2, R=1 
+   */
+  private static Map<Term, ScoreDoc[]> initPrunedScores() {
+    HashMap<Term, ScoreDoc[]> res = new HashMap<Term, ScoreDoc[]>();
+    Term t;
+    ScoreDoc sd[]; 
+    t = new Term("body","one");
+    sd = new ScoreDoc[] {
+        new ScoreDoc(4, 0.74011815f),
+        new ScoreDoc(2, 0.54939526f),
+        new ScoreDoc(3, 0.54939526f),
+    };
+    res.put(t,sd);
+    t = new Term("body","two");
+    sd = new ScoreDoc[] {
+        new ScoreDoc(2, 0.7679404f),
+        new ScoreDoc(1, 0.62702066f),
+    };
+    res.put(t,sd);
+    t = new Term("body","three");
+    sd = new ScoreDoc[] {
+        new ScoreDoc(3, 0.7679404f),
+        new ScoreDoc(1, 0.62702066f),
+    };
+    res.put(t,sd);
+    t = new Term("test","one");
+    sd = new ScoreDoc[] {
+        new ScoreDoc(4, 2.9678855f)
+    };
+    res.put(t,sd);
+    t = new Term("allthesame","allthesame"); // must keep all because all are the same! 
+    sd = new ScoreDoc[] {
+        new ScoreDoc(0, 0.84584934f),
+        new ScoreDoc(1, 0.84584934f),
+        new ScoreDoc(2, 0.84584934f),
+        new ScoreDoc(3, 0.84584934f),
+        new ScoreDoc(4, 0.84584934f)
+    };
+    res.put(t,sd);
+    return res;
+  }
+
+  private void assertTDCount(AtomicReader ir, Term t, int count) throws Exception {
+    DocsAndPositionsEnum td = ir.termPositionsEnum(t);
+    assertNotNull(td);
+    int i = 0;
+    while (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) i++;
+    assertEquals(t.toString(), count, i);
+  }
+  
+  public void setUp() throws Exception {
+    super.setUp();
+
+    FieldType storedNotIndexed = new FieldType(TextField.TYPE_STORED);
+    storedNotIndexed.setIndexed(false);
+
+    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
+    IndexWriter iw = new IndexWriter(sourceDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
+    Document doc = new Document();
+    doc.add(new Field("body", "one two three four", TextField.TYPE_STORED));
+    doc.add(new Field("id", "0", storedNotIndexed));
+    doc.add(new Field("allthesame", "allthesame", TextField.TYPE_STORED));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one two three one two three", TextField.TYPE_STORED));
+    doc.add(new Field("id", "1", storedNotIndexed));
+    doc.add(new Field("allthesame", "allthesame", TextField.TYPE_STORED));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one two one two one two", TextField.TYPE_STORED));
+    doc.add(new Field("id", "2", storedNotIndexed));
+    doc.add(new Field("allthesame", "allthesame", TextField.TYPE_STORED));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one three one three one three", TextField.TYPE_STORED));
+    doc.add(new Field("id", "3", storedNotIndexed));
+    doc.add(new Field("allthesame", "allthesame", TextField.TYPE_STORED));
+    iw.addDocument(doc);
+    doc = new Document();
+    doc.add(new Field("body", "one one one one two", TextField.TYPE_STORED));
+    doc.add(new Field("test", "one two one two three three three four", TextField.TYPE_STORED));
+    doc.add(new Field("id", "4", storedNotIndexed));
+    doc.add(new Field("allthesame", "allthesame", TextField.TYPE_STORED));
+    iw.addDocument(doc);
+    iw.close();
+  }
+  
+  public void testRIDFPruning() throws Exception {
+    RAMDirectory targetDir = new RAMDirectory();
+    CompositeReader topLevelReader = DirectoryReader.open(sourceDir);
+    AtomicReader in = SlowCompositeReaderWrapper.wrap(topLevelReader);
+    // remove only very popular terms
+    RIDFTermPruningPolicy ridf = new RIDFTermPruningPolicy(in, null, null, -0.12);
+    PruningAtomicReader tfr = new PruningAtomicReader(in, null, ridf);
+    assertTDCount(tfr, new Term("body", "one"), 0);
+    assertTD(tfr, new Term("body", "two"), new int[]{0, 1, 2, 4});
+    assertTD(tfr, new Term("body", "three"), new int[]{0, 1, 3});
+    assertTD(tfr, new Term("test", "one"), new int[]{4});
+    assertTD(tfr, new Term("body", "four"), new int[]{0});
+    assertTD(tfr, new Term("test", "four"), new int[]{4});
+    in.close();
+    topLevelReader.close();
+  }
+
+  public void testTfPruning() throws Exception {
+    RAMDirectory targetDir = new RAMDirectory();
+    CompositeReader topLevelReader = DirectoryReader.open(sourceDir);
+    AtomicReader in = SlowCompositeReaderWrapper.wrap(topLevelReader);
+    TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, null, null, 2);
+    PruningAtomicReader tfr = new PruningAtomicReader(in, null, tfp);
+    // verify
+    assertTD(tfr, new Term("body", "one"), new int[]{1, 2, 3, 4});
+    assertTD(tfr, new Term("body", "two"), new int[]{1, 2});
+    assertTD(tfr, new Term("body", "three"), new int[]{1, 3});
+    assertTD(tfr, new Term("test", "one"), new int[]{4});
+    assertTDCount(tfr, new Term("body", "four"), 0);
+    assertTDCount(tfr, new Term("test", "four"), 0);
+    // verify new reader
+    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
+    IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
+    iw.addIndexes(new AtomicReader[]{tfr});
+    iw.close();
+    DirectoryReader newIndex = DirectoryReader.open(targetDir);
+    AtomicReader ir = SlowCompositeReaderWrapper.wrap(newIndex);
+    assertTD(ir, new Term("body", "one"), new int[]{1, 2, 3, 4});
+    assertTD(ir, new Term("body", "two"), new int[]{1, 2});
+    assertTD(ir, new Term("body", "three"), new int[]{1, 3});
+    assertTD(ir, new Term("test", "one"), new int[]{4});
+    tfr.close();
+    ir.close();
+    newIndex.close();
+    topLevelReader.close();
+  }
+  
+  public void testCarmelTopKPruning() throws Exception {
+    CompositeReader topLevelReader = DirectoryReader.open(sourceDir);
+    AtomicReader in = SlowCompositeReaderWrapper.wrap(topLevelReader);
+    // validate full scores - without pruning, just to make sure we test the right thing
+    validateDocScores(fullScores, in, false, false); // validate both docs and scores
+    // prune reader
+    CarmelTopKTermPruningPolicy tfp = new CarmelTopKTermPruningPolicy(in, null, K, EPSILON, R, null);
+    PruningAtomicReader tfr = new PruningAtomicReader(in, null, tfp);
+    
+    // create the pruned index
+    RAMDirectory targetDir = new RAMDirectory();
+    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
+    IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
+    iw.addIndexes(new AtomicReader[]{tfr});
+    iw.close();
+    in.close();
+
+    // validate scores of pruned index
+    AtomicReader ir = SlowCompositeReaderWrapper.wrap(topLevelReader);
+    validateDocScores(prunedScores, ir, false, true); // validated only docs (scores have changed after pruning)
+    ir.close();
+
+    topLevelReader.close();
+  }
+  
+  private void validateDocScores(Map<Term,ScoreDoc[]> baseScores, AtomicReader in, boolean print, boolean onlyDocs) throws IOException {
+    validateDocScores(baseScores, in, new Term("body", "one"), print, onlyDocs);
+    validateDocScores(baseScores, in, new Term("body", "two"), print, onlyDocs);
+    validateDocScores(baseScores, in, new Term("body", "three"), print, onlyDocs);
+    validateDocScores(baseScores, in, new Term("test", "one"), print, onlyDocs);
+    validateDocScores(baseScores, in, new Term("allthesame", "allthesame"), print, onlyDocs);
+  }
+  
+  /** validate the doc-scores, optionally also print them */
+  private void validateDocScores(Map<Term,ScoreDoc[]> baseScores, AtomicReader in, Term term, boolean print, boolean onlyDocs) throws IOException {
+    if (print) {
+      printDocScores(baseScores, in, term);
+    }
+    float delta = .0001f;
+    IndexSearcher is = new IndexSearcher(in);
+    TermQuery q = new TermQuery(term);
+    ScoreDoc[] sd = is.search(q, 100).scoreDocs;
+    assertNotNull("unknown result for term: "+term, baseScores.get(term));
+    assertEquals("wrong number of results!", baseScores.get(term).length, sd.length);
+    for (int i = 0; i < sd.length; i++) {
+      assertEquals("wrong doc!", baseScores.get(term)[i].doc, sd[i].doc);
+      if (!onlyDocs) {
+        assertEquals("wrong score!", baseScores.get(term)[i].score, sd[i].score, delta);
+      }
+    }
+  }
+
+  /** Print the doc scores (in a code format */
+  private void printDocScores(Map<Term,ScoreDoc[]> baseScores, AtomicReader in, Term term) throws IOException {
+    IndexSearcher is = new IndexSearcher(in);
+    TermQuery q = new TermQuery(term);
+    ScoreDoc[] scoreDocs = is.search(q, 100).scoreDocs;
+    System.out.println("t = new Term(\""+term.field+"\",\""+term.text()+"\");");
+    System.out.println("sd = new ScoreDoc[] {");
+    for (ScoreDoc sd : scoreDocs) {
+      System.out.println("    new ScoreDoc("+sd.doc+", "+sd.score+"f),");
+    }
+    System.out.println("res.put(t,sd);");
+  }
+
+  public void testThresholds() throws Exception {
+    Map<String, Integer> thresholds = new HashMap<String, Integer>();
+    thresholds.put("test", 3);
+    CompositeReader topLevelReader = DirectoryReader.open(sourceDir);
+    AtomicReader in = SlowCompositeReaderWrapper.wrap(topLevelReader);
+    TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, null, thresholds, 2);
+    PruningAtomicReader tfr = new PruningAtomicReader(in, null, tfp);
+    assertTDCount(tfr, new Term("test", "one"), 0);
+    assertTDCount(tfr, new Term("test", "two"), 0);
+    assertTD(tfr, new Term("test", "three"), new int[]{4});
+    assertTDCount(tfr, new Term("test", "four"), 0);
+    in.close();
+    topLevelReader.close();
+  }
+  
+  public void testRemoveFields() throws Exception {
+    throw new AssertionError("Rewrite this test !");
+    /*
+    RAMDirectory targetDir = new RAMDirectory();
+    Map<String, Integer> removeFields = new HashMap<String, Integer>();
+    removeFields.put("test", PruningPolicy.DEL_POSTINGS | PruningPolicy.DEL_STORED);
+    CompositeReader topLevelReader = DirectoryReader.open(sourceDir);
+    AtomicReader in = SlowCompositeReaderWrapper.wrap(topLevelReader);
+    TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, removeFields, null, 2);
+    StorePruningPolicy stp = new StorePruningPolicy(in, removeFields);
+    PruningAtomicReader tfr = new PruningAtomicReader(in, stp, tfp);
+    StoredDocument doc = tfr.document(4);
+    // removed stored values?
+    assertNull(doc.get("test"));
+    // removed postings ?
+    Terms terms = tfr.fields().terms("test");
+    assertEquals(terms.size(), 0);
+
+    // but vectors should be present !
+    TermFreqVector tv = tfr.getTermFreqVector(4, "test");
+    assertNotNull(tv);
+    assertEquals(4, tv.getTerms().length); // term "four" not deleted yet from TermEnum
+    // verify new reader
+    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
+    IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
+    iw.addIndexes(new AtomicReader[]{tfr});
+    iw.close();
+    AtomicReader ir = AtomicReader.open(targetDir, true);
+    tv = ir.getTermFreqVector(4, "test");
+    assertNotNull(tv);
+    assertEquals(3, tv.getTerms().length); // term "four" was deleted from TermEnum
+    */
+  }
+
+}
-- 
1.8.1.5

