Index: lucene/build.xml
===================================================================
--- lucene/build.xml (revision 1237823)
+++ lucene/build.xml (working copy)
@@ -265,6 +265,7 @@
+
Index: lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java
===================================================================
--- lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java (revision 0)
+++ lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java (working copy)
@@ -0,0 +1,343 @@
+package org.apache.lucene.index;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.PruningReader;
+import org.apache.lucene.index.pruning.CarmelTopKTermPruningPolicy;
+import org.apache.lucene.index.pruning.PruningPolicy;
+import org.apache.lucene.index.pruning.StorePruningPolicy;
+import org.apache.lucene.index.pruning.TFTermPruningPolicy;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.LuceneTestCase;
+
+
+public class TestPruningReader extends LuceneTestCase {
+
+ // parameters for the Carmel-TopK-Pruning
+ private static final int R = 1; //number of terms in the query
+ private static final int K = 2; // top K results
+ private static final float EPSILON = .001f; // error in score
+
+ RAMDirectory sourceDir = new RAMDirectory();
+
+ /** once computed base on how index is created, these are the full scores, i.e. before pruning */
+ private static Map fullScores = initFullScores();
+ private static Map prunedScores = initPrunedScores();
+
+ private void assertTD(IndexReader ir, Term t, int[] ids) throws Exception {
+ TermPositions td = ir.termPositions(t);
+ assertNotNull(td);
+ try {
+ int i = 0;
+ while(td.next()) {
+ assertEquals(t + ", i=" + i, ids[i], td.doc());
+ i++;
+ }
+ assertEquals(ids.length, i);
+ } finally {
+ td.close();
+ }
+ }
+
+ /**
+ * Scores of the full, unpruned index.
+ */
+ private static Map initFullScores() {
+ HashMap res = new HashMap();
+ Term t;
+ ScoreDoc sd[];
+ t = new Term("body","one");
+ sd = new ScoreDoc[] {
+ new ScoreDoc(4, 0.74011815f),
+ new ScoreDoc(2, 0.54939526f),
+ new ScoreDoc(3, 0.54939526f),
+ new ScoreDoc(1, 0.44857934f),
+ new ScoreDoc(0, 0.42292467f)
+ };
+ res.put(t,sd);
+ t = new Term("body","two");
+ sd = new ScoreDoc[] {
+ new ScoreDoc(2, 0.7679404f),
+ new ScoreDoc(1, 0.62702066f),
+ new ScoreDoc(0, 0.5911608f),
+ new ScoreDoc(4, 0.5172657f)
+ };
+ res.put(t,sd);
+ t = new Term("body","three");
+ sd = new ScoreDoc[] {
+ new ScoreDoc(3, 0.7679404f),
+ new ScoreDoc(1, 0.62702066f),
+ new ScoreDoc(0, 0.5911608f)
+ };
+ res.put(t,sd);
+ t = new Term("test","one");
+ sd = new ScoreDoc[] {
+ new ScoreDoc(4, 2.9678855f)
+ };
+ res.put(t,sd);
+ t = new Term("allthesame","allthesame");
+ sd = new ScoreDoc[] {
+ new ScoreDoc(0, 0.84584934f),
+ new ScoreDoc(1, 0.84584934f),
+ new ScoreDoc(2, 0.84584934f),
+ new ScoreDoc(3, 0.84584934f),
+ new ScoreDoc(4, 0.84584934f)
+ };
+ res.put(t,sd);
+ return res;
+ }
+
+ /**
+ * Expected scores of the pruned index - with EPSILON=0.001, K=2, R=1
+ */
+ private static Map initPrunedScores() {
+ HashMap res = new HashMap();
+ Term t;
+ ScoreDoc sd[];
+ t = new Term("body","one");
+ sd = new ScoreDoc[] {
+ new ScoreDoc(4, 0.74011815f),
+ new ScoreDoc(2, 0.54939526f),
+ new ScoreDoc(3, 0.54939526f),
+ };
+ res.put(t,sd);
+ t = new Term("body","two");
+ sd = new ScoreDoc[] {
+ new ScoreDoc(2, 0.7679404f),
+ new ScoreDoc(1, 0.62702066f),
+ };
+ res.put(t,sd);
+ t = new Term("body","three");
+ sd = new ScoreDoc[] {
+ new ScoreDoc(3, 0.7679404f),
+ new ScoreDoc(1, 0.62702066f),
+ };
+ res.put(t,sd);
+ t = new Term("test","one");
+ sd = new ScoreDoc[] {
+ new ScoreDoc(4, 2.9678855f)
+ };
+ res.put(t,sd);
+ t = new Term("allthesame","allthesame"); // must keep all because all are the same!
+ sd = new ScoreDoc[] {
+ new ScoreDoc(0, 0.84584934f),
+ new ScoreDoc(1, 0.84584934f),
+ new ScoreDoc(2, 0.84584934f),
+ new ScoreDoc(3, 0.84584934f),
+ new ScoreDoc(4, 0.84584934f)
+ };
+ res.put(t,sd);
+ return res;
+ }
+
+ private void assertTDCount(IndexReader ir, Term t, int count) throws Exception {
+ TermPositions td = ir.termPositions(t);
+ assertNotNull(td);
+ try {
+ int i = 0;
+ while (td.next()) i++;
+ assertEquals(t.toString(), count, i);
+ } finally {
+ td.close();
+ }
+ }
+
+ public void setUp() throws Exception {
+ super.setUp();
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
+ IndexWriter iw = new IndexWriter(sourceDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
+ Document doc = new Document();
+ doc.add(new Field("body", "one two three four", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "0", Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
+ iw.addDocument(doc);
+ doc = new Document();
+ doc.add(new Field("body", "one two three one two three", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
+ iw.addDocument(doc);
+ doc = new Document();
+ doc.add(new Field("body", "one two one two one two", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
+ iw.addDocument(doc);
+ doc = new Document();
+ doc.add(new Field("body", "one three one three one three", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
+ iw.addDocument(doc);
+ doc = new Document();
+ doc.add(new Field("body", "one one one one two", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("test", "one two one two three three three four", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS_OFFSETS));
+ doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
+ iw.addDocument(doc);
+ // to be deleted
+ doc = new Document();
+ doc.add(new Field("body", "one three one three one three five five five", Field.Store.YES, Field.Index.ANALYZED));
+ doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
+ iw.addDocument(doc);
+ iw.close();
+ IndexReader ir = IndexReader.open(sourceDir, false);
+ ir.deleteDocument(5);
+ ir.close();
+ }
+
+ public void testTfPruning() throws Exception {
+ RAMDirectory targetDir = new RAMDirectory();
+ IndexReader in = IndexReader.open(sourceDir, true);
+ TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, null, null, 2);
+ PruningReader tfr = new PruningReader(in, null, tfp);
+ // verify
+// assertTD(tfr, new Term("body", "one"), new int[]{1, 2, 3, 4});
+// assertTD(tfr, new Term("body", "two"), new int[]{1, 2});
+// assertTD(tfr, new Term("body", "three"), new int[]{1, 3});
+// assertTD(tfr, new Term("test", "one"), new int[]{4});
+// assertTDCount(tfr, new Term("body", "four"), 0);
+// assertTDCount(tfr, new Term("test", "four"), 0);
+ // verify new reader
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
+ IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
+ iw.addIndexes(new IndexReader[]{tfr});
+ iw.close();
+ IndexReader ir = IndexReader.open(targetDir, true);
+ assertTD(ir, new Term("body", "one"), new int[]{1, 2, 3, 4});
+ assertTD(ir, new Term("body", "two"), new int[]{1, 2});
+ assertTD(ir, new Term("body", "three"), new int[]{1, 3});
+ assertTD(ir, new Term("test", "one"), new int[]{4});
+ tfr.close();
+ ir.close();
+ }
+
+ public void testCarmelTopKPruning() throws Exception {
+ IndexReader in = IndexReader.open(sourceDir, true);
+ // validate full scores - without pruning, just to make sure we test the right thing
+ validateDocScores(fullScores, in, false, false); // validate both docs and scores
+ // prune reader
+ CarmelTopKTermPruningPolicy tfp = new CarmelTopKTermPruningPolicy(in, null, K, EPSILON, R, null);
+ PruningReader tfr = new PruningReader(in, null, tfp);
+
+ // create the pruned index
+ RAMDirectory targetDir = new RAMDirectory();
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
+ IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
+ iw.addIndexes(new IndexReader[]{tfr});
+ iw.close();
+ in.close();
+
+ // validate scores of pruned index
+ IndexReader ir = IndexReader.open(targetDir, true);
+ validateDocScores(prunedScores, ir, false, true); // validated only docs (scores have changed after pruning)
+ ir.close();
+ }
+
+ private void validateDocScores(Map baseScores, IndexReader in, boolean print, boolean onlyDocs) throws IOException {
+ validateDocScores(baseScores, in, new Term("body", "one"), print, onlyDocs);
+ validateDocScores(baseScores, in, new Term("body", "two"), print, onlyDocs);
+ validateDocScores(baseScores, in, new Term("body", "three"), print, onlyDocs);
+ validateDocScores(baseScores, in, new Term("test", "one"), print, onlyDocs);
+ validateDocScores(baseScores, in, new Term("allthesame", "allthesame"), print, onlyDocs);
+ }
+
+ /** validate the doc-scores, optionally also print them */
+ private void validateDocScores(Map baseScores, IndexReader in, Term term, boolean print, boolean onlyDocs) throws IOException {
+ if (print) {
+ printDocScores(baseScores, in, term);
+ }
+ float delta = .0001f;
+ IndexSearcher is = new IndexSearcher(in);
+ TermQuery q = new TermQuery(term);
+ ScoreDoc[] sd = is.search(q, 100).scoreDocs;
+ assertNotNull("unknown result for term: "+term, baseScores.get(term));
+ assertEquals("wrong number of results!", baseScores.get(term).length, sd.length);
+ for (int i = 0; i < sd.length; i++) {
+ assertEquals("wrong doc!", baseScores.get(term)[i].doc, sd[i].doc);
+ if (!onlyDocs) {
+ assertEquals("wrong score!", baseScores.get(term)[i].score, sd[i].score, delta);
+ }
+ }
+ }
+
+ /** Print the doc scores (in a code format */
+ private void printDocScores(Map baseScores, IndexReader in, Term term) throws IOException {
+ IndexSearcher is = new IndexSearcher(in);
+ TermQuery q = new TermQuery(term);
+ ScoreDoc[] scoreDocs = is.search(q, 100).scoreDocs;
+ System.out.println("t = new Term(\""+term.field+"\",\""+term.text+"\");");
+ System.out.println("sd = new ScoreDoc[] {");
+ for (ScoreDoc sd : scoreDocs) {
+ System.out.println(" new ScoreDoc("+sd.doc+", "+sd.score+"f),");
+ }
+ System.out.println("res.put(t,sd);");
+ }
+
+ public void testThresholds() throws Exception {
+ Map thresholds = new HashMap();
+ thresholds.put("test", 3);
+ IndexReader in = IndexReader.open(sourceDir, true);
+ TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, null, thresholds, 2);
+ PruningReader tfr = new PruningReader(in, null, tfp);
+ assertTDCount(tfr, new Term("test", "one"), 0);
+ assertTDCount(tfr, new Term("test", "two"), 0);
+ assertTD(tfr, new Term("test", "three"), new int[]{4});
+ assertTDCount(tfr, new Term("test", "four"), 0);
+ }
+
+ public void testRemoveFields() throws Exception {
+ RAMDirectory targetDir = new RAMDirectory();
+ Map removeFields = new HashMap();
+ removeFields.put("test", PruningPolicy.DEL_POSTINGS | PruningPolicy.DEL_STORED);
+ IndexReader in = IndexReader.open(sourceDir, true);
+ TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, removeFields, null, 2);
+ StorePruningPolicy stp = new StorePruningPolicy(in, removeFields);
+ PruningReader tfr = new PruningReader(in, stp, tfp);
+ Document doc = tfr.document(4);
+ // removed stored values?
+ assertNull(doc.get("test"));
+ // removed postings ?
+ TermEnum te = tfr.terms();
+ while (te.next()) {
+ assertFalse("test".equals(te.term().field()));
+ }
+ // but vectors should be present !
+ TermFreqVector tv = tfr.getTermFreqVector(4, "test");
+ assertNotNull(tv);
+ assertEquals(4, tv.getTerms().length); // term "four" not deleted yet from TermEnum
+ // verify new reader
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
+ IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
+ iw.addIndexes(new IndexReader[]{tfr});
+ iw.close();
+ IndexReader ir = IndexReader.open(targetDir, true);
+ tv = ir.getTermFreqVector(4, "test");
+ assertNotNull(tv);
+ assertEquals(3, tv.getTerms().length); // term "four" was deleted from TermEnum
+ }
+
+}
Property changes on: lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java (working copy)
@@ -0,0 +1,94 @@
+package org.apache.lucene.index.pruning;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+
+/**
+ * Implementation of {@link TermPruningPolicy} that uses "residual IDF"
+ * metric to determine the postings of terms to keep/remove. Residual
+ * IDF is a difference between a collection-wide IDF of a term and the
+ * observed in-document frequency of the term.
+ */
+public class RIDFTermPruningPolicy extends TermPruningPolicy {
+ double defThreshold;
+ Map thresholds;
+ double df;
+ double maxDoc;
+
+ public RIDFTermPruningPolicy(IndexReader in,
+ Map fieldFlags, Map thresholds,
+ double defThreshold) {
+ super(in, fieldFlags);
+ this.defThreshold = defThreshold;
+ if (thresholds != null) {
+ this.thresholds = thresholds;
+ } else {
+ this.thresholds = Collections.emptyMap();
+ }
+ maxDoc = in.maxDoc();
+ }
+
+ @Override
+ public void initPositionsTerm(TermPositions tp, Term t) throws IOException {
+ df = Math.log(in.docFreq(t) / maxDoc);
+ }
+
+ @Override
+ public boolean pruneTermEnum(TermEnum te) throws IOException {
+ return false;
+ }
+
+ @Override
+ public boolean pruneAllPositions(TermPositions termPositions, Term t)
+ throws IOException {
+ double ridf = Math.log(1 - Math.pow(Math.E, termPositions.freq() / maxDoc)) - df;
+ double thr = defThreshold;
+ String key = t.field() + ":" + t.text();
+ if (thresholds.containsKey(key)) {
+ thr = thresholds.get(key);
+ } else if (thresholds.containsKey(t.field())) {
+ thr = thresholds.get(t.field());
+ }
+ if (ridf > thr) {
+ return false; // keep
+ } else {
+ return true;
+ }
+ }
+
+ @Override
+ public int pruneTermVectorTerms(int docNumber, String field, String[] terms,
+ int[] freqs, TermFreqVector v) throws IOException {
+ return 0;
+ }
+
+ @Override
+ public int pruneSomePositions(int docNum, int[] positions, Term curTerm) {
+ return 0; //this policy either prunes all or none, so nothing to prune here
+ }
+
+}
Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/RIDFTermPruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/PruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/PruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/PruningPolicy.java (working copy)
@@ -0,0 +1,34 @@
+package org.apache.lucene.index.pruning;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * General Definitions for Index Pruning, such as operations to be performed on field data.
+ */
+public class PruningPolicy {
+ /** Delete (some or all) postings for this field. */
+ public static final int DEL_POSTINGS = 0x01;
+ /** Delete (some or all) stored values for this field. */
+ public static final int DEL_STORED = 0x02;
+ /** Delete term frequency vectors for this field (whole vectors or individual terms). */
+ public static final int DEL_VECTOR = 0x04;
+ /** Delete (some or all) payloads in these fields. */
+ public static final int DEL_PAYLOADS = 0x08;
+ /** Delete all data for this field. */
+ public static final int DEL_ALL = 0xff;
+}
Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/PruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/CarmelTopKTermPruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/CarmelTopKTermPruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/CarmelTopKTermPruningPolicy.java (working copy)
@@ -0,0 +1,273 @@
+package org.apache.lucene.index.pruning;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopScoreDocCollector;
+
+/**
+ * Pruning policy with a search quality parameterized guarantee - configuration
+ * of this policy allows to specify two parameters: k and
+ * ε such that:
+ *
+ *
+ *
+ *
+ * For any OR query with r terms, the score of each of the top
+ * k results in the original index, should be "practically the same" as
+ * the score that document in the pruned index: the scores difference should not
+ * exceed r * ε.
+ * The claim of this pruning technique is, quoting from the above paper:
+ *
+ *
+ *
+ *
+ * Prune the index in such a way that a human
+ * "cannot distinguish the difference" between the results of a search engine
+ * whose index is pruned and one whose index is not pruned.
+ *
+ *
+ *
+ * For indexes with a large number of terms this policy might be too slow. In
+ * such situations, the uniform pruning approach in
+ * {@link CarmelUniformTermPruningPolicy} will be faster, though it might
+ * produce inferior search quality, as that policy does not pose a theoretical
+ * guarantee on resulted search quality.
+ *
+ * TODO implement also CarmelTermPruningDeltaTopPolicy
+ */
+public class CarmelTopKTermPruningPolicy extends TermPruningPolicy {
+
+ /**
+ * Default number of guaranteed top K scores
+ */
+ public static final int DEFAULT_TOP_K = 10;
+
+ /**
+ * Default number of query terms
+ */
+ public static final int DEFAULT_R = 1;
+
+ /**
+ * Default largest meaningless score difference
+ */
+ public static final float DEFAULT_EPSILON = .001f;
+
+ private int docsPos = 0;
+ private int k;
+ private ScoreDoc[] docs = null;
+ private IndexSearcher is;
+ private boolean noPruningForCurrentTerm;
+ private float scoreDelta;
+
+ /**
+ * Constructor with default parameters
+ *
+ * @see #DEFAULT_TOP_K
+ * @see #DEFAULT_EPSILON
+ * @see #DEFAULT_R
+ * @see DefaultSimilarity
+ * @see #CarmelTopKTermPruningPolicy(IndexReader, Map, int, float, int, Similarity)
+ */
+ public CarmelTopKTermPruningPolicy(IndexReader in,
+ Map fieldFlags) {
+ this(in, fieldFlags, DEFAULT_TOP_K, DEFAULT_EPSILON, DEFAULT_R, null);
+ }
+
+ /**
+ * Constructor with specific settings
+ *
+ * @param in reader for original index
+ * @param k number of guaranteed top scores. Each top K results in the pruned
+ * index is either also an original top K result or its original
+ * score is indistinguishable from some original top K result.
+ * @param epsilon largest meaningless score difference Results whose scores
+ * difference is smaller or equal to epsilon are considered
+ * indistinguishable.
+ * @param r maximal number of terms in a query for which search quaility in
+ * pruned index is guaranteed
+ * @param sim similarity to use when selecting top docs fir each index term.
+ * When null, {@link DefaultSimilarity} is used.
+ */
+ public CarmelTopKTermPruningPolicy(IndexReader in,
+ Map fieldFlags, int k, float epsilon, int r,
+ Similarity sim) {
+ super(in, fieldFlags);
+ this.k = k;
+ is = new IndexSearcher(in);
+ is.setSimilarity(sim != null ? sim : new DefaultSimilarity());
+ scoreDelta = epsilon * r;
+ }
+
+ // too costly - pass everything at this stage
+ @Override
+ public boolean pruneTermEnum(TermEnum te) throws IOException {
+ return false;
+ }
+
+ @Override
+ public void initPositionsTerm(TermPositions tp, Term t) throws IOException {
+ // check if there's any point to prune this term
+ int df = in.docFreq(t);
+ noPruningForCurrentTerm = (df <= k);
+ if (noPruningForCurrentTerm) {
+ return;
+ }
+ // take more results (k2>k), attempting for sufficient results to avoid a
+ // second search
+ int k2 = Math.min(2 * k, k + 100); // for small k's 2*k will do, but for
+ // large ones (1000's) keep overhead
+ // smaller
+ k2 = Math.min(k2, df); // no more than the potential number of results
+ TopScoreDocCollector collector = TopScoreDocCollector.create(k2, true);
+ TermQuery tq = new TermQuery(t);
+ is.search(tq, collector);
+ docs = collector.topDocs().scoreDocs;
+ float threshold = docs[k - 1].score - scoreDelta;
+
+ int nLast = k2 - 1;
+ nLast = Math.min(nLast, docs.length - 1); // protect in case of deleted docs
+ if (docs[nLast].score < threshold) {
+ // this is the better/faster case - no need to go over docs again - we
+ // have top ones
+ int n = nLast;
+ while (docs[n - 1].score < threshold)
+ --n; // n == num-valid-docs == first-invalid-doc
+ ScoreDoc[] subset = new ScoreDoc[n];
+ System.arraycopy(docs, 0, subset, 0, n);
+ docs = subset;
+ // sort by doc but only after taking top scores
+ Arrays.sort(docs, ByDocComparator.INSTANCE);
+ } else {
+ // this is the worse case - must go over docs again
+ ThresholdCollector thresholdCollector = new ThresholdCollector(threshold);
+ is.search(tq, thresholdCollector);
+ docs = thresholdCollector.scoreDocs.toArray(new ScoreDoc[0]);
+ }
+ docsPos = 0;
+ }
+
+ @Override
+ public boolean pruneAllPositions(TermPositions termPositions, Term t)
+ throws IOException {
+ if (noPruningForCurrentTerm) {
+ return false;
+ }
+ if (docsPos >= docs.length) { // used up all doc id-s
+ return true; // skip any remaining docs
+ }
+ while ((docsPos < docs.length - 1)
+ && termPositions.doc() > docs[docsPos].doc) {
+ docsPos++;
+ }
+ if (termPositions.doc() == docs[docsPos].doc) {
+ // pass
+ docsPos++; // move to next doc id
+ return false;
+ } else if (termPositions.doc() < docs[docsPos].doc) {
+ return true; // skip this one - it's less important
+ }
+ // should not happen!
+ throw new IOException("termPositions.doc > docs[docsPos].doc");
+ }
+
+ // it probably doesn't make sense to prune term vectors using this method,
+ // due to its overhead
+ @Override
+ public int pruneTermVectorTerms(int docNumber, String field, String[] terms,
+ int[] freqs, TermFreqVector tfv) throws IOException {
+ return 0;
+ }
+
+ public static class ByDocComparator implements Comparator {
+ public static final ByDocComparator INSTANCE = new ByDocComparator();
+
+ public int compare(ScoreDoc o1, ScoreDoc o2) {
+ return o1.doc - o2.doc;
+ }
+ }
+
+ @Override
+ public int pruneSomePositions(int docNum, int[] positions, Term curTerm) {
+ return 0; // this policy either prunes all or none, so nothing to prune here
+ }
+
+ /**
+ * Collect all docs with score >= higher threshold
+ */
+ private static class ThresholdCollector extends Collector {
+
+ private ArrayList scoreDocs = new ArrayList();
+ private Scorer scorer;
+ private float threshold;
+ private int docBase;
+
+ public ThresholdCollector(float threshold) {
+ this.threshold = threshold;
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return false;
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
+ float score = scorer.score();
+ if (score >= threshold) {
+ scoreDocs.add(new ScoreDoc(docBase + doc, score));
+ }
+ }
+
+ @Override
+ public void setNextReader(IndexReader reader, int docBase)
+ throws IOException {
+ this.docBase = docBase;
+ }
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ this.scorer = scorer;
+ }
+
+ }
+}
Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/CarmelTopKTermPruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/package.html
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/package.html (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/package.html (working copy)
@@ -0,0 +1,33 @@
+
+
+
+
+ Index Pruning
+
+
+
+Static Index Pruning Tools
+
+This package provides a framework for pruning an existing index into
+a smaller index while retaining visible search quality as much as possible.
+
+
+
+
+
+
Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/package.html
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/CarmelUniformTermPruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/CarmelUniformTermPruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/CarmelUniformTermPruningPolicy.java (working copy)
@@ -0,0 +1,186 @@
+package org.apache.lucene.index.pruning;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopScoreDocCollector;
+
+/**
+ * Enhanced implementation of Carmel Uniform Pruning,
+ *
+ * {@link TermPositions} whose in-document frequency is below a specified
+ * threshold
+ *
+ * See {@link CarmelTopKTermPruningPolicy} for link to the paper describing this
+ * policy. are pruned.
+ *
+ * Conclusions of that paper indicate that it's best to compute per-term
+ * thresholds, as we do in {@link CarmelTopKTermPruningPolicy}. However for
+ * large indexes with a large number of terms that method might be too slow, and
+ * the (enhanced) uniform approach implemented here may will be faster, although
+ * it might produce inferior search quality.
+ *
+ * This implementation enhances the Carmel uniform pruning approach, as it
+ * allows to specify three levels of thresholds:
+ *
+ *
one default threshold - globally (for terms in all fields)
+ *
threshold per field
+ *
threshold per term
+ *
+ *
+ * These thresholds are applied so that always the most specific one takes
+ * precedence: first a per-term threshold is used if present, then per-field
+ * threshold if present, and finally the default threshold.
+ *
+ * Threshold are maintained in a map, keyed by either field names or terms in
+ * field:text format. precedence of these values is the following:
+ *
+ * Thresholds in this method of pruning are expressed as the percentage of the
+ * top-N scoring documents per term that are retained. The list of top-N
+ * documents is established by using a regular {@link IndexSearcher} and
+ * {@link Similarity} to run a simple {@link TermQuery}.
+ *
+ * Smaller threshold value will produce a smaller index. See
+ * {@link TermPruningPolicy} for size vs performance considerations.
+ *
+ * For indexes with a large number of terms this policy might be still too slow,
+ * since it issues a term query for each term in the index. In such situations,
+ * the term frequency pruning approach in {@link TFTermPruningPolicy} will be
+ * faster, though it might produce inferior search quality.
+ */
+public class CarmelUniformTermPruningPolicy extends TermPruningPolicy {
+ int docsPos = 0;
+ float curThr;
+ float defThreshold;
+ Map thresholds;
+ ScoreDoc[] docs = null;
+ IndexSearcher is;
+ Similarity sim;
+
+ public CarmelUniformTermPruningPolicy(IndexReader in,
+ Map fieldFlags, Map thresholds,
+ float defThreshold, Similarity sim) {
+ super(in, fieldFlags);
+ this.defThreshold = defThreshold;
+ if (thresholds != null) {
+ this.thresholds = thresholds;
+ } else {
+ this.thresholds = Collections.emptyMap();
+ }
+ if (sim != null) {
+ this.sim = sim;
+ } else {
+ sim = new DefaultSimilarity();
+ }
+ is = new IndexSearcher(in);
+ is.setSimilarity(sim);
+ }
+
+ // too costly - pass everything at this stage
+ @Override
+ public boolean pruneTermEnum(TermEnum te) throws IOException {
+ return false;
+ }
+
+ @Override
+ public void initPositionsTerm(TermPositions tp, Term t) throws IOException {
+ curThr = defThreshold;
+ String termKey = t.field() + ":" + t.text();
+ if (thresholds.containsKey(termKey)) {
+ curThr = thresholds.get(termKey);
+ } else if (thresholds.containsKey(t.field())) {
+ curThr = thresholds.get(t.field());
+ }
+ // calculate count
+ int df = in.docFreq(t);
+ int count = Math.round((float) df * curThr);
+ if (count < 100) count = 100;
+ TopScoreDocCollector collector = TopScoreDocCollector.create(count, true);
+ TermQuery tq = new TermQuery(t);
+ is.search(tq, collector);
+ docs = collector.topDocs().scoreDocs;
+ if (docs.length > count) {
+ // TODO deadcode: can topSDcollector(count) produce more than count
+ // results?
+ // take top subset *before* sorting by ID
+ ScoreDoc[] subset = new ScoreDoc[count];
+ System.arraycopy(docs, 0, subset, 0, count);
+ docs = subset;
+ }
+ Arrays.sort(docs, ByDocComparator.INSTANCE);
+ docsPos = 0;
+ }
+
+ @Override
+ public boolean pruneAllPositions(TermPositions termPositions, Term t)
+ throws IOException {
+ if (docsPos >= docs.length) { // used up all doc id-s
+ return true; // skip any remaining docs
+ }
+ while ((docsPos < docs.length - 1)
+ && termPositions.doc() > docs[docsPos].doc) {
+ docsPos++;
+ }
+ if (termPositions.doc() == docs[docsPos].doc) {
+ // pass
+ docsPos++; // move to next doc id
+ return false;
+ } else if (termPositions.doc() < docs[docsPos].doc) {
+ return true; // skip this one - it's less important
+ }
+ // should not happen!
+ throw new IOException("termPositions.doc > docs[docsPos].doc");
+ }
+
+ // it probably doesn't make sense to prune term vectors using this method,
+ // due to its overhead
+ @Override
+ public int pruneTermVectorTerms(int docNumber, String field, String[] terms,
+ int[] freqs, TermFreqVector tfv) throws IOException {
+ return 0;
+ }
+
+ public static class ByDocComparator implements Comparator {
+ public static final ByDocComparator INSTANCE = new ByDocComparator();
+
+ public int compare(ScoreDoc o1, ScoreDoc o2) {
+ return o1.doc - o2.doc;
+ }
+ }
+
+ @Override
+ public int pruneSomePositions(int docNum, int[] positions, Term curTerm) {
+ return 0; // this policy either prunes all or none, so nothing to prune here
+ }
+
+}
Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/CarmelUniformTermPruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/StorePruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/StorePruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/StorePruningPolicy.java (working copy)
@@ -0,0 +1,136 @@
+package org.apache.lucene.index.pruning;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+import java.util.logging.Logger;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.FieldSelectorResult;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexReader;
+
+/**
+ * Pruning policy for removing stored fields from documents.
+ */
+public class StorePruningPolicy extends PruningPolicy {
+
+ private static final Logger LOG = Logger.getLogger(StorePruningPolicy.class.getName());
+
+ /** Pruning in effect for each field */
+ protected Map fieldFlags;
+
+ /** Fields to be completely deleted */
+ protected Set deleteAll;
+
+ protected DelFieldSelector fs;
+ protected IndexReader in;
+ protected int delFields; // total number of fields deleted
+
+ /**
+ * Constructs a policy.
+ * @param in input reader.
+ * @param fieldFlags a map where keys are field names, and flags are
+ * bitwise-OR values of flags defined in {@link PruningPolicy}.
+ */
+ public StorePruningPolicy(IndexReader in, Map fieldFlags) {
+ if (fieldFlags != null) {
+ this.fieldFlags = fieldFlags;
+ deleteAll = new HashSet();
+ for (Entry e : fieldFlags.entrySet()) {
+ if (e.getValue() == PruningPolicy.DEL_ALL) {
+ deleteAll.add(e.getKey());
+ }
+ }
+ } else {
+ this.fieldFlags = Collections.emptyMap();
+ deleteAll = Collections.emptySet();
+ }
+ fs = new DelFieldSelector(fieldFlags);
+ this.in = in;
+ }
+
+ /**
+ * Compute field infos that should be retained
+ * @param allInfos original field infos
+ * @return those of the original field infos which should not be removed.
+ */
+ public FieldInfos getFieldInfos(FieldInfos allInfos) {
+ // for simplicity remove only fields with DEL_ALL
+ FieldInfos res = new FieldInfos();
+ for (FieldInfo fi: allInfos) {
+ if (!deleteAll.contains(fi.name)) {
+ res.add(fi);
+ }
+ }
+ return res;
+ }
+
+ /**
+ * Prune stored fields of a document. Note that you can also arbitrarily
+ * change values of the retrieved fields, so long as the field names belong
+ * to a list of fields returned from {@link #getFieldInfos(FieldInfos)}.
+ * @param doc document number
+ * @param parent original field selector that limits what fields will be
+ * retrieved.
+ * @return a pruned instance of a Document.
+ * @throws IOException
+ */
+ public Document pruneDocument(int doc, FieldSelector parent) throws IOException {
+ if (fieldFlags.isEmpty()) {
+ return in.document(doc, parent);
+ } else {
+ fs.setParent(parent);
+ return in.document(doc, fs);
+ }
+ }
+
+ class DelFieldSelector implements FieldSelector {
+ private static final long serialVersionUID = -4913592063491685103L;
+ private FieldSelector parent;
+ private Map remove;
+
+ public DelFieldSelector(Map remove) {
+ this.remove = remove;
+ }
+
+ public void setParent(FieldSelector parent) {
+ this.parent = parent;
+ }
+
+ public FieldSelectorResult accept(String fieldName) {
+ if (!remove.isEmpty() && remove.containsKey(fieldName) &&
+ ((remove.get(fieldName) & DEL_STORED) > 0)) {
+ delFields++;
+ if (delFields % 10000 == 0) {
+ LOG.info(" - stored fields: removed " + delFields + " fields.");
+ }
+ return FieldSelectorResult.NO_LOAD;
+ } else if (parent != null) {
+ return parent.accept(fieldName);
+ } else return FieldSelectorResult.LOAD;
+ }
+ };
+
+}
Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/StorePruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/PruningTool.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/PruningTool.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/PruningTool.java (working copy)
@@ -0,0 +1,176 @@
+package org.apache.lucene.index.pruning;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.PruningReader;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+
+/**
+ * A command-line tool to configure and run a {@link PruningReader} on an input
+ * index and produce a pruned output index using
+ * {@link IndexWriter#addIndexes(IndexReader...)}.
+ */
+public class PruningTool {
+
+ public static void main(String[] args) throws Exception {
+ int res = run(args);
+ System.exit(res);
+ }
+
+ public static int run(String[] args) throws Exception {
+ if (args.length < 5) {
+ System.err.println("Usage: PruningTool -impl (tf | carmel | carmeltopk | ridf) (-in [-in ...]) " +
+ "-out -t [-del f1,f2,..] [-conf ] [-topkk ] [-topke ] [-topkr ]");
+ System.err.println("\t-impl (tf | carmel | carmeltopk | ridf)\tTermPruningPolicy implementation name: TF or CarmelUniform or or CarmelTopK or RIDFTerm");
+ System.err.println("\t-in path\tpath to the input index. Can specify multiple input indexes.");
+ System.err.println("\t-out path\toutput path where the output index will be stored.");
+ System.err.println("\t-t NN\tdefault threshold value (minimum in-document frequency) for all terms");
+ System.err.println("\t-del f1,f2,..\tcomma-separated list of field specs to delete (postings, vectors & stored):");
+ System.err.println("\t\tfield spec : fieldName ( ':' [pPsv] )");
+ System.err.println("\t\twhere: p - postings, P - payloads, s - stored value, v - vectors");
+ System.err.println("\t-conf file\tpath to config file with per-term thresholds");
+ System.err.println("\t-topkk NN\t'K' for Carmel TopK Pruning: number of guaranteed top scores");
+ System.err.println("\t-topke NN\t'Epsilon' for Carmel TopK Pruning: largest meaningless score difference");
+ System.err.println("\t-topkr NN\t'R' for Carmel TopK Pruning: planned maximal number of terms in a query on pruned index");
+ return -1;
+ }
+ ArrayList inputs = new ArrayList();
+ Directory out = null;
+ float thr = -1;
+ Map delFields = new HashMap();
+
+ // parameters for top-K pruning
+ int topkK = CarmelTopKTermPruningPolicy.DEFAULT_TOP_K;
+ float topkEpsilon = CarmelTopKTermPruningPolicy.DEFAULT_EPSILON;
+ int topkR = CarmelTopKTermPruningPolicy.DEFAULT_R;
+
+ String impl = null;
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-in")) {
+ Directory d = FSDirectory.open(new File(args[++i]));
+ if (!IndexReader.indexExists(d)) {
+ System.err.println("WARN: no index in " + args[i] + ", skipping ...");
+ }
+ inputs.add(IndexReader.open(d, true));
+ } else if (args[i].equals("-out")) {
+ File outFile = new File(args[++i]);
+ if (outFile.exists()) {
+ throw new Exception("Output " + outFile + " already exists.");
+ }
+ outFile.mkdirs();
+ out = FSDirectory.open(outFile);
+ } else if (args[i].equals("-impl")) {
+ impl = args[++i];
+ } else if (args[i].equals("-t")) {
+ thr = Float.parseFloat(args[++i]);
+ } else if (args[i].equals("-topkk")) {
+ topkK = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-topke")) {
+ topkEpsilon = Float.parseFloat(args[++i]);
+ } else if (args[i].equals("-topkr")) {
+ topkR = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-del")) {
+ String[] fields = args[++i].split(",");
+ for (String f : fields) {
+ // parse field spec
+ String[] spec = f.split(":");
+ int opts = PruningPolicy.DEL_ALL;
+ if (spec.length > 0) {
+ opts = 0;
+ if (spec[1].indexOf('p') != -1) {
+ opts |= PruningPolicy.DEL_POSTINGS;
+ }
+ if (spec[1].indexOf('P') != -1) {
+ opts |= PruningPolicy.DEL_PAYLOADS;
+ }
+ if (spec[1].indexOf('s') != -1) {
+ opts |= PruningPolicy.DEL_STORED;
+ }
+ if (spec[1].indexOf('v') != -1) {
+ opts |= PruningPolicy.DEL_VECTOR;
+ }
+ }
+ delFields.put(spec[0], opts);
+ }
+ } else if (args[i].equals("-conf")) {
+ ++i;
+ System.err.println("WARN: -conf option not implemented yet.");
+ } else {
+ throw new Exception("Invalid argument: '" + args[i] + "'");
+ }
+ }
+ if (impl == null) {
+ throw new Exception("Must select algorithm implementation");
+ }
+ if (inputs.size() == 0) {
+ throw new Exception("At least one input index is required.");
+ }
+ if (out == null) {
+ throw new Exception("Output path is not set.");
+ }
+ if (thr == -1) {
+ throw new Exception("Threshold value is not set.");
+ }
+ IndexReader in;
+ if (inputs.size() == 1) {
+ in = inputs.get(0);
+ } else {
+ in = new MultiReader(inputs.toArray(new IndexReader[inputs.size()]), true);
+ }
+ if (in.hasDeletions()) {
+ System.err.println("WARN: input index(es) with deletions - document ID-s will NOT be preserved!");
+ }
+ IndexReader pruning = null;
+ StorePruningPolicy stp = null;
+ if (delFields.size() > 0) {
+ stp = new StorePruningPolicy(in, delFields);
+ }
+ TermPruningPolicy tpp = null;
+ if (impl.equals("tf")) {
+ tpp = new TFTermPruningPolicy(in, delFields, null, (int)thr);
+ } else if (impl.equals("carmel")) {
+ tpp = new CarmelUniformTermPruningPolicy(in, delFields, null, thr, null);
+ } else if (impl.equals("carmeltopk")) {
+ tpp = new CarmelTopKTermPruningPolicy(in, delFields, topkK, topkEpsilon, topkR, null);
+ } else if (impl.equals("ridf")) {
+ tpp = new RIDFTermPruningPolicy(in, delFields, null, thr);
+ } else {
+ throw new Exception("Unknown algorithm: '" + impl + "'");
+ }
+ pruning = new PruningReader(in, stp, tpp);
+ IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_31,
+ new WhitespaceAnalyzer(Version.LUCENE_31));
+ IndexWriter iw = new IndexWriter(out, cfg);
+ iw.addIndexes(new IndexReader[]{pruning});
+ iw.close();
+ System.err.println("DONE.");
+ return 0;
+ }
+}
Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/PruningTool.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/TFTermPruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/TFTermPruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/TFTermPruningPolicy.java (working copy)
@@ -0,0 +1,133 @@
+package org.apache.lucene.index.pruning;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+
+/**
+ * Policy for producing smaller index out of an input index, by removing postings data
+ * for those terms where their in-document frequency is below a specified
+ * threshold.
+ *
+ * Larger threshold value will produce a smaller index.
+ * See {@link TermPruningPolicy} for size vs performance considerations.
+ *
+ * This implementation uses simple term frequency thresholds to remove all postings
+ * from documents where a given term occurs rarely (i.e. its TF in a document
+ * is smaller than the threshold).
+ *
+ * Threshold values in this method are expressed as absolute term frequencies.
+ */
+public class TFTermPruningPolicy extends TermPruningPolicy {
+ protected Map thresholds;
+ protected int defThreshold;
+ protected int curThr;
+
+ public TFTermPruningPolicy(IndexReader in, Map fieldFlags,
+ Map thresholds, int defThreshold) {
+ super(in, fieldFlags);
+ this.defThreshold = defThreshold;
+ if (thresholds != null) {
+ this.thresholds = thresholds;
+ } else {
+ this.thresholds = Collections.emptyMap();
+ }
+ }
+
+ @Override
+ public boolean pruneTermEnum(TermEnum te) throws IOException {
+ // check that at least one doc exceeds threshold
+ int thr = defThreshold;
+ String termKey = te.term().field() + ":" + te.term().text();
+ if (thresholds.containsKey(termKey)) {
+ thr = thresholds.get(termKey);
+ } else if (thresholds.containsKey(te.term().field())) {
+ thr = thresholds.get(te.term().field());
+ }
+ TermDocs td = in.termDocs(te.term());
+ boolean pass = false;
+ do {
+ if (td.freq() >= thr) {
+ pass = true;
+ break;
+ }
+ } while (td.next());
+ td.close();
+ return !pass;
+ }
+
+ @Override
+ public void initPositionsTerm(TermPositions in, Term t) throws IOException {
+ // set threshold for this field
+ curThr = defThreshold;
+ String termKey = t.field() + ":" + t.text();
+ if (thresholds.containsKey(termKey)) {
+ curThr = thresholds.get(termKey);
+ } else if (thresholds.containsKey(t.field())) {
+ curThr = thresholds.get(t.field());
+ }
+ }
+
+ @Override
+ public boolean pruneAllPositions(TermPositions termPositions, Term t)
+ throws IOException {
+ if (termPositions.freq() < curThr) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public int pruneTermVectorTerms(int docNumber, String field, String[] terms,
+ int[] freqs, TermFreqVector tfv)
+ throws IOException {
+ int thr = defThreshold;
+ if (thresholds.containsKey(field)) {
+ thr = thresholds.get(field);
+ }
+ int removed = 0;
+ for (int i = 0; i < terms.length; i++) {
+ // check per-term thresholds
+ int termThr = thr;
+ String t = field + ":" + terms[i];
+ if (thresholds.containsKey(t)) {
+ termThr = thresholds.get(t);
+ }
+ if (freqs[i] < termThr) {
+ terms[i] = null;
+ removed++;
+ }
+ }
+ return removed;
+ }
+
+ @Override
+ public int pruneSomePositions(int docNum, int[] positions, Term curTerm) {
+ return 0; //this policy either prunes all or none, so nothing to prune here
+ }
+
+}
Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/TFTermPruningPolicy.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/TermPruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/TermPruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/pruning/TermPruningPolicy.java (working copy)
@@ -0,0 +1,206 @@
+package org.apache.lucene.index.pruning;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+
+/**
+ * Policy for producing smaller index out of an input index, by examining its terms
+ * and removing from the index some or all of their data as follows:
+ *
+ *
all terms of a certain field - see {@link #pruneAllFieldPostings(String)}
+ *
all data of a certain term - see {@link #pruneTermEnum(TermEnum)}
+ *
all positions of a certain term in a certain document - see #pruneAllPositions(TermPositions, Term)
+ *
some positions of a certain term in a certain document - see #pruneSomePositions(int, int[], Term)
+ *
+ *
+ * The pruned, smaller index would, for many types of queries return nearly
+ * identical top-N results as compared with the original index, but with increased performance.
+ *
+ * Pruning of indexes is handy for producing small first-tier indexes that fit
+ * completely in RAM, and store these indexes using {@link IndexWriter#addIndexes(IndexReader...)}
+ *
+ * Interestingly, if the input index is optimized (i.e. doesn't contain deletions),
+ * then the index produced via {@link IndexWriter#addIndexes(IndexReader[])} will preserve internal document
+ * id-s so that they are in sync with the original index. This means that
+ * all other auxiliary information not necessary for first-tier processing, such
+ * as some stored fields, can also be removed, to be quickly retrieved on-demand
+ * from the original index using the same internal document id. See
+ * {@link StorePruningPolicy} for information about removing stored fields.
+ *
+ * Please note that while this family of policies method produces good results for term queries it
+ * often leads to poor results for phrase queries (because postings are removed
+ * without considering whether they belong to an important phrase).
+ *
+ * See the following papers for a discussion of this problem and the
+ * proposed solutions to improve the quality of a pruned index (not implemented
+ * here):
+ *
+ *