Index: lucene/build.xml =================================================================== --- lucene/build.xml (revision 1234831) +++ lucene/build.xml (working copy) @@ -265,6 +265,7 @@ + Index: lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java =================================================================== --- lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java (revision 0) +++ lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java (working copy) @@ -0,0 +1,342 @@ +package org.apache.lucene.index; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.PruningPolicy; +import org.apache.lucene.index.PruningReader; +import org.apache.lucene.index.StorePruningPolicy; +import org.apache.lucene.index.TFTermPruningPolicy; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + + +public class TestPruningReader extends LuceneTestCase { + + // parameters for the Carmel-TopK-Pruning + private static final int R = 1; //number of terms in the query + private static final int K = 2; // top K results + private static final float EPSILON = .001f; // error in score + + RAMDirectory sourceDir = new RAMDirectory(); + + /** once computed base on how index is created, these are the full scores, i.e. before pruning */ + private static Map fullScores = initFullScores(); + private static Map prunedScores = initPrunedScores(); + + private void assertTD(IndexReader ir, Term t, int[] ids) throws Exception { + TermPositions td = ir.termPositions(t); + assertNotNull(td); + try { + int i = 0; + while(td.next()) { + assertEquals(t + ", i=" + i, ids[i], td.doc()); + i++; + } + assertEquals(ids.length, i); + } finally { + td.close(); + } + } + + /** + * Scores of the full, unpruned index. + */ + private static Map initFullScores() { + HashMap res = new HashMap(); + Term t; + ScoreDoc sd[]; + t = new Term("body","one"); + sd = new ScoreDoc[] { + new ScoreDoc(4, 0.74011815f), + new ScoreDoc(2, 0.54939526f), + new ScoreDoc(3, 0.54939526f), + new ScoreDoc(1, 0.44857934f), + new ScoreDoc(0, 0.42292467f) + }; + res.put(t,sd); + t = new Term("body","two"); + sd = new ScoreDoc[] { + new ScoreDoc(2, 0.7679404f), + new ScoreDoc(1, 0.62702066f), + new ScoreDoc(0, 0.5911608f), + new ScoreDoc(4, 0.5172657f) + }; + res.put(t,sd); + t = new Term("body","three"); + sd = new ScoreDoc[] { + new ScoreDoc(3, 0.7679404f), + new ScoreDoc(1, 0.62702066f), + new ScoreDoc(0, 0.5911608f) + }; + res.put(t,sd); + t = new Term("test","one"); + sd = new ScoreDoc[] { + new ScoreDoc(4, 2.9678855f) + }; + res.put(t,sd); + t = new Term("allthesame","allthesame"); + sd = new ScoreDoc[] { + new ScoreDoc(0, 0.84584934f), + new ScoreDoc(1, 0.84584934f), + new ScoreDoc(2, 0.84584934f), + new ScoreDoc(3, 0.84584934f), + new ScoreDoc(4, 0.84584934f) + }; + res.put(t,sd); + return res; + } + + /** + * Expected scores of the pruned index - with EPSILON=0.001, K=2, R=1 + */ + private static Map initPrunedScores() { + HashMap res = new HashMap(); + Term t; + ScoreDoc sd[]; + t = new Term("body","one"); + sd = new ScoreDoc[] { + new ScoreDoc(4, 0.74011815f), + new ScoreDoc(2, 0.54939526f), + new ScoreDoc(3, 0.54939526f), + }; + res.put(t,sd); + t = new Term("body","two"); + sd = new ScoreDoc[] { + new ScoreDoc(2, 0.7679404f), + new ScoreDoc(1, 0.62702066f), + }; + res.put(t,sd); + t = new Term("body","three"); + sd = new ScoreDoc[] { + new ScoreDoc(3, 0.7679404f), + new ScoreDoc(1, 0.62702066f), + }; + res.put(t,sd); + t = new Term("test","one"); + sd = new ScoreDoc[] { + new ScoreDoc(4, 2.9678855f) + }; + res.put(t,sd); + t = new Term("allthesame","allthesame"); // must keep all because all are the same! + sd = new ScoreDoc[] { + new ScoreDoc(0, 0.84584934f), + new ScoreDoc(1, 0.84584934f), + new ScoreDoc(2, 0.84584934f), + new ScoreDoc(3, 0.84584934f), + new ScoreDoc(4, 0.84584934f) + }; + res.put(t,sd); + return res; + } + + private void assertTDCount(IndexReader ir, Term t, int count) throws Exception { + TermPositions td = ir.termPositions(t); + assertNotNull(td); + try { + int i = 0; + while (td.next()) i++; + assertEquals(t.toString(), count, i); + } finally { + td.close(); + } + } + + public void setUp() throws Exception { + super.setUp(); + WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); + IndexWriter iw = new IndexWriter(sourceDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + Document doc = new Document(); + doc.add(new Field("body", "one two three four", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "0", Field.Store.YES, Field.Index.NO)); + doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED)); + iw.addDocument(doc); + doc = new Document(); + doc.add(new Field("body", "one two three one two three", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO)); + doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED)); + iw.addDocument(doc); + doc = new Document(); + doc.add(new Field("body", "one two one two one two", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO)); + doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED)); + iw.addDocument(doc); + doc = new Document(); + doc.add(new Field("body", "one three one three one three", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO)); + doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED)); + iw.addDocument(doc); + doc = new Document(); + doc.add(new Field("body", "one one one one two", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("test", "one two one two three three three four", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS_OFFSETS)); + doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO)); + doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED)); + iw.addDocument(doc); + // to be deleted + doc = new Document(); + doc.add(new Field("body", "one three one three one three five five five", Field.Store.YES, Field.Index.ANALYZED)); + doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO)); + doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED)); + iw.addDocument(doc); + iw.close(); + IndexReader ir = IndexReader.open(sourceDir, false); + ir.deleteDocument(5); + ir.close(); + } + + public void testTfPruning() throws Exception { + RAMDirectory targetDir = new RAMDirectory(); + IndexReader in = IndexReader.open(sourceDir, true); + TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, null, null, 2); + PruningReader tfr = new PruningReader(in, null, tfp); + // verify +// assertTD(tfr, new Term("body", "one"), new int[]{1, 2, 3, 4}); +// assertTD(tfr, new Term("body", "two"), new int[]{1, 2}); +// assertTD(tfr, new Term("body", "three"), new int[]{1, 3}); +// assertTD(tfr, new Term("test", "one"), new int[]{4}); +// assertTDCount(tfr, new Term("body", "four"), 0); +// assertTDCount(tfr, new Term("test", "four"), 0); + // verify new reader + WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); + IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + iw.addIndexes(new IndexReader[]{tfr}); + iw.close(); + IndexReader ir = IndexReader.open(targetDir, true); + assertTD(ir, new Term("body", "one"), new int[]{1, 2, 3, 4}); + assertTD(ir, new Term("body", "two"), new int[]{1, 2}); + assertTD(ir, new Term("body", "three"), new int[]{1, 3}); + assertTD(ir, new Term("test", "one"), new int[]{4}); + tfr.close(); + ir.close(); + } + + public void testCarmelTopKPruning() throws Exception { + IndexReader in = IndexReader.open(sourceDir, true); + // validate full scores - without pruning, just to make sure we test the right thing + validateDocScores(fullScores, in, false, false); // validate both docs and scores + // prune reader + CarmelTopKTermPruningPolicy tfp = new CarmelTopKTermPruningPolicy(in, null, K, EPSILON, R, null); + PruningReader tfr = new PruningReader(in, null, tfp); + + // create the pruned index + RAMDirectory targetDir = new RAMDirectory(); + WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); + IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + iw.addIndexes(new IndexReader[]{tfr}); + iw.close(); + in.close(); + + // validate scores of pruned index + IndexReader ir = IndexReader.open(targetDir, true); + validateDocScores(prunedScores, ir, false, true); // validated only docs (scores have changed after pruning) + ir.close(); + } + + private void validateDocScores(Map baseScores, IndexReader in, boolean print, boolean onlyDocs) throws IOException { + validateDocScores(baseScores, in, new Term("body", "one"), print, onlyDocs); + validateDocScores(baseScores, in, new Term("body", "two"), print, onlyDocs); + validateDocScores(baseScores, in, new Term("body", "three"), print, onlyDocs); + validateDocScores(baseScores, in, new Term("test", "one"), print, onlyDocs); + validateDocScores(baseScores, in, new Term("allthesame", "allthesame"), print, onlyDocs); + } + + /** validate the doc-scores, optionally also print them */ + private void validateDocScores(Map baseScores, IndexReader in, Term term, boolean print, boolean onlyDocs) throws IOException { + if (print) { + printDocScores(baseScores, in, term); + } + float delta = .0001f; + IndexSearcher is = new IndexSearcher(in); + TermQuery q = new TermQuery(term); + ScoreDoc[] sd = is.search(q, 100).scoreDocs; + assertNotNull("unknown result for term: "+term, baseScores.get(term)); + assertEquals("wrong number of results!", baseScores.get(term).length, sd.length); + for (int i = 0; i < sd.length; i++) { + assertEquals("wrong doc!", baseScores.get(term)[i].doc, sd[i].doc); + if (!onlyDocs) { + assertEquals("wrong score!", baseScores.get(term)[i].score, sd[i].score, delta); + } + } + } + + /** Print the doc scores (in a code format */ + private void printDocScores(Map baseScores, IndexReader in, Term term) throws IOException { + IndexSearcher is = new IndexSearcher(in); + TermQuery q = new TermQuery(term); + ScoreDoc[] scoreDocs = is.search(q, 100).scoreDocs; + System.out.println("t = new Term(\""+term.field+"\",\""+term.text+"\");"); + System.out.println("sd = new ScoreDoc[] {"); + for (ScoreDoc sd : scoreDocs) { + System.out.println(" new ScoreDoc("+sd.doc+", "+sd.score+"f),"); + } + System.out.println("res.put(t,sd);"); + } + + public void testThresholds() throws Exception { + Map thresholds = new HashMap(); + thresholds.put("test", 3); + IndexReader in = IndexReader.open(sourceDir, true); + TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, null, thresholds, 2); + PruningReader tfr = new PruningReader(in, null, tfp); + assertTDCount(tfr, new Term("test", "one"), 0); + assertTDCount(tfr, new Term("test", "two"), 0); + assertTD(tfr, new Term("test", "three"), new int[]{4}); + assertTDCount(tfr, new Term("test", "four"), 0); + } + + public void testRemoveFields() throws Exception { + RAMDirectory targetDir = new RAMDirectory(); + Map removeFields = new HashMap(); + removeFields.put("test", PruningPolicy.DEL_POSTINGS | PruningPolicy.DEL_STORED); + IndexReader in = IndexReader.open(sourceDir, true); + TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, removeFields, null, 2); + StorePruningPolicy stp = new StorePruningPolicy(in, removeFields); + PruningReader tfr = new PruningReader(in, stp, tfp); + Document doc = tfr.document(4); + // removed stored values? + assertNull(doc.get("test")); + // removed postings ? + TermEnum te = tfr.terms(); + while (te.next()) { + assertFalse("test".equals(te.term().field())); + } + // but vectors should be present ! + TermFreqVector tv = tfr.getTermFreqVector(4, "test"); + assertNotNull(tv); + assertEquals(4, tv.getTerms().length); // term "four" not deleted yet from TermEnum + // verify new reader + WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT); + IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + iw.addIndexes(new IndexReader[]{tfr}); + iw.close(); + IndexReader ir = IndexReader.open(targetDir, true); + tv = ir.getTermFreqVector(4, "test"); + assertNotNull(tv); + assertEquals(3, tv.getTerms().length); // term "four" was deleted from TermEnum + } + +} Property changes on: lucene/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/RIDFTermPruningPolicy.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/RIDFTermPruningPolicy.java (revision 0) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/RIDFTermPruningPolicy.java (working copy) @@ -0,0 +1,88 @@ +package org.apache.lucene.index; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; +import java.util.Collections; +import java.util.Map; + +/** + * Implementation of {@link TermPruningPolicy} that uses "residual IDF" + * metric to determine the postings of terms to keep/remove. Residual + * IDF is a difference between a collection-wide IDF of a term and the + * observed in-document frequency of the term. + */ +public class RIDFTermPruningPolicy extends TermPruningPolicy { + double defThreshold; + Map thresholds; + double df; + double maxDoc; + + protected RIDFTermPruningPolicy(IndexReader in, + Map fieldFlags, Map thresholds, + double defThreshold) { + super(in, fieldFlags); + this.defThreshold = defThreshold; + if (thresholds != null) { + this.thresholds = thresholds; + } else { + this.thresholds = Collections.emptyMap(); + } + maxDoc = in.maxDoc(); + } + + @Override + public void initPositionsTerm(TermPositions tp, Term t) throws IOException { + df = Math.log(in.docFreq(t) / maxDoc); + } + + @Override + public boolean pruneTermEnum(TermEnum te) throws IOException { + return false; + } + + @Override + public boolean pruneAllPositions(TermPositions termPositions, Term t) + throws IOException { + double ridf = Math.log(1 - Math.pow(Math.E, termPositions.freq() / maxDoc)) - df; + double thr = defThreshold; + String key = t.field() + ":" + t.text(); + if (thresholds.containsKey(key)) { + thr = thresholds.get(key); + } else if (thresholds.containsKey(t.field())) { + thr = thresholds.get(t.field()); + } + if (ridf > thr) { + return false; // keep + } else { + return true; + } + } + + @Override + public int pruneTermVectorTerms(int docNumber, String field, String[] terms, + int[] freqs, TermFreqVector v) throws IOException { + return 0; + } + + @Override + public int pruneSomePositions(int docNum, int[] positions, Term curTerm) { + return 0; //this policy either prunes all or none, so nothing to prune here + } + +} Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/RIDFTermPruningPolicy.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java (revision 0) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java (working copy) @@ -0,0 +1,34 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * General Definitions for Index Pruning, such as operations to be performed on field data. + */ +public class PruningPolicy { + /** Delete (some or all) postings for this field. */ + public static final int DEL_POSTINGS = 0x01; + /** Delete (some or all) stored values for this field. */ + public static final int DEL_STORED = 0x02; + /** Delete term frequency vectors for this field (whole vectors or individual terms). */ + public static final int DEL_VECTOR = 0x04; + /** Delete (some or all) payloads in these fields. */ + public static final int DEL_PAYLOADS = 0x08; + /** Delete all data for this field. */ + public static final int DEL_ALL = 0xff; +} Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java (revision 0) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java (working copy) @@ -0,0 +1,333 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.logging.Logger; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FilterIndexReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.SegmentTermPositionVector; +import org.apache.lucene.index.SegmentTermVector; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositionVector; +import org.apache.lucene.index.TermVectorOffsetInfo; + +/** + * This class produces a subset of the input index, by removing some postings + * data according to rules implemented in a {@link TermPruningPolicy}, and + * optionally it can also remove stored fields of documents according to rules + * implemented in a {@link StorePruningPolicy}. + */ +public class PruningReader extends FilterIndexReader { + private static final Logger LOG = Logger.getLogger(PruningReader.class.getName()); + + protected int docCount; + protected int vecCount; + protected int termCount, delTermCount; + protected int prunedVecCount, delVecCount; + + protected TermPruningPolicy termPolicy; + protected StorePruningPolicy storePolicy; + + /** + * Constructor. + * @param in input reader + * @param storePolicy implementation of {@link StorePruningPolicy} - if null + * then stored values will be retained as is. + * @param termPolicy implementation of {@link TermPruningPolicy}, must not + * be null. + */ + public PruningReader(IndexReader in, StorePruningPolicy storePolicy, + TermPruningPolicy termPolicy) { + super(in); + this.termPolicy = termPolicy; + assert termPolicy != null; + this.storePolicy = storePolicy; + } + + @Override + public IndexReader[] getSequentialSubReaders() { + IndexReader[] orig = super.getSequentialSubReaders(); + if (orig == null) { + return null; + } + IndexReader[] res = new IndexReader[orig.length]; + for (int i = 0; i < res.length; i++) { + res[i] = new PruningReader(orig[i], storePolicy, termPolicy); + } + return res; + } + + /** + * Applies a {@link StorePruningPolicy} to stored fields of a document. + */ + @Override + public Document document(final int n, FieldSelector fieldSelector) + throws CorruptIndexException, IOException { + docCount++; + if ((docCount % 10000) == 0) { + LOG.info(" - stored fields: " + docCount + " docs."); + } + if (storePolicy != null) { + return storePolicy.pruneDocument(n, fieldSelector); + } else { + return in.document(n, fieldSelector); + } + } + + /** + * Applies a {@link StorePruningPolicy} to the list of available field infos. + */ + @Override + public FieldInfos getFieldInfos() { + FieldInfos res = super.getFieldInfos(); + if (storePolicy == null) { + return res; + } + return storePolicy.getFieldInfos(res); + } + + /** + * Applies {@link TermPruningPolicy} to terms inside term vectors. + */ + @Override + public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException { + TermFreqVector[] vectors = super.getTermFreqVectors(docNumber); + if (vectors == null) { + return null; + } + ArrayList newVectors = new ArrayList(); + for (TermFreqVector v : vectors) { + if (v == null) { + continue; + } + if (termPolicy.pruneWholeTermVector(docNumber, v.getField())) { + delVecCount++; + if ((delVecCount % 10000) == 0) { + LOG.info(" - deleted vectors: " + delVecCount); + } + continue; + } + if (v.size() == 0) { + continue; + } + String[] terms = v.getTerms(); + int[] freqs = v.getTermFrequencies(); + + int removed = termPolicy.pruneTermVectorTerms(docNumber, v.getField(), terms, freqs, v); + if (removed > 0 && removed < terms.length) { + String[] newTerms = new String[terms.length - removed]; + int[] newFreqs = new int[terms.length - removed]; + int j = 0; + for (int i = 0; i < terms.length; i++) { + if (terms[i] != null) { + newTerms[j] = terms[i]; + newFreqs[j] = freqs[i]; + j++; + } + } + // create a modified vector + if (v instanceof TermPositionVector) { + TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[terms.length - removed][]; + boolean withOffsets = false; + j = 0; + for (int i = 0; i < terms.length; i++) { + if (terms[i] == null) { + continue; + } + offsets[j] = ((TermPositionVector) v).getOffsets(i); + if (offsets[j] != null && offsets[j] != TermVectorOffsetInfo.EMPTY_OFFSET_INFO) { + withOffsets = true; + } + j++; + } + j = 0; + int[][] positions = new int[terms.length - removed][]; + boolean withPositions = false; + for (int i = 0; i < terms.length; i++) { + if (terms[i] == null) { + continue; + } + positions[j] = ((TermPositionVector) v).getTermPositions(i); + if (positions[j] != null && positions[j].length > 0) { + withPositions = true; + } + j++; + } + v = new SegmentTermPositionVector(v.getField(), newTerms, newFreqs, + withPositions ? positions : null, + withOffsets ? offsets : null); + } else { + v = new SegmentTermVector(v.getField(), newTerms, newFreqs); + } + newVectors.add(v); + } + } + vecCount++; + if ((vecCount % 10000) == 0) { + LOG.info(" - vectors: " + vecCount + " docs."); + } + if (newVectors.size() == 0) { + prunedVecCount++; + if ((prunedVecCount % 1000) == 0) { + LOG.info(" - deleted pruned vectors: " + prunedVecCount); + } + return null; + } + return newVectors.toArray(new TermFreqVector[newVectors.size()]); + } + + /** + * Applies {@link TermPruningPolicy} to term positions. + */ + @Override + public TermPositions termPositions() throws IOException { + return new PruningTermPositions(in.termPositions()); + } + + /** + * Applies {@link TermPruningPolicy} to term enum. + */ + @Override + public TermEnum terms() throws IOException { + return new PruningTermEnum(in.terms()); + } + + private class PruningTermEnum extends FilterTermEnum { + + private PruningTermEnum(TermEnum in) { + super(in); + } + + @Override + public boolean next() throws IOException { + for (;;) { + if (!super.next()) { + // System.out.println("TE: end"); + return false; + } + termCount++; + if ((termCount % 50000) == 0) { + LOG.info(" - terms: " + termCount + " (" + term() + "), deleted: " + delTermCount); + } + if (termPolicy.pruneAllFieldPostings(term().field()) + || termPolicy.pruneTermEnum(in)) { + delTermCount++; + // System.out.println("TE: remove " + term()); + continue; + } + // System.out.println("TE: pass " + term()); + return true; + } + } + + } + + private class PruningTermPositions extends FilterTermPositions { + + protected Term curTerm = null; + protected int[] positions; + protected TermPositions tp; + protected int curFreq; + protected int posPos; + + private PruningTermPositions(TermPositions in) { + super(in); + tp = in; + } + + @Override + public void seek(Term t) throws IOException { + super.seek(t); + informPolicy(t); + } + + @Override + public void seek(TermEnum termEnum) throws IOException { + super.seek(termEnum); + informPolicy(termEnum.term()); + } + + private void informPolicy(Term t) throws IOException { + termPolicy.initPositionsTerm(tp, t); + curTerm = new Term(t.field(), t.text()); + } + + @Override + public boolean next() throws IOException { + for (;;) { + positions = null; + if (!super.next()) { + return false; + } + if (termPolicy.pruneAllPositions(tp, curTerm)) { + continue; + } + break; + } + // prepare the positions + positions = new int[tp.freq()]; + for (int i = 0; i < positions.length; i++) { + positions[i] = tp.nextPosition(); + } + int pruned = termPolicy.pruneSomePositions(tp.doc(), positions, curTerm); + if (pruned > 0) { + int[] newPositions = new int[positions.length - pruned]; + int j = 0; + for (int i = 0; i < positions.length; i++) { + if (positions[i] < 0) { + continue; + } else { + newPositions[j++] = positions[i]; + } + } + positions = newPositions; + } + curFreq = positions.length; + posPos = 0; + return true; + } + + @Override + public int nextPosition() throws IOException { + return positions[posPos++]; + } + + @Override + public int freq() { + return curFreq; + } + + @Override + public boolean isPayloadAvailable() { + if (!super.isPayloadAvailable()) { + return false; + } + if (termPolicy.prunePayload((TermPositions) in, curTerm)) { + return false; + } + return true; + } + } +} \ No newline at end of file Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelTopKTermPruningPolicy.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelTopKTermPruningPolicy.java (revision 0) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelTopKTermPruningPolicy.java (working copy) @@ -0,0 +1,273 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Map; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopScoreDocCollector; + +/** + * Pruning policy with a search quality parameterized guarantee - configuration + * of this policy allows to specify two parameters: k and + * ε such that: + *

+ * + * + * + * + *
+ * For any OR query with r terms, the score of each of the top + * k results in the original index, should be "practically the same" as + * the score that document in the pruned index: the scores difference should not + * exceed r * ε.
+ *

+ * See the following paper for more details about this method: Static index pruning for + * information retrieval systems, D. Carmel at al, ACM SIGIR 2001 . + *

+ * The claim of this pruning technique is, quoting from the above paper: + *

+ * + * + * + * + *
+ * Prune the index in such a way that a human + * "cannot distinguish the difference" between the results of a search engine + * whose index is pruned and one whose index is not pruned.
+ *

+ * For indexes with a large number of terms this policy might be too slow. In + * such situations, the uniform pruning approach in + * {@link CarmelUniformTermPruningPolicy} will be faster, though it might + * produce inferior search quality, as that policy does not pose a theoretical + * guarantee on resulted search quality. + *

+ * TODO implement also CarmelTermPruningDeltaTopPolicy + */ +public class CarmelTopKTermPruningPolicy extends TermPruningPolicy { + + /** + * Default number of guaranteed top K scores + */ + public static final int DEFAULT_TOP_K = 10; + + /** + * Default number of query terms + */ + public static final int DEFAULT_R = 1; + + /** + * Default largest meaningless score difference + */ + public static final float DEFAULT_EPSILON = .001f; + + private int docsPos = 0; + private int k; + private ScoreDoc[] docs = null; + private IndexSearcher is; + private boolean noPruningForCurrentTerm; + private float scoreDelta; + + /** + * Constructor with default parameters + * + * @see #DEFAULT_TOP_K + * @see #DEFAULT_EPSILON + * @see #DEFAULT_R + * @see DefaultSimilarity + * @see #CarmelTopKTermPruningPolicy(IndexReader, Map, int, float, int, Similarity) + */ + protected CarmelTopKTermPruningPolicy(IndexReader in, + Map fieldFlags) { + this(in, fieldFlags, DEFAULT_TOP_K, DEFAULT_EPSILON, DEFAULT_R, null); + } + + /** + * Constructor with specific settings + * + * @param in reader for original index + * @param k number of guaranteed top scores. Each top K results in the pruned + * index is either also an original top K result or its original + * score is indistinguishable from some original top K result. + * @param epsilon largest meaningless score difference Results whose scores + * difference is smaller or equal to epsilon are considered + * indistinguishable. + * @param r maximal number of terms in a query for which search quaility in + * pruned index is guaranteed + * @param sim similarity to use when selecting top docs fir each index term. + * When null, {@link DefaultSimilarity} is used. + */ + protected CarmelTopKTermPruningPolicy(IndexReader in, + Map fieldFlags, int k, float epsilon, int r, + Similarity sim) { + super(in, fieldFlags); + this.k = k; + is = new IndexSearcher(in); + is.setSimilarity(sim != null ? sim : new DefaultSimilarity()); + scoreDelta = epsilon * r; + } + + // too costly - pass everything at this stage + @Override + public boolean pruneTermEnum(TermEnum te) throws IOException { + return false; + } + + @Override + public void initPositionsTerm(TermPositions tp, Term t) throws IOException { + // check if there's any point to prune this term + int df = in.docFreq(t); + noPruningForCurrentTerm = (df <= k); + if (noPruningForCurrentTerm) { + return; + } + // take more results (k2>k), attempting for sufficient results to avoid a + // second search + int k2 = Math.min(2 * k, k + 100); // for small k's 2*k will do, but for + // large ones (1000's) keep overhead + // smaller + k2 = Math.min(k2, df); // no more than the potential number of results + TopScoreDocCollector collector = TopScoreDocCollector.create(k2, true); + TermQuery tq = new TermQuery(t); + is.search(tq, collector); + docs = collector.topDocs().scoreDocs; + float threshold = docs[k - 1].score - scoreDelta; + + int nLast = k2 - 1; + nLast = Math.min(nLast, docs.length - 1); // protect in case of deleted docs + if (docs[nLast].score < threshold) { + // this is the better/faster case - no need to go over docs again - we + // have top ones + int n = nLast; + while (docs[n - 1].score < threshold) + --n; // n == num-valid-docs == first-invalid-doc + ScoreDoc[] subset = new ScoreDoc[n]; + System.arraycopy(docs, 0, subset, 0, n); + docs = subset; + // sort by doc but only after taking top scores + Arrays.sort(docs, ByDocComparator.INSTANCE); + } else { + // this is the worse case - must go over docs again + ThresholdCollector thresholdCollector = new ThresholdCollector(threshold); + is.search(tq, thresholdCollector); + docs = thresholdCollector.scoreDocs.toArray(new ScoreDoc[0]); + } + docsPos = 0; + } + + @Override + public boolean pruneAllPositions(TermPositions termPositions, Term t) + throws IOException { + if (noPruningForCurrentTerm) { + return false; + } + if (docsPos >= docs.length) { // used up all doc id-s + return true; // skip any remaining docs + } + while ((docsPos < docs.length - 1) + && termPositions.doc() > docs[docsPos].doc) { + docsPos++; + } + if (termPositions.doc() == docs[docsPos].doc) { + // pass + docsPos++; // move to next doc id + return false; + } else if (termPositions.doc() < docs[docsPos].doc) { + return true; // skip this one - it's less important + } + // should not happen! + throw new IOException("termPositions.doc > docs[docsPos].doc"); + } + + // it probably doesn't make sense to prune term vectors using this method, + // due to its overhead + @Override + public int pruneTermVectorTerms(int docNumber, String field, String[] terms, + int[] freqs, TermFreqVector tfv) throws IOException { + return 0; + } + + public static class ByDocComparator implements Comparator { + public static final ByDocComparator INSTANCE = new ByDocComparator(); + + public int compare(ScoreDoc o1, ScoreDoc o2) { + return o1.doc - o2.doc; + } + } + + @Override + public int pruneSomePositions(int docNum, int[] positions, Term curTerm) { + return 0; // this policy either prunes all or none, so nothing to prune here + } + + /** + * Collect all docs with score >= higher threshold + */ + private static class ThresholdCollector extends Collector { + + private ArrayList scoreDocs = new ArrayList(); + private Scorer scorer; + private float threshold; + private int docBase; + + public ThresholdCollector(float threshold) { + this.threshold = threshold; + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return false; + } + + @Override + public void collect(int doc) throws IOException { + float score = scorer.score(); + if (score >= threshold) { + scoreDocs.add(new ScoreDoc(docBase + doc, score)); + } + } + + @Override + public void setNextReader(IndexReader reader, int docBase) + throws IOException { + this.docBase = docBase; + } + + @Override + public void setScorer(Scorer scorer) throws IOException { + this.scorer = scorer; + } + + } +} Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelTopKTermPruningPolicy.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/package.html =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/package.html (revision 0) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/package.html (working copy) @@ -0,0 +1,33 @@ + + + + + Index Pruning + + +

+Static Index Pruning Tools +

+This package provides a framework for pruning an existing index into +a smaller index while retaining visible search quality as much as possible. +

+ +
+
 
+ + Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/package.html ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelUniformTermPruningPolicy.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelUniformTermPruningPolicy.java (revision 0) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelUniformTermPruningPolicy.java (working copy) @@ -0,0 +1,186 @@ +package org.apache.lucene.index; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.Map; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopScoreDocCollector; + +/** + * Enhanced implementation of Carmel Uniform Pruning, + *

+ * {@link TermPositions} whose in-document frequency is below a specified + * threshold + *

+ * See {@link CarmelTopKTermPruningPolicy} for link to the paper describing this + * policy. are pruned. + *

+ * Conclusions of that paper indicate that it's best to compute per-term + * thresholds, as we do in {@link CarmelTopKTermPruningPolicy}. However for + * large indexes with a large number of terms that method might be too slow, and + * the (enhanced) uniform approach implemented here may will be faster, although + * it might produce inferior search quality. + *

+ * This implementation enhances the Carmel uniform pruning approach, as it + * allows to specify three levels of thresholds: + *

    + *
  • one default threshold - globally (for terms in all fields)
  • + *
  • threshold per field
  • + *
  • threshold per term
  • + *
+ *

+ * These thresholds are applied so that always the most specific one takes + * precedence: first a per-term threshold is used if present, then per-field + * threshold if present, and finally the default threshold. + *

+ * Threshold are maintained in a map, keyed by either field names or terms in + * field:text format. precedence of these values is the following: + *

+ * Thresholds in this method of pruning are expressed as the percentage of the + * top-N scoring documents per term that are retained. The list of top-N + * documents is established by using a regular {@link IndexSearcher} and + * {@link Similarity} to run a simple {@link TermQuery}. + *

+ * Smaller threshold value will produce a smaller index. See + * {@link TermPruningPolicy} for size vs performance considerations. + *

+ * For indexes with a large number of terms this policy might be still too slow, + * since it issues a term query for each term in the index. In such situations, + * the term frequency pruning approach in {@link TFTermPruningPolicy} will be + * faster, though it might produce inferior search quality. + */ +public class CarmelUniformTermPruningPolicy extends TermPruningPolicy { + int docsPos = 0; + float curThr; + float defThreshold; + Map thresholds; + ScoreDoc[] docs = null; + IndexSearcher is; + Similarity sim; + + protected CarmelUniformTermPruningPolicy(IndexReader in, + Map fieldFlags, Map thresholds, + float defThreshold, Similarity sim) { + super(in, fieldFlags); + this.defThreshold = defThreshold; + if (thresholds != null) { + this.thresholds = thresholds; + } else { + this.thresholds = Collections.emptyMap(); + } + if (sim != null) { + this.sim = sim; + } else { + sim = new DefaultSimilarity(); + } + is = new IndexSearcher(in); + is.setSimilarity(sim); + } + + // too costly - pass everything at this stage + @Override + public boolean pruneTermEnum(TermEnum te) throws IOException { + return false; + } + + @Override + public void initPositionsTerm(TermPositions tp, Term t) throws IOException { + curThr = defThreshold; + String termKey = t.field() + ":" + t.text(); + if (thresholds.containsKey(termKey)) { + curThr = thresholds.get(termKey); + } else if (thresholds.containsKey(t.field())) { + curThr = thresholds.get(t.field()); + } + // calculate count + int df = in.docFreq(t); + int count = Math.round((float) df * curThr); + if (count < 100) count = 100; + TopScoreDocCollector collector = TopScoreDocCollector.create(count, true); + TermQuery tq = new TermQuery(t); + is.search(tq, collector); + docs = collector.topDocs().scoreDocs; + if (docs.length > count) { + // TODO deadcode: can topSDcollector(count) produce more than count + // results? + // take top subset *before* sorting by ID + ScoreDoc[] subset = new ScoreDoc[count]; + System.arraycopy(docs, 0, subset, 0, count); + docs = subset; + } + Arrays.sort(docs, ByDocComparator.INSTANCE); + docsPos = 0; + } + + @Override + public boolean pruneAllPositions(TermPositions termPositions, Term t) + throws IOException { + if (docsPos >= docs.length) { // used up all doc id-s + return true; // skip any remaining docs + } + while ((docsPos < docs.length - 1) + && termPositions.doc() > docs[docsPos].doc) { + docsPos++; + } + if (termPositions.doc() == docs[docsPos].doc) { + // pass + docsPos++; // move to next doc id + return false; + } else if (termPositions.doc() < docs[docsPos].doc) { + return true; // skip this one - it's less important + } + // should not happen! + throw new IOException("termPositions.doc > docs[docsPos].doc"); + } + + // it probably doesn't make sense to prune term vectors using this method, + // due to its overhead + @Override + public int pruneTermVectorTerms(int docNumber, String field, String[] terms, + int[] freqs, TermFreqVector tfv) throws IOException { + return 0; + } + + public static class ByDocComparator implements Comparator { + public static final ByDocComparator INSTANCE = new ByDocComparator(); + + public int compare(ScoreDoc o1, ScoreDoc o2) { + return o1.doc - o2.doc; + } + } + + @Override + public int pruneSomePositions(int docNum, int[] positions, Term curTerm) { + return 0; // this policy either prunes all or none, so nothing to prune here + } + +} Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelUniformTermPruningPolicy.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/StorePruningPolicy.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/StorePruningPolicy.java (revision 0) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/StorePruningPolicy.java (working copy) @@ -0,0 +1,123 @@ +package org.apache.lucene.index; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.Map.Entry; +import java.util.logging.Logger; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.FieldSelectorResult; +import org.apache.lucene.index.IndexReader; + +/** + * This class implements rules for removing stored fields from documents. + */ +public class StorePruningPolicy extends PruningPolicy { + private static final Logger LOG = Logger.getLogger(StorePruningPolicy.class.getName()); + protected Map fieldFlags; + protected Set deleteAll; + protected DelFieldSelector fs; + protected IndexReader in; + protected int delFields; + + /** + * Constructs a policy. + * @param in input reader. + * @param fieldFlags a map where keys are field names, and flags are + * bitwise-OR values of flags defined in {@link PruningPolicy}. + */ + public StorePruningPolicy(IndexReader in, Map fieldFlags) { + if (fieldFlags != null) { + this.fieldFlags = fieldFlags; + deleteAll = new HashSet(); + for (Entry e : fieldFlags.entrySet()) { + if (e.getValue() == PruningPolicy.DEL_ALL) { + deleteAll.add(e.getKey()); + } + } + } else { + this.fieldFlags = Collections.emptyMap(); + deleteAll = Collections.emptySet(); + } + fs = new DelFieldSelector(fieldFlags); + this.in = in; + } + + public FieldInfos getFieldInfos(FieldInfos allInfos) { + // for simplicity remove only fields with DEL_ALL + FieldInfos res = new FieldInfos(); + for (FieldInfo fi: allInfos) { + if (!deleteAll.contains(fi.name)) { + res.add(fi); + } + } + return res; + } + + /** + * Prune stored fields of a document. Note that you can also arbitrarily + * change values of the retrieved fields, so long as the field names belong + * to a list of fields returned from {@link #getFieldInfos(FieldInfos)}. + * @param doc document number + * @param parent original field selector that limits what fields will be + * retrieved. + * @return a pruned instance of a Document. + * @throws IOException + */ + public Document pruneDocument(int doc, FieldSelector parent) throws IOException { + if (fieldFlags.isEmpty()) { + return in.document(doc, parent); + } else { + fs.setParent(parent); + return in.document(doc, fs); + } + } + + class DelFieldSelector implements FieldSelector { + private static final long serialVersionUID = -4913592063491685103L; + private FieldSelector parent; + private Map remove; + + public DelFieldSelector(Map remove) { + this.remove = remove; + } + + public void setParent(FieldSelector parent) { + this.parent = parent; + } + + public FieldSelectorResult accept(String fieldName) { + if (!remove.isEmpty() && remove.containsKey(fieldName) && + ((remove.get(fieldName) & DEL_STORED) > 0)) { + delFields++; + if (delFields % 10000 == 0) { + LOG.info(" - stored fields: removed " + delFields + " fields."); + } + return FieldSelectorResult.NO_LOAD; + } else if (parent != null) { + return parent.accept(fieldName); + } else return FieldSelectorResult.LOAD; + } + }; + +} Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/StorePruningPolicy.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java (revision 0) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java (working copy) @@ -0,0 +1,174 @@ +package org.apache.lucene.index; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; + +/** + * A command-line tool to configure and run a {@link PruningReader} on an input + * index and produce a pruned output index using + * {@link IndexWriter#addIndexes(IndexReader...)}. + */ +public class PruningTool { + + public static void main(String[] args) throws Exception { + int res = run(args); + System.exit(res); + } + + public static int run(String[] args) throws Exception { + if (args.length < 5) { + System.err.println("Usage: PruningTool -impl (tf | carmel | carmeltopk | ridf) (-in [-in ...]) " + + "-out -t [-del f1,f2,..] [-conf ] [-topkk ] [-topke ] [-topkr ]"); + System.err.println("\t-impl (tf | carmel | carmeltopk | ridf)\tTermPruningPolicy implementation name: TF or CarmelUniform or or CarmelTopK or RIDFTerm"); + System.err.println("\t-in path\tpath to the input index. Can specify multiple input indexes."); + System.err.println("\t-out path\toutput path where the output index will be stored."); + System.err.println("\t-t NN\tdefault threshold value (minimum in-document frequency) for all terms"); + System.err.println("\t-del f1,f2,..\tcomma-separated list of field specs to delete (postings, vectors & stored):"); + System.err.println("\t\tfield spec : fieldName ( ':' [pPsv] )"); + System.err.println("\t\twhere: p - postings, P - payloads, s - stored value, v - vectors"); + System.err.println("\t-conf file\tpath to config file with per-term thresholds"); + System.err.println("\t-topkk NN\t'K' for Carmel TopK Pruning: number of guaranteed top scores"); + System.err.println("\t-topke NN\t'Epsilon' for Carmel TopK Pruning: largest meaningless score difference"); + System.err.println("\t-topkr NN\t'R' for Carmel TopK Pruning: planned maximal number of terms in a query on pruned index"); + return -1; + } + ArrayList inputs = new ArrayList(); + Directory out = null; + float thr = -1; + Map delFields = new HashMap(); + + // parameters for top-K pruning + int topkK = CarmelTopKTermPruningPolicy.DEFAULT_TOP_K; + float topkEpsilon = CarmelTopKTermPruningPolicy.DEFAULT_EPSILON; + int topkR = CarmelTopKTermPruningPolicy.DEFAULT_R; + + String impl = null; + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-in")) { + Directory d = FSDirectory.open(new File(args[++i])); + if (!IndexReader.indexExists(d)) { + System.err.println("WARN: no index in " + args[i] + ", skipping ..."); + } + inputs.add(IndexReader.open(d, true)); + } else if (args[i].equals("-out")) { + File outFile = new File(args[++i]); + if (outFile.exists()) { + throw new Exception("Output " + outFile + " already exists."); + } + outFile.mkdirs(); + out = FSDirectory.open(outFile); + } else if (args[i].equals("-impl")) { + impl = args[++i]; + } else if (args[i].equals("-t")) { + thr = Float.parseFloat(args[++i]); + } else if (args[i].equals("-topkk")) { + topkK = Integer.parseInt(args[++i]); + } else if (args[i].equals("-topke")) { + topkEpsilon = Float.parseFloat(args[++i]); + } else if (args[i].equals("-topkr")) { + topkR = Integer.parseInt(args[++i]); + } else if (args[i].equals("-del")) { + String[] fields = args[++i].split(","); + for (String f : fields) { + // parse field spec + String[] spec = f.split(":"); + int opts = PruningPolicy.DEL_ALL; + if (spec.length > 0) { + opts = 0; + if (spec[1].indexOf('p') != -1) { + opts |= PruningPolicy.DEL_POSTINGS; + } + if (spec[1].indexOf('P') != -1) { + opts |= PruningPolicy.DEL_PAYLOADS; + } + if (spec[1].indexOf('s') != -1) { + opts |= PruningPolicy.DEL_STORED; + } + if (spec[1].indexOf('v') != -1) { + opts |= PruningPolicy.DEL_VECTOR; + } + } + delFields.put(spec[0], opts); + } + } else if (args[i].equals("-conf")) { + ++i; + System.err.println("WARN: -conf option not implemented yet."); + } else { + throw new Exception("Invalid argument: '" + args[i] + "'"); + } + } + if (impl == null) { + throw new Exception("Must select algorithm implementation"); + } + if (inputs.size() == 0) { + throw new Exception("At least one input index is required."); + } + if (out == null) { + throw new Exception("Output path is not set."); + } + if (thr == -1) { + throw new Exception("Threshold value is not set."); + } + IndexReader in; + if (inputs.size() == 1) { + in = inputs.get(0); + } else { + in = new MultiReader(inputs.toArray(new IndexReader[inputs.size()]), true); + } + if (in.hasDeletions()) { + System.err.println("WARN: input index(es) with deletions - document ID-s will NOT be preserved!"); + } + IndexReader pruning = null; + StorePruningPolicy stp = null; + if (delFields.size() > 0) { + stp = new StorePruningPolicy(in, delFields); + } + TermPruningPolicy tpp = null; + if (impl.equals("tf")) { + tpp = new TFTermPruningPolicy(in, delFields, null, (int)thr); + } else if (impl.equals("carmel")) { + tpp = new CarmelUniformTermPruningPolicy(in, delFields, null, thr, null); + } else if (impl.equals("carmeltopk")) { + tpp = new CarmelTopKTermPruningPolicy(in, delFields, topkK, topkEpsilon, topkR, null); + } else if (impl.equals("ridf")) { + tpp = new RIDFTermPruningPolicy(in, delFields, null, thr); + } else { + throw new Exception("Unknown algorithm: '" + impl + "'"); + } + pruning = new PruningReader(in, stp, tpp); + IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_31, + new WhitespaceAnalyzer(Version.LUCENE_31)); + IndexWriter iw = new IndexWriter(out, cfg); + iw.addIndexes(new IndexReader[]{pruning}); + iw.close(); + System.err.println("DONE."); + return 0; + } +} Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningTool.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java (revision 0) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java (working copy) @@ -0,0 +1,133 @@ +package org.apache.lucene.index; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collections; +import java.util.Map; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositions; + +/** + * Policy for producing smaller index out of an input index, by removing postings data + * for those terms where their in-document frequency is below a specified + * threshold. + *

+ * Larger threshold value will produce a smaller index. + * See {@link TermPruningPolicy} for size vs performance considerations. + *

+ * This implementation uses simple term frequency thresholds to remove all postings + * from documents where a given term occurs rarely (i.e. its TF in a document + * is smaller than the threshold). + *

+ * Threshold values in this method are expressed as absolute term frequencies. + */ +public class TFTermPruningPolicy extends TermPruningPolicy { + protected Map thresholds; + protected int defThreshold; + protected int curThr; + + protected TFTermPruningPolicy(IndexReader in, Map fieldFlags, + Map thresholds, int defThreshold) { + super(in, fieldFlags); + this.defThreshold = defThreshold; + if (thresholds != null) { + this.thresholds = thresholds; + } else { + this.thresholds = Collections.emptyMap(); + } + } + + @Override + public boolean pruneTermEnum(TermEnum te) throws IOException { + // check that at least one doc exceeds threshold + int thr = defThreshold; + String termKey = te.term().field() + ":" + te.term().text(); + if (thresholds.containsKey(termKey)) { + thr = thresholds.get(termKey); + } else if (thresholds.containsKey(te.term().field())) { + thr = thresholds.get(te.term().field()); + } + TermDocs td = in.termDocs(te.term()); + boolean pass = false; + do { + if (td.freq() >= thr) { + pass = true; + break; + } + } while (td.next()); + td.close(); + return !pass; + } + + @Override + public void initPositionsTerm(TermPositions in, Term t) throws IOException { + // set threshold for this field + curThr = defThreshold; + String termKey = t.field() + ":" + t.text(); + if (thresholds.containsKey(termKey)) { + curThr = thresholds.get(termKey); + } else if (thresholds.containsKey(t.field())) { + curThr = thresholds.get(t.field()); + } + } + + @Override + public boolean pruneAllPositions(TermPositions termPositions, Term t) + throws IOException { + if (termPositions.freq() < curThr) { + return true; + } else { + return false; + } + } + + @Override + public int pruneTermVectorTerms(int docNumber, String field, String[] terms, + int[] freqs, TermFreqVector tfv) + throws IOException { + int thr = defThreshold; + if (thresholds.containsKey(field)) { + thr = thresholds.get(field); + } + int removed = 0; + for (int i = 0; i < terms.length; i++) { + // check per-term thresholds + int termThr = thr; + String t = field + ":" + terms[i]; + if (thresholds.containsKey(t)) { + termThr = thresholds.get(t); + } + if (freqs[i] < termThr) { + terms[i] = null; + removed++; + } + } + return removed; + } + + @Override + public int pruneSomePositions(int docNum, int[] positions, Term curTerm) { + return 0; //this policy either prunes all or none, so nothing to prune here + } + +} Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java =================================================================== --- lucene/contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java (revision 0) +++ lucene/contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java (working copy) @@ -0,0 +1,205 @@ +package org.apache.lucene.index; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collections; +import java.util.Map; + +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositions; + +/** + * Policy for producing smaller index out of an input index, by examining its terms + * and removing from the index some or all of their data as follows: + *

    + *
  • all terms of a certain field - see {@link #pruneAllFieldPostings(String)}
  • + *
  • all data of a certain term - see {@link #pruneTermEnum(TermEnum)}
  • + *
  • all positions of a certain term in a certain document - see #pruneAllPositions(TermPositions, Term)
  • + *
  • some positions of a certain term in a certain document - see #pruneSomePositions(int, int[], Term)
  • + *
+ *

+ * The pruned, smaller index would, for many types of queries return nearly + * identical top-N results as compared with the original index, but with increased performance. + *

+ * Pruning of indexes is handy for producing small first-tier indexes that fit + * completely in RAM, and store these indexes using {@link IndexWriter#addIndexes(IndexReader...)} + *

+ * Interestingly, if the input index is optimized (i.e. doesn't contain deletions), + * then the index produced via {@link IndexWriter#addIndexes(IndexReader[])} will preserve internal document + * id-s so that they are in sync with the original index. This means that + * all other auxiliary information not necessary for first-tier processing, such + * as some stored fields, can also be removed, to be quickly retrieved on-demand + * from the original index using the same internal document id. See + * {@link StorePruningPolicy} for information about removing stored fields. + *

+ * Please note that while this family of policies method produces good results for term queries it + * often leads to poor results for phrase queries (because postings are removed + * without considering whether they belong to an important phrase). + *

+ * Aggressive pruning policies produce smaller indexes - + * search performance increases, and recall decreases (i.e. search quality + * deteriorates). + *

+ * See the following papers for a discussion of this problem and the + * proposed solutions to improve the quality of a pruned index (not implemented + * here): + * + *

+ * + */ +public abstract class TermPruningPolicy extends PruningPolicy { + protected Map fieldFlags; + protected IndexReader in; + + /** + * Construct a policy. + * @param in input reader + * @param fieldFlags a map, where keys are field names and values + * are bitwise-OR flags of operations to be performed (see + * {@link PruningPolicy} for more details). + */ + protected TermPruningPolicy(IndexReader in, Map fieldFlags) { + this.in = in; + if (fieldFlags != null) { + this.fieldFlags = fieldFlags; + } else { + this.fieldFlags = Collections.emptyMap(); + } + } + + /** + * Term vector pruning. + * @param docNumber document number + * @param field field name + * @return true if the complete term vector for this field should be + * removed (as specified by {@link PruningPolicy#DEL_VECTOR} flag). + * @throws IOException + */ + public boolean pruneWholeTermVector(int docNumber, String field) + throws IOException { + if (fieldFlags.containsKey(field) && + (fieldFlags.get(field) & DEL_VECTOR) != 0) { + return true; + } else { + return false; + } + } + + /** + * Pruning of all postings for a field + * @param field field name + * @return true if all postings for all terms in this field should be + * removed (as specified by {@link PruningPolicy#DEL_POSTINGS}). + * @throws IOException + */ + public boolean pruneAllFieldPostings(String field) throws IOException { + if (fieldFlags.containsKey(field) && + (fieldFlags.get(field) & DEL_POSTINGS) != 0) { + return true; + } else { + return false; + } + } + + /** + * Called when moving {@link TermPositions} to a new {@link Term}. + * @param in input term positions + * @param t current term + * @throws IOException + */ + public abstract void initPositionsTerm(TermPositions in, Term t) + throws IOException; + + /** + * Called when checking for the presence of payload for the current + * term at a current position + * @param in positioned term positions + * @param curTerm current term associated with these positions + * @return true if the payload should be removed, false otherwise. + */ + public boolean prunePayload(TermPositions in, Term curTerm) { + if (fieldFlags.containsKey(curTerm.field()) && + (fieldFlags.get(curTerm.field()) & DEL_PAYLOADS) != 0) { + return true; + } + return false; + } + + /** + * Pruning of individual terms in term vectors. + * @param docNumber document number + * @param field field name + * @param terms array of terms + * @param freqs array of term frequencies + * @param v the original term frequency vector + * @return 0 if no terms are to be removed, positive number to indicate + * how many terms need to be removed. The same number of entries in the terms + * array must be set to null to indicate which terms to remove. + * @throws IOException + */ + public abstract int pruneTermVectorTerms(int docNumber, String field, + String[] terms, int[] freqs, TermFreqVector v) throws IOException; + + /** + * Pruning of all postings for a term (invoked once per term). + * @param te positioned term enum. + * @return true if all postings for this term should be removed, false + * otherwise. + * @throws IOException + */ + public abstract boolean pruneTermEnum(TermEnum te) throws IOException; + + /** + * Prune all postings per term (invoked once per term per doc) + * @param termPositions positioned term positions. Implementations MUST NOT + * advance this by calling {@link TermPositions} methods that advance either + * the position pointer (next, skipTo) or term pointer (seek). + * @param t current term + * @return true if the current posting should be removed, false otherwise. + * @throws IOException + */ + public abstract boolean pruneAllPositions(TermPositions termPositions, Term t) + throws IOException; + + /** + * Prune some postings per term (invoked once per term per doc). + * @param docNum current document number + * @param positions original term positions in the document (and indirectly + * term frequency) + * @param curTerm current term + * @return 0 if no postings are to be removed, or positive number to indicate + * how many postings need to be removed. The same number of entries in the + * positions array must be set to -1 to indicate which positions to remove. + */ + public abstract int pruneSomePositions(int docNum, int[] positions, + Term curTerm); + +} Property changes on: lucene/contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/README.txt =================================================================== --- lucene/contrib/pruning/README.txt (revision 0) +++ lucene/contrib/pruning/README.txt (working copy) @@ -0,0 +1,30 @@ +Static index pruning tools. +=========================== + +This package provides tools and API-s for static index pruning. + +Static pruning is an approach that reduces size of the index +by removing terms and/or postings that are considered less +important, i.e. they don't affect the quality of top-N +retrieval too much. + +There are several different strategies for pruning, each with +its own set of pros and cons. Plese consult the javadocs of +TermPruningPolicy subclasses that contain also references +to published papers on each method. + +There is also a simple command-line driver class that +can apply some of the common pruning policies: + +Usage: PruningTool -impl (tf | carmel | carmeltopk | ridf) (-in [-in ...]) -out -t [-del f1,f2,..] [-conf ] [-topkk ] [-topke ] [-topkr ] + -impl (tf | carmel | carmeltopk | ridf) TermPruningPolicy implementation name: TF or CarmelUniform or or CarmelTopK or RIDFTerm + -in path path to the input index. Can specify multiple input indexes. + -out path output path where the output index will be stored. + -t NN default threshold value (minimum in-document frequency) for all terms + -del f1,f2,.. comma-separated list of field specs to delete (postings, vectors & stored): + field spec : fieldName ( ':' [pPsv] ) + where: p - postings, P - payloads, s - stored value, v - vectors + -conf file path to config file with per-term thresholds + -topkk NN 'K' for Carmel TopK Pruning: number of guaranteed top scores + -topke NN 'Epsilon' for Carmel TopK Pruning: largest meaningless score difference + -topkr NN 'R' for Carmel TopK Pruning: planned maximal number of terms in a query on pruned index \ No newline at end of file Property changes on: lucene/contrib/pruning/README.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native Index: lucene/contrib/pruning/build.xml =================================================================== --- lucene/contrib/pruning/build.xml (revision 0) +++ lucene/contrib/pruning/build.xml (working copy) @@ -0,0 +1,27 @@ + + + + + + + + Pruning Lucene indexes by various criteria + + + + Property changes on: lucene/contrib/pruning/build.xml ___________________________________________________________________ Added: svn:mime-type ## -0,0 +1 ## +text/plain Index: dev-tools/idea/.idea/ant.xml =================================================================== --- dev-tools/idea/.idea/ant.xml (revision 1234831) +++ dev-tools/idea/.idea/ant.xml (working copy) @@ -25,6 +25,7 @@ + Index: dev-tools/idea/.idea/workspace.xml =================================================================== --- dev-tools/idea/.idea/workspace.xml (revision 1234831) +++ dev-tools/idea/.idea/workspace.xml (working copy) @@ -221,6 +221,13 @@ + + + + @@ -253,6 +260,7 @@ + Index: dev-tools/idea/.idea/modules.xml =================================================================== --- dev-tools/idea/.idea/modules.xml (revision 1234831) +++ dev-tools/idea/.idea/modules.xml (working copy) @@ -25,6 +25,7 @@ + Index: dev-tools/idea/lucene/contrib/pruning/pruning.iml =================================================================== --- dev-tools/idea/lucene/contrib/pruning/pruning.iml (revision 0) +++ dev-tools/idea/lucene/contrib/pruning/pruning.iml (working copy) @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + Property changes on: dev-tools/idea/lucene/contrib/pruning/pruning.iml ___________________________________________________________________ Added: svn:mime-type ## -0,0 +1 ## +text/plain Index: dev-tools/eclipse/dot.classpath =================================================================== --- dev-tools/eclipse/dot.classpath (revision 1234831) +++ dev-tools/eclipse/dot.classpath (working copy) @@ -52,6 +52,8 @@ + + Index: dev-tools/maven/lucene/contrib/pruning/pom.xml.template =================================================================== --- dev-tools/maven/lucene/contrib/pruning/pom.xml.template (revision 0) +++ dev-tools/maven/lucene/contrib/pruning/pom.xml.template (working copy) @@ -0,0 +1,66 @@ + + + 4.0.0 + + org.apache.lucene + lucene-parent + @version@ + ../../pom.xml + + org.apache.lucene + lucene-pruning + jar + Lucene Pruning + Pruning Lucene indexes by various criteria. + + lucene/contrib/pruning + ../../build/contrib/pruning + + + + ${project.groupId} + lucene-core + ${project.version} + + + ${project.groupId} + lucene-test-framework + ${project.version} + test + + + + ${build-directory} + ${build-directory}/classes/java + ${build-directory}/classes/test + src/java + src/test + + + ${project.build.testSourceDirectory} + + **/*.java + + + + + Property changes on: dev-tools/maven/lucene/contrib/pruning/pom.xml.template ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* Index: dev-tools/maven/lucene/contrib/pom.xml.template =================================================================== --- dev-tools/maven/lucene/contrib/pom.xml.template (revision 1234831) +++ dev-tools/maven/lucene/contrib/pom.xml.template (working copy) @@ -48,6 +48,7 @@ spatial spellchecker xml-query-parser + pruning ../build/contrib/lucene-contrib-aggregator