();
+
+ // parameters for top-K pruning
+ int topkK = CarmelTopKTermPruningPolicy.DEFAULT_TOP_K;
+ float topkEpsilon = CarmelTopKTermPruningPolicy.DEFAULT_EPSILON;
+ int topkR = CarmelTopKTermPruningPolicy.DEFAULT_R;
+
+ String impl = null;
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-in")) {
+ Directory d = FSDirectory.open(new File(args[++i]));
+ if (!IndexReader.indexExists(d)) {
+ System.err.println("WARN: no index in " + args[i] + ", skipping ...");
+ }
+ inputs.add(IndexReader.open(d, true));
+ } else if (args[i].equals("-out")) {
+ File outFile = new File(args[++i]);
+ if (outFile.exists()) {
+ throw new Exception("Output " + outFile + " already exists.");
+ }
+ outFile.mkdirs();
+ out = FSDirectory.open(outFile);
+ } else if (args[i].equals("-impl")) {
+ impl = args[++i];
+ } else if (args[i].equals("-t")) {
+ thr = Float.parseFloat(args[++i]);
+ } else if (args[i].equals("-topkk")) {
+ topkK = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-topke")) {
+ topkEpsilon = Float.parseFloat(args[++i]);
+ } else if (args[i].equals("-topkr")) {
+ topkR = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-del")) {
+ String[] fields = args[++i].split(",");
+ for (String f : fields) {
+ // parse field spec
+ String[] spec = f.split(":");
+ int opts = PruningPolicy.DEL_ALL;
+ if (spec.length > 0) {
+ opts = 0;
+ if (spec[1].indexOf('p') != -1) {
+ opts |= PruningPolicy.DEL_POSTINGS;
+ }
+ if (spec[1].indexOf('P') != -1) {
+ opts |= PruningPolicy.DEL_PAYLOADS;
+ }
+ if (spec[1].indexOf('s') != -1) {
+ opts |= PruningPolicy.DEL_STORED;
+ }
+ if (spec[1].indexOf('v') != -1) {
+ opts |= PruningPolicy.DEL_VECTOR;
+ }
+ }
+ delFields.put(spec[0], opts);
+ }
+ } else if (args[i].equals("-conf")) {
+ ++i;
+ System.err.println("WARN: -conf option not implemented yet.");
+ } else {
+ throw new Exception("Invalid argument: '" + args[i] + "'");
+ }
+ }
+ if (impl == null) {
+ throw new Exception("Must select algorithm implementation");
+ }
+ if (inputs.size() == 0) {
+ throw new Exception("At least one input index is required.");
+ }
+ if (out == null) {
+ throw new Exception("Output path is not set.");
+ }
+ if (thr == -1) {
+ throw new Exception("Threshold value is not set.");
+ }
+ IndexReader in;
+ if (inputs.size() == 1) {
+ in = inputs.get(0);
+ } else {
+ in = new MultiReader((IndexReader[])inputs
+ .toArray(new IndexReader[inputs.size()]), true);
+ }
+ if (in.hasDeletions()) {
+ System.err.println("WARN: input index(es) with deletions - document ID-s will NOT be preserved!");
+ }
+ IndexReader pruning = null;
+ StorePruningPolicy stp = null;
+ if (delFields.size() > 0) {
+ stp = new StorePruningPolicy(in, delFields);
+ }
+ TermPruningPolicy tpp = null;
+ if (impl.equals("tf")) {
+ tpp = new TFTermPruningPolicy(in, delFields, null, (int)thr);
+ } else if (impl.equals("carmel")) {
+ tpp = new CarmelUniformTermPruningPolicy(in, delFields, null, thr, null);
+ } else if (impl.equals("carmeltopk")) {
+ tpp = new CarmelTopKTermPruningPolicy(in, delFields, topkK, topkEpsilon, topkR, null);
+ } else if (impl.equals("ridf")) {
+ tpp = new RIDFTermPruningPolicy(in, delFields, null, thr);
+ } else {
+ throw new Exception("Unknown algorithm: '" + impl + "'");
+ }
+ pruning = new PruningReader(in, stp, tpp);
+ IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_31,
+ new WhitespaceAnalyzer(Version.LUCENE_31));
+ IndexWriter iw = new IndexWriter(out, cfg);
+ iw.addIndexes(new IndexReader[]{pruning});
+ iw.close();
+ System.err.println("DONE.");
+ return 0;
+ }
+}
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningPolicy.java (revision 0)
@@ -0,0 +1,34 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * General Definitions for Index Pruning, such as operations to be performed on field data.
+ */
+public class PruningPolicy {
+ /** Delete (some or all) postings for this field. */
+ public static final int DEL_POSTINGS = 0x01;
+ /** Delete (some or all) stored values for this field. */
+ public static final int DEL_STORED = 0x02;
+ /** Delete term frequency vectors for this field (whole vectors or individual terms). */
+ public static final int DEL_VECTOR = 0x04;
+ /** Delete (some or all) payloads in these fields. */
+ public static final int DEL_PAYLOADS = 0x08;
+ /** Delete all data for this field. */
+ public static final int DEL_ALL = 0xff;
+}
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/TFTermPruningPolicy.java (revision 0)
@@ -0,0 +1,133 @@
+package org.apache.lucene.index;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+
+/**
+ * Policy for producing smaller index out of an input index, by removing postings data
+ * for those terms where their in-document frequency is below a specified
+ * threshold.
+ *
+ * Larger threshold value will produce a smaller index.
+ * See {@link TermPruningPolicy} for size vs performance considerations.
+ *
+ * This implementation uses simple term frequency thresholds to remove all postings
+ * from documents where a given term occurs rarely (i.e. its TF in a document
+ * is smaller than the threshold).
+ *
+ * Threshold values in this method are expressed as absolute term frequencies.
+ */
+public class TFTermPruningPolicy extends TermPruningPolicy {
+ protected Map thresholds;
+ protected int defThreshold;
+ protected int curThr;
+
+ protected TFTermPruningPolicy(IndexReader in, Map fieldFlags,
+ Map thresholds, int defThreshold) {
+ super(in, fieldFlags);
+ this.defThreshold = defThreshold;
+ if (thresholds != null) {
+ this.thresholds = thresholds;
+ } else {
+ this.thresholds = Collections.emptyMap();
+ }
+ }
+
+ @Override
+ public boolean pruneTermEnum(TermEnum te) throws IOException {
+ // check that at least one doc exceeds threshold
+ int thr = defThreshold;
+ String termKey = te.term().field() + ":" + te.term().text();
+ if (thresholds.containsKey(termKey)) {
+ thr = thresholds.get(termKey);
+ } else if (thresholds.containsKey(te.term().field())) {
+ thr = thresholds.get(te.term().field());
+ }
+ TermDocs td = in.termDocs(te.term());
+ boolean pass = false;
+ do {
+ if (td.freq() >= thr) {
+ pass = true;
+ break;
+ }
+ } while (td.next());
+ td.close();
+ return !pass;
+ }
+
+ @Override
+ public void initPositionsTerm(TermPositions in, Term t) throws IOException {
+ // set threshold for this field
+ curThr = defThreshold;
+ String termKey = t.field() + ":" + t.text();
+ if (thresholds.containsKey(termKey)) {
+ curThr = thresholds.get(termKey);
+ } else if (thresholds.containsKey(t.field())) {
+ curThr = thresholds.get(t.field());
+ }
+ }
+
+ @Override
+ public boolean pruneAllPositions(TermPositions termPositions, Term t)
+ throws IOException {
+ if (termPositions.freq() < curThr) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public int pruneTermVectorTerms(int docNumber, String field, String[] terms,
+ int[] freqs, TermFreqVector tfv)
+ throws IOException {
+ int thr = defThreshold;
+ if (thresholds.containsKey(field)) {
+ thr = thresholds.get(field);
+ }
+ int removed = 0;
+ for (int i = 0; i < terms.length; i++) {
+ // check per-term thresholds
+ int termThr = thr;
+ String t = field + ":" + terms[i];
+ if (thresholds.containsKey(t)) {
+ termThr = thresholds.get(t);
+ }
+ if (freqs[i] < termThr) {
+ terms[i] = null;
+ removed++;
+ }
+ }
+ return removed;
+ }
+
+ @Override
+ public int pruneSomePositions(int docNum, int[] positions, Term curTerm) {
+ return 0; //this policy either prunes all or none, so nothing to prune here
+ }
+
+}
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/PruningReader.java (revision 0)
@@ -0,0 +1,321 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.logging.Logger;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FilterIndexReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.SegmentTermPositionVector;
+import org.apache.lucene.index.SegmentTermVector;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+
+/**
+ * This class produces a subset of the input index, by removing some postings
+ * data according to rules implemented in a {@link TermPruningPolicy}, and
+ * optionally it can also remove stored fields of documents according to rules
+ * implemented in a {@link StorePruningPolicy}.
+ */
+public class PruningReader extends FilterIndexReader {
+ private static final Logger LOG = Logger.getLogger(PruningReader.class.getName());
+
+ protected int docCount;
+ protected int vecCount;
+ protected int termCount, delTermCount;
+ protected int prunedVecCount, delVecCount;
+
+ protected TermPruningPolicy termPolicy;
+ protected StorePruningPolicy storePolicy;
+
+ /**
+ * Constructor.
+ * @param in input reader
+ * @param storePolicy implementation of {@link StorePruningPolicy} - if null
+ * then stored values will be retained as is.
+ * @param termPolicy implementation of {@link TermPruningPolicy}, must not
+ * be null.
+ */
+ public PruningReader(IndexReader in, StorePruningPolicy storePolicy,
+ TermPruningPolicy termPolicy) {
+ super(in);
+ this.termPolicy = termPolicy;
+ assert termPolicy != null;
+ this.storePolicy = storePolicy;
+ }
+
+ /**
+ * Applies a {@link StorePruningPolicy} to stored fields of a document.
+ */
+ @Override
+ public Document document(final int n, FieldSelector fieldSelector)
+ throws CorruptIndexException, IOException {
+ docCount++;
+ if ((docCount % 10000) == 0) {
+ LOG.info(" - stored fields: " + docCount + " docs.");
+ }
+ if (storePolicy != null) {
+ return storePolicy.pruneDocument(n, fieldSelector);
+ } else {
+ return in.document(n, fieldSelector);
+ }
+ }
+
+ /**
+ * Applies a {@link StorePruningPolicy} to the list of available field names.
+ */
+ @Override
+ public Collection getFieldNames(FieldOption fieldNames) {
+ Collection res = super.getFieldNames(fieldNames);
+ if (storePolicy == null) {
+ return res;
+ }
+ return storePolicy.getFieldNames(res);
+ }
+
+ /**
+ * Applies {@link TermPruningPolicy} to terms inside term vectors.
+ */
+ @Override
+ public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
+ TermFreqVector[] vectors = super.getTermFreqVectors(docNumber);
+ if (vectors == null) {
+ return null;
+ }
+ ArrayList newVectors = new ArrayList();
+ for (TermFreqVector v : vectors) {
+ if (v == null) {
+ continue;
+ }
+ if (termPolicy.pruneWholeTermVector(docNumber, v.getField())) {
+ delVecCount++;
+ if ((delVecCount % 10000) == 0) {
+ LOG.info(" - deleted vectors: " + delVecCount);
+ }
+ continue;
+ }
+ if (v.size() == 0) {
+ continue;
+ }
+ String[] terms = v.getTerms();
+ int[] freqs = v.getTermFrequencies();
+
+ int removed = termPolicy.pruneTermVectorTerms(docNumber, v.getField(), terms, freqs, v);
+ if (removed > 0 && removed < terms.length) {
+ String[] newTerms = new String[terms.length - removed];
+ int[] newFreqs = new int[terms.length - removed];
+ int j = 0;
+ for (int i = 0; i < terms.length; i++) {
+ if (terms[i] != null) {
+ newTerms[j] = terms[i];
+ newFreqs[j] = freqs[i];
+ j++;
+ }
+ }
+ // create a modified vector
+ if (v instanceof TermPositionVector) {
+ TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[terms.length - removed][];
+ boolean withOffsets = false;
+ j = 0;
+ for (int i = 0; i < terms.length; i++) {
+ if (terms[i] == null) {
+ continue;
+ }
+ offsets[j] = ((TermPositionVector) v).getOffsets(i);
+ if (offsets[j] != null && offsets[j] != TermVectorOffsetInfo.EMPTY_OFFSET_INFO) {
+ withOffsets = true;
+ }
+ j++;
+ }
+ j = 0;
+ int[][] positions = new int[terms.length - removed][];
+ boolean withPositions = false;
+ for (int i = 0; i < terms.length; i++) {
+ if (terms[i] == null) {
+ continue;
+ }
+ positions[j] = ((TermPositionVector) v).getTermPositions(i);
+ if (positions[j] != null && positions[j].length > 0) {
+ withPositions = true;
+ }
+ j++;
+ }
+ v = new SegmentTermPositionVector(v.getField(), newTerms, newFreqs,
+ withPositions ? positions : null,
+ withOffsets ? offsets : null);
+ } else {
+ v = new SegmentTermVector(v.getField(), newTerms, newFreqs);
+ }
+ newVectors.add(v);
+ }
+ }
+ vecCount++;
+ if ((vecCount % 10000) == 0) {
+ LOG.info(" - vectors: " + vecCount + " docs.");
+ }
+ if (newVectors.size() == 0) {
+ prunedVecCount++;
+ if ((prunedVecCount % 1000) == 0) {
+ LOG.info(" - deleted pruned vectors: " + prunedVecCount);
+ }
+ return null;
+ }
+ return (TermFreqVector[]) newVectors.toArray(new TermFreqVector[newVectors.size()]);
+ }
+
+ /**
+ * Applies {@link TermPruningPolicy} to term positions.
+ */
+ @Override
+ public TermPositions termPositions() throws IOException {
+ return new PruningTermPositions(in.termPositions());
+ }
+
+ /**
+ * Applies {@link TermPruningPolicy} to term enum.
+ */
+ @Override
+ public TermEnum terms() throws IOException {
+ return new PruningTermEnum(in.terms());
+ }
+
+ private class PruningTermEnum extends FilterTermEnum {
+
+ private PruningTermEnum(TermEnum in) {
+ super(in);
+ }
+
+ @Override
+ public boolean next() throws IOException {
+ for (;;) {
+ if (!super.next()) {
+ // System.out.println("TE: end");
+ return false;
+ }
+ termCount++;
+ if ((termCount % 50000) == 0) {
+ LOG.info(" - terms: " + termCount + " (" + term() + "), deleted: " + delTermCount);
+ }
+ if (termPolicy.pruneAllFieldPostings(term().field())
+ || termPolicy.pruneTermEnum(in)) {
+ delTermCount++;
+ // System.out.println("TE: remove " + term());
+ continue;
+ }
+ // System.out.println("TE: pass " + term());
+ return true;
+ }
+ }
+
+ }
+
+ private class PruningTermPositions extends FilterTermPositions {
+
+ protected Term curTerm = null;
+ protected int[] positions;
+ protected TermPositions tp;
+ protected int curFreq;
+ protected int posPos;
+
+ private PruningTermPositions(TermPositions in) {
+ super(in);
+ tp = in;
+ }
+
+ @Override
+ public void seek(Term t) throws IOException {
+ super.seek(t);
+ informPolicy(t);
+ }
+
+ @Override
+ public void seek(TermEnum termEnum) throws IOException {
+ super.seek(termEnum);
+ informPolicy(termEnum.term());
+ }
+
+ private void informPolicy(Term t) throws IOException {
+ termPolicy.initPositionsTerm(tp, t);
+ curTerm = new Term(t.field(), t.text());
+ }
+
+ @Override
+ public boolean next() throws IOException {
+ for (;;) {
+ positions = null;
+ if (!super.next()) {
+ return false;
+ }
+ if (termPolicy.pruneAllPositions(tp, curTerm)) {
+ continue;
+ }
+ break;
+ }
+ // prepare the positions
+ positions = new int[tp.freq()];
+ for (int i = 0; i < positions.length; i++) {
+ positions[i] = tp.nextPosition();
+ }
+ int pruned = termPolicy.pruneSomePositions(tp.doc(), positions, curTerm);
+ if (pruned > 0) {
+ int[] newPositions = new int[positions.length - pruned];
+ int j = 0;
+ for (int i = 0; i < positions.length; i++) {
+ if (positions[i] < 0) {
+ continue;
+ } else {
+ newPositions[j++] = positions[i];
+ }
+ }
+ positions = newPositions;
+ }
+ curFreq = positions.length;
+ posPos = 0;
+ return true;
+ }
+
+ @Override
+ public int nextPosition() throws IOException {
+ return positions[posPos++];
+ }
+
+ @Override
+ public int freq() {
+ return curFreq;
+ }
+
+ @Override
+ public boolean isPayloadAvailable() {
+ if (!super.isPayloadAvailable()) {
+ return false;
+ }
+ if (termPolicy.prunePayload((TermPositions) in, curTerm)) {
+ return false;
+ }
+ return true;
+ }
+ }
+}
\ No newline at end of file
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/TermPruningPolicy.java (revision 0)
@@ -0,0 +1,205 @@
+package org.apache.lucene.index;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+
+/**
+ * Policy for producing smaller index out of an input index, by examining its terms
+ * and removing from the index some or all of their data as follows:
+ *
+ * - all terms of a certain field - see {@link #pruneAllFieldPostings(String)}
+ * - all data of a certain term - see {@link #pruneTermEnum(TermEnum)}
+ * - all positions of a certain term in a certain document - see #pruneAllPositions(TermPositions, Term)
+ * - some positions of a certain term in a certain document - see #pruneSomePositions(int, int[], Term)
+ *
+ *
+ * The pruned, smaller index would, for many types of queries return nearly
+ * identical top-N results as compared with the original index, but with increased performance.
+ *
+ * Pruning of indexes is handy for producing small first-tier indexes that fit
+ * completely in RAM, and store these indexes using {@link IndexWriter#addIndexes(IndexReader...)}
+ *
+ * Interestingly, if the input index is optimized (i.e. doesn't contain deletions),
+ * then the index produced via {@link IndexWriter#addIndexes(IndexReader[])} will preserve internal document
+ * id-s so that they are in sync with the original index. This means that
+ * all other auxiliary information not necessary for first-tier processing, such
+ * as some stored fields, can also be removed, to be quickly retrieved on-demand
+ * from the original index using the same internal document id. See
+ * {@link StorePruningPolicy} for information about removing stored fields.
+ *
+ * Please note that while this family of policies method produces good results for term queries it
+ * often leads to poor results for phrase queries (because postings are removed
+ * without considering whether they belong to an important phrase).
+ *
+ * Aggressive pruning policies produce smaller indexes -
+ * search performance increases, and recall decreases (i.e. search quality
+ * deteriorates).
+ *
+ * See the following papers for a discussion of this problem and the
+ * proposed solutions to improve the quality of a pruned index (not implemented
+ * here):
+ *
+ *
+ *
+ */
+public abstract class TermPruningPolicy extends PruningPolicy {
+ protected Map fieldFlags;
+ protected IndexReader in;
+
+ /**
+ * Construct a policy.
+ * @param in input reader
+ * @param fieldFlags a map, where keys are field names and values
+ * are bitwise-OR flags of operations to be performed (see
+ * {@link PruningPolicy} for more details).
+ */
+ protected TermPruningPolicy(IndexReader in, Map fieldFlags) {
+ this.in = in;
+ if (fieldFlags != null) {
+ this.fieldFlags = fieldFlags;
+ } else {
+ this.fieldFlags = Collections.emptyMap();
+ }
+ }
+
+ /**
+ * Term vector pruning.
+ * @param docNumber document number
+ * @param field field name
+ * @return true if the complete term vector for this field should be
+ * removed (as specified by {@link PruningPolicy#DEL_VECTOR} flag).
+ * @throws IOException
+ */
+ public boolean pruneWholeTermVector(int docNumber, String field)
+ throws IOException {
+ if (fieldFlags.containsKey(field) &&
+ (fieldFlags.get(field) & DEL_VECTOR) != 0) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Pruning of all postings for a field
+ * @param field field name
+ * @return true if all postings for all terms in this field should be
+ * removed (as specified by {@link PruningPolicy#DEL_POSTINGS}).
+ * @throws IOException
+ */
+ public boolean pruneAllFieldPostings(String field) throws IOException {
+ if (fieldFlags.containsKey(field) &&
+ (fieldFlags.get(field) & DEL_POSTINGS) != 0) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Called when moving {@link TermPositions} to a new {@link Term}.
+ * @param in input term positions
+ * @param t current term
+ * @throws IOException
+ */
+ public abstract void initPositionsTerm(TermPositions in, Term t)
+ throws IOException;
+
+ /**
+ * Called when checking for the presence of payload for the current
+ * term at a current position
+ * @param in positioned term positions
+ * @param curTerm current term associated with these positions
+ * @return true if the payload should be removed, false otherwise.
+ */
+ public boolean prunePayload(TermPositions in, Term curTerm) {
+ if (fieldFlags.containsKey(curTerm.field()) &&
+ (fieldFlags.get(curTerm.field()) & DEL_PAYLOADS) != 0) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Pruning of individual terms in term vectors.
+ * @param docNumber document number
+ * @param field field name
+ * @param terms array of terms
+ * @param freqs array of term frequencies
+ * @param v the original term frequency vector
+ * @return 0 if no terms are to be removed, positive number to indicate
+ * how many terms need to be removed. The same number of entries in the terms
+ * array must be set to null to indicate which terms to remove.
+ * @throws IOException
+ */
+ public abstract int pruneTermVectorTerms(int docNumber, String field,
+ String[] terms, int[] freqs, TermFreqVector v) throws IOException;
+
+ /**
+ * Pruning of all postings for a term (invoked once per term).
+ * @param te positioned term enum.
+ * @return true if all postings for this term should be removed, false
+ * otherwise.
+ * @throws IOException
+ */
+ public abstract boolean pruneTermEnum(TermEnum te) throws IOException;
+
+ /**
+ * Prune all postings per term (invoked once per term per doc)
+ * @param termPositions positioned term positions. Implementations MUST NOT
+ * advance this by calling {@link TermPositions} methods that advance either
+ * the position pointer (next, skipTo) or term pointer (seek).
+ * @param t current term
+ * @return true if the current posting should be removed, false otherwise.
+ * @throws IOException
+ */
+ public abstract boolean pruneAllPositions(TermPositions termPositions, Term t)
+ throws IOException;
+
+ /**
+ * Prune some postings per term (invoked once per term per doc).
+ * @param docNum current document number
+ * @param positions original term positions in the document (and indirectly
+ * term frequency)
+ * @param curTerm current term
+ * @return 0 if no postings are to be removed, or positive number to indicate
+ * how many postings need to be removed. The same number of entries in the
+ * positions array must be set to -1 to indicate which positions to remove.
+ */
+ public abstract int pruneSomePositions(int docNum, int[] positions,
+ Term curTerm);
+
+}
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelTopKTermPruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelTopKTermPruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelTopKTermPruningPolicy.java (revision 0)
@@ -0,0 +1,273 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.search.Collector;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopScoreDocCollector;
+
+/**
+ * Pruning policy with a search quality parameterized guarantee - configuration
+ * of this policy allows to specify two parameters: k and
+ * ε such that:
+ *
+ *
+ *
+ * |
+ * For any OR query with r terms, the score of each of the top
+ * k results in the original index, should be "practically the same" as
+ * the score that document in the pruned index: the scores difference should not
+ * exceed r * ε. |
+ *
+ *
+ *
+ * See the following paper for more details about this method: Static index pruning for
+ * information retrieval systems, D. Carmel at al, ACM SIGIR 2001 .
+ *
+ * The claim of this pruning technique is, quoting from the above paper:
+ *
+ *
+ *
+ * |
+ * Prune the index in such a way that a human
+ * "cannot distinguish the difference" between the results of a search engine
+ * whose index is pruned and one whose index is not pruned. |
+ *
+ *
+ *
+ * For indexes with a large number of terms this policy might be too slow. In
+ * such situations, the uniform pruning approach in
+ * {@link CarmelUniformTermPruningPolicy} will be faster, though it might
+ * produce inferior search quality, as that policy does not pose a theoretical
+ * guarantee on resulted search quality.
+ *
+ * TODO implement also CarmelTermPruningDeltaTopPolicy
+ */
+public class CarmelTopKTermPruningPolicy extends TermPruningPolicy {
+
+ /**
+ * Default number of guaranteed top K scores
+ */
+ public static final int DEFAULT_TOP_K = 10;
+
+ /**
+ * Default number of query terms
+ */
+ public static final int DEFAULT_R = 1;
+
+ /**
+ * Default largest meaningless score difference
+ */
+ public static final float DEFAULT_EPSILON = .001f;
+
+ private int docsPos = 0;
+ private int k;
+ private ScoreDoc[] docs = null;
+ private IndexSearcher is;
+ private boolean noPruningForCurrentTerm;
+ private float scoreDelta;
+
+ /**
+ * Constructor with default parameters
+ *
+ * @see #DEFAULT_TOP_K
+ * @see #DEFAULT_EPSILON
+ * @see #DEFAULT_R
+ * @see DefaultSimilarity
+ * @see #CarmelTermPruningTopKPolicy(IndexReader, int, float, int, Similarity)
+ */
+ protected CarmelTopKTermPruningPolicy(IndexReader in,
+ Map fieldFlags) {
+ this(in, fieldFlags, DEFAULT_TOP_K, DEFAULT_EPSILON, DEFAULT_R, null);
+ }
+
+ /**
+ * Constructor with specific settings
+ *
+ * @param in reader for original index
+ * @param k number of guaranteed top scores. Each top K results in the pruned
+ * index is either also an original top K result or its original
+ * score is indistinguishable from some original top K result.
+ * @param epsilon largest meaningless score difference Results whose scores
+ * difference is smaller or equal to epsilon are considered
+ * indistinguishable.
+ * @param r maximal number of terms in a query for which search quaility in
+ * pruned index is guaranteed
+ * @param sim similarity to use when selecting top docs fir each index term.
+ * When null, {@link DefaultSimilarity} is used.
+ */
+ protected CarmelTopKTermPruningPolicy(IndexReader in,
+ Map fieldFlags, int k, float epsilon, int r,
+ Similarity sim) {
+ super(in, fieldFlags);
+ this.k = k;
+ is = new IndexSearcher(in);
+ is.setSimilarity(sim != null ? sim : new DefaultSimilarity());
+ scoreDelta = epsilon * r;
+ }
+
+ // too costly - pass everything at this stage
+ @Override
+ public boolean pruneTermEnum(TermEnum te) throws IOException {
+ return false;
+ }
+
+ @Override
+ public void initPositionsTerm(TermPositions tp, Term t) throws IOException {
+ // check if there's any point to prune this term
+ int df = in.docFreq(t);
+ noPruningForCurrentTerm = (df <= k);
+ if (noPruningForCurrentTerm) {
+ return;
+ }
+ // take more results (k2>k), attempting for sufficient results to avoid a
+ // second search
+ int k2 = Math.min(2 * k, k + 100); // for small k's 2*k will do, but for
+ // large ones (1000's) keep overhead
+ // smaller
+ k2 = Math.min(k2, df); // no more than the potential number of results
+ TopScoreDocCollector collector = TopScoreDocCollector.create(k2, true);
+ TermQuery tq = new TermQuery(t);
+ is.search(tq, collector);
+ docs = collector.topDocs().scoreDocs;
+ float threshold = docs[k - 1].score - scoreDelta;
+
+ int nLast = k2 - 1;
+ nLast = Math.min(nLast, docs.length - 1); // protect in case of deleted docs
+ if (docs[nLast].score < threshold) {
+ // this is the better/faster case - no need to go over docs again - we
+ // have top ones
+ int n = nLast;
+ while (docs[n - 1].score < threshold)
+ --n; // n == num-valid-docs == first-invalid-doc
+ ScoreDoc[] subset = new ScoreDoc[n];
+ System.arraycopy(docs, 0, subset, 0, n);
+ docs = subset;
+ // sort by doc but only after taking top scores
+ Arrays.sort(docs, ByDocComparator.INSTANCE);
+ } else {
+ // this is the worse case - must go over docs again
+ ThresholdCollector thresholdCollector = new ThresholdCollector(threshold);
+ is.search(tq, thresholdCollector);
+ docs = thresholdCollector.scoreDocs.toArray(new ScoreDoc[0]);
+ }
+ docsPos = 0;
+ }
+
+ @Override
+ public boolean pruneAllPositions(TermPositions termPositions, Term t)
+ throws IOException {
+ if (noPruningForCurrentTerm) {
+ return false;
+ }
+ if (docsPos >= docs.length) { // used up all doc id-s
+ return true; // skip any remaining docs
+ }
+ while ((docsPos < docs.length - 1)
+ && termPositions.doc() > docs[docsPos].doc) {
+ docsPos++;
+ }
+ if (termPositions.doc() == docs[docsPos].doc) {
+ // pass
+ docsPos++; // move to next doc id
+ return false;
+ } else if (termPositions.doc() < docs[docsPos].doc) {
+ return true; // skip this one - it's less important
+ }
+ // should not happen!
+ throw new IOException("termPositions.doc > docs[docsPos].doc");
+ }
+
+ // it probably doesn't make sense to prune term vectors using this method,
+ // due to its overhead
+ @Override
+ public int pruneTermVectorTerms(int docNumber, String field, String[] terms,
+ int[] freqs, TermFreqVector tfv) throws IOException {
+ return 0;
+ }
+
+ public static class ByDocComparator implements Comparator {
+ public static final ByDocComparator INSTANCE = new ByDocComparator();
+
+ public int compare(ScoreDoc o1, ScoreDoc o2) {
+ return o1.doc - o2.doc;
+ }
+ }
+
+ @Override
+ public int pruneSomePositions(int docNum, int[] positions, Term curTerm) {
+ return 0; // this policy either prunes all or none, so nothing to prune here
+ }
+
+ /**
+ * Collect all docs with score >= higher threshold
+ */
+ private static class ThresholdCollector extends Collector {
+
+ private ArrayList scoreDocs = new ArrayList();
+ private Scorer scorer;
+ private float threshold;
+ private int docBase;
+
+ public ThresholdCollector(float threshold) {
+ this.threshold = threshold;
+ }
+
+ @Override
+ public boolean acceptsDocsOutOfOrder() {
+ return false;
+ }
+
+ @Override
+ public void collect(int doc) throws IOException {
+ float score = scorer.score();
+ if (score >= threshold) {
+ scoreDocs.add(new ScoreDoc(docBase + doc, score));
+ }
+ }
+
+ @Override
+ public void setNextReader(IndexReader reader, int docBase)
+ throws IOException {
+ this.docBase = docBase;
+ }
+
+ @Override
+ public void setScorer(Scorer scorer) throws IOException {
+ this.scorer = scorer;
+ }
+
+ }
+}
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelUniformTermPruningPolicy.java
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelUniformTermPruningPolicy.java (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/CarmelUniformTermPruningPolicy.java (revision 0)
@@ -0,0 +1,186 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Map;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopScoreDocCollector;
+
+/**
+ * Enhanced implementation of Carmel Uniform Pruning,
+ *
+ * {@link TermPositions} whose in-document frequency is below a specified
+ * threshold
+ *
+ * See {@link CarmelTopKTermPruningPolicy} for link to the paper describing this
+ * policy. are pruned.
+ *
+ * Conclusions of that paper indicate that it's best to compute per-term
+ * thresholds, as we do in {@link CarmelTopKTermPruningPolicy}. However for
+ * large indexes with a large number of terms that method might be too slow, and
+ * the (enhanced) uniform approach implemented here may will be faster, although
+ * it might produce inferior search quality.
+ *
+ * This implementation enhances the Carmel uniform pruning approach, as it
+ * allows to specify three levels of thresholds:
+ *
+ * - one default threshold - globally (for terms in all fields)
+ * - threshold per field
+ * - threshold per term
+ *
+ *
+ * These thresholds are applied so that always the most specific one takes
+ * precedence: first a per-term threshold is used if present, then per-field
+ * threshold if present, and finally the default threshold.
+ *
+ * Threshold are maintained in a map, keyed by either field names or terms in
+ * field:text format. precedence of these values is the following:
+ *
+ * Thresholds in this method of pruning are expressed as the percentage of the
+ * top-N scoring documents per term that are retained. The list of top-N
+ * documents is established by using a regular {@link IndexSearcher} and
+ * {@link Similarity} to run a simple {@link TermQuery}.
+ *
+ * Smaller threshold value will produce a smaller index. See
+ * {@link TermPruningPolicy} for size vs performance considerations.
+ *
+ * For indexes with a large number of terms this policy might be still too slow,
+ * since it issues a term query for each term in the index. In such situations,
+ * the term frequency pruning approach in {@link TFTermPruningPolicy} will be
+ * faster, though it might produce inferior search quality.
+ */
+public class CarmelUniformTermPruningPolicy extends TermPruningPolicy {
+ int docsPos = 0;
+ float curThr;
+ float defThreshold;
+ Map thresholds;
+ ScoreDoc[] docs = null;
+ IndexSearcher is;
+ Similarity sim;
+
+ protected CarmelUniformTermPruningPolicy(IndexReader in,
+ Map fieldFlags, Map thresholds,
+ float defThreshold, Similarity sim) {
+ super(in, fieldFlags);
+ this.defThreshold = defThreshold;
+ if (thresholds != null) {
+ this.thresholds = thresholds;
+ } else {
+ this.thresholds = Collections.emptyMap();
+ }
+ if (sim != null) {
+ this.sim = sim;
+ } else {
+ sim = new DefaultSimilarity();
+ }
+ is = new IndexSearcher(in);
+ is.setSimilarity(sim);
+ }
+
+ // too costly - pass everything at this stage
+ @Override
+ public boolean pruneTermEnum(TermEnum te) throws IOException {
+ return false;
+ }
+
+ @Override
+ public void initPositionsTerm(TermPositions tp, Term t) throws IOException {
+ curThr = defThreshold;
+ String termKey = t.field() + ":" + t.text();
+ if (thresholds.containsKey(termKey)) {
+ curThr = thresholds.get(termKey);
+ } else if (thresholds.containsKey(t.field())) {
+ curThr = thresholds.get(t.field());
+ }
+ // calculate count
+ int df = in.docFreq(t);
+ int count = Math.round((float) df * curThr);
+ if (count < 100) count = 100;
+ TopScoreDocCollector collector = TopScoreDocCollector.create(count, true);
+ TermQuery tq = new TermQuery(t);
+ is.search(tq, collector);
+ docs = collector.topDocs().scoreDocs;
+ if (docs.length > count) {
+ // TODO deadcode: can topSDcollector(count) produce more than count
+ // results?
+ // take top subset *before* sorting by ID
+ ScoreDoc[] subset = new ScoreDoc[count];
+ System.arraycopy(docs, 0, subset, 0, count);
+ docs = subset;
+ }
+ Arrays.sort(docs, ByDocComparator.INSTANCE);
+ docsPos = 0;
+ }
+
+ @Override
+ public boolean pruneAllPositions(TermPositions termPositions, Term t)
+ throws IOException {
+ if (docsPos >= docs.length) { // used up all doc id-s
+ return true; // skip any remaining docs
+ }
+ while ((docsPos < docs.length - 1)
+ && termPositions.doc() > docs[docsPos].doc) {
+ docsPos++;
+ }
+ if (termPositions.doc() == docs[docsPos].doc) {
+ // pass
+ docsPos++; // move to next doc id
+ return false;
+ } else if (termPositions.doc() < docs[docsPos].doc) {
+ return true; // skip this one - it's less important
+ }
+ // should not happen!
+ throw new IOException("termPositions.doc > docs[docsPos].doc");
+ }
+
+ // it probably doesn't make sense to prune term vectors using this method,
+ // due to its overhead
+ @Override
+ public int pruneTermVectorTerms(int docNumber, String field, String[] terms,
+ int[] freqs, TermFreqVector tfv) throws IOException {
+ return 0;
+ }
+
+ public static class ByDocComparator implements Comparator {
+ public static final ByDocComparator INSTANCE = new ByDocComparator();
+
+ public int compare(ScoreDoc o1, ScoreDoc o2) {
+ return o1.doc - o2.doc;
+ }
+ }
+
+ @Override
+ public int pruneSomePositions(int docNum, int[] positions, Term curTerm) {
+ return 0; // this policy either prunes all or none, so nothing to prune here
+ }
+
+}
Index: lucene/contrib/pruning/src/java/org/apache/lucene/index/package.html
===================================================================
--- lucene/contrib/pruning/src/java/org/apache/lucene/index/package.html (revision 0)
+++ lucene/contrib/pruning/src/java/org/apache/lucene/index/package.html (revision 0)
@@ -0,0 +1,33 @@
+
+
+
+
+ Index Pruning
+
+
+
+Static Index Pruning Tools
+
+This package provides a framework for pruning an existing index into
+a smaller index while retaining visible search quality as much as possible.
+
+
+
+
+
+
Index: lucene/contrib/pruning/README.txt
===================================================================
--- lucene/contrib/pruning/README.txt (revision 0)
+++ lucene/contrib/pruning/README.txt (revision 0)
@@ -0,0 +1,30 @@
+Static index pruning tools.
+===========================
+
+This package provides tools and API-s for static index pruning.
+
+Static pruning is an approach that reduces size of the index
+by removing terms and/or postings that are considered less
+important, i.e. they don't affect the quality of top-N
+retrieval too much.
+
+There are several different strategies for pruning, each with
+its own set of pros and cons. Plese consult the javadocs of
+TermPruningPolicy subclasses that contain also references
+to published papers on each method.
+
+There is also a simple command-line driver class that
+can apply some of the common pruning policies:
+
+Usage: PruningTool -impl (tf | carmel | carmeltopk | ridf) (-in [-in ...]) -out -t [-del f1,f2,..] [-conf ] [-topkk ] [-topke ] [-topkr ]
+ -impl (tf | carmel | carmeltopk | ridf) TermPruningPolicy implementation name: TF or CarmelUniform or or CarmelTopK or RIDFTerm
+ -in path path to the input index. Can specify multiple input indexes.
+ -out path output path where the output index will be stored.
+ -t NN default threshold value (minimum in-document frequency) for all terms
+ -del f1,f2,.. comma-separated list of field specs to delete (postings, vectors & stored):
+ field spec : fieldName ( ':' [pPsv] )
+ where: p - postings, P - payloads, s - stored value, v - vectors
+ -conf file path to config file with per-term thresholds
+ -topkk NN 'K' for Carmel TopK Pruning: number of guaranteed top scores
+ -topke NN 'Epsilon' for Carmel TopK Pruning: largest meaningless score difference
+ -topkr NN 'R' for Carmel TopK Pruning: planned maximal number of terms in a query on pruned index
\ No newline at end of file
Property changes on: lucene/contrib/pruning/README.txt
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/contrib/pruning/build.xml
===================================================================
--- lucene/contrib/pruning/build.xml (revision 0)
+++ lucene/contrib/pruning/build.xml (revision 0)
@@ -0,0 +1,27 @@
+
+
+
+
+
+
+
+ Pruning Lucene indexes by various criteria
+
+
+
+
Property changes on: lucene/contrib/pruning/build.xml
___________________________________________________________________
Added: svn:eol-style
+ native