StatefulFilter builds on the {@link Filter} class so as to
+ * make it capable of remembering terms across segments spanning the whole
+ * underlying index. The need for making filters stateful stems from the fact
+ * that some, although not most, filters care about what terms they may have
+ * come across in prior segments. Note that, by design, the reader provided to a
+ * {@link Filter} typically only represents a single segment.
+ */
+public abstract class StatefulFilter extends Filter {
+ protected static final long serialVersionUID = 1219268102015079116L;
+
+ // The directory representing the whole underlying index.
+ protected Directory directory;
+ // The current set of segments seen under the above {@link #directory}.
+ protected Set+ * If this reader corresponds to the first segment in the index, then we setup + * our state by re-initializing it. Similarly, if this reader corresponds to + * the last segment in the index, then we clean up after ourselves. + * + * @return a DocIdSet that provides the documents which should be permitted or + * prohibited in search results. NOTE: null can be returned if + * no documents will be accepted by this Filter. + * + * @see DocIdBitSet + */ + @Override + public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + maybeSetupState(reader); + DocIdSet docIdSet = getStatefulDocIdSet(reader, getTermsEnum()); + maybeTeardownState(reader); + return docIdSet; + } + + /** + * Creates a {@link DocIdSet} enumerating the documents that should be + * permitted in search results. NOTE: null can be returned if no + * documents are accepted by this Filter. + *
+ * Note: The returned {@link DocIdSet} must refer to document IDs for that + * segment, not for the top-level reader. Although this method will be called + * once per segment in the index during searching, it is passed a + * {@link StatefulTermsEnum} that has a "memory" of past terms seen in prior + * segments. + * + * @param reader + * a {@link IndexReader} instance opened on the index currently + * searched on. Note, it is likely that the provided reader does not + * represent the whole underlying index i.e. if the index has more + * than one segment the given reader only represents a single + * segment. + * @param termsEnum + * a {@link StatefulTermsEnum} whose memory may impact the + * composition of the returned {@link DocIdSet}. + * + * @return a DocIdSet that provides the documents which should be permitted or + * prohibited in search results. NOTE: null can be returned if + * no documents will be accepted by this Filter. + * + * @see DocIdBitSet + */ + protected abstract DocIdSet getStatefulDocIdSet(IndexReader reader, + StatefulTermsEnum termsEnum) throws IOException; + + /** + * Obtain a copy of the {@StatefulTermsEnum} to use as the + * model for thread-specific ones. + * + * @return a {@link StatefulTermsEnum} that has been properly populated + */ + protected abstract StatefulTermsEnum getTermsEnumModel(); + + /** + * @return the current thread's copy of the {link @StatefulTermsEnum} + */ + protected StatefulTermsEnum getTermsEnum() { + return threadLocalTermsEnum.get(); + } + + /** + * Set the current thread's copy of the {@link StatefulTermsEnum}. + */ + protected void setTermsEnum(StatefulTermsEnum termsEnum) { + threadLocalTermsEnum.set(termsEnum); + } + + /** + * Check to see if the low-level reader corresponds to a directory different + * from the one provided before, or to the first segment of that directory. If + * so, then we have sufficient cause to clear the state of the filter, on + * account of the fact that the filter is dealing with a brand-new search. + * + * @param reader + * the {@link IndexReader} provided to the + * {@link #getDocIdSet(IndexReader)} method + * @throws IOException + */ + protected void maybeSetupState(IndexReader reader) throws IOException { + boolean resetState = false; + String readerQName = getReaderQName((SegmentReader) reader); + if (directory == null || !isRelatedTo(reader, topLevelReader)) { + resetState = true; + } else { + if (reader instanceof SegmentReader) { + if (segmentReaders.contains(readerQName)) { + resetState = true; + } + } else { + resetState = true; + } + } + if (resetState) { + resetState(reader); + } + if (reader instanceof SegmentReader) { + if (!segmentReaders.contains(readerQName)) { + segmentReaders.add(readerQName); + lowLevelSegmentCount++; + } + } + lowLevelReader = reader; + } + + /** + * Check to see if the low-level reader corresponds to the last segment in the + * whole underlying index. If so, then we have sufficient cause to clear the + * state of the filter, on account of the fact that the filter is done with + * the current search. + * + * @param reader + * the {@link IndexReader} provided to the + * {@link #getDocIdSet(IndexReader)} method + * @throws IOException + */ + protected void maybeTeardownState(IndexReader reader) throws IOException { + if (isLastReader(reader)) { + resetState(null); + } + } + + /** + * Calculate the number of segments that fall directly or indirectly + * underneath the given reader. + * + * @param reader + * a top-level or low-level {@link IndexReader} + * @return + */ + protected int getSegmentCount(IndexReader reader) { + if (reader == null) { + return 0; + } + try { + SegmentInfos si = new SegmentInfos(); + si.read(reader.directory()); + return si.size(); + } catch (Exception e) { + int segmentCount = 0; + IndexReader[] subReaders = reader.getSequentialSubReaders(); + if (subReaders != null) { + for (IndexReader subReader : subReaders) { + segmentCount += getSegmentCount(subReader); + } + } + return segmentCount; + } + } + + /** + * Check to see if the given (low-level) sub-reader is part of the given + * (top-level) reader. It does so by walking through the sequence of + * sub-readers in the given (top-level) reader until it finds a match (or + * not). + * + * @param subReader + * a (low-level) reader + * @param reader + * a (top-level) reader + * @return true iff the subReader is part of the reader + */ + protected boolean isRelatedTo(IndexReader subReader, IndexReader reader) { + if (subReader.equals(reader)) { + return true; + } + IndexReader[] subReaders = reader.getSequentialSubReaders(); + if (subReaders != null) { + for (IndexReader sequentialSubReader : subReaders) { + if (sequentialSubReader.equals(subReader)) { + return true; + } + if (isRelatedTo(subReader, sequentialSubReader)) { + return true; + } + } + } + return false; + } + + /** + * Obtain a qualified name for the given reader, which is based on its + * directory's lock ID, plus if it corresponds to a segment, the name of the + * segment. + * + * @param reader + * an arbitrary index reader + * @return the qualified name for the given reader + */ + protected String getReaderQName(IndexReader reader) { + return reader.directory().getLockID() + + ((reader instanceof SegmentReader) ? ((SegmentReader) reader) + .getSegmentName() : ""); + } + + /** + * Check to see if the given reader is the last reader, with respect to the + * top-level reader as defined in {@link #topLevelReader}. + * + *
+ * This method works with the assumption that the {@link Searcher} will pass + * any given reader that is part of the index to this filter exactly once. + * Because we keep track of how many readers have been filtered so far, we can + * tell whether or not the given reader is the last one based on how many + * total readers exist in the whole underlying index. + * + * @param reader + * a (low-level) index reader + * @return true iff the given reader is the last one in the index being + * searched. + */ + protected boolean isLastReader(IndexReader reader) { + return lowLevelSegmentCount == topLevelSegmentCount; + } + + /** + * Reset the state of the filter by clearing the segments herein, and the + * memory of this thread's copy of the {@link StatefulTermsEnum}. + * + * @throws IOException + * @throws CorruptIndexException + */ + protected void resetState(IndexReader reader) throws CorruptIndexException, + IOException { + segmentReaders.clear(); + getTermsEnum().resetTermsEnum(); + if (reader != null) { + directory = reader.directory(); + setTopLevelReader(reader); + } else { + directory = null; + setTopLevelReader(null); + } + topLevelSegmentCount = getSegmentCount(topLevelReader); + lowLevelSegmentCount = 0; + } + + /** + * Set the top-level reader corresponding to the whole underlying index that + * is currently being searched. + * + *
+ * Note that if the provided reader is a low-level one (for instance, if it is + * a {@link SegmentReader}), then we try to determine as best as we can what + * the top-level reader should be. On the other hand, if the provided reader + * does not appear to be a low-level reader, then we take it that it is being + * user-defined, in which case, we try not to overwrite that. + *
+ * + *+ * As previously mentioned, there are certain cases where it is impossible to + * tell what the top-level reader is just by looking at the low-level reader + * (such as when the whole underlying index is defined by a + * {@link MultiReader}). That said, in the event the whole underlying index + * spans exactly one directory, then chances are that this method will be able + * to determine the top-level reader correctly. Given that the system-defined + * mechanism of detecting top-level readers may not always be accurate, we + * highly recommend that users of any type of {@link StatefulFilter} + * explicitly set the top-level reader by using this method. + *
+ * + * @param reader + * a top- or low-level reader + * @return + */ + public StatefulFilter setTopLevelReader(IndexReader reader) { + if (reader instanceof SegmentReader) { + if (!topLevelReaderUserDefined) { + try { + topLevelReader = IndexReader.open(reader.directory(), true); + } catch (Exception e) { + e.printStackTrace(); + topLevelReader = reader; + } + } + } else { + topLevelReader = reader; + topLevelReaderUserDefined = true; + } + return this; + } + + /** + * @return the top-level reader corresponding to the whole underlying index + */ + public IndexReader getTopLevelReader() { + return topLevelReader; + } + + /** + * @return the low-level reader corresponding to the segment being filtered + */ + protected IndexReader getLowLevelReader() { + return lowLevelReader; + } + + /** + * TheStatefulTermsEnum abstract class builds on the
+ * {@link FilteredTermsEnum} by essentially giving it a "memory" of past
+ * terms. More importantly, it acts as a reusable {@link FilteredTermsEnum} in
+ * the sense that its actual {@link TermsEnum} can be reset over and over
+ * during an instance's life. Note that, by default, a
+ * {@link FilteredTermsEnum} applies to exactly one underlying
+ * {@link TermsEnum}.
+ *
+ * @author Karthick Sankarachary
+ */
+ public abstract class StatefulTermsEnum extends FilteredTermsEnum implements
+ Cloneable {
+ // The name of the field to filter on.
+ protected String field = null;
+ // The value of the text to initially seek on.
+ protected String initialText = null;
+ // A flag to indicate when to end this enum.
+ protected boolean endEnum = false;
+ // The terms for the given {@link #field} in the top-level reader.
+ protected Terms topLevelTerms = null;
+
+ // A cache of the text of the word in terms belonging to current
+ // and/or previous segments. As an aside, using a byte-based Trie data
+ // structure instead of a {@link HashSet} might improve performance.
+ private Set