Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 945833) +++ lucene/CHANGES.txt (working copy) @@ -394,6 +394,17 @@ * LUCENE-2467: Fixed memory leaks in IndexWriter when large documents are indexed. (Mike McCandless) +* LUCENE-2468: Fix CachingWrapperFilter and CachingSpanFilter to + consider two SegmentReaders equal if only deletions have changed. + This should be a performance gain (higher cache hit rate) in apps + that reopen readers, or use near-real-time reader + (IndexWriter.getReader()). This change may introduce invalid search + results (allowing deleted docs to be returned) for certain advanced + apps, so a new expert ctor was added to CachingWrapperFilter to + enforce deletions at a performance cost. (Shay Banon via Mike + McCandless) + + New features * LUCENE-2128: Parallelized fetching document frequencies during weight Index: lucene/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java (revision 945833) +++ lucene/src/test/org/apache/lucene/search/TestCachingWrapperFilter.java (working copy) @@ -24,11 +24,15 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.MockRAMDirectory; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.OpenBitSetDISI; +import org.apache.lucene.util.BytesRef; public class TestCachingWrapperFilter extends LuceneTestCase { public void testCachingWorks() throws Exception { @@ -94,4 +98,65 @@ reader.close(); } + + public void testEnforceDeletions() throws Exception { + Directory dir = new MockRAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexReader reader = writer.getReader(); + IndexSearcher searcher = new IndexSearcher(reader); + + // add a doc, refresh the reader, and check that its there + Document doc = new Document(); + doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)); + writer.addDocument(doc); + + reader = refreshReader(reader); + searcher = new IndexSearcher(reader); + + TopDocs docs = searcher.search(new MatchAllDocsQuery(), 1); + assertEquals("Should find a hit...", 1, docs.totalHits); + + Filter filter = new Filter() { + @Override + public DocIdSet getDocIdSet(final IndexReader r) throws IOException { + return new DocIdSet() { + @Override + public DocIdSetIterator iterator() throws IOException { + return r.termDocsEnum(r.getDeletedDocs(), + "id", + new BytesRef("1")); + } + }; + } + }; + + filter = new CachingWrapperFilter(filter, true); + + docs = searcher.search(new MatchAllDocsQuery(), filter, 1); + assertEquals("[query + filter] Should find a hit...", 1, docs.totalHits); + ConstantScoreQuery constantScore = new ConstantScoreQuery(filter); + docs = searcher.search(constantScore, 1); + assertEquals("[just filter] Should find a hit...", 1, docs.totalHits); + + // now delete the doc, refresh the reader, and see that it's not there + writer.deleteDocuments(new Term("id", "1")); + + reader = refreshReader(reader); + searcher = new IndexSearcher(reader); + + docs = searcher.search(new MatchAllDocsQuery(), filter, 1); + assertEquals("[query + filter] Should *not* find a hit...", 0, docs.totalHits); + + docs = searcher.search(constantScore, 1); + assertEquals("[just filter] Should *not* find a hit...", 0, docs.totalHits); + } + + private static IndexReader refreshReader(IndexReader reader) throws IOException { + IndexReader oldReader = reader; + reader = reader.reopen(); + if (reader != oldReader) { + oldReader.close(); + } + return reader; + } } Index: lucene/src/test/org/apache/lucene/search/CachingWrapperFilterHelper.java =================================================================== --- lucene/src/test/org/apache/lucene/search/CachingWrapperFilterHelper.java (revision 945833) +++ lucene/src/test/org/apache/lucene/search/CachingWrapperFilterHelper.java (working copy) @@ -43,12 +43,19 @@ @Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + final Object key; + if (enforceDeletions) { + key = reader; + } else { + key = reader.getCoreCacheKey(); + } + if (cache == null) { - cache = new WeakHashMap(); + cache = new WeakHashMap(); } synchronized (cache) { // check cache - DocIdSet cached = cache.get(reader); + DocIdSet cached = cache.get(key); if (shouldHaveCache) { Assert.assertNotNull("Cache should have data ", cached); } else { @@ -62,7 +69,7 @@ final DocIdSet bits = filter.getDocIdSet(reader); synchronized (cache) { // update cache - cache.put(reader, bits); + cache.put(key, bits); } return bits; Index: lucene/src/java/org/apache/lucene/search/CachingSpanFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/search/CachingSpanFilter.java (revision 945833) +++ lucene/src/java/org/apache/lucene/search/CachingSpanFilter.java (working copy) @@ -33,15 +33,38 @@ /** * A transient Filter cache (package private because of test) */ - private transient Map cache; + private transient Map cache; private final ReentrantLock lock = new ReentrantLock(); + private final boolean enforceDeletions; + /** * @param filter Filter to cache results of */ public CachingSpanFilter(SpanFilter filter) { + this(filter, false); + } + + /** + * Expert: by default, the cached filter will be shared + * across reopened segments that only had changes to their + * deletions. This is a big performance gain, since you + * don't hit a cache miss on a reopened reader for prior + * segments. However, in some cases this can cause + * invalid query results, allowing deleted documents to be + * returned. This only happens if the query does not + * rule out deleted documents on its own, such as + * ConstantScoreQuery. + * + * @param filter Filter to cache results of + * @param enforceDeletions if true, a reopened segment + * will be forced to regenerate the cache so that + * deleted documents are not present + */ + public CachingSpanFilter(SpanFilter filter, boolean enforceDeletions) { this.filter = filter; + this.enforceDeletions = enforceDeletions; } @Override @@ -51,12 +74,19 @@ } private SpanFilterResult getCachedResult(IndexReader reader) throws IOException { + final Object key; + if (enforceDeletions) { + key = reader; + } else { + key = reader.getCoreCacheKey(); + } + lock.lock(); try { if (cache == null) { - cache = new WeakHashMap(); + cache = new WeakHashMap(); } - final SpanFilterResult cached = cache.get(reader); + final SpanFilterResult cached = cache.get(key); if (cached != null) return cached; } finally { lock.unlock(); @@ -65,7 +95,7 @@ final SpanFilterResult result = filter.bitSpans(reader); lock.lock(); try { - cache.put(reader, result); + cache.put(key, result); } finally { lock.unlock(); } Index: lucene/src/java/org/apache/lucene/search/TopScoreDocCollector.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TopScoreDocCollector.java (revision 945833) +++ lucene/src/java/org/apache/lucene/search/TopScoreDocCollector.java (working copy) @@ -108,7 +108,6 @@ * objects. */ public static TopScoreDocCollector create(int numHits, boolean docsScoredInOrder) { - if (docsScoredInOrder) { return new InOrderTopScoreDocCollector(numHits); } else { Index: lucene/src/java/org/apache/lucene/search/CachingWrapperFilter.java =================================================================== --- lucene/src/java/org/apache/lucene/search/CachingWrapperFilter.java (revision 945833) +++ lucene/src/java/org/apache/lucene/search/CachingWrapperFilter.java (working copy) @@ -35,15 +35,38 @@ /** * A transient Filter cache (package private because of test) */ - transient Map cache; + transient Map cache; private final ReentrantLock lock = new ReentrantLock(); + protected final boolean enforceDeletions; + /** * @param filter Filter to cache results of */ public CachingWrapperFilter(Filter filter) { + this(filter, false); + } + + /** + * Expert: by default, the cached filter will be shared + * across reopened segments that only had changes to their + * deletions. This is a big performance gain, since you + * don't hit a cache miss on a reopened reader for prior + * segments. However, in some cases this can cause + * invalid query results, allowing deleted documents to be + * returned. This only happens if the query does not + * rule out deleted documents on its own, such as + * ConstantScoreQuery. + * + * @param filter Filter to cache results of + * @param enforceDeletions if true, a reopened segment + * will be forced to regenerate the cache so that + * deleted documents are not present + */ + public CachingWrapperFilter(Filter filter, boolean enforceDeletions) { this.filter = filter; + this.enforceDeletions = enforceDeletions; } /** Provide the DocIdSet to be cached, using the DocIdSet provided @@ -66,13 +89,20 @@ @Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException { + final Object key; + if (enforceDeletions) { + key = reader; + } else { + key = reader.getCoreCacheKey(); + } + lock.lock(); try { if (cache == null) { - cache = new WeakHashMap(); + cache = new WeakHashMap(); } - - final DocIdSet cached = cache.get(reader); + + final DocIdSet cached = cache.get(key); if (cached != null) return cached; } finally { lock.unlock(); @@ -82,7 +112,7 @@ if (docIdSet != null) { lock.lock(); try { - cache.put(reader, docIdSet); + cache.put(key, docIdSet); } finally { lock.unlock(); } Index: lucene/src/java/org/apache/lucene/search/TopDocsCollector.java =================================================================== --- lucene/src/java/org/apache/lucene/search/TopDocsCollector.java (revision 945833) +++ lucene/src/java/org/apache/lucene/search/TopDocsCollector.java (working copy) @@ -116,7 +116,7 @@ * search execution collected. */ public final TopDocs topDocs(int start, int howMany) { - + // In case pq was populated with sentinel values, there might be less // results than pq.size(). Therefore return all results until either // pq.size() or totalHits. @@ -141,7 +141,7 @@ // Get the requested results from pq. populateResults(results, howMany); - + return newTopDocs(results, start); } Index: lucene/src/java/org/apache/lucene/search/IndexSearcher.java =================================================================== --- lucene/src/java/org/apache/lucene/search/IndexSearcher.java (revision 945833) +++ lucene/src/java/org/apache/lucene/search/IndexSearcher.java (working copy) @@ -162,7 +162,7 @@ throw new IllegalArgumentException("nDocs must be > 0"); } - nDocs = Math.min(nDocs, reader.numDocs()); + nDocs = Math.min(nDocs, reader.maxDoc()); TopScoreDocCollector collector = TopScoreDocCollector.create(nDocs, !weight.scoresDocsOutOfOrder()); search(weight, filter, collector); @@ -190,7 +190,7 @@ Sort sort, boolean fillFields) throws IOException { - nDocs = Math.min(nDocs, reader.numDocs()); + nDocs = Math.min(nDocs, reader.maxDoc()); TopFieldCollector collector = TopFieldCollector.create(sort, nDocs, fillFields, fieldSortDoTrackScores, fieldSortDoMaxScore, !weight.scoresDocsOutOfOrder()); Index: lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java =================================================================== --- lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (revision 945833) +++ lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java (working copy) @@ -154,7 +154,7 @@ /** Remove this reader from the cache, if present. */ public void purge(IndexReader r) { - Object readerKey = r.getFieldCacheKey(); + Object readerKey = r.getCoreCacheKey(); synchronized(readerCache) { readerCache.remove(readerKey); } @@ -163,7 +163,7 @@ public Object get(IndexReader reader, Entry key) throws IOException { Map innerCache; Object value; - final Object readerKey = reader.getFieldCacheKey(); + final Object readerKey = reader.getCoreCacheKey(); synchronized (readerCache) { innerCache = readerCache.get(readerKey); if (innerCache == null) { Index: lucene/src/java/org/apache/lucene/index/SegmentReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/SegmentReader.java (revision 945833) +++ lucene/src/java/org/apache/lucene/index/SegmentReader.java (working copy) @@ -1293,7 +1293,7 @@ // share the underlying postings data) will map to the // same entry in the FieldCache. See LUCENE-1579. @Override - public final Object getFieldCacheKey() { + public final Object getCoreCacheKey() { return core; } Index: lucene/src/java/org/apache/lucene/index/FilterIndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/FilterIndexReader.java (revision 945833) +++ lucene/src/java/org/apache/lucene/index/FilterIndexReader.java (working copy) @@ -309,8 +309,8 @@ * contents of the FieldCache, you must override this * method to provide a different key */ @Override - public Object getFieldCacheKey() { - return in.getFieldCacheKey(); + public Object getCoreCacheKey() { + return in.getCoreCacheKey(); } /** {@inheritDoc} */ Index: lucene/src/java/org/apache/lucene/index/IndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/IndexReader.java (revision 945833) +++ lucene/src/java/org/apache/lucene/index/IndexReader.java (working copy) @@ -1367,7 +1367,7 @@ } /** Expert */ - public Object getFieldCacheKey() { + public Object getCoreCacheKey() { return this; } Index: lucene/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java =================================================================== --- lucene/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java (revision 945833) +++ lucene/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java (working copy) @@ -274,7 +274,7 @@ if (obj instanceof IndexReader) { IndexReader[] subs = ((IndexReader)obj).getSequentialSubReaders(); for (int j = 0; (null != subs) && (j < subs.length); j++) { - all.add(subs[j].getFieldCacheKey()); + all.add(subs[j].getCoreCacheKey()); } }