Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 883337) +++ CHANGES.txt (working copy) @@ -20,6 +20,12 @@ * LUCENE-2086: When resolving deleted terms, do so in term sort order for better performance. (Bogdan Ghidireac via Mike McCandless) +* LUCENE-2075: Terms dict cache is now shared across threads instead + of being stored separately in thread local storage. Also fixed + terms dict so that the cache is used when seeking the thread local + term enum, which will be important for MultiTermQuery impls that do + lots of seeking (Mike McCandless, Uwe Schindler, Yonik Seeley) + Build ======================= Release 3.0.0 2009-11-25 ======================= Index: src/java/org/apache/lucene/index/TermInfo.java =================================================================== --- src/java/org/apache/lucene/index/TermInfo.java (revision 883337) +++ src/java/org/apache/lucene/index/TermInfo.java (working copy) @@ -19,7 +19,7 @@ /** A TermInfo is the record of information stored for a term.*/ -final class TermInfo { +class TermInfo { /** The number of documents which contain the term. */ int docFreq = 0; @@ -42,6 +42,28 @@ skipOffset = ti.skipOffset; } + public boolean equals(Object obj) { + if (obj instanceof TermInfo) { + TermInfo other = (TermInfo) obj; + return other.docFreq == docFreq && + other.freqPointer == freqPointer && + other.proxPointer == proxPointer && + other.skipOffset == skipOffset; + } else { + return false; + } + } + + public int hashCode() { + final int PRIME = 17; + int result = 1; + result = PRIME * result + docFreq; + result = (int) (PRIME * result + freqPointer); + result = (int) (PRIME * result + proxPointer); + result = (int) (PRIME * result + skipOffset); + return result; + } + final void set(int docFreq, long freqPointer, long proxPointer, int skipOffset) { this.docFreq = docFreq; Index: src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- src/java/org/apache/lucene/index/TermInfosReader.java (revision 883337) +++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -21,7 +21,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.cache.Cache; -import org.apache.lucene.util.cache.SimpleLRUCache; +import org.apache.lucene.util.cache.DoubleBarrelLRUCache; import org.apache.lucene.util.CloseableThreadLocal; /** This stores a monotonically increasing set of pairs in a @@ -44,15 +44,23 @@ private final int totalIndexInterval; private final static int DEFAULT_CACHE_SIZE = 1024; + + // Just adds term's ord to TermInfo + private final static class TermInfoAndOrd extends TermInfo { + final int termOrd; + public TermInfoAndOrd(TermInfo ti, int termOrd) { + super(ti); + this.termOrd = termOrd; + } + } + + private final Cache termsCache = new DoubleBarrelLRUCache(DEFAULT_CACHE_SIZE); /** * Per-thread resources managed by ThreadLocal */ private static final class ThreadResources { SegmentTermEnum termEnum; - - // Used for caching the least recently looked-up Terms - Cache termInfoCache; } TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize, int indexDivisor) @@ -130,6 +138,7 @@ if (origEnum != null) origEnum.close(); threadResources.close(); + termsCache.close(); } /** Returns the number of term/value pairs in the set. */ @@ -142,8 +151,6 @@ if (resources == null) { resources = new ThreadResources(); resources.termEnum = terms(); - // Cache does not have to be thread-safe, it is only used by one thread at the same time - resources.termInfoCache = new SimpleLRUCache(DEFAULT_CACHE_SIZE); threadResources.set(resources); } return resources; @@ -176,26 +183,20 @@ /** Returns the TermInfo for a Term in the set, or null. */ TermInfo get(Term term) throws IOException { - return get(term, true); + return get(term, false); } /** Returns the TermInfo for a Term in the set, or null. */ - private TermInfo get(Term term, boolean useCache) throws IOException { + private TermInfo get(Term term, boolean mustSeekEnum) throws IOException { if (size == 0) return null; ensureIndexIsRead(); - TermInfo ti; + TermInfoAndOrd tiOrd = termsCache.get(term); ThreadResources resources = getThreadResources(); - Cache cache = null; - if (useCache) { - cache = resources.termInfoCache; - // check the cache first if the term was recently looked up - ti = cache.get(term); - if (ti != null) { - return ti; - } + if (!mustSeekEnum && tiOrd != null) { + return tiOrd; } // optimize sequential access: first try scanning cached enum w/o seeking @@ -208,16 +209,23 @@ || term.compareTo(indexTerms[enumOffset]) < 0) { // no need to seek + final TermInfo ti; + int numScans = enumerator.scanTo(term); if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { ti = enumerator.termInfo(); - if (cache != null && numScans > 1) { + if (numScans > 1) { // we only want to put this TermInfo into the cache if // scanEnum skipped more than one dictionary entry. // This prevents RangeQueries or WildcardQueries to // wipe out the cache when they iterate over a large numbers // of terms in order - cache.put(term, ti); + if (tiOrd == null) { + termsCache.put(term, new TermInfoAndOrd(ti, (int) enumerator.position)); + } else { + assert ti.equals(tiOrd); + assert (int) enumerator.position == tiOrd.termOrd; + } } } else { ti = null; @@ -228,12 +236,24 @@ } // random-access: must seek - seekEnum(enumerator, getIndexOffset(term)); + final int indexPos; + if (tiOrd != null) { + indexPos = tiOrd.termOrd / totalIndexInterval; + } else { + // Must do binary search: + indexPos = getIndexOffset(term); + } + + seekEnum(enumerator, indexPos); enumerator.scanTo(term); + final TermInfo ti; if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { ti = enumerator.termInfo(); - if (cache != null) { - cache.put(term, ti); + if (tiOrd == null) { + termsCache.put(term, new TermInfoAndOrd(ti, (int) enumerator.position)); + } else { + assert ti.equals(tiOrd); + assert (int) enumerator.position == tiOrd.termOrd; } } else { ti = null; @@ -294,9 +314,7 @@ /** Returns an enumeration of terms starting at or after the named term. */ public SegmentTermEnum terms(Term term) throws IOException { - // don't use the cache in this call because we want to reposition the - // enumeration - get(term, false); + get(term, true); return (SegmentTermEnum)getThreadResources().termEnum.clone(); } } Index: src/java/org/apache/lucene/util/cache/DoubleBarrelLRUCache.java =================================================================== --- src/java/org/apache/lucene/util/cache/DoubleBarrelLRUCache.java (revision 0) +++ src/java/org/apache/lucene/util/cache/DoubleBarrelLRUCache.java (revision 0) @@ -0,0 +1,128 @@ +package org.apache.lucene.util.cache; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.Map; + +/** + * Simple concurrent LRU cache, using a "double barrel" + * approach where two ConcurrentHashMaps record entries. + * + *

At any given time, one hash is primary and the other + * is secondary. {@link #get} first checks primary, and if + * that's a miss, checks secondary. If secondary has the + * entry, it's promoted to primary. Once primary is full, + * the two are swapped and secondary is cleared.

+ * + *

This is not as space efficient as other possible + * concurrent approaches (see LUCENE-2075): to achieve + * perfect LRU(N) it requires 2*N storage. But, this + * approach is relatively simple and seems in practice to + * not grow unbounded in size when under hideously high + * load.

+ * + *

NOTE: this class is meant only to be used internally + * by Lucene; it's only public so it can be shared across + * packages. This means the API is freely subject to + * change, and, the class could be removed entirely, in any + * Lucene release. Use directly at your own risk! + */ + +final public class DoubleBarrelLRUCache extends Cache { + private final Map cache1; + private final Map cache2; + private final AtomicInteger putCount = new AtomicInteger(); + private final AtomicInteger swapCount = new AtomicInteger(); + private final int maxSize; + + public DoubleBarrelLRUCache(int maxSize) { + this.maxSize = maxSize; + cache1 = new ConcurrentHashMap(); + cache2 = new ConcurrentHashMap(); + } + + private final boolean swapped() { + return (swapCount.get()&1) != 0; + } + + @Override + public boolean containsKey(Object k) { + return false; + } + + @Override + public void close() { + } + + @Override @SuppressWarnings("unchecked") + public V get(Object key) { + final Map primary; + final Map secondary; + if (swapped()) { + primary = cache2; + secondary = cache1; + } else { + primary = cache1; + secondary = cache2; + } + + // Try primary frist + V result = primary.get(key); + if (result == null) { + // Not found -- try secondary + result = secondary.get(key); + if (result != null) { + // Promote to primary + put((K) key, result); + } + } + return result; + } + + @Override + public void put(K key, V value) { + final Map primary; + final Map secondary; + if (swapped()) { + primary = cache2; + secondary = cache1; + } else { + primary = cache1; + secondary = cache2; + } + primary.put(key, value); + + if (putCount.getAndIncrement() % maxSize == maxSize-1) { + // Time to swap + + // NOTE: there is saturation risk here, that the + // thread that's doing the clear() takes too long to + // do so, while other threads continue to add to + // primary, but in practice this seems not to be an + // issue (see LUCENE-2075 for benchmark & details) + + // First clear secondary + secondary.clear(); + + // Second, swap + swapCount.getAndIncrement(); + } + } +} Property changes on: src/java/org/apache/lucene/util/cache/DoubleBarrelLRUCache.java ___________________________________________________________________ Added: svn:eol-style + native