Index: src/java/org/apache/lucene/search/similar/LRULinkedHashMap.java =================================================================== --- src/java/org/apache/lucene/search/similar/LRULinkedHashMap.java (revision 0) +++ src/java/org/apache/lucene/search/similar/LRULinkedHashMap.java (revision 0) @@ -0,0 +1,37 @@ +/** + * Copyright 2004-2005 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similar; + +import java.util.Map; +import java.util.LinkedHashMap; + +/** + * Class that wraps a deletion policy around a LinkedHashMap to + * provide a basic resizable LRU cache + */ +public class LRULinkedHashMap extends LinkedHashMap +{ + public LRULinkedHashMap(int initialCapacity, float loadFactor, boolean accessOrder) { + super(initialCapacity, loadFactor, accessOrder); + } + private int MAX_TERMS = 0; + public void setMaxTerms(int terms) { + MAX_TERMS = terms; + } + public boolean removeEldestEntry(Map.Entry eldest) { + return size() > MAX_TERMS; + } +} \ No newline at end of file Index: src/java/org/apache/lucene/search/similar/MoreLikeThis.java =================================================================== --- src/java/org/apache/lucene/search/similar/MoreLikeThis.java (revision 798477) +++ src/java/org/apache/lucene/search/similar/MoreLikeThis.java (working copy) @@ -36,6 +36,7 @@ import java.util.Set; import java.util.HashMap; import java.util.Map; +import java.util.WeakHashMap; import java.util.Collection; import java.util.Iterator; import java.io.IOException; @@ -287,6 +288,16 @@ private float boostFactor = 1; /** + * Cache term frequencies for recently accessed terms + */ + private WeakHashMap termFreqCache = null; + + /** + * Term cache size + */ + private int termFreqCacheSize = 0; + + /** * Returns the boost factor used when boosting terms * @return the boost factor used when boosting terms */ @@ -301,6 +312,14 @@ public void setBoostFactor(float boostFactor) { this.boostFactor = boostFactor; } + + /** + * Sets the term cache size (this value is used per field) + * @param size + */ + public void setTermCacheSize(int size) { + this.termFreqCacheSize = size; + } /** * Constructor requiring an IndexReader. @@ -310,11 +329,42 @@ } public MoreLikeThis(IndexReader ir, Similarity sim){ - this.ir = ir; - this.similarity = sim; + this(ir, sim, 1000); } + /** + * Constructor specifying IndexReader and custom similarity and cache size values. The + * cache size is per index field. + * + * @param indexReader Reference to an open IndexReader + * @param termCacheSize Size limit for the LRU cache for each index field + */ + public MoreLikeThis(IndexReader ir, int termCacheSize ){ + this(ir, new DefaultSimilarity(), termCacheSize); + } + /** + * Constructor specifying IndexReader and custom similarity and cache size values. The + * cache size is per index field. + * + * @param indexReader Reference to an open IndexReader + * @param similarity Similarity implimentation + * @param termCacheSize Size limit for the LRU cache for each index field + */ + public MoreLikeThis(IndexReader ir, Similarity sim, int termCacheSize ){ + this.termFreqCacheSize = termCacheSize; + this.ir = ir; + this.similarity = sim; + this.createCache(); + } + + private Map createCache(){ + this.termFreqCache = new WeakHashMap(); + Map cache = new HashMap(); + this.termFreqCache.put(this.ir, cache); + return cache; + } + public Similarity getSimilarity() { return similarity; } @@ -649,7 +699,8 @@ String topField = fieldNames[0]; int docFreq = 0; for (int i = 0; i < fieldNames.length; i++) { - int freq = ir.docFreq(new Term(fieldNames[i], word)); + + int freq = this.getTermFrequency( word, fieldNames[i] ); topField = (freq > docFreq) ? fieldNames[i] : topField; docFreq = (freq > docFreq) ? freq : docFreq; } @@ -678,6 +729,43 @@ } /** + * Fetches term frequency from the index, via the LRU cache + */ + protected int getTermFrequency( String term, String field ) throws IOException + { + // allow bypass of the cache + if ( this.termFreqCacheSize <= 0 ) return ir.docFreq(new Term(field, term)); + + // get the cache for this specific field + Map cache = (Map)this.termFreqCache.get(this.ir); + if ( cache == null ) cache = this.createCache(); + LRULinkedHashMap fieldCache = (LRULinkedHashMap)cache.get(field); + int freq = 0; + + // make sure this field has a cache + if ( fieldCache == null ) { + fieldCache = new LRULinkedHashMap(this.termFreqCacheSize+1, 0.75f, true); + fieldCache.setMaxTerms(this.termFreqCacheSize); + cache.put( field, fieldCache ); + } + + Integer intbox = (Integer)fieldCache.get(term); + if ( intbox != null ) + { + // use the cached version + freq = intbox.intValue(); + } + else + { + // fetch from the index and cache + freq = ir.docFreq(new Term(field, term)); + fieldCache.put(term,new Integer(freq)); + } + + return freq; + } + + /** * Describe the parameters that control how the "more like this" query is formed. */ public String describeParams() { @@ -972,6 +1060,8 @@ x = 1; } } + - } + + Index: src/java/org/apache/lucene/search/similar/MoreLikeThisQuery.java =================================================================== --- src/java/org/apache/lucene/search/similar/MoreLikeThisQuery.java (revision 798477) +++ src/java/org/apache/lucene/search/similar/MoreLikeThisQuery.java (working copy) @@ -43,6 +43,7 @@ private String likeText; private String[] moreLikeFields; private Analyzer analyzer; + private int termCacheSize = -1; float percentTermsToMatch=0.3f; int minTermFrequency=1; int maxQueryTerms=5; @@ -55,15 +56,20 @@ */ public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) { + this(likeText, moreLikeFields, analyzer, -1 ); + } + public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer, int termCacheSize ) + { this.likeText=likeText; this.moreLikeFields=moreLikeFields; this.analyzer=analyzer; + this.termCacheSize=termCacheSize; } public Query rewrite(IndexReader reader) throws IOException { MoreLikeThis mlt=new MoreLikeThis(reader); - + if ( this.termCacheSize > -1 ) mlt.setTermCacheSize(this.termCacheSize); mlt.setFieldNames(moreLikeFields); mlt.setAnalyzer(analyzer); mlt.setMinTermFreq(minTermFrequency); Index: src/test/org/apache/lucene/search/similar/TestLRULinkedHashMap.java =================================================================== --- src/test/org/apache/lucene/search/similar/TestLRULinkedHashMap.java (revision 0) +++ src/test/org/apache/lucene/search/similar/TestLRULinkedHashMap.java (revision 0) @@ -0,0 +1,86 @@ +package org.apache.lucene.search.similar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriter.MaxFieldLength; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; + +public class TestLRULinkedHashMap extends LuceneTestCase { + + public void testSize() + { + // build a simple cache and make sure the limit works + LRULinkedHashMap cache = new LRULinkedHashMap(2, 0.75f, true); + cache.setMaxTerms(2); + cache.put("one",new Integer(1)); + assert(cache.size() == 1); + cache.put("two",new Integer(2)); + assert(cache.size() == 2); + cache.put("three",new Integer(3)); + assert(cache.size() == 2); + } + + public void testOldestExpiry() + { + // build a simple cache and make sure the right items are expired + LRULinkedHashMap cache = new LRULinkedHashMap(2, 0.75f, true); + cache.setMaxTerms(2); + cache.put("one",new Integer(1)); + cache.put("two",new Integer(2)); + cache.put("three",new Integer(3)); + + // test that the oldest item has been dropped when there are no accesses + assert(cache.size() == 2); + assert(cache.get("one") == null); + assert(cache.get("two").equals(new Integer(2))); + assert(cache.get("three").equals(new Integer(3))); + } + + public void testLastAccessExpiry() + { + // build a simple cache and make sure the right items are expired + LRULinkedHashMap cache = new LRULinkedHashMap(2, 0.75f, true); + cache.setMaxTerms(2); + cache.put("one",new Integer(1)); + cache.put("two",new Integer(2)); + + // test that accessing a term changes the expiry order + Integer accessTest = (Integer)cache.get("one"); + cache.put("three",new Integer(3)); + + assert(cache.get("one").equals(new Integer(1))); + assert(cache.get("two") == null); + assert(cache.get("three").equals(new Integer(3))); + } +} Index: src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java =================================================================== --- src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java (revision 798477) +++ src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java (working copy) @@ -35,6 +35,7 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.index.Term; public class TestMoreLikeThis extends LuceneTestCase { private RAMDirectory directory; @@ -123,4 +124,22 @@ } return originalValues; } + + public void testFreqCache() throws IOException + { + MoreLikeThis mlt = new MoreLikeThis(reader,3); + + // check the right values come out + assert(mlt.getTermFrequency("current","text") == 0); + assert(mlt.getTermFrequency("lucene","text") == 2); + assert(mlt.getTermFrequency("release","text") == 1); + + // check the values are old cached values despite a new document + IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(), true, MaxFieldLength.UNLIMITED); + addDoc(writer, "current lucene nightly release"); + writer.close(); + assert(mlt.getTermFrequency("current","text") == 0); + assert(mlt.getTermFrequency("lucene","text") == 2); + assert(mlt.getTermFrequency("release","text") == 1); + } }