Index: FuzzyQuery.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java,v retrieving revision 1.11 diff -u -r1.11 FuzzyQuery.java --- FuzzyQuery.java 8 Nov 2004 00:10:39 -0000 1.11 +++ FuzzyQuery.java 4 Jan 2005 21:18:17 -0000 @@ -104,16 +104,15 @@ ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); try { - do { float minScore = 0.0f; - float score = 0.0f; + do { Term t = enumerator.term(); if (t != null) { - score = enumerator.difference(); + float score = enumerator.difference(); // terms come in alphabetical order, therefore if queue is full and score // not bigger than minScore, we can skip if(stQueue.size() < maxClauseCount || score > minScore){ - stQueue.insert(new ScoreTerm(t, score)); + stQueue.insert(new ScoreTerm(t, score, enumerator.docFreq())); minScore = ((ScoreTerm)stQueue.top()).score; // maintain minScore } } @@ -122,11 +121,13 @@ enumerator.close(); } - BooleanQuery query = new BooleanQuery(); + BooleanQuery query = new ExpandedTermsQuery(); + ExpandedTermSimilarity etSim=new ExpandedTermSimilarity(); int size = stQueue.size(); for(int i = 0; i < size; i++){ ScoreTerm st = (ScoreTerm) stQueue.pop(); - TermQuery tq = new TermQuery(st.term); // found a match + TermQuery tq = new ExpandedTermQuery(st.term,etSim); // found a match + etSim.addDocFreq(st.docFreq); //update the docFreq tq.setBoost(getBoost() * st.score); // set the boost query.add(tq, BooleanClause.Occur.SHOULD); // add to query } @@ -141,11 +142,14 @@ private static class ScoreTerm{ public Term term; public float score; + public int docFreq; + - public ScoreTerm(Term term, float score){ - this.term = term; - this.score = score; - } + public ScoreTerm(Term term, float score, int docFreq){ + this.term = term; + this.score = score; + this.docFreq=docFreq; + } } private static class ScoreTermQueue extends PriorityQueue { Index: MultiTermQuery.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/search/MultiTermQuery.java,v retrieving revision 1.12 diff -u -r1.12 MultiTermQuery.java --- MultiTermQuery.java 27 Aug 2004 20:20:47 -0000 1.12 +++ MultiTermQuery.java 4 Jan 2005 21:18:18 -0000 @@ -51,12 +51,15 @@ public Query rewrite(IndexReader reader) throws IOException { FilteredTermEnum enumerator = getEnum(reader); - BooleanQuery query = new BooleanQuery(); + BooleanQuery query = new ExpandedTermsQuery(); + ExpandedTermSimilarity etSim=new ExpandedTermSimilarity(); + try { do { Term t = enumerator.term(); if (t != null) { - TermQuery tq = new TermQuery(t); // found a match + etSim.addDocFreq(enumerator.docFreq()); + TermQuery tq = new ExpandedTermQuery(t,etSim); // found a match tq.setBoost(getBoost() * enumerator.difference()); // set the boost query.add(tq, BooleanClause.Occur.SHOULD); // add to query } Index: PrefixQuery.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/search/PrefixQuery.java,v retrieving revision 1.10 diff -u -r1.10 PrefixQuery.java --- PrefixQuery.java 27 Aug 2004 20:20:47 -0000 1.10 +++ PrefixQuery.java 4 Jan 2005 21:18:18 -0000 @@ -35,7 +35,9 @@ public Term getPrefix() { return prefix; } public Query rewrite(IndexReader reader) throws IOException { - BooleanQuery query = new BooleanQuery(); + BooleanQuery query = new ExpandedTermsQuery(); + ExpandedTermSimilarity etSim=new ExpandedTermSimilarity(); + TermEnum enumerator = reader.terms(prefix); try { String prefixText = prefix.text(); @@ -45,7 +47,8 @@ if (term != null && term.text().startsWith(prefixText) && term.field() == prefixField) { - TermQuery tq = new TermQuery(term); // found a match + etSim.addDocFreq(enumerator.docFreq()); + TermQuery tq = new ExpandedTermQuery(term, etSim); // found a match tq.setBoost(getBoost()); // set the boost query.add(tq, BooleanClause.Occur.SHOULD); // add to query //System.out.println("added " + term); Index: RangeQuery.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/search/RangeQuery.java,v retrieving revision 1.14 diff -u -r1.14 RangeQuery.java --- RangeQuery.java 23 Nov 2004 20:54:47 -0000 1.14 +++ RangeQuery.java 4 Jan 2005 21:18:18 -0000 @@ -64,7 +64,9 @@ public Query rewrite(IndexReader reader) throws IOException { - BooleanQuery query = new BooleanQuery(); +// BooleanQuery query = new BooleanQuery(); + BooleanQuery query = new ExpandedTermsQuery(); + ExpandedTermSimilarity etSim=new ExpandedTermSimilarity(); TermEnum enumerator = reader.terms(lowerTerm); try { @@ -87,7 +89,8 @@ if ((compare < 0) || (!inclusive && compare == 0)) break; } - TermQuery tq = new TermQuery(term); // found a match + TermQuery tq = new ExpandedTermQuery(term, etSim); // found a match + etSim.addDocFreq(enumerator.docFreq()); tq.setBoost(getBoost()); // set the boost query.add(tq, BooleanClause.Occur.SHOULD); // add to query } Index: ExpandedTermQuery.java =================================================================== RCS file: ExpandedTermQuery.java diff -N ExpandedTermQuery.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ ExpandedTermQuery.java 1 Jan 1970 00:00:00 -0000 @@ -0,0 +1,37 @@ +package org.apache.lucene.search; + +import org.apache.lucene.index.Term; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Uses a custom similarity to score terms expanded in queries that produce multiple terms + */ +public class ExpandedTermQuery extends TermQuery +{ + Similarity similarity; + public ExpandedTermQuery(Term t, Similarity similarity) + { + super(t); + this.similarity=similarity; + } + + public Similarity getSimilarity(Searcher searcher) + { + return similarity; + } +} Index: ExpandedTermSimilarity.java =================================================================== RCS file: ExpandedTermSimilarity.java diff -N ExpandedTermSimilarity.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ ExpandedTermSimilarity.java 1 Jan 1970 00:00:00 -0000 @@ -0,0 +1,44 @@ +package org.apache.lucene.search; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Similarity implementation used by multi-term queries. When searching for an expanded term + * e.g. accom* we dont want the rarest forms (eg mis-spellings like accomadation) being + * boosted above other forms. This class uses the most common document frequency as the basis + * of idf calculatation used by ALL members of a set of expanded terms. + * @author MAHarwood + */ +class ExpandedTermSimilarity extends DefaultSimilarity +{ + int maxDocFreq; + + /** + * ignores the docFreq of individual terms (we don't want a really scarce term eg a mis-spelling + * being boosted to the top of the results), instead the frequency of the most common term in the + * list of expanded terms is used as for all expanded terms. + */ + public float idf(int docFreq, int numDocs) + { + return (float)(Math.log(numDocs/(double)(maxDocFreq+1)) + 1.0); + } + + //records the maximum docFreq of an expanded term to remember the most common term + public void addDocFreq(int docFreq) + { + maxDocFreq=Math.max(docFreq,maxDocFreq); + } +} Index: ExpandedTermsQuery.java =================================================================== RCS file: ExpandedTermsQuery.java diff -N ExpandedTermsQuery.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ ExpandedTermsQuery.java 1 Jan 1970 00:00:00 -0000 @@ -0,0 +1,37 @@ +package org.apache.lucene.search; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Overrides coord similarity function to prevent expanded terms from devaluing scores. + */ +public class ExpandedTermsQuery extends BooleanQuery +{ + static Similarity similarity=new DefaultSimilarity() + { + public float coord(int overlap, int maxOverlap) + { + return 1; + } + }; + + public Similarity getSimilarity(Searcher searcher) + { + return similarity; + } +}