Index: src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- src/java/org/apache/lucene/search/PhraseQuery.java (revision 807534) +++ src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -60,6 +60,52 @@ public int getSlop() { return slop; } /** + * Sets sub-phrase (partial phrase) config + */ + private SubPhraseConfig subPhraseConf; + + /** + * Config that fine tunes sub phrase (partial phrase) matches. + */ + static class SubPhraseConfig { + /** + * How much more valuable is a N word sub-phrase + * compared to a N-1 word sub-phrase. + * Each subphrase is scored as + * sccore += (sub-phrase length to the power of phraseBoost) + * So with phraseBoost = 2, a 4 words long sub-phrase adds a score of 16 + * while 3 word long sub phrase adds a score of 9. + */ + public int phraseBoost = 2; + /** + * Ignore idf when scoring. + */ + public boolean ignoreIdf = false; + /** + * Ignore field norms when scoring. + */ + public boolean ignoreFieldNorms = false; + /** + * Ignore duplicate sub phrases. For example, "sub1 sub2" is a + * duplicate of "sub1 sub2". But "sub1" is not duplicate of "sub1 sub2" + */ + public boolean ignoreDuplicates = false; + /** + * When more than one sub-phrase matched, pick the longest for scoring. + */ + public boolean matchOnlyLongest = false; + } + + /** + * If the object is supplied, the query is treated as a sub-phrase query. + * + * @param subPhraseConf + */ + public void setSubPhraseConf(SubPhraseConfig subPhraseConf) { + this.subPhraseConf = subPhraseConf; + } + + /** * Adds a term to the end of the query phrase. * The relative position of the term is the one immediately after the last term added. */ @@ -118,6 +164,9 @@ this.similarity = getSimilarity(searcher); idf = similarity.idf(terms, searcher); + // if sub phrase config is present and it ignores idf, do it here + if (subPhraseConf != null && subPhraseConf.ignoreIdf) + idf = 1.0f; } public String toString() { return "weight(" + PhraseQuery.this + ")"; } @@ -148,6 +197,11 @@ tps[i] = p; } + // If sub-phrase is configured use it. Else revert to existing logic. + if (subPhraseConf != null) { + return new SubPhraseScorer(this, tps, getPositions(), similarity, + reader.norms(field), terms, subPhraseConf); + } else if (slop == 0) // optimize exact case return new ExactPhraseScorer(this, tps, getPositions(), similarity, reader.norms(field)); @@ -219,8 +273,12 @@ Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.norms(field); + // if sub phrase config is present and is configured to ignore field norms + // show the same in explain float fieldNorm = - fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 1.0f; + fieldNorms != null + && (subPhraseConf == null || !subPhraseConf.ignoreFieldNorms) + ? Similarity.decodeNorm(fieldNorms[doc]) : 1.0f; fieldNormExpl.setValue(fieldNorm); fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); fieldExpl.addDetail(fieldNormExpl); Index: src/java/org/apache/lucene/search/SubPhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/SubPhraseScorer.java (revision 0) +++ src/java/org/apache/lucene/search/SubPhraseScorer.java (revision 0) @@ -0,0 +1,705 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; + +import org.apache.lucene.index.*; +import org.apache.lucene.util.PriorityQueue; + +/** + * Phrase Query scorer that scores based on sub-phrases (partial phrases) + * For example when queries like "3 bed homes new york swimming pool" + * are run against multiple fields each holding a different piece of information + * like city, beds, amenities etc. we need to score based on sub-phrase + * matches. + *
+ * Example: + * doc1 : "one two three sub1 sub2 sub3 four sub4" + * doc2 : "one two three sub1 sub2 four sub4 sub3" + * q : " none ten sub1 sub2 sub3 sub4 something" + * doc1 should score higher than doc2 since it has 3-word plus 1-word matches + * where as doc2 has 2-word plus 1-word plus 1-word + * + * The difference between N-word vs N-1 word sub phrase score must be + * configurable There should be way to ignore matches except longest, like + * ignoring sub4 match above. We should be also able to ignore scoring factors + * outside of this doc so that the match is based on phrase match alone. + * + * Shingles look non-intuitive and expensive for this since the query as well as + * all fields of all doucments need to be indexed with all possible (1...N)-gram + * shingles and then a boolean OR query fired. + */ + +class SubPhraseScorer extends Scorer { + /** + * config + */ + private PhraseQuery.SubPhraseConfig conf; + + /** + * Fields copied from ExactScorer as it is. + */ + private Weight weight; + protected byte[] norms; + protected float value; + /** + * Holds the score for current document. + */ + private float score; + + /** + * The below is for iterating over documents. + */ + private boolean firstTime = true; + private boolean more = true; + + /** + * The is a linked list holding all PhrasePositions. + * Its always kept sorted by doc id. First one holding the smallest doc id. + * The first N PPs, with same doc id, represent all the terms found in current + * document. The first node of this linked list, is the next minimum doc id + * that needs to be processed + */ + protected SubPhraseQueue pq; + protected SubPhrasePositions first, last; + + /** + * This class is used to score each document. It receives the first N nodes + * with same doc id, meaning all PPs for terms found in current doc. It then + * iterates through all position:offset tuples for all these terms, alwyas finding + * the next minimu position for the current doc, mainting a current sequence + * where each term in the sequence has its offset as well as position + * incremented by 1. Whenever the sequence breaks, it scores it and starts + * new sequence to represent the new sub-phrase being matched. + */ + protected PerDocScorer perDoc; + + SubPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, + Similarity similarity, byte[] norms, ArrayList terms, + PhraseQuery.SubPhraseConfig subPhraseConf) { + super(similarity); + + this.norms = norms; + this.weight = weight; + this.value = weight.getValue(); + this.conf = subPhraseConf; + // create linked list + for (int i = 0; i < tps.length; i++) { + SubPhrasePositions pp = new SubPhrasePositions(tps[i], + offsets[i], terms.get(i)); + if (last != null) { + last.next = pp; + } else + first = pp; + last = pp; + } + // this queue is used for initial sorting of PPs + pq = new SubPhraseQueue(tps.length); + // this is used for scoring induvidual docs. + perDoc = new PerDocScorer(tps.length); + } + + /** + * First doucment in linked list is the current doc. + * + * @return + */ + public int doc() { + return first != null ? first.doc : NO_MORE_DOCS; + } + + /** + * First doucment in linked list is the current doc. + * + * @return + */ + public int docID() { + return doc(); + } + + /** + * Increment TermPositions for all terms with doc id as current doc, + * and move each to correct position in the list so that next minimum doc is the + * first node of the list. + * + * @return + * @throws IOException + */ + public boolean next() throws IOException { + return nextDoc() != NO_MORE_DOCS; + } + + /** + * Increment TermPositions for all terms with doc id as current doc, + * and move each to correct position in the list so that next minimum doc is the + * first node of the list. + * + * @return + * @throws IOException + */ + public int nextDoc() throws IOException { + if (firstTime) { + // sort the list and init term positions + init(); + firstTime = false; + } else { + // increment TPs for current doc and move them to correct pos. + doNext(); + } + perDoc.reset(); + score = perDoc.score(); + return doc(); + } + + /** + * Increment TermPositions for all terms with doc id as current doc, + * and move each to correct position in the list so that next minimum doc + * past the target is the first node of the list. + * + * @return + * @throws IOException + */ + public boolean skipTo(int target) throws IOException { + return advance(target) != NO_MORE_DOCS; + } + + /** + * Increment TermPositions for all terms with doc id as current doc, + * and move each to correct position in the list so that next minimum doc + * past the target is the first node of the list. + * + * @return + * @throws IOException + */ + public int advance(int target) throws IOException { + firstTime = false; + boolean more = false; + SubPhrasePositions last = null; + SubPhrasePositions pp = first; + while (pp != null) { + // increment each TP past target + if (!pp.skipTo(target)) { + // if no more docs for TP, remove from list + pp = remove(pp, last); + } else { + more = true; + last = pp; + pp = last.next; + } + } + this.more = more; + // sort it + if (more) + sort(); + perDoc.reset(); + score = perDoc.score(); + return doc(); + } + + /** + * Calculate score. Ignore fields norms if config says so. + * + * @return + * @throws IOException + */ + public float score() throws IOException { + float raw = getSimilarity().tf(score) * value; + float nrms = !conf.ignoreFieldNorms ? + Similarity.decodeNorm(norms[first.doc]) : 1.0f; + return raw * nrms; + } + + /** + * Explain scoring + * + * @param doc The document number for the explanation. + * @return + * @throws IOException + */ + public Explanation explain(final int doc) throws IOException { + Explanation tfExplanation = new Explanation(); + while (next() && doc() < doc) { + } + float phraseFreq = (doc() == doc) ? score : 0.0f; + float tfval = getSimilarity().tf(phraseFreq); + tfExplanation.setValue(tfval); + tfExplanation.setDescription("tf(subPhraseScore=" + phraseFreq + ")"); + return tfExplanation; + } + + public String toString() { + return "scorer(" + weight + ")"; + } + + /** + * Increment all TPs for current doc. Move to correct position. + * Remove those that don't have any more docs. + * + * @throws IOException + */ + private void doNext() throws IOException { + boolean more = false; + SubPhrasePositions pp = first; + SubPhrasePositions last = null; + int cur = first.doc; + // iterate only those that match current doc. + // The list is in sorted order. + while (pp != null && pp.doc == cur) { + // increment TP + if (!pp.next()) + // If no more docs, remove it + pp = remove(pp, last); + else { + // Move the TP to correct position in sorted list + // Either first remains where it was and loop ends or second becomes + // first now, + moveFirst(); + pp = first; + } + } + if (first != null) + more = true; + this.more = more; + } + + /** + * Move the first item in the list to correct postion in the sorted list. + */ + private void moveFirst() { + if (first == null || first.next == null) + return; + SubPhrasePositions pp = first.next; + SubPhrasePositions prev = first; + // traverse to find correct position. If same doc id, sort on offset so that + // nodes are sorted in the same order as they appear in query + while (pp != null && + ((pp.doc < first.doc) || + (pp.doc == first.doc && pp.offset < first.offset))) { + prev = pp; + pp = pp.next; + } + // insert in correct position + if (pp == null) { + // move past all + prev.next = first; + first = first.next; + prev.next.next = null; + } else { + // insert before pp + if (prev != first) { + SubPhrasePositions tmp = first.next; + prev.next = first; + first.next = pp; + first = tmp; + } + } + } + + /** + * Remove a node from list + * + * @param pp + * @param last + * @return + * @throws IOException + */ + public SubPhrasePositions remove(SubPhrasePositions pp, + SubPhrasePositions last) + throws IOException { + SubPhrasePositions next; + if (pp == first) { + first = first.next; + next = first; + } else { + last.next = pp.next; + next = last.next; + } + return next; + } + + /** + * Init all TPs and sort. + * remove those that don't have any docs. + * + * @throws IOException + */ + private void init() throws IOException { + boolean more = false; + SubPhrasePositions last = null; + SubPhrasePositions pp = first; + while (pp != null) { + if (!pp.next()) { + pp = remove(pp, last); + } else { + more = true; + last = pp; + pp = last.next; + } + } + this.more = more; + if (more) + sort(); + } + + /** + * sort using priority queue + */ + private void sort() { + pq.clear(); + for (SubPhrasePositions pp = first; pp != null; pp = pp.next) + pq.put(pp); + pqToList(); + } + + /** + * convert a queue to linked list by reading. Creates a sorted list + */ + protected final void pqToList() { + last = first = null; + while (pq.top() != null) { + SubPhrasePositions pp = (SubPhrasePositions) pq.pop(); + if (last != null) { + last.next = pp; + } else + first = pp; + last = pp; + pp.next = null; + } + } + + /** + * Scores each document. Always reads the next minimum position for any + * term. If its offset is one greater than last read, and its position is one greater + * than last read, adds it to current sequence length. If not, the current sub + * sequence has neded and can be scored and score is added. + */ + class PerDocScorer { + /** + * How many terms are being asked to be matches for this doc ? + */ + int termCount; + /** + * How many are remaining ? Zero means document is processed. + */ + int curTerms; + /** + * used as init flag. We start by sorting by positions. + */ + boolean sorted; + /** + * cache the score and return to guard against multiple calls. + */ + boolean docScored; + /** + * computed score + */ + int score; + /** + * current seqquence length + */ + int curSeqLen = 0; + /** + * which the term the current sequence started with ? + */ + int curOffset = 0; + /** + * which position in the document did the current sequence start at ? + */ + int curPos = 0; + /** + * what is the longest match score so far ? + */ + int longestMatchScore = 0; + /** + * the sorted list used to pick the next minimum. + */ + SubPhrasePositions[] sortedOffsets; + /** + * set to remember mathes already seen. The sequences are cconverted + * to long, where each bit represnts their position in query. So a sequence + * contain second and third words of query gets stored as "0...110" = 6 + * So it works only for queries with 64 terms. Is there a better way ? + */ + HashSet