Index: src/java/org/apache/lucene/search/SubPhraseQuery.java =================================================================== --- src/java/org/apache/lucene/search/SubPhraseQuery.java (revision 0) +++ src/java/org/apache/lucene/search/SubPhraseQuery.java (revision 0) @@ -0,0 +1,370 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; +import java.util.ArrayList; + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Explanation.IDFExplanation; +import org.apache.lucene.util.ToStringUtils; + +/** A Query that matches documents containing a particular sequence of terms. + */ +public class SubPhraseQuery extends Query { + + private String field; + private ArrayList terms = new ArrayList(4); + private ArrayList positions = new ArrayList(4); + private int maxPosition = 0; + + /** Constructs an empty phrase query. */ + public SubPhraseQuery() {} + + /** + * Sets sub-phrase (partial phrase) config + */ + private SubPhraseConfig subPhraseConf; + + /** + * Config that fine tunes sub phrase (partial phrase) matches. + */ + static class SubPhraseConfig { + /** + * How much more valuable is a N word sub-phrase + * compared to a N-1 word sub-phrase. + * Each subphrase is scored as + * sccore += (sub-phrase length to the power of phraseBoost) + * So with phraseBoost = 2, a 4 words long sub-phrase adds a score of 16 + * while 3 word long sub phrase adds a score of 9. + */ + public int phraseBoost = 2; + /** + * Ignore idf when scoring. + */ + public boolean ignoreIdf = false; + /** + * Ignore field norms when scoring. + */ + public boolean ignoreFieldNorms = false; + /** + * Ignore duplicate sub phrases. For example, "sub1 sub2" is a + * duplicate of "sub1 sub2". But "sub1" is not duplicate of "sub1 sub2" + */ + public boolean ignoreDuplicates = false; + /** + * When more than one sub-phrase matched, pick the longest for scoring. + */ + public boolean matchOnlyLongest = false; + + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + SubPhraseConfig that = (SubPhraseConfig) o; + + if (ignoreDuplicates != that.ignoreDuplicates) return false; + if (ignoreFieldNorms != that.ignoreFieldNorms) return false; + if (ignoreIdf != that.ignoreIdf) return false; + if (matchOnlyLongest != that.matchOnlyLongest) return false; + if (phraseBoost != that.phraseBoost) return false; + + return true; + } + + public int hashCode() { + int result = phraseBoost; + result = 31 * result + (ignoreIdf ? 1 : 0); + result = 31 * result + (ignoreFieldNorms ? 1 : 0); + result = 31 * result + (ignoreDuplicates ? 1 : 0); + result = 31 * result + (matchOnlyLongest ? 1 : 0); + return result; + } + } + + /** + * If the object is supplied, the query is treated as a sub-phrase query. + * + * @param subPhraseConf + */ + public void setSubPhraseConf(SubPhraseConfig subPhraseConf) { + this.subPhraseConf = subPhraseConf; + } + + /** + * Adds a term to the end of the query phrase. + * The relative position of the term is the one immediately after the last term added. + */ + public void add(Term term) { + int position = 0; + if(positions.size() > 0) + position = ((Integer) positions.get(positions.size()-1)).intValue() + 1; + + add(term, position); + } + + /** + * Adds a term to the end of the query phrase. + * The relative position of the term within the phrase is specified explicitly. + * This allows e.g. phrases with more than one term at the same position + * or phrases with gaps (e.g. in connection with stopwords). + * + * @param term + * @param position + */ + public void add(Term term, int position) { + if (terms.size() == 0) + field = term.field(); + else if (term.field() != field) + throw new IllegalArgumentException("All phrase terms must be in the same field: " + term); + + terms.add(term); + positions.add(new Integer(position)); + if (position > maxPosition) maxPosition = position; + } + + /** Returns the set of terms in this phrase. */ + public Term[] getTerms() { + return (Term[])terms.toArray(new Term[0]); + } + + /** + * Returns the relative positions of terms in this phrase. + */ + public int[] getPositions() { + int[] result = new int[positions.size()]; + for(int i = 0; i < positions.size(); i++) + result[i] = ((Integer) positions.get(i)).intValue(); + return result; + } + + public Weight createWeight(Searcher searcher) throws IOException { + if (terms.size() == 1) { // optimize one-term case + Term term = (Term)terms.get(0); + Query termQuery = new TermQuery(term); + termQuery.setBoost(getBoost()); + return termQuery.createWeight(searcher); + } + return new SubPhraseWeight(searcher); + } + + /** + * @see org.apache.lucene.search.Query#extractTerms(java.util.Set) + */ + public void extractTerms(Set queryTerms) { + queryTerms.addAll(terms); + } + + /** Prints a user-readable version of this query. */ + public String toString(String f) { + StringBuffer buffer = new StringBuffer(); + if (field != null && !field.equals(f)) { + buffer.append(field); + buffer.append(":"); + } + + buffer.append("\""); + String[] pieces = new String[maxPosition + 1]; + for (int i = 0; i < terms.size(); i++) { + int pos = ((Integer)positions.get(i)).intValue(); + String s = pieces[pos]; + if (s == null) { + s = ((Term)terms.get(i)).text(); + } else { + s = s + "|" + ((Term)terms.get(i)).text(); + } + pieces[pos] = s; + } + for (int i = 0; i < pieces.length; i++) { + if (i > 0) { + buffer.append(' '); + } + String s = pieces[i]; + if (s == null) { + buffer.append('?'); + } else { + buffer.append(s); + } + } + buffer.append("\""); + + buffer.append(ToStringUtils.boost(getBoost())); + + return buffer.toString(); + } + + /** Returns true iff o is equal to this. */ + public boolean equals(Object o) { + if (!(o instanceof SubPhraseQuery)) + return false; + SubPhraseQuery other = (SubPhraseQuery)o; + return (this.getBoost() == other.getBoost()) + && (this.subPhraseConf.equals(other.subPhraseConf)) + && this.terms.equals(other.terms) + && this.positions.equals(other.positions); + } + + /** Returns a hash code value for this object.*/ + public int hashCode() { + return Float.floatToIntBits(getBoost()) + ^ subPhraseConf.hashCode() + ^ terms.hashCode() + ^ positions.hashCode(); + } + + private class SubPhraseWeight extends Weight { + private Similarity similarity; + private float value; + private float idf; + private float queryNorm; + private float queryWeight; + private IDFExplanation idfExp; + + public SubPhraseWeight(Searcher searcher) + throws IOException { + this.similarity = getSimilarity(searcher); + + idfExp = similarity.idfExplain(terms, searcher); + idf = idfExp.getIdf(); + // if sub phrase config is present and it ignores idf, do it here + if (subPhraseConf != null && subPhraseConf.ignoreIdf) + idf = 1.0f; + } + + public String toString() { return "weight(" + SubPhraseQuery.this + ")"; } + + public Query getQuery() { return SubPhraseQuery.this; } + public float getValue() { return value; } + + public float sumOfSquaredWeights() { + queryWeight = idf * getBoost(); // compute query weight + return queryWeight * queryWeight; // square it + } + + public void normalize(float queryNorm) { + this.queryNorm = queryNorm; + queryWeight *= queryNorm; // normalize query weight + value = queryWeight * idf; // idf for document + } + + public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { + if (terms.size() == 0) // optimize zero-term case + return null; + + TermPositions[] tps = new TermPositions[terms.size()]; + for (int i = 0; i < terms.size(); i++) { + TermPositions p = reader.termPositions((Term)terms.get(i)); + if (p == null) + return null; + tps[i] = p; + } + + // If sub-phrase is configured use it. Else revert to existing logic. + if (subPhraseConf == null) + subPhraseConf = new SubPhraseConfig(); + + return new SubPhraseScorer(this, tps, getPositions(), similarity, + reader.norms(field), terms, subPhraseConf); + } + + public Explanation explain(IndexReader reader, int doc) + throws IOException { + + Explanation result = new Explanation(); + result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); + + StringBuffer docFreqs = new StringBuffer(); + StringBuffer query = new StringBuffer(); + query.append('\"'); + docFreqs.append(idfExp.explain()); + for (int i = 0; i < terms.size(); i++) { + if (i != 0) { + query.append(" "); + } + + Term term = (Term)terms.get(i); + + query.append(term.text()); + } + query.append('\"'); + + Explanation idfExpl = + new Explanation(idf, "idf(" + field + ":" + docFreqs + ")"); + + // explain query weight + Explanation queryExpl = new Explanation(); + queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); + + Explanation boostExpl = new Explanation(getBoost(), "boost"); + if (getBoost() != 1.0f) + queryExpl.addDetail(boostExpl); + queryExpl.addDetail(idfExpl); + + Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); + queryExpl.addDetail(queryNormExpl); + + queryExpl.setValue(boostExpl.getValue() * + idfExpl.getValue() * + queryNormExpl.getValue()); + + result.addDetail(queryExpl); + + // explain field weight + Explanation fieldExpl = new Explanation(); + fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+ + "), product of:"); + + Scorer scorer = scorer(reader, true, false); + if (scorer == null) { + return new Explanation(0.0f, "no matching docs"); + } + Explanation tfExpl = scorer.explain(doc); + fieldExpl.addDetail(tfExpl); + fieldExpl.addDetail(idfExpl); + + Explanation fieldNormExpl = new Explanation(); + byte[] fieldNorms = reader.norms(field); + // if sub phrase config is present and is configured to ignore field norms + // show the same in explain + float fieldNorm = + fieldNorms != null + && (subPhraseConf == null || !subPhraseConf.ignoreFieldNorms) + ? Similarity.decodeNorm(fieldNorms[doc]) : 1.0f; + fieldNormExpl.setValue(fieldNorm); + fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); + fieldExpl.addDetail(fieldNormExpl); + + fieldExpl.setValue(tfExpl.getValue() * + idfExpl.getValue() * + fieldNormExpl.getValue()); + + result.addDetail(fieldExpl); + + // combine them + result.setValue(queryExpl.getValue() * fieldExpl.getValue()); + + if (queryExpl.getValue() == 1.0f) + return fieldExpl; + + return result; + } + } +} Index: src/java/org/apache/lucene/search/SubPhraseScorer.java =================================================================== --- src/java/org/apache/lucene/search/SubPhraseScorer.java (revision 0) +++ src/java/org/apache/lucene/search/SubPhraseScorer.java (revision 0) @@ -0,0 +1,705 @@ +package org.apache.lucene.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; + +import org.apache.lucene.index.*; +import org.apache.lucene.util.PriorityQueue; + +/** + * Phrase Query scorer that scores based on sub-phrases (partial phrases) + * For example when queries like "3 bed homes new york swimming pool" + * are run against multiple fields each holding a different piece of information + * like city, beds, amenities etc. we need to score based on sub-phrase + * matches. + *

+ * Example: + * doc1 : "one two three sub1 sub2 sub3 four sub4" + * doc2 : "one two three sub1 sub2 four sub4 sub3" + * q : " none ten sub1 sub2 sub3 sub4 something" + * doc1 should score higher than doc2 since it has 3-word plus 1-word matches + * where as doc2 has 2-word plus 1-word plus 1-word + *

+ * The difference between N-word vs N-1 word sub phrase score must be + * configurable There should be way to ignore matches except longest, like + * ignoring sub4 match above. We should be also able to ignore scoring factors + * outside of this doc so that the match is based on phrase match alone. + *

+ * Shingles look non-intuitive and expensive for this since the query as well as + * all fields of all doucments need to be indexed with all possible (1...N)-gram + * shingles and then a boolean OR query fired. + */ + +class SubPhraseScorer extends Scorer { + /** + * config + */ + private SubPhraseQuery.SubPhraseConfig conf; + + /** + * Fields copied from ExactScorer as it is. + */ + private Weight weight; + protected byte[] norms; + protected float value; + /** + * Holds the score for current document. + */ + private float score; + + /** + * The below is for iterating over documents. + */ + private boolean firstTime = true; + private boolean more = true; + + /** + * The is a linked list holding all PhrasePositions. + * Its always kept sorted by doc id. First one holding the smallest doc id. + * The first N PPs, with same doc id, represent all the terms found in current + * document. The first node of this linked list, is the next minimum doc id + * that needs to be processed + */ + protected SubPhraseQueue pq; + protected SubPhrasePositions first, last; + + /** + * This class is used to score each document. It receives the first N nodes + * with same doc id, meaning all PPs for terms found in current doc. It then + * iterates through all position:offset tuples for all these terms, alwyas finding + * the next minimu position for the current doc, mainting a current sequence + * where each term in the sequence has its offset as well as position + * incremented by 1. Whenever the sequence breaks, it scores it and starts + * new sequence to represent the new sub-phrase being matched. + */ + protected PerDocScorer perDoc; + + SubPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, + Similarity similarity, byte[] norms, ArrayList terms, + SubPhraseQuery.SubPhraseConfig subPhraseConf) { + super(similarity); + + this.norms = norms; + this.weight = weight; + this.value = weight.getValue(); + this.conf = subPhraseConf; + // create linked list + for (int i = 0; i < tps.length; i++) { + SubPhrasePositions pp = new SubPhrasePositions(tps[i], + offsets[i], terms.get(i)); + if (last != null) { + last.next = pp; + } else + first = pp; + last = pp; + } + // this queue is used for initial sorting of PPs + pq = new SubPhraseQueue(tps.length); + // this is used for scoring induvidual docs. + perDoc = new PerDocScorer(tps.length); + } + + /** + * First doucment in linked list is the current doc. + * + * @return + */ + public int doc() { + return first != null ? first.doc : NO_MORE_DOCS; + } + + /** + * First doucment in linked list is the current doc. + * + * @return + */ + public int docID() { + return doc(); + } + + /** + * Increment TermPositions for all terms with doc id as current doc, + * and move each to correct position in the list so that next minimum doc is the + * first node of the list. + * + * @return + * @throws IOException + */ + public boolean next() throws IOException { + return nextDoc() != NO_MORE_DOCS; + } + + /** + * Increment TermPositions for all terms with doc id as current doc, + * and move each to correct position in the list so that next minimum doc is the + * first node of the list. + * + * @return + * @throws IOException + */ + public int nextDoc() throws IOException { + if (firstTime) { + // sort the list and init term positions + init(); + firstTime = false; + } else { + // increment TPs for current doc and move them to correct pos. + doNext(); + } + perDoc.reset(); + score = perDoc.score(); + return doc(); + } + + /** + * Increment TermPositions for all terms with doc id as current doc, + * and move each to correct position in the list so that next minimum doc + * past the target is the first node of the list. + * + * @return + * @throws IOException + */ + public boolean skipTo(int target) throws IOException { + return advance(target) != NO_MORE_DOCS; + } + + /** + * Increment TermPositions for all terms with doc id as current doc, + * and move each to correct position in the list so that next minimum doc + * past the target is the first node of the list. + * + * @return + * @throws IOException + */ + public int advance(int target) throws IOException { + firstTime = false; + boolean more = false; + SubPhrasePositions last = null; + SubPhrasePositions pp = first; + while (pp != null) { + // increment each TP past target + if (!pp.skipTo(target)) { + // if no more docs for TP, remove from list + pp = remove(pp, last); + } else { + more = true; + last = pp; + pp = last.next; + } + } + this.more = more; + // sort it + if (more) + sort(); + perDoc.reset(); + score = perDoc.score(); + return doc(); + } + + /** + * Calculate score. Ignore fields norms if config says so. + * + * @return + * @throws IOException + */ + public float score() throws IOException { + float raw = getSimilarity().tf(score) * value; + float nrms = !conf.ignoreFieldNorms ? + Similarity.decodeNorm(norms[first.doc]) : 1.0f; + return raw * nrms; + } + + /** + * Explain scoring + * + * @param doc The document number for the explanation. + * @return + * @throws IOException + */ + public Explanation explain(final int doc) throws IOException { + Explanation tfExplanation = new Explanation(); + while (next() && doc() < doc) { + } + float phraseFreq = (doc() == doc) ? score : 0.0f; + float tfval = getSimilarity().tf(phraseFreq); + tfExplanation.setValue(tfval); + tfExplanation.setDescription("tf(subPhraseScore=" + phraseFreq + ")"); + return tfExplanation; + } + + public String toString() { + return "scorer(" + weight + ")"; + } + + /** + * Increment all TPs for current doc. Move to correct position. + * Remove those that don't have any more docs. + * + * @throws IOException + */ + private void doNext() throws IOException { + boolean more = false; + SubPhrasePositions pp = first; + SubPhrasePositions last = null; + int cur = first.doc; + // iterate only those that match current doc. + // The list is in sorted order. + while (pp != null && pp.doc == cur) { + // increment TP + if (!pp.next()) + // If no more docs, remove it + pp = remove(pp, last); + else { + // Move the TP to correct position in sorted list + // Either first remains where it was and loop ends or second becomes + // first now, + moveFirst(); + pp = first; + } + } + if (first != null) + more = true; + this.more = more; + } + + /** + * Move the first item in the list to correct postion in the sorted list. + */ + private void moveFirst() { + if (first == null || first.next == null) + return; + SubPhrasePositions pp = first.next; + SubPhrasePositions prev = first; + // traverse to find correct position. If same doc id, sort on offset so that + // nodes are sorted in the same order as they appear in query + while (pp != null && + ((pp.doc < first.doc) || + (pp.doc == first.doc && pp.offset < first.offset))) { + prev = pp; + pp = pp.next; + } + // insert in correct position + if (pp == null) { + // move past all + prev.next = first; + first = first.next; + prev.next.next = null; + } else { + // insert before pp + if (prev != first) { + SubPhrasePositions tmp = first.next; + prev.next = first; + first.next = pp; + first = tmp; + } + } + } + + /** + * Remove a node from list + * + * @param pp + * @param last + * @return + * @throws IOException + */ + public SubPhrasePositions remove(SubPhrasePositions pp, + SubPhrasePositions last) + throws IOException { + SubPhrasePositions next; + if (pp == first) { + first = first.next; + next = first; + } else { + last.next = pp.next; + next = last.next; + } + return next; + } + + /** + * Init all TPs and sort. + * remove those that don't have any docs. + * + * @throws IOException + */ + private void init() throws IOException { + boolean more = false; + SubPhrasePositions last = null; + SubPhrasePositions pp = first; + while (pp != null) { + if (!pp.next()) { + pp = remove(pp, last); + } else { + more = true; + last = pp; + pp = last.next; + } + } + this.more = more; + if (more) + sort(); + } + + /** + * sort using priority queue + */ + private void sort() { + pq.clear(); + for (SubPhrasePositions pp = first; pp != null; pp = pp.next) + pq.put(pp); + pqToList(); + } + + /** + * convert a queue to linked list by reading. Creates a sorted list + */ + protected final void pqToList() { + last = first = null; + while (pq.top() != null) { + SubPhrasePositions pp = (SubPhrasePositions) pq.pop(); + if (last != null) { + last.next = pp; + } else + first = pp; + last = pp; + pp.next = null; + } + } + + /** + * Scores each document. Always reads the next minimum position for any + * term. If its offset is one greater than last read, and its position is one greater + * than last read, adds it to current sequence length. If not, the current sub + * sequence has neded and can be scored and score is added. + */ + class PerDocScorer { + /** + * How many terms are being asked to be matches for this doc ? + */ + int termCount; + /** + * How many are remaining ? Zero means document is processed. + */ + int curTerms; + /** + * used as init flag. We start by sorting by positions. + */ + boolean sorted; + /** + * cache the score and return to guard against multiple calls. + */ + boolean docScored; + /** + * computed score + */ + int score; + /** + * current seqquence length + */ + int curSeqLen = 0; + /** + * which the term the current sequence started with ? + */ + int curOffset = 0; + /** + * which position in the document did the current sequence start at ? + */ + int curPos = 0; + /** + * what is the longest match score so far ? + */ + int longestMatchScore = 0; + /** + * the sorted list used to pick the next minimum. + */ + SubPhrasePositions[] sortedOffsets; + /** + * set to remember mathes already seen. The sequences are cconverted + * to long, where each bit represnts their position in query. So a sequence + * contain second and third words of query gets stored as "0...110" = 6 + * So it works only for queries with 64 terms. Is there a better way ? + */ + HashSet duplicates; + + /** + * Create objects + * + * @param length + */ + public PerDocScorer(int length) { + termCount = length; + sortedOffsets = new SubPhrasePositions[termCount]; + if (conf.ignoreDuplicates) + duplicates = new HashSet(100); + } + + /** + * clear for every new document + */ + void reset() { + sorted = false; + docScored = false; + curTerms = 0; + longestMatchScore = 0; + score = 0; + if (conf.ignoreDuplicates) + duplicates.clear(); + } + + /** + * Find sub sequences + * + * @return + */ + int score() { + if (first == null) + return 0; + if (docScored) + return score; + // get first + SubPhrasePositions min = nextMin(); + curSeqLen = 1; + curOffset = min.offset; + curPos = min.position; + // get next + min = nextMin(); + while (min != null) { + // if below matches, its continuation of a sub sequence. + if (min.offset == curOffset + curSeqLen && + min.position == curPos + curSeqLen) + curSeqLen++; + else { + // sub sequence ended. Score it. + scoreSubPhrase(); + curSeqLen = 1; + curOffset = min.offset; + curPos = min.position; + } + min = nextMin(); + } + // score the last sequence found. + scoreSubPhrase(); + docScored = true; + if (conf.matchOnlyLongest) { + score = longestMatchScore; + } + return score; + } + + /** + * Actual scoring algorithm + * score += (seq len) ^ phraseBoost + */ + private void scoreSubPhrase() { + int s = 1; + for (int i = 1; i <= conf.phraseBoost; i++) + s *= curSeqLen; + if (s > longestMatchScore) + longestMatchScore = s; + score += s; + // duplicate detection works for the queries whose + // # terms = sizeof(Long) bits. + if (conf.ignoreDuplicates) { + long encodedSeq = 0; + // encode sequence as a long + for (int j = curOffset; j < curOffset + curSeqLen; j++) { + encodedSeq += (1 << j); + } + if (duplicates.contains(encodedSeq)) { + score -= s; + } + duplicates.add(encodedSeq); + } + } + + /** + * Get next min position. + * Increments the first item and moves to correct position. + * remove those that have been exhausted. + * + * @return + */ + private SubPhrasePositions nextMin() { + if (!sorted) { + sorted = true; + sortAsc(); + return sortedOffsets[0]; + } else if (curTerms == 0) { + return null; + } else { + try { + incrementAndFindMin(); + if (curTerms == 0) + return null; + else + return sortedOffsets[0]; + } catch (IOException e) { + // e.printStackTrace(); + } + } + return null; + } + + /** + * Increment first item and move it to correct postion. + * + * @throws IOException + */ + private void incrementAndFindMin() throws IOException { + if (!sortedOffsets[0].nextPosition()) { + curTerms--; + for (int i = 0; i < curTerms; i++) + sortedOffsets[i] = sortedOffsets[i + 1]; + } else { + SubPhrasePositions pos = sortedOffsets[0]; + int k = 1; + while (k < curTerms && pos.position > sortedOffsets[k].position) { + sortedOffsets[k - 1] = sortedOffsets[k]; + k++; + } + sortedOffsets[k - 1] = pos; + } + } + + /** + * sort first time + */ + private void sortAsc() { + sortedOffsets[0] = first; + curTerms = 1; + int cur = doc(); + int i = 0; + SubPhrasePositions it = first.next; + while (it != null && it.doc == cur) { + int j = i; + while (j >= 0 && it.position < sortedOffsets[j].position) { + sortedOffsets[j + 1] = sortedOffsets[j]; + j--; + } + sortedOffsets[j + 1] = it; + it = it.next; + i++; + } + curTerms += i; + } + } + + /** + * Wrapper for TermPositions so that we can put it in a linked list. + */ + static class SubPhrasePositions { + /** + * next() and skipTo() iterate over documents. + * firstPosition() and nextPosition() iterate over postions in a given doc. + */ + /** + * current doc + */ + int doc; + /** + * current position + */ + int position = -1; + /** + * positions to read + */ + int count; + /** + * offset in query + */ + int offset; + /** + * underlying TP + */ + TermPositions tp; + /** + * pointer to next + */ + SubPhrasePositions next; + /** + * hold actual term for debugging ? + */ + String term = ""; + + SubPhrasePositions(TermPositions t, int o, Object o1) { + tp = t; + offset = o; + term = ((Term) o1).text(); + } + + boolean next() throws IOException { + if (!tp.next()) { + tp.close(); + doc = Integer.MAX_VALUE; + return false; + } + doc = tp.doc(); + firstPosition(); + return true; + } + + boolean skipTo(int target) throws IOException { + if (!tp.skipTo(target)) { + tp.close(); + doc = Integer.MAX_VALUE; + return false; + } + doc = tp.doc(); + firstPosition(); + return true; + } + + void firstPosition() throws IOException { + count = tp.freq(); + nextPosition(); + } + + boolean nextPosition() throws IOException { + if (count-- > 0) { + position = tp.nextPosition(); + return true; + } else + return false; + } + } + + /** + * Simple queue that sorts by doc id, then by offset and then by position + */ + static class SubPhraseQueue extends PriorityQueue { + SubPhraseQueue(int size) { + initialize(size); + } + + protected boolean lessThan(Object o1, Object o2) { + SubPhrasePositions pp1 = (SubPhrasePositions) o1; + SubPhrasePositions pp2 = (SubPhrasePositions) o2; + if (pp1.doc == pp2.doc) + if (pp1.position == pp2.position) + return pp1.offset < pp2.offset; + else + return pp1.position < pp2.position; + else + return pp1.doc < pp2.doc; + } + } +} Index: src/test/org/apache/lucene/search/TestSubPhraseQuery.java =================================================================== --- src/test/org/apache/lucene/search/TestSubPhraseQuery.java (revision 0) +++ src/test/org/apache/lucene/search/TestSubPhraseQuery.java (revision 0) @@ -0,0 +1,172 @@ +package org.apache.lucene.search; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; + +public class TestSubPhraseQuery extends TestCase { + + private IndexSearcher searcher; + private RAMDirectory directory; + private Analyzer analyzer; + + String[] docs = new String[] + { + "sub1 one sub2 two sub3 three sub1 four five sub2 sub3", + "one two sub1 sub2 sub3 sub4 three four sub3 sub2 sub1", + "one two three four", + "one two three sub2 sub1 four five six sub1 sub3", + "sub1 sub2 sub3", + "one two three sub3 four sub2 sub3 sub4" + }; + + String[] docsForIgnoreDuplicateTest = new String[] + { + "sub1 one sub2 two sub3 three sub1", + "one two sub1 sub2 sub3 sub4 three four sub3 sub2 sub1", + "one two three four", + "one two three sub1 sub2 four five six sub1 sub2 sub1 sub2 sub1 sub2", + "sub1 sub2 sub3", + "one two three sub3 four sub2 sub3 sub4" + }; + + protected void setUp() throws Exception { + + directory = new RAMDirectory(); + //analyzer = new WhitespaceAnalyzer(); + analyzer = new StandardAnalyzer(); + IndexWriter writer = new IndexWriter(directory, + analyzer, true); + + for (String content : docs) { + Document doc = new Document(); + doc.add(new Field("f", content, Field.Store.YES, + Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + + writer.close(); + + searcher = new IndexSearcher(directory); + } + + public void testSubPhrase() throws Exception { + String search = "sub1 sub2 sub3 sub4"; + SubPhraseQuery pq = new SubPhraseQuery(); + String[] terms = search.split("\\s+"); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + pq.add(new Term("f", term)); + } + + SubPhraseQuery.SubPhraseConfig conf = new SubPhraseQuery.SubPhraseConfig(); + conf.ignoreIdf = true; + conf.ignoreFieldNorms = true; + conf.matchOnlyLongest = false; + conf.ignoreDuplicates = true; + conf.phraseBoost = 2; + pq.setSubPhraseConf(conf); + + Hits hits = searcher.search(pq); + assertTrue("returned correct # ", (hits.length() == 5)); + assertTrue("returned correct match", (hits.id(0) == 1)); + assertTrue("returned correct match", (hits.id(1) == 5)); + assertTrue("returned correct match", (hits.id(2) == 4)); + assertTrue("returned correct match", (hits.id(3) == 0)); + } + + public void testSubPhraseMatchLongest() throws Exception { + String search = "sub1 sub2 sub3 sub4"; + SubPhraseQuery pq = new SubPhraseQuery(); + String[] terms = search.split("\\s+"); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + pq.add(new Term("f", term)); + } + + SubPhraseQuery.SubPhraseConfig conf = new SubPhraseQuery.SubPhraseConfig(); + conf.ignoreIdf = true; + conf.ignoreFieldNorms = true; + conf.matchOnlyLongest = true; + conf.ignoreDuplicates = true; + conf.phraseBoost = 2; + pq.setSubPhraseConf(conf); + + Hits hits = searcher.search(pq); + assertTrue("returned correct # ", (hits.length() == 5)); + assertTrue("returned correct match", (hits.id(0) == 1)); + assertTrue("returned correct match", (hits.score(1) == hits.score(2))); + assertTrue("returned correct match", (hits.id(3) == 0)); + } + + public void testSubPhrasePhraseBoost() throws Exception { + String search = "sub1 sub2 sub3 sub4"; + SubPhraseQuery pq = new SubPhraseQuery(); + String[] terms = search.split("\\s+"); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + pq.add(new Term("f", term)); + } + // Test proven by not setting it compared to other tests where it is set to 2 + SubPhraseQuery.SubPhraseConfig conf = new SubPhraseQuery.SubPhraseConfig(); + conf.ignoreIdf = true; + conf.ignoreFieldNorms = true; + conf.matchOnlyLongest = false; + conf.ignoreDuplicates = false; + conf.phraseBoost = 1; + pq.setSubPhraseConf(conf); + + Hits hits = searcher.search(pq); + assertTrue("returned correct # ", (hits.length() == 5)); + assertTrue("returned correct match", (hits.id(0) == 1)); + assertTrue("returned correct match", (hits.id(1) == 0)); + } + + public void testSubPhraseIgnoreDuplicates() throws Exception { + + RAMDirectory directory = new RAMDirectory(); + //analyzer = new WhitespaceAnalyzer(); + StandardAnalyzer analyzer = new StandardAnalyzer(); + IndexWriter writer = new IndexWriter(directory, + analyzer, true); + + for (String content : docsForIgnoreDuplicateTest) { + Document doc = new Document(); + doc.add(new Field("f", content, Field.Store.YES, + Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + + writer.close(); + + IndexSearcher searcher = new IndexSearcher(directory); + + String search = "sub1 sub2 sub3 sub4"; + SubPhraseQuery pq = new SubPhraseQuery(); + String[] terms = search.split("\\s+"); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + pq.add(new Term("f", term)); + } + // Test proven by not setting it compared to other tests where it is set to 2 + SubPhraseQuery.SubPhraseConfig conf = new SubPhraseQuery.SubPhraseConfig(); + conf.ignoreIdf = true; + conf.ignoreFieldNorms = true; + conf.matchOnlyLongest = false; + conf.ignoreDuplicates = true; + conf.phraseBoost = 1; + pq.setSubPhraseConf(conf); + + Hits hits = searcher.search(pq); + assertTrue("returned correct # ", (hits.length() == 5)); + assertTrue("returned correct match", (hits.id(0) == 1)); + assertTrue("returned correct match", (hits.id(1) == 5)); + assertTrue("returned correct match", (hits.score(2) == hits.score(3))); + assertTrue("returned correct match", (hits.id(4) == 3)); + } +}