Index: src/java/org/apache/lucene/search/SubPhraseQuery.java
===================================================================
--- src/java/org/apache/lucene/search/SubPhraseQuery.java (revision 0)
+++ src/java/org/apache/lucene/search/SubPhraseQuery.java (revision 0)
@@ -0,0 +1,370 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+import java.util.ArrayList;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Explanation.IDFExplanation;
+import org.apache.lucene.util.ToStringUtils;
+
+/** A Query that matches documents containing a particular sequence of terms.
+ */
+public class SubPhraseQuery extends Query {
+
+ private String field;
+ private ArrayList terms = new ArrayList(4);
+ private ArrayList positions = new ArrayList(4);
+ private int maxPosition = 0;
+
+ /** Constructs an empty phrase query. */
+ public SubPhraseQuery() {}
+
+ /**
+ * Sets sub-phrase (partial phrase) config
+ */
+ private SubPhraseConfig subPhraseConf;
+
+ /**
+ * Config that fine tunes sub phrase (partial phrase) matches.
+ */
+ static class SubPhraseConfig {
+ /**
+ * How much more valuable is a N word sub-phrase
+ * compared to a N-1 word sub-phrase.
+ * Each subphrase is scored as
+ * sccore += (sub-phrase length to the power of phraseBoost)
+ * So with phraseBoost = 2, a 4 words long sub-phrase adds a score of 16
+ * while 3 word long sub phrase adds a score of 9.
+ */
+ public int phraseBoost = 2;
+ /**
+ * Ignore idf when scoring.
+ */
+ public boolean ignoreIdf = false;
+ /**
+ * Ignore field norms when scoring.
+ */
+ public boolean ignoreFieldNorms = false;
+ /**
+ * Ignore duplicate sub phrases. For example, "sub1 sub2" is a
+ * duplicate of "sub1 sub2". But "sub1" is not duplicate of "sub1 sub2"
+ */
+ public boolean ignoreDuplicates = false;
+ /**
+ * When more than one sub-phrase matched, pick the longest for scoring.
+ */
+ public boolean matchOnlyLongest = false;
+
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ SubPhraseConfig that = (SubPhraseConfig) o;
+
+ if (ignoreDuplicates != that.ignoreDuplicates) return false;
+ if (ignoreFieldNorms != that.ignoreFieldNorms) return false;
+ if (ignoreIdf != that.ignoreIdf) return false;
+ if (matchOnlyLongest != that.matchOnlyLongest) return false;
+ if (phraseBoost != that.phraseBoost) return false;
+
+ return true;
+ }
+
+ public int hashCode() {
+ int result = phraseBoost;
+ result = 31 * result + (ignoreIdf ? 1 : 0);
+ result = 31 * result + (ignoreFieldNorms ? 1 : 0);
+ result = 31 * result + (ignoreDuplicates ? 1 : 0);
+ result = 31 * result + (matchOnlyLongest ? 1 : 0);
+ return result;
+ }
+ }
+
+ /**
+ * If the object is supplied, the query is treated as a sub-phrase query.
+ *
+ * @param subPhraseConf
+ */
+ public void setSubPhraseConf(SubPhraseConfig subPhraseConf) {
+ this.subPhraseConf = subPhraseConf;
+ }
+
+ /**
+ * Adds a term to the end of the query phrase.
+ * The relative position of the term is the one immediately after the last term added.
+ */
+ public void add(Term term) {
+ int position = 0;
+ if(positions.size() > 0)
+ position = ((Integer) positions.get(positions.size()-1)).intValue() + 1;
+
+ add(term, position);
+ }
+
+ /**
+ * Adds a term to the end of the query phrase.
+ * The relative position of the term within the phrase is specified explicitly.
+ * This allows e.g. phrases with more than one term at the same position
+ * or phrases with gaps (e.g. in connection with stopwords).
+ *
+ * @param term
+ * @param position
+ */
+ public void add(Term term, int position) {
+ if (terms.size() == 0)
+ field = term.field();
+ else if (term.field() != field)
+ throw new IllegalArgumentException("All phrase terms must be in the same field: " + term);
+
+ terms.add(term);
+ positions.add(new Integer(position));
+ if (position > maxPosition) maxPosition = position;
+ }
+
+ /** Returns the set of terms in this phrase. */
+ public Term[] getTerms() {
+ return (Term[])terms.toArray(new Term[0]);
+ }
+
+ /**
+ * Returns the relative positions of terms in this phrase.
+ */
+ public int[] getPositions() {
+ int[] result = new int[positions.size()];
+ for(int i = 0; i < positions.size(); i++)
+ result[i] = ((Integer) positions.get(i)).intValue();
+ return result;
+ }
+
+ public Weight createWeight(Searcher searcher) throws IOException {
+ if (terms.size() == 1) { // optimize one-term case
+ Term term = (Term)terms.get(0);
+ Query termQuery = new TermQuery(term);
+ termQuery.setBoost(getBoost());
+ return termQuery.createWeight(searcher);
+ }
+ return new SubPhraseWeight(searcher);
+ }
+
+ /**
+ * @see org.apache.lucene.search.Query#extractTerms(java.util.Set)
+ */
+ public void extractTerms(Set queryTerms) {
+ queryTerms.addAll(terms);
+ }
+
+ /** Prints a user-readable version of this query. */
+ public String toString(String f) {
+ StringBuffer buffer = new StringBuffer();
+ if (field != null && !field.equals(f)) {
+ buffer.append(field);
+ buffer.append(":");
+ }
+
+ buffer.append("\"");
+ String[] pieces = new String[maxPosition + 1];
+ for (int i = 0; i < terms.size(); i++) {
+ int pos = ((Integer)positions.get(i)).intValue();
+ String s = pieces[pos];
+ if (s == null) {
+ s = ((Term)terms.get(i)).text();
+ } else {
+ s = s + "|" + ((Term)terms.get(i)).text();
+ }
+ pieces[pos] = s;
+ }
+ for (int i = 0; i < pieces.length; i++) {
+ if (i > 0) {
+ buffer.append(' ');
+ }
+ String s = pieces[i];
+ if (s == null) {
+ buffer.append('?');
+ } else {
+ buffer.append(s);
+ }
+ }
+ buffer.append("\"");
+
+ buffer.append(ToStringUtils.boost(getBoost()));
+
+ return buffer.toString();
+ }
+
+ /** Returns true iff o is equal to this. */
+ public boolean equals(Object o) {
+ if (!(o instanceof SubPhraseQuery))
+ return false;
+ SubPhraseQuery other = (SubPhraseQuery)o;
+ return (this.getBoost() == other.getBoost())
+ && (this.subPhraseConf.equals(other.subPhraseConf))
+ && this.terms.equals(other.terms)
+ && this.positions.equals(other.positions);
+ }
+
+ /** Returns a hash code value for this object.*/
+ public int hashCode() {
+ return Float.floatToIntBits(getBoost())
+ ^ subPhraseConf.hashCode()
+ ^ terms.hashCode()
+ ^ positions.hashCode();
+ }
+
+ private class SubPhraseWeight extends Weight {
+ private Similarity similarity;
+ private float value;
+ private float idf;
+ private float queryNorm;
+ private float queryWeight;
+ private IDFExplanation idfExp;
+
+ public SubPhraseWeight(Searcher searcher)
+ throws IOException {
+ this.similarity = getSimilarity(searcher);
+
+ idfExp = similarity.idfExplain(terms, searcher);
+ idf = idfExp.getIdf();
+ // if sub phrase config is present and it ignores idf, do it here
+ if (subPhraseConf != null && subPhraseConf.ignoreIdf)
+ idf = 1.0f;
+ }
+
+ public String toString() { return "weight(" + SubPhraseQuery.this + ")"; }
+
+ public Query getQuery() { return SubPhraseQuery.this; }
+ public float getValue() { return value; }
+
+ public float sumOfSquaredWeights() {
+ queryWeight = idf * getBoost(); // compute query weight
+ return queryWeight * queryWeight; // square it
+ }
+
+ public void normalize(float queryNorm) {
+ this.queryNorm = queryNorm;
+ queryWeight *= queryNorm; // normalize query weight
+ value = queryWeight * idf; // idf for document
+ }
+
+ public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
+ if (terms.size() == 0) // optimize zero-term case
+ return null;
+
+ TermPositions[] tps = new TermPositions[terms.size()];
+ for (int i = 0; i < terms.size(); i++) {
+ TermPositions p = reader.termPositions((Term)terms.get(i));
+ if (p == null)
+ return null;
+ tps[i] = p;
+ }
+
+ // If sub-phrase is configured use it. Else revert to existing logic.
+ if (subPhraseConf == null)
+ subPhraseConf = new SubPhraseConfig();
+
+ return new SubPhraseScorer(this, tps, getPositions(), similarity,
+ reader.norms(field), terms, subPhraseConf);
+ }
+
+ public Explanation explain(IndexReader reader, int doc)
+ throws IOException {
+
+ Explanation result = new Explanation();
+ result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
+
+ StringBuffer docFreqs = new StringBuffer();
+ StringBuffer query = new StringBuffer();
+ query.append('\"');
+ docFreqs.append(idfExp.explain());
+ for (int i = 0; i < terms.size(); i++) {
+ if (i != 0) {
+ query.append(" ");
+ }
+
+ Term term = (Term)terms.get(i);
+
+ query.append(term.text());
+ }
+ query.append('\"');
+
+ Explanation idfExpl =
+ new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");
+
+ // explain query weight
+ Explanation queryExpl = new Explanation();
+ queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
+
+ Explanation boostExpl = new Explanation(getBoost(), "boost");
+ if (getBoost() != 1.0f)
+ queryExpl.addDetail(boostExpl);
+ queryExpl.addDetail(idfExpl);
+
+ Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
+ queryExpl.addDetail(queryNormExpl);
+
+ queryExpl.setValue(boostExpl.getValue() *
+ idfExpl.getValue() *
+ queryNormExpl.getValue());
+
+ result.addDetail(queryExpl);
+
+ // explain field weight
+ Explanation fieldExpl = new Explanation();
+ fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+
+ "), product of:");
+
+ Scorer scorer = scorer(reader, true, false);
+ if (scorer == null) {
+ return new Explanation(0.0f, "no matching docs");
+ }
+ Explanation tfExpl = scorer.explain(doc);
+ fieldExpl.addDetail(tfExpl);
+ fieldExpl.addDetail(idfExpl);
+
+ Explanation fieldNormExpl = new Explanation();
+ byte[] fieldNorms = reader.norms(field);
+ // if sub phrase config is present and is configured to ignore field norms
+ // show the same in explain
+ float fieldNorm =
+ fieldNorms != null
+ && (subPhraseConf == null || !subPhraseConf.ignoreFieldNorms)
+ ? Similarity.decodeNorm(fieldNorms[doc]) : 1.0f;
+ fieldNormExpl.setValue(fieldNorm);
+ fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
+ fieldExpl.addDetail(fieldNormExpl);
+
+ fieldExpl.setValue(tfExpl.getValue() *
+ idfExpl.getValue() *
+ fieldNormExpl.getValue());
+
+ result.addDetail(fieldExpl);
+
+ // combine them
+ result.setValue(queryExpl.getValue() * fieldExpl.getValue());
+
+ if (queryExpl.getValue() == 1.0f)
+ return fieldExpl;
+
+ return result;
+ }
+ }
+}
Index: src/java/org/apache/lucene/search/SubPhraseScorer.java
===================================================================
--- src/java/org/apache/lucene/search/SubPhraseScorer.java (revision 0)
+++ src/java/org/apache/lucene/search/SubPhraseScorer.java (revision 0)
@@ -0,0 +1,705 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Phrase Query scorer that scores based on sub-phrases (partial phrases)
+ * For example when queries like "3 bed homes new york swimming pool"
+ * are run against multiple fields each holding a different piece of information
+ * like city, beds, amenities etc. we need to score based on sub-phrase
+ * matches.
+ *
+ * Example:
+ * doc1 : "one two three sub1 sub2 sub3 four sub4"
+ * doc2 : "one two three sub1 sub2 four sub4 sub3"
+ * q : " none ten sub1 sub2 sub3 sub4 something"
+ * doc1 should score higher than doc2 since it has 3-word plus 1-word matches
+ * where as doc2 has 2-word plus 1-word plus 1-word
+ *
+ * The difference between N-word vs N-1 word sub phrase score must be
+ * configurable There should be way to ignore matches except longest, like
+ * ignoring sub4 match above. We should be also able to ignore scoring factors
+ * outside of this doc so that the match is based on phrase match alone.
+ *
+ * Shingles look non-intuitive and expensive for this since the query as well as
+ * all fields of all doucments need to be indexed with all possible (1...N)-gram
+ * shingles and then a boolean OR query fired.
+ */
+
+class SubPhraseScorer extends Scorer {
+ /**
+ * config
+ */
+ private SubPhraseQuery.SubPhraseConfig conf;
+
+ /**
+ * Fields copied from ExactScorer as it is.
+ */
+ private Weight weight;
+ protected byte[] norms;
+ protected float value;
+ /**
+ * Holds the score for current document.
+ */
+ private float score;
+
+ /**
+ * The below is for iterating over documents.
+ */
+ private boolean firstTime = true;
+ private boolean more = true;
+
+ /**
+ * The is a linked list holding all PhrasePositions.
+ * Its always kept sorted by doc id. First one holding the smallest doc id.
+ * The first N PPs, with same doc id, represent all the terms found in current
+ * document. The first node of this linked list, is the next minimum doc id
+ * that needs to be processed
+ */
+ protected SubPhraseQueue pq;
+ protected SubPhrasePositions first, last;
+
+ /**
+ * This class is used to score each document. It receives the first N nodes
+ * with same doc id, meaning all PPs for terms found in current doc. It then
+ * iterates through all position:offset tuples for all these terms, alwyas finding
+ * the next minimu position for the current doc, mainting a current sequence
+ * where each term in the sequence has its offset as well as position
+ * incremented by 1. Whenever the sequence breaks, it scores it and starts
+ * new sequence to represent the new sub-phrase being matched.
+ */
+ protected PerDocScorer perDoc;
+
+ SubPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets,
+ Similarity similarity, byte[] norms, ArrayList terms,
+ SubPhraseQuery.SubPhraseConfig subPhraseConf) {
+ super(similarity);
+
+ this.norms = norms;
+ this.weight = weight;
+ this.value = weight.getValue();
+ this.conf = subPhraseConf;
+ // create linked list
+ for (int i = 0; i < tps.length; i++) {
+ SubPhrasePositions pp = new SubPhrasePositions(tps[i],
+ offsets[i], terms.get(i));
+ if (last != null) {
+ last.next = pp;
+ } else
+ first = pp;
+ last = pp;
+ }
+ // this queue is used for initial sorting of PPs
+ pq = new SubPhraseQueue(tps.length);
+ // this is used for scoring induvidual docs.
+ perDoc = new PerDocScorer(tps.length);
+ }
+
+ /**
+ * First doucment in linked list is the current doc.
+ *
+ * @return
+ */
+ public int doc() {
+ return first != null ? first.doc : NO_MORE_DOCS;
+ }
+
+ /**
+ * First doucment in linked list is the current doc.
+ *
+ * @return
+ */
+ public int docID() {
+ return doc();
+ }
+
+ /**
+ * Increment TermPositions for all terms with doc id as current doc,
+ * and move each to correct position in the list so that next minimum doc is the
+ * first node of the list.
+ *
+ * @return
+ * @throws IOException
+ */
+ public boolean next() throws IOException {
+ return nextDoc() != NO_MORE_DOCS;
+ }
+
+ /**
+ * Increment TermPositions for all terms with doc id as current doc,
+ * and move each to correct position in the list so that next minimum doc is the
+ * first node of the list.
+ *
+ * @return
+ * @throws IOException
+ */
+ public int nextDoc() throws IOException {
+ if (firstTime) {
+ // sort the list and init term positions
+ init();
+ firstTime = false;
+ } else {
+ // increment TPs for current doc and move them to correct pos.
+ doNext();
+ }
+ perDoc.reset();
+ score = perDoc.score();
+ return doc();
+ }
+
+ /**
+ * Increment TermPositions for all terms with doc id as current doc,
+ * and move each to correct position in the list so that next minimum doc
+ * past the target is the first node of the list.
+ *
+ * @return
+ * @throws IOException
+ */
+ public boolean skipTo(int target) throws IOException {
+ return advance(target) != NO_MORE_DOCS;
+ }
+
+ /**
+ * Increment TermPositions for all terms with doc id as current doc,
+ * and move each to correct position in the list so that next minimum doc
+ * past the target is the first node of the list.
+ *
+ * @return
+ * @throws IOException
+ */
+ public int advance(int target) throws IOException {
+ firstTime = false;
+ boolean more = false;
+ SubPhrasePositions last = null;
+ SubPhrasePositions pp = first;
+ while (pp != null) {
+ // increment each TP past target
+ if (!pp.skipTo(target)) {
+ // if no more docs for TP, remove from list
+ pp = remove(pp, last);
+ } else {
+ more = true;
+ last = pp;
+ pp = last.next;
+ }
+ }
+ this.more = more;
+ // sort it
+ if (more)
+ sort();
+ perDoc.reset();
+ score = perDoc.score();
+ return doc();
+ }
+
+ /**
+ * Calculate score. Ignore fields norms if config says so.
+ *
+ * @return
+ * @throws IOException
+ */
+ public float score() throws IOException {
+ float raw = getSimilarity().tf(score) * value;
+ float nrms = !conf.ignoreFieldNorms ?
+ Similarity.decodeNorm(norms[first.doc]) : 1.0f;
+ return raw * nrms;
+ }
+
+ /**
+ * Explain scoring
+ *
+ * @param doc The document number for the explanation.
+ * @return
+ * @throws IOException
+ */
+ public Explanation explain(final int doc) throws IOException {
+ Explanation tfExplanation = new Explanation();
+ while (next() && doc() < doc) {
+ }
+ float phraseFreq = (doc() == doc) ? score : 0.0f;
+ float tfval = getSimilarity().tf(phraseFreq);
+ tfExplanation.setValue(tfval);
+ tfExplanation.setDescription("tf(subPhraseScore=" + phraseFreq + ")");
+ return tfExplanation;
+ }
+
+ public String toString() {
+ return "scorer(" + weight + ")";
+ }
+
+ /**
+ * Increment all TPs for current doc. Move to correct position.
+ * Remove those that don't have any more docs.
+ *
+ * @throws IOException
+ */
+ private void doNext() throws IOException {
+ boolean more = false;
+ SubPhrasePositions pp = first;
+ SubPhrasePositions last = null;
+ int cur = first.doc;
+ // iterate only those that match current doc.
+ // The list is in sorted order.
+ while (pp != null && pp.doc == cur) {
+ // increment TP
+ if (!pp.next())
+ // If no more docs, remove it
+ pp = remove(pp, last);
+ else {
+ // Move the TP to correct position in sorted list
+ // Either first remains where it was and loop ends or second becomes
+ // first now,
+ moveFirst();
+ pp = first;
+ }
+ }
+ if (first != null)
+ more = true;
+ this.more = more;
+ }
+
+ /**
+ * Move the first item in the list to correct postion in the sorted list.
+ */
+ private void moveFirst() {
+ if (first == null || first.next == null)
+ return;
+ SubPhrasePositions pp = first.next;
+ SubPhrasePositions prev = first;
+ // traverse to find correct position. If same doc id, sort on offset so that
+ // nodes are sorted in the same order as they appear in query
+ while (pp != null &&
+ ((pp.doc < first.doc) ||
+ (pp.doc == first.doc && pp.offset < first.offset))) {
+ prev = pp;
+ pp = pp.next;
+ }
+ // insert in correct position
+ if (pp == null) {
+ // move past all
+ prev.next = first;
+ first = first.next;
+ prev.next.next = null;
+ } else {
+ // insert before pp
+ if (prev != first) {
+ SubPhrasePositions tmp = first.next;
+ prev.next = first;
+ first.next = pp;
+ first = tmp;
+ }
+ }
+ }
+
+ /**
+ * Remove a node from list
+ *
+ * @param pp
+ * @param last
+ * @return
+ * @throws IOException
+ */
+ public SubPhrasePositions remove(SubPhrasePositions pp,
+ SubPhrasePositions last)
+ throws IOException {
+ SubPhrasePositions next;
+ if (pp == first) {
+ first = first.next;
+ next = first;
+ } else {
+ last.next = pp.next;
+ next = last.next;
+ }
+ return next;
+ }
+
+ /**
+ * Init all TPs and sort.
+ * remove those that don't have any docs.
+ *
+ * @throws IOException
+ */
+ private void init() throws IOException {
+ boolean more = false;
+ SubPhrasePositions last = null;
+ SubPhrasePositions pp = first;
+ while (pp != null) {
+ if (!pp.next()) {
+ pp = remove(pp, last);
+ } else {
+ more = true;
+ last = pp;
+ pp = last.next;
+ }
+ }
+ this.more = more;
+ if (more)
+ sort();
+ }
+
+ /**
+ * sort using priority queue
+ */
+ private void sort() {
+ pq.clear();
+ for (SubPhrasePositions pp = first; pp != null; pp = pp.next)
+ pq.put(pp);
+ pqToList();
+ }
+
+ /**
+ * convert a queue to linked list by reading. Creates a sorted list
+ */
+ protected final void pqToList() {
+ last = first = null;
+ while (pq.top() != null) {
+ SubPhrasePositions pp = (SubPhrasePositions) pq.pop();
+ if (last != null) {
+ last.next = pp;
+ } else
+ first = pp;
+ last = pp;
+ pp.next = null;
+ }
+ }
+
+ /**
+ * Scores each document. Always reads the next minimum position for any
+ * term. If its offset is one greater than last read, and its position is one greater
+ * than last read, adds it to current sequence length. If not, the current sub
+ * sequence has neded and can be scored and score is added.
+ */
+ class PerDocScorer {
+ /**
+ * How many terms are being asked to be matches for this doc ?
+ */
+ int termCount;
+ /**
+ * How many are remaining ? Zero means document is processed.
+ */
+ int curTerms;
+ /**
+ * used as init flag. We start by sorting by positions.
+ */
+ boolean sorted;
+ /**
+ * cache the score and return to guard against multiple calls.
+ */
+ boolean docScored;
+ /**
+ * computed score
+ */
+ int score;
+ /**
+ * current seqquence length
+ */
+ int curSeqLen = 0;
+ /**
+ * which the term the current sequence started with ?
+ */
+ int curOffset = 0;
+ /**
+ * which position in the document did the current sequence start at ?
+ */
+ int curPos = 0;
+ /**
+ * what is the longest match score so far ?
+ */
+ int longestMatchScore = 0;
+ /**
+ * the sorted list used to pick the next minimum.
+ */
+ SubPhrasePositions[] sortedOffsets;
+ /**
+ * set to remember mathes already seen. The sequences are cconverted
+ * to long, where each bit represnts their position in query. So a sequence
+ * contain second and third words of query gets stored as "0...110" = 6
+ * So it works only for queries with 64 terms. Is there a better way ?
+ */
+ HashSet duplicates;
+
+ /**
+ * Create objects
+ *
+ * @param length
+ */
+ public PerDocScorer(int length) {
+ termCount = length;
+ sortedOffsets = new SubPhrasePositions[termCount];
+ if (conf.ignoreDuplicates)
+ duplicates = new HashSet(100);
+ }
+
+ /**
+ * clear for every new document
+ */
+ void reset() {
+ sorted = false;
+ docScored = false;
+ curTerms = 0;
+ longestMatchScore = 0;
+ score = 0;
+ if (conf.ignoreDuplicates)
+ duplicates.clear();
+ }
+
+ /**
+ * Find sub sequences
+ *
+ * @return
+ */
+ int score() {
+ if (first == null)
+ return 0;
+ if (docScored)
+ return score;
+ // get first
+ SubPhrasePositions min = nextMin();
+ curSeqLen = 1;
+ curOffset = min.offset;
+ curPos = min.position;
+ // get next
+ min = nextMin();
+ while (min != null) {
+ // if below matches, its continuation of a sub sequence.
+ if (min.offset == curOffset + curSeqLen &&
+ min.position == curPos + curSeqLen)
+ curSeqLen++;
+ else {
+ // sub sequence ended. Score it.
+ scoreSubPhrase();
+ curSeqLen = 1;
+ curOffset = min.offset;
+ curPos = min.position;
+ }
+ min = nextMin();
+ }
+ // score the last sequence found.
+ scoreSubPhrase();
+ docScored = true;
+ if (conf.matchOnlyLongest) {
+ score = longestMatchScore;
+ }
+ return score;
+ }
+
+ /**
+ * Actual scoring algorithm
+ * score += (seq len) ^ phraseBoost
+ */
+ private void scoreSubPhrase() {
+ int s = 1;
+ for (int i = 1; i <= conf.phraseBoost; i++)
+ s *= curSeqLen;
+ if (s > longestMatchScore)
+ longestMatchScore = s;
+ score += s;
+ // duplicate detection works for the queries whose
+ // # terms = sizeof(Long) bits.
+ if (conf.ignoreDuplicates) {
+ long encodedSeq = 0;
+ // encode sequence as a long
+ for (int j = curOffset; j < curOffset + curSeqLen; j++) {
+ encodedSeq += (1 << j);
+ }
+ if (duplicates.contains(encodedSeq)) {
+ score -= s;
+ }
+ duplicates.add(encodedSeq);
+ }
+ }
+
+ /**
+ * Get next min position.
+ * Increments the first item and moves to correct position.
+ * remove those that have been exhausted.
+ *
+ * @return
+ */
+ private SubPhrasePositions nextMin() {
+ if (!sorted) {
+ sorted = true;
+ sortAsc();
+ return sortedOffsets[0];
+ } else if (curTerms == 0) {
+ return null;
+ } else {
+ try {
+ incrementAndFindMin();
+ if (curTerms == 0)
+ return null;
+ else
+ return sortedOffsets[0];
+ } catch (IOException e) {
+ // e.printStackTrace();
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Increment first item and move it to correct postion.
+ *
+ * @throws IOException
+ */
+ private void incrementAndFindMin() throws IOException {
+ if (!sortedOffsets[0].nextPosition()) {
+ curTerms--;
+ for (int i = 0; i < curTerms; i++)
+ sortedOffsets[i] = sortedOffsets[i + 1];
+ } else {
+ SubPhrasePositions pos = sortedOffsets[0];
+ int k = 1;
+ while (k < curTerms && pos.position > sortedOffsets[k].position) {
+ sortedOffsets[k - 1] = sortedOffsets[k];
+ k++;
+ }
+ sortedOffsets[k - 1] = pos;
+ }
+ }
+
+ /**
+ * sort first time
+ */
+ private void sortAsc() {
+ sortedOffsets[0] = first;
+ curTerms = 1;
+ int cur = doc();
+ int i = 0;
+ SubPhrasePositions it = first.next;
+ while (it != null && it.doc == cur) {
+ int j = i;
+ while (j >= 0 && it.position < sortedOffsets[j].position) {
+ sortedOffsets[j + 1] = sortedOffsets[j];
+ j--;
+ }
+ sortedOffsets[j + 1] = it;
+ it = it.next;
+ i++;
+ }
+ curTerms += i;
+ }
+ }
+
+ /**
+ * Wrapper for TermPositions so that we can put it in a linked list.
+ */
+ static class SubPhrasePositions {
+ /**
+ * next() and skipTo() iterate over documents.
+ * firstPosition() and nextPosition() iterate over postions in a given doc.
+ */
+ /**
+ * current doc
+ */
+ int doc;
+ /**
+ * current position
+ */
+ int position = -1;
+ /**
+ * positions to read
+ */
+ int count;
+ /**
+ * offset in query
+ */
+ int offset;
+ /**
+ * underlying TP
+ */
+ TermPositions tp;
+ /**
+ * pointer to next
+ */
+ SubPhrasePositions next;
+ /**
+ * hold actual term for debugging ?
+ */
+ String term = "";
+
+ SubPhrasePositions(TermPositions t, int o, Object o1) {
+ tp = t;
+ offset = o;
+ term = ((Term) o1).text();
+ }
+
+ boolean next() throws IOException {
+ if (!tp.next()) {
+ tp.close();
+ doc = Integer.MAX_VALUE;
+ return false;
+ }
+ doc = tp.doc();
+ firstPosition();
+ return true;
+ }
+
+ boolean skipTo(int target) throws IOException {
+ if (!tp.skipTo(target)) {
+ tp.close();
+ doc = Integer.MAX_VALUE;
+ return false;
+ }
+ doc = tp.doc();
+ firstPosition();
+ return true;
+ }
+
+ void firstPosition() throws IOException {
+ count = tp.freq();
+ nextPosition();
+ }
+
+ boolean nextPosition() throws IOException {
+ if (count-- > 0) {
+ position = tp.nextPosition();
+ return true;
+ } else
+ return false;
+ }
+ }
+
+ /**
+ * Simple queue that sorts by doc id, then by offset and then by position
+ */
+ static class SubPhraseQueue extends PriorityQueue {
+ SubPhraseQueue(int size) {
+ initialize(size);
+ }
+
+ protected boolean lessThan(Object o1, Object o2) {
+ SubPhrasePositions pp1 = (SubPhrasePositions) o1;
+ SubPhrasePositions pp2 = (SubPhrasePositions) o2;
+ if (pp1.doc == pp2.doc)
+ if (pp1.position == pp2.position)
+ return pp1.offset < pp2.offset;
+ else
+ return pp1.position < pp2.position;
+ else
+ return pp1.doc < pp2.doc;
+ }
+ }
+}
Index: src/test/org/apache/lucene/search/TestSubPhraseQuery.java
===================================================================
--- src/test/org/apache/lucene/search/TestSubPhraseQuery.java (revision 0)
+++ src/test/org/apache/lucene/search/TestSubPhraseQuery.java (revision 0)
@@ -0,0 +1,172 @@
+package org.apache.lucene.search;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.RAMDirectory;
+
+public class TestSubPhraseQuery extends TestCase {
+
+ private IndexSearcher searcher;
+ private RAMDirectory directory;
+ private Analyzer analyzer;
+
+ String[] docs = new String[]
+ {
+ "sub1 one sub2 two sub3 three sub1 four five sub2 sub3",
+ "one two sub1 sub2 sub3 sub4 three four sub3 sub2 sub1",
+ "one two three four",
+ "one two three sub2 sub1 four five six sub1 sub3",
+ "sub1 sub2 sub3",
+ "one two three sub3 four sub2 sub3 sub4"
+ };
+
+ String[] docsForIgnoreDuplicateTest = new String[]
+ {
+ "sub1 one sub2 two sub3 three sub1",
+ "one two sub1 sub2 sub3 sub4 three four sub3 sub2 sub1",
+ "one two three four",
+ "one two three sub1 sub2 four five six sub1 sub2 sub1 sub2 sub1 sub2",
+ "sub1 sub2 sub3",
+ "one two three sub3 four sub2 sub3 sub4"
+ };
+
+ protected void setUp() throws Exception {
+
+ directory = new RAMDirectory();
+ //analyzer = new WhitespaceAnalyzer();
+ analyzer = new StandardAnalyzer();
+ IndexWriter writer = new IndexWriter(directory,
+ analyzer, true);
+
+ for (String content : docs) {
+ Document doc = new Document();
+ doc.add(new Field("f", content, Field.Store.YES,
+ Field.Index.TOKENIZED));
+ writer.addDocument(doc);
+ }
+
+ writer.close();
+
+ searcher = new IndexSearcher(directory);
+ }
+
+ public void testSubPhrase() throws Exception {
+ String search = "sub1 sub2 sub3 sub4";
+ SubPhraseQuery pq = new SubPhraseQuery();
+ String[] terms = search.split("\\s+");
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ pq.add(new Term("f", term));
+ }
+
+ SubPhraseQuery.SubPhraseConfig conf = new SubPhraseQuery.SubPhraseConfig();
+ conf.ignoreIdf = true;
+ conf.ignoreFieldNorms = true;
+ conf.matchOnlyLongest = false;
+ conf.ignoreDuplicates = true;
+ conf.phraseBoost = 2;
+ pq.setSubPhraseConf(conf);
+
+ Hits hits = searcher.search(pq);
+ assertTrue("returned correct # ", (hits.length() == 5));
+ assertTrue("returned correct match", (hits.id(0) == 1));
+ assertTrue("returned correct match", (hits.id(1) == 5));
+ assertTrue("returned correct match", (hits.id(2) == 4));
+ assertTrue("returned correct match", (hits.id(3) == 0));
+ }
+
+ public void testSubPhraseMatchLongest() throws Exception {
+ String search = "sub1 sub2 sub3 sub4";
+ SubPhraseQuery pq = new SubPhraseQuery();
+ String[] terms = search.split("\\s+");
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ pq.add(new Term("f", term));
+ }
+
+ SubPhraseQuery.SubPhraseConfig conf = new SubPhraseQuery.SubPhraseConfig();
+ conf.ignoreIdf = true;
+ conf.ignoreFieldNorms = true;
+ conf.matchOnlyLongest = true;
+ conf.ignoreDuplicates = true;
+ conf.phraseBoost = 2;
+ pq.setSubPhraseConf(conf);
+
+ Hits hits = searcher.search(pq);
+ assertTrue("returned correct # ", (hits.length() == 5));
+ assertTrue("returned correct match", (hits.id(0) == 1));
+ assertTrue("returned correct match", (hits.score(1) == hits.score(2)));
+ assertTrue("returned correct match", (hits.id(3) == 0));
+ }
+
+ public void testSubPhrasePhraseBoost() throws Exception {
+ String search = "sub1 sub2 sub3 sub4";
+ SubPhraseQuery pq = new SubPhraseQuery();
+ String[] terms = search.split("\\s+");
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ pq.add(new Term("f", term));
+ }
+ // Test proven by not setting it compared to other tests where it is set to 2
+ SubPhraseQuery.SubPhraseConfig conf = new SubPhraseQuery.SubPhraseConfig();
+ conf.ignoreIdf = true;
+ conf.ignoreFieldNorms = true;
+ conf.matchOnlyLongest = false;
+ conf.ignoreDuplicates = false;
+ conf.phraseBoost = 1;
+ pq.setSubPhraseConf(conf);
+
+ Hits hits = searcher.search(pq);
+ assertTrue("returned correct # ", (hits.length() == 5));
+ assertTrue("returned correct match", (hits.id(0) == 1));
+ assertTrue("returned correct match", (hits.id(1) == 0));
+ }
+
+ public void testSubPhraseIgnoreDuplicates() throws Exception {
+
+ RAMDirectory directory = new RAMDirectory();
+ //analyzer = new WhitespaceAnalyzer();
+ StandardAnalyzer analyzer = new StandardAnalyzer();
+ IndexWriter writer = new IndexWriter(directory,
+ analyzer, true);
+
+ for (String content : docsForIgnoreDuplicateTest) {
+ Document doc = new Document();
+ doc.add(new Field("f", content, Field.Store.YES,
+ Field.Index.TOKENIZED));
+ writer.addDocument(doc);
+ }
+
+ writer.close();
+
+ IndexSearcher searcher = new IndexSearcher(directory);
+
+ String search = "sub1 sub2 sub3 sub4";
+ SubPhraseQuery pq = new SubPhraseQuery();
+ String[] terms = search.split("\\s+");
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ pq.add(new Term("f", term));
+ }
+ // Test proven by not setting it compared to other tests where it is set to 2
+ SubPhraseQuery.SubPhraseConfig conf = new SubPhraseQuery.SubPhraseConfig();
+ conf.ignoreIdf = true;
+ conf.ignoreFieldNorms = true;
+ conf.matchOnlyLongest = false;
+ conf.ignoreDuplicates = true;
+ conf.phraseBoost = 1;
+ pq.setSubPhraseConf(conf);
+
+ Hits hits = searcher.search(pq);
+ assertTrue("returned correct # ", (hits.length() == 5));
+ assertTrue("returned correct match", (hits.id(0) == 1));
+ assertTrue("returned correct match", (hits.id(1) == 5));
+ assertTrue("returned correct match", (hits.score(2) == hits.score(3)));
+ assertTrue("returned correct match", (hits.id(4) == 3));
+ }
+}