Index: lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (revision 955326) +++ lucene/src/java/org/apache/lucene/search/MultiPhraseQuery.java (working copy) @@ -171,7 +171,10 @@ if (termArrays.size() == 0) // optimize zero-term case return null; + final Bits delDocs = MultiFields.getDeletedDocs(reader); + DocsAndPositionsEnum[] postings = new DocsAndPositionsEnum[termArrays.size()]; + for (int i=0; i 1) { postingsEnum = new UnionDocsAndPositionsEnum(reader, terms); } else { - postingsEnum = reader.termPositionsEnum(MultiFields.getDeletedDocs(reader), + final BytesRef text = new BytesRef(terms[0].text()); + postingsEnum = reader.termPositionsEnum(delDocs, terms[0].field(), - new BytesRef(terms[0].text())); - } + text); - if (postingsEnum == null) { - return null; + if (postingsEnum == null) { + if (MultiFields.getTermDocsEnum(reader, delDocs, terms[0].field(), text) != null) { + // term does exist, but has no positions + throw new IllegalStateException("field \"" + terms[0].field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + terms[0].text() + ")"); + } else { + // term does not exist + return null; + } + } } postings[i] = postingsEnum; @@ -231,13 +241,25 @@ fieldExpl.setDescription("fieldWeight("+getQuery()+" in "+doc+ "), product of:"); - PhraseScorer scorer = (PhraseScorer) scorer(reader, true, false); + Scorer scorer = (Scorer) scorer(reader, true, false); if (scorer == null) { return new Explanation(0.0f, "no matching docs"); } + // nocommit -- if not already, add test case calling + // explain() for slop = 0 / non-zero cases Explanation tfExplanation = new Explanation(); int d = scorer.advance(doc); - float phraseFreq = (d == doc) ? scorer.currentFreq() : 0.0f; + float phraseFreq; + if (d == doc) { + if (slop == 0) { + phraseFreq = ((ExactPhraseScorer) scorer).currentFreq(); + } else { + phraseFreq = ((SloppyPhraseScorer) scorer).currentFreq(); + } + } else { + phraseFreq = 0.0f; + } + tfExplanation.setValue(similarity.tf(phraseFreq)); tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); fieldExpl.addDetail(tfExplanation); @@ -456,11 +478,17 @@ List docsEnums = new LinkedList(); final Bits delDocs = MultiFields.getDeletedDocs(indexReader); for (int i = 0; i < terms.length; i++) { + final BytesRef text = new BytesRef(terms[i].text()); DocsAndPositionsEnum postings = indexReader.termPositionsEnum(delDocs, terms[i].field(), - new BytesRef(terms[i].text())); + text); if (postings != null) { docsEnums.add(postings); + } else { + if (MultiFields.getTermDocsEnum(indexReader, delDocs, terms[i].field(), text) != null) { + // term does exist, but has no positions + throw new IllegalStateException("field \"" + terms[i].field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + terms[i].text() + ")"); + } } } Index: lucene/src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PhraseQuery.java (revision 955326) +++ lucene/src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Set; import java.util.ArrayList; +import java.util.Arrays; import org.apache.lucene.index.Term; import org.apache.lucene.util.BytesRef; @@ -120,6 +121,24 @@ return super.rewrite(reader); } + private static class PostingsAndFreq implements Comparable { + final DocsAndPositionsEnum postings; + final int docFreq; + final int position; + final BytesRef text; // nocommit only used for debugging + + public PostingsAndFreq(BytesRef text, DocsAndPositionsEnum postings, int docFreq, int position) { + this.text = text; + this.postings = postings; + this.docFreq = docFreq; + this.position = position; + } + + public int compareTo(PostingsAndFreq other) { + return docFreq - other.docFreq; + } + } + private class PhraseWeight extends Weight { private final Similarity similarity; private float value; @@ -163,7 +182,7 @@ if (terms.size() == 0) // optimize zero-term case return null; - DocsAndPositionsEnum[] postings = new DocsAndPositionsEnum[terms.size()]; + PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()]; final Bits delDocs = MultiFields.getDeletedDocs(reader); for (int i = 0; i < terms.size(); i++) { final Term t = terms.get(i); @@ -183,15 +202,31 @@ return null; } } - postings[i] = postingsEnum; + postingsFreqs[i] = new PostingsAndFreq(text, postingsEnum, reader.docFreq(t.field(), text), positions.get(i).intValue()); } + // sort by increasing docFreq order + //if (slop == 0) { + Arrays.sort(postingsFreqs); + //} + + final DocsAndPositionsEnum[] postings = new DocsAndPositionsEnum[postingsFreqs.length]; + final int positions[] = new int[postingsFreqs.length]; + + //System.out.println("PQ.scorer"); + for(int i=0;i doc) { + break; + } + i++; + } + + if (i == postings.length) { + // this doc has all the terms -- now test whether + // phrase occurs + docID = doc; + //System.out.println(" has all terms"); + + freq = phraseFreq(); + if (freq != 0) { + //System.out.println(" freq=" + freq); + return docID; + } else { + //System.out.println(" no phrase"); + } + } } - pqToList(); // rebuild list from pq + } - // for counting how many times the exact phrase is found in current document, - // just count how many times all PhrasePosition's have exactly the same position. - int freq = 0; - do { // find position w/ all terms - while (first.position < last.position) { // scan forward in first - do { - if (!first.nextPosition()) - return freq; - } while (first.position < last.position); - firstToLast(); + @Override + public int advance(int target) throws IOException { + //System.out.println("PQ.advance target=" + target); + int doc = postings[0].advance(target); + if (doc == DocsEnum.NO_MORE_DOCS) { + docID = DocsEnum.NO_MORE_DOCS; + //System.out.println(" end"); + return doc; + } + + while(true) { + + // all middle terms + // nocommit -- try sometimes not using .skipTo here? + int i = 1; + while(i < postings.length) { + int doc2 = postings[i].docID(); + if (doc2 < doc) { + doc2 = postings[i].advance(doc); + } + if (doc2 > doc) { + break; + } + i++; } - freq++; // all equal: a match - } while (last.nextPosition()); - + + if (i == postings.length) { + // this doc has all the terms -- now test whether + // phrase occurs + docID = doc; + freq = phraseFreq(); + //System.out.println(" doc=" + doc); + if (freq != 0) { + //System.out.println(" freq=" + freq); + return docID; + } + } + + doc = postings[0].nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + docID = doc; + //System.out.println(" end2"); + return doc; + } + } + } + + @Override + public String toString() { + return "ExactPhraseScorer(" + weight + ")"; + } + + // used by MultiPhraseQuery + float currentFreq() { return freq; } + + @Override + public int docID() { + return docID; + } + + @Override + public float score() throws IOException { + //System.out.println("scoring " + first.doc); + final float raw; // raw score + if (freq < SCORE_CACHE_SIZE) { + raw = scoreCache[freq]; + } else { + raw = getSimilarity().tf((float) freq) * value; + } + return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize + } + + private int phraseFreq() throws IOException { + + // nocommit -- we can do better -- as we visit each, + // skip the pos if count != my slot + + // TODO: break into chunks, so long docs don't take too + // much RAM + + // TODO: conceivably, a different ordering of terms + // could have better perf than the inverse-docFreq order + // that's best for ANDing. though this should be + // unusual. + + // nocommit: do this in chunks, just like BooleanScorer + + // first term + { + final DocsAndPositionsEnum posEnum = postings[0]; + final int limit = posEnum.freq(); + final int offset = offsets[0]; + //System.out.println(" pp start offset=" + offset); + int lastPos = -1; + for(int i=0;i