Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 955429) +++ lucene/CHANGES.txt (working copy) @@ -194,6 +194,11 @@ * LUCENE-2489: Added PerFieldCodecWrapper (in oal.index.codecs) which lets you set the Codec per field (Mike McCandless) + +Optimizations + +* LUCENE-2410: ~2.5X speedup on exact (slop=0) PhraseQuery matching. + (Mike McCandless) ======================= Lucene 3.x (not yet released) ======================= Index: lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java =================================================================== --- lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java (revision 955429) +++ lucene/src/test/org/apache/lucene/search/TestPhraseQuery.java (working copy) @@ -19,18 +19,21 @@ import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.document.*; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.Term; +import org.apache.lucene.index.*; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.queryParser.QueryParser; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.store.*; import org.apache.lucene.util.Version; +import org.apache.lucene.util._TestUtil; import java.io.IOException; import java.io.Reader; +import java.io.StringReader; +import java.util.List; +import java.util.ArrayList; +import java.util.Random; /** * Tests {@link PhraseQuery}. @@ -331,11 +334,11 @@ writer.addDocument(doc); Document doc2 = new Document(); - doc2.add(new Field("field", "foo firstname xxx lastname foo", Field.Store.YES, Field.Index.ANALYZED)); + doc2.add(new Field("field", "foo firstname zzz lastname foo", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc2); Document doc3 = new Document(); - doc3.add(new Field("field", "foo firstname xxx yyy lastname foo", Field.Store.YES, Field.Index.ANALYZED)); + doc3.add(new Field("field", "foo firstname zzz yyy lastname foo", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc3); writer.optimize(); @@ -517,6 +520,9 @@ //System.out.println("(exact) field: one two three: "+score0); QueryUtils.check(query,searcher); + // just make sure no exc: + searcher.explain(query, 0); + // search on non palyndrome, find phrase with slop 3, though no slop required here. query.setSlop(4); // to use sloppy scorer hits = searcher.search(query, null, 1000).scoreDocs; @@ -533,6 +539,10 @@ query.add(new Term("palindrome", "two")); query.add(new Term("palindrome", "three")); hits = searcher.search(query, null, 1000).scoreDocs; + + // just make sure no exc: + searcher.explain(query, 0); + assertEquals("just sloppy enough", 1, hits.length); //float score2 = hits[0].score; //System.out.println("palindrome: one two three: "+score2); @@ -572,4 +582,93 @@ Query rewritten = pq.rewrite(searcher.getIndexReader()); assertTrue(rewritten instanceof TermQuery); } + + public void testRandomPhrases() throws Exception { + Directory dir = new MockRAMDirectory(); + Analyzer analyzer = new MockAnalyzer(); + + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + List> docs = new ArrayList>(); + Document d = new Document(); + Field f = new Field("f", "", Field.Store.NO, Field.Index.ANALYZED); + d.add(f); + + Random r = newRandom(); + + int NUM_DOCS = 10*_TestUtil.getRandomMultiplier(); + for(int i=0;i 4096 so it spans multiple chunks + int termCount = _TestUtil.nextInt(r, 10000, 30000); + + List doc = new ArrayList(); + + StringBuilder sb = new StringBuilder(); + while(doc.size() < termCount) { + if (r.nextInt(5) == 1 || docs.size() == 0) { + // make new non-empty-string term + String term; + while(true) { + term = _TestUtil.randomUnicodeString(r); + if (term.length() > 0) { + break; + } + } + TokenStream ts = analyzer.reusableTokenStream("ignore", new StringReader(term)); + CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class); + while(ts.incrementToken()) { + String text = termAttr.toString(); + doc.add(text); + sb.append(text).append(' '); + } + } else { + // pick existing sub-phrase + List lastDoc = docs.get(r.nextInt(docs.size())); + int len = _TestUtil.nextInt(r, 1, 10); + int start = r.nextInt(lastDoc.size()-len); + for(int k=start;k doc = docs.get(docID); + + final int numTerm = _TestUtil.nextInt(r, 2, 20); + final int start = r.nextInt(doc.size()-numTerm); + PhraseQuery pq = new PhraseQuery(); + StringBuilder sb = new StringBuilder(); + for(int t=start;t 1) { postingsEnum = new UnionDocsAndPositionsEnum(reader, terms); + + // coarse -- this overcounts since a given doc can + // have more than one terms: + docFreq = 0; + for(int j=0;j 1/5th) rarer than + // the first term, then we just use .nextDoc() when + // ANDing. This buys ~15% gain for phrases where + // freq of rarest 2 terms is close: + final boolean useAdvance = postingsFreqs[0].docFreq != 0 && postingsFreqs[1].docFreq / postingsFreqs[0].docFreq > 5; + + ExactPhraseScorer s = new ExactPhraseScorer(this, postings, positions, similarity, + reader.norms(field), useAdvance); + if (s.noDocs) { + return null; + } else { + return s; + } + } else { + return new SloppyPhraseScorer(this, postings, positions, similarity, slop, reader.norms(field)); + } } @Override @@ -231,13 +280,24 @@ fieldExpl.setDescription("fieldWeight("+getQuery()+" in "+doc+ "), product of:"); - PhraseScorer scorer = (PhraseScorer) scorer(reader, true, false); + Scorer scorer = (Scorer) scorer(reader, true, false); if (scorer == null) { return new Explanation(0.0f, "no matching docs"); } + Explanation tfExplanation = new Explanation(); int d = scorer.advance(doc); - float phraseFreq = (d == doc) ? scorer.currentFreq() : 0.0f; + float phraseFreq; + if (d == doc) { + if (slop == 0) { + phraseFreq = ((ExactPhraseScorer) scorer).currentFreq(); + } else { + phraseFreq = ((SloppyPhraseScorer) scorer).currentFreq(); + } + } else { + phraseFreq = 0.0f; + } + tfExplanation.setValue(similarity.tf(phraseFreq)); tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); fieldExpl.addDetail(tfExplanation); @@ -456,11 +516,17 @@ List docsEnums = new LinkedList(); final Bits delDocs = MultiFields.getDeletedDocs(indexReader); for (int i = 0; i < terms.length; i++) { + final BytesRef text = new BytesRef(terms[i].text()); DocsAndPositionsEnum postings = indexReader.termPositionsEnum(delDocs, terms[i].field(), - new BytesRef(terms[i].text())); + text); if (postings != null) { docsEnums.add(postings); + } else { + if (MultiFields.getTermDocsEnum(indexReader, delDocs, terms[i].field(), text) != null) { + // term does exist, but has no positions + throw new IllegalStateException("field \"" + terms[i].field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + terms[i].text() + ")"); + } } } Index: lucene/src/java/org/apache/lucene/search/PhraseQuery.java =================================================================== --- lucene/src/java/org/apache/lucene/search/PhraseQuery.java (revision 955429) +++ lucene/src/java/org/apache/lucene/search/PhraseQuery.java (working copy) @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.Set; import java.util.ArrayList; +import java.util.Arrays; import org.apache.lucene.index.Term; import org.apache.lucene.util.BytesRef; @@ -120,6 +121,22 @@ return super.rewrite(reader); } + static class PostingsAndFreq implements Comparable { + final DocsAndPositionsEnum postings; + final int docFreq; + final int position; + + public PostingsAndFreq(DocsAndPositionsEnum postings, int docFreq, int position) { + this.postings = postings; + this.docFreq = docFreq; + this.position = position; + } + + public int compareTo(PostingsAndFreq other) { + return docFreq - other.docFreq; + } + } + private class PhraseWeight extends Weight { private final Similarity similarity; private float value; @@ -163,7 +180,7 @@ if (terms.size() == 0) // optimize zero-term case return null; - DocsAndPositionsEnum[] postings = new DocsAndPositionsEnum[terms.size()]; + PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()]; final Bits delDocs = MultiFields.getDeletedDocs(reader); for (int i = 0; i < terms.size(); i++) { final Term t = terms.get(i); @@ -183,17 +200,45 @@ return null; } } - postings[i] = postingsEnum; + postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), text), positions.get(i).intValue()); } - if (slop == 0) // optimize exact case - return new ExactPhraseScorer(this, postings, getPositions(), similarity, - reader.norms(field)); - else + // sort by increasing docFreq order + if (slop == 0) { + Arrays.sort(postingsFreqs); + } + + final DocsAndPositionsEnum[] postings = new DocsAndPositionsEnum[postingsFreqs.length]; + final int positions[] = new int[postingsFreqs.length]; + + for(int i=0;i 1/5th) rarer than + // the first term, then we just use .nextDoc() when + // ANDing. This buys ~15% gain for phrases where + // freq of rarest 2 terms is close: + final boolean useAdvance = postingsFreqs[0].docFreq != 0 && postingsFreqs[1].docFreq / postingsFreqs[0].docFreq > 5; + + ExactPhraseScorer s = new ExactPhraseScorer(this, postings, positions, similarity, + reader.norms(field), useAdvance); + if (s.noDocs) { + return null; + } else { + return s; + } + } else { return - new SloppyPhraseScorer(this, postings, getPositions(), similarity, slop, + new SloppyPhraseScorer(this, postings, positions, similarity, slop, reader.norms(field)); - + } } @Override @@ -244,13 +289,23 @@ fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+ "), product of:"); - PhraseScorer scorer = (PhraseScorer) scorer(reader, true, false); + Scorer scorer = (Scorer) scorer(reader, true, false); if (scorer == null) { return new Explanation(0.0f, "no matching docs"); } Explanation tfExplanation = new Explanation(); int d = scorer.advance(doc); - float phraseFreq = (d == doc) ? scorer.currentFreq() : 0.0f; + float phraseFreq; + if (d == doc) { + if (slop == 0) { + phraseFreq = ((ExactPhraseScorer) scorer).currentFreq(); + } else { + phraseFreq = ((SloppyPhraseScorer) scorer).currentFreq(); + } + } else { + phraseFreq = 0.0f; + } + tfExplanation.setValue(similarity.tf(phraseFreq)); tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); Index: lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java =================================================================== --- lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (revision 955429) +++ lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java (working copy) @@ -18,39 +18,300 @@ */ import java.io.IOException; +import java.util.Arrays; + import org.apache.lucene.index.*; -final class ExactPhraseScorer extends PhraseScorer { +final class ExactPhraseScorer extends Scorer { + private final Weight weight; + private final byte[] norms; + private final float value; + private static final int SCORE_CACHE_SIZE = 32; + private final float[] scoreCache = new float[SCORE_CACHE_SIZE]; + + private final int endMinus1; + private final boolean useAdvance; + + private final static int CHUNK = 4096; + + private int gen; + private final int[] counts = new int[CHUNK]; + private final int[] gens = new int[CHUNK]; + + boolean noDocs; + + private final static class ChunkState { + final DocsAndPositionsEnum posEnum; + final int offset; + int posUpto; + int posLimit; + int pos; + int lastPos; + + public ChunkState(DocsAndPositionsEnum posEnum, int offset) { + this.posEnum = posEnum; + this.offset = offset; + } + } + + private final ChunkState[] chunkStates; + + private int docID = -1; + private int freq; + ExactPhraseScorer(Weight weight, DocsAndPositionsEnum[] postings, int[] offsets, - Similarity similarity, byte[] norms) { - super(weight, postings, offsets, similarity, norms); + Similarity similarity, byte[] norms, boolean useAdvance) throws IOException { + super(similarity); + this.weight = weight; + this.norms = norms; + this.value = weight.getValue(); + this.useAdvance = useAdvance; + + chunkStates = new ChunkState[postings.length]; + + endMinus1 = offsets.length-1; + + for(int i=0;i 0 && postings[i].nextDoc() == DocsEnum.NO_MORE_DOCS) { + noDocs = true; + return; + } + } + + for (int i = 0; i < SCORE_CACHE_SIZE; i++) { + scoreCache[i] = getSimilarity().tf((float) i) * value; + } } @Override - protected final float phraseFreq() throws IOException { - // sort list with pq - pq.clear(); - for (PhrasePositions pp = first; pp != null; pp = pp.next) { - pp.firstPosition(); - pq.add(pp); // build pq from list + public int nextDoc() throws IOException { + while(true) { + + // first (rarest) term + final int doc = chunkStates[0].posEnum.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + docID = doc; + return doc; + } + + // not-first terms + int i = 1; + while(i < chunkStates.length) { + int doc2 = chunkStates[i].posEnum.docID(); + if (useAdvance) { + if (doc2 < doc) { + doc2 = chunkStates[i].posEnum.advance(doc); + } + } else { + while (doc2 < doc) { + doc2 = chunkStates[i].posEnum.nextDoc(); + } + } + if (doc2 > doc) { + break; + } + i++; + } + + if (i == chunkStates.length) { + // this doc has all the terms -- now test whether + // phrase occurs + docID = doc; + + freq = phraseFreq(); + if (freq != 0) { + return docID; + } + } } - pqToList(); // rebuild list from pq + } - // for counting how many times the exact phrase is found in current document, - // just count how many times all PhrasePosition's have exactly the same position. - int freq = 0; - do { // find position w/ all terms - while (first.position < last.position) { // scan forward in first - do { - if (!first.nextPosition()) - return freq; - } while (first.position < last.position); - firstToLast(); + @Override + public int advance(int target) throws IOException { + + // first term + int doc = chunkStates[0].posEnum.advance(target); + if (doc == DocsEnum.NO_MORE_DOCS) { + docID = DocsEnum.NO_MORE_DOCS; + return doc; + } + + while(true) { + + // not-first terms + int i = 1; + while(i < chunkStates.length) { + int doc2 = chunkStates[i].posEnum.docID(); + if (doc2 < doc) { + doc2 = chunkStates[i].posEnum.advance(doc); + } + if (doc2 > doc) { + break; + } + i++; } - freq++; // all equal: a match - } while (last.nextPosition()); - + + if (i == chunkStates.length) { + // this doc has all the terms -- now test whether + // phrase occurs + docID = doc; + freq = phraseFreq(); + if (freq != 0) { + return docID; + } + } + + doc = chunkStates[0].posEnum.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + docID = doc; + return doc; + } + } + } + + @Override + public String toString() { + return "ExactPhraseScorer(" + weight + ")"; + } + + // used by MultiPhraseQuery + float currentFreq() { return freq; } + + @Override + public int docID() { + return docID; + } + + @Override + public float score() throws IOException { + final float raw; // raw score + if (freq < SCORE_CACHE_SIZE) { + raw = scoreCache[freq]; + } else { + raw = getSimilarity().tf((float) freq) * value; + } + return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize + } + + private int phraseFreq() throws IOException { + + freq = 0; + + // init chunks + for(int i=0;i c.lastPos) { + c.lastPos = c.pos; + final int posIndex = c.pos - chunkStart; + counts[posIndex] = 1; + assert gens[posIndex] != gen; + gens[posIndex] = gen; + } + + if (c.posUpto == c.posLimit) { + end = true; + break; + } + c.posUpto++; + c.pos = c.offset + c.posEnum.nextPosition(); + } + } + + // middle terms + boolean any = true; + for(int t=1;t c.lastPos) { + c.lastPos = c.pos; + final int posIndex = c.pos - chunkStart; + if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) { + // viable + counts[posIndex]++; + any = true; + } + } + + if (c.posUpto == c.posLimit) { + end = true; + break; + } + c.posUpto++; + c.pos = c.offset + c.posEnum.nextPosition(); + } + + if (!any) { + break; + } + } + + if (!any) { + // petered out for this chunk + chunkStart += CHUNK; + chunkEnd += CHUNK; + continue; + } + + // last term + + { + final ChunkState c = chunkStates[endMinus1]; + while(c.pos < chunkEnd) { + if (c.pos > c.lastPos) { + c.lastPos = c.pos; + final int posIndex = c.pos - chunkStart; + if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) { + freq++; + } + } + + if (c.posUpto == c.posLimit) { + end = true; + break; + } + c.posUpto++; + c.pos = c.offset + c.posEnum.nextPosition(); + } + } + + chunkStart += CHUNK; + chunkEnd += CHUNK; + } + + return freq; + } }