Index: lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java (revision 1297129) +++ lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java (working copy) @@ -22,24 +22,24 @@ import org.apache.lucene.index.*; -final class ExactPhraseScorer extends Scorer { - private final byte[] norms; - private final float value; +class ExactPhraseScorer extends Scorer { + protected final byte[] norms; + protected final float value; private static final int SCORE_CACHE_SIZE = 32; private final float[] scoreCache = new float[SCORE_CACHE_SIZE]; - private final int endMinus1; + protected final int endMinus1; - private final static int CHUNK = 4096; + protected final static int CHUNK = 4096; - private int gen; - private final int[] counts = new int[CHUNK]; - private final int[] gens = new int[CHUNK]; + protected int gen; + protected final int[] counts = new int[CHUNK]; + protected final int[] gens = new int[CHUNK]; boolean noDocs; - private final static class ChunkState { + protected static class ChunkState { final TermPositions posEnum; final int offset; final boolean useAdvance; @@ -55,9 +55,9 @@ } } - private final ChunkState[] chunkStates; + protected final ChunkState[] chunkStates; - private int docID = -1; + protected int docID = -1; private int freq; ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, @@ -79,7 +79,7 @@ // ANDing. This buys ~15% gain for phrases where // freq of rarest 2 terms is close: final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq; - chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance); + chunkStates[i] = newChunkState(postings[i].postings, postings[i].position, useAdvance); if (i > 0 && !postings[i].postings.next()) { noDocs = true; return; @@ -91,6 +91,11 @@ } } + /** create a new ChunkState (Extension point */ + protected ChunkState newChunkState(TermPositions postings, int position, boolean useAdvance) { + return new ChunkState(postings, -position, useAdvance); + } + @Override public int nextDoc() throws IOException { while(true) { @@ -151,8 +156,7 @@ // phrase occurs docID = doc; - freq = phraseFreq(); - if (freq != 0) { + if (computePhraseFreq()) { return docID; } } @@ -193,8 +197,7 @@ // this doc has all the terms -- now test whether // phrase occurs docID = doc; - freq = phraseFreq(); - if (freq != 0) { + if (computePhraseFreq()) { return docID; } } @@ -234,7 +237,8 @@ return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize } - private int phraseFreq() throws IOException { + /** check if a document matches. Side effect for matches: set the phrase frequency */ + protected boolean computePhraseFreq() throws IOException { freq = 0; @@ -349,6 +353,6 @@ chunkEnd += CHUNK; } - return freq; + return freq > 0; } } Index: lucene/core/src/java/org/apache/lucene/search/NonExactPhraseScorer.java =================================================================== --- lucene/core/src/java/org/apache/lucene/search/NonExactPhraseScorer.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/search/NonExactPhraseScorer.java (working copy) @@ -0,0 +1,172 @@ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.search.PhraseQuery.PostingsAndFreq; + +public class NonExactPhraseScorer extends ExactPhraseScorer { + + private final int slop; //TODO use for the score + private float freq; // cannot use super.freq, latter is int, optimized that way. + + protected static class ChunkState extends ExactPhraseScorer.ChunkState { + + /** iteration with slop window */ + public ChunkState(TermPositions posEnum, int offset, boolean useAdvance) { + super(posEnum, offset, useAdvance); + } + + public void init() throws IOException { + posLimit = posEnum.freq(); + pos = offset + posEnum.nextPosition(); + posUpto = 1; + lastPos = -1; + } + + } + + NonExactPhraseScorer(Weight weight, PostingsAndFreq[] postings, + Similarity similarity, int slop, byte[] norms) throws IOException { + super(weight, postings, similarity, norms); + this.slop = slop; + } + + @Override + public String toString() { + return "NonExactPhraseScorer(" + weight + ")"; + } + + @Override + public float freq() { + return freq; + } + + @Override + public float score() throws IOException { + final float raw; // raw score + //TODO some score caching? + raw = getSimilarity().tf(freq) * value; + return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize + } + + @Override + protected boolean computePhraseFreq() throws IOException { + float freq = 0.0f; + + // init chunks + for(int i=0;i cs.lastPos) { + cs.lastPos = cs.pos; + final int posIndex = cs.pos - chunkStart; + counts[posIndex] = 1; //TODO Sloppy: increment more, and more cells + assert gens[posIndex] != gen; + gens[posIndex] = gen; + } + + if (cs.posUpto == cs.posLimit) { + end = true; + break; + } + cs.posUpto++; + cs.pos = cs.offset + cs.posEnum.nextPosition(); + } + } + + // middle terms + boolean any = true; + for(int t=1;t cs.lastPos) { + cs.lastPos = cs.pos; + final int posIndex = cs.pos - chunkStart; + if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) { // TODO sloppy: different condition + // viable + counts[posIndex]++; // TODO sloppy: increment more, and more cells + any = true; + } + } + + if (cs.posUpto == cs.posLimit) { + end = true; + break; + } + cs.posUpto++; + cs.pos = cs.offset + cs.posEnum.nextPosition(); + } + + if (!any) { + break; + } + } + + if (!any) { + // petered out for this chunk + chunkStart += CHUNK; + chunkEnd += CHUNK; + continue; + } + + // last term + + { + final ChunkState cs = (ChunkState) chunkStates[endMinus1]; + while(cs.pos < chunkEnd) { + if (cs.pos > cs.lastPos) { + cs.lastPos = cs.pos; + final int posIndex = cs.pos - chunkStart; + if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) { // TODO sloppy: different condition + freq++; //TODO consider slop + } + } + + if (cs.posUpto == cs.posLimit) { + end = true; + break; + } + cs.posUpto++; + cs.pos = cs.offset + cs.posEnum.nextPosition(); + } + } + + chunkStart += CHUNK; + chunkEnd += CHUNK; + } + + return freq>0; + } + + @Override + protected ExactPhraseScorer.ChunkState newChunkState(TermPositions postings, int position, boolean useAdvance) { + return new ChunkState(postings, -position, useAdvance); + } +} Property changes on: lucene/core/src/java/org/apache/lucene/search/NonExactPhraseScorer.java ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* Added: svn:eol-style ## -0,0 +1 ## +native